diff --git a/app.py b/app.py
new file mode 100644
index 0000000000000000000000000000000000000000..0351de3994a166179e9a175849f27bfc41e70ecd
--- /dev/null
+++ b/app.py
@@ -0,0 +1,274 @@
+import gradio as gr
+from PIL import Image
+from src.tryon_pipeline import StableDiffusionXLInpaintPipeline as TryonPipeline
+from src.unet_hacked_garmnet import UNet2DConditionModel as UNet2DConditionModel_ref
+from src.unet_hacked_tryon import UNet2DConditionModel
+from transformers import (
+    CLIPImageProcessor,
+    CLIPVisionModelWithProjection,
+    CLIPTextModel,
+    CLIPTextModelWithProjection,
+)
+from diffusers import DDPMScheduler,AutoencoderKL
+from typing import List
+
+import torch
+import os
+from transformers import AutoTokenizer
+import spaces
+import numpy as np
+from utils_mask import get_mask_location
+from torchvision import transforms
+import apply_net
+from preprocess.humanparsing.run_parsing import Parsing
+from preprocess.openpose.run_openpose import OpenPose
+from detectron2.data.detection_utils import convert_PIL_to_numpy,_apply_exif_orientation
+
+
+
+def pil_to_binary_mask(pil_image, threshold=0):
+    np_image = np.array(pil_image)
+    grayscale_image = Image.fromarray(np_image).convert("L")
+    binary_mask = np.array(grayscale_image) > threshold
+    mask = np.zeros(binary_mask.shape, dtype=np.uint8)
+    for i in range(binary_mask.shape[0]):
+        for j in range(binary_mask.shape[1]):
+            if binary_mask[i,j] == True :
+                mask[i,j] = 1
+    mask = (mask*255).astype(np.uint8)
+    output_mask = Image.fromarray(mask)
+    return output_mask
+
+
+base_path = 'yisol/IDM-VTON'
+example_path = os.path.join(os.path.dirname(__file__), 'example')
+
+unet = UNet2DConditionModel.from_pretrained(
+    base_path,
+    subfolder="unet",
+    torch_dtype=torch.float16,
+)
+unet.requires_grad_(False)
+tokenizer_one = AutoTokenizer.from_pretrained(
+    base_path,
+    subfolder="tokenizer",
+    revision=None,
+    use_fast=False,
+)
+tokenizer_two = AutoTokenizer.from_pretrained(
+    base_path,
+    subfolder="tokenizer_2",
+    revision=None,
+    use_fast=False,
+)
+noise_scheduler = DDPMScheduler.from_pretrained(base_path, subfolder="scheduler")
+
+text_encoder_one = CLIPTextModel.from_pretrained(
+    base_path,
+    subfolder="text_encoder",
+    torch_dtype=torch.float16,
+)
+text_encoder_two = CLIPTextModelWithProjection.from_pretrained(
+    base_path,
+    subfolder="text_encoder_2",
+    torch_dtype=torch.float16,
+)
+image_encoder = CLIPVisionModelWithProjection.from_pretrained(
+    base_path,
+    subfolder="image_encoder",
+    torch_dtype=torch.float16,
+    )
+vae = AutoencoderKL.from_pretrained(base_path,
+                                    subfolder="vae",
+                                    torch_dtype=torch.float16,
+)
+
+# "stabilityai/stable-diffusion-xl-base-1.0",
+UNet_Encoder = UNet2DConditionModel_ref.from_pretrained(
+    base_path,
+    subfolder="unet_encoder",
+    torch_dtype=torch.float16,
+)
+
+parsing_model = Parsing(0)
+openpose_model = OpenPose(0)
+
+UNet_Encoder.requires_grad_(False)
+image_encoder.requires_grad_(False)
+vae.requires_grad_(False)
+unet.requires_grad_(False)
+text_encoder_one.requires_grad_(False)
+text_encoder_two.requires_grad_(False)
+tensor_transfrom = transforms.Compose(
+            [
+                transforms.ToTensor(),
+                transforms.Normalize([0.5], [0.5]),
+            ]
+    )
+
+pipe = TryonPipeline.from_pretrained(
+        base_path,
+        unet=unet,
+        vae=vae,
+        feature_extractor= CLIPImageProcessor(),
+        text_encoder = text_encoder_one,
+        text_encoder_2 = text_encoder_two,
+        tokenizer = tokenizer_one,
+        tokenizer_2 = tokenizer_two,
+        scheduler = noise_scheduler,
+        image_encoder=image_encoder,
+        torch_dtype=torch.float16,
+)
+pipe.unet_encoder = UNet_Encoder
+
+@spaces.GPU
+def start_tryon(dict,garm_img,garment_des,is_checked,denoise_steps,seed):
+    device = "cuda"
+    
+    openpose_model.preprocessor.body_estimation.model.to(device)
+    pipe.to(device)
+    pipe.unet_encoder.to(device)
+
+    garm_img= garm_img.convert("RGB").resize((768,1024))
+    human_img = dict["background"].resize((768,1024)).convert("RGB")    
+    
+    if is_checked:
+        keypoints = openpose_model(human_img.resize((384,512)))
+        model_parse, _ = parsing_model(human_img.resize((384,512)))
+        mask, mask_gray = get_mask_location('hd', "upper_body", model_parse, keypoints)
+        mask = mask.resize((768,1024))
+    else:
+        mask = pil_to_binary_mask(dict['layers'][0].convert("RGB").resize((768, 1024)))
+        mask = transforms.ToTensor()(mask)
+        mask = mask.unsqueeze(0)
+
+
+    human_img_arg = _apply_exif_orientation(human_img.resize((384,512)))
+    human_img_arg = convert_PIL_to_numpy(human_img_arg, format="BGR")
+     
+    
+
+    args = apply_net.create_argument_parser().parse_args(('show', './configs/densepose_rcnn_R_50_FPN_s1x.yaml', './ckpt/densepose/model_final_162be9.pkl', 'dp_segm', '-v', '--opts', 'MODEL.DEVICE', 'cuda'))
+    # verbosity = getattr(args, "verbosity", None)
+    pose_img = args.func(args,human_img_arg)    
+    pose_img = pose_img[:,:,::-1]    
+    pose_img = Image.fromarray(pose_img).resize((768,1024))
+    
+    with torch.no_grad():
+        # Extract the images
+        with torch.cuda.amp.autocast():
+            with torch.no_grad():
+                prompt = "model is wearing " + garment_des
+                negative_prompt = "monochrome, lowres, bad anatomy, worst quality, low quality"
+                with torch.inference_mode():
+                    (
+                        prompt_embeds,
+                        negative_prompt_embeds,
+                        pooled_prompt_embeds,
+                        negative_pooled_prompt_embeds,
+                    ) = pipe.encode_prompt(
+                        prompt,
+                        num_images_per_prompt=1,
+                        do_classifier_free_guidance=True,
+                        negative_prompt=negative_prompt,
+                    )
+                                    
+                    prompt = "a photo of " + garment_des
+                    negative_prompt = "monochrome, lowres, bad anatomy, worst quality, low quality"
+                    if not isinstance(prompt, List):
+                        prompt = [prompt] * 1
+                    if not isinstance(negative_prompt, List):
+                        negative_prompt = [negative_prompt] * 1
+                    with torch.inference_mode():
+                        (
+                            prompt_embeds_c,
+                            _,
+                            _,
+                            _,
+                        ) = pipe.encode_prompt(
+                            prompt,
+                            num_images_per_prompt=1,
+                            do_classifier_free_guidance=False,
+                            negative_prompt=negative_prompt,
+                        )
+                        
+                    pose_img =  tensor_transfrom(pose_img).unsqueeze(0).to(device,torch.float16)
+                    garm_tensor =  tensor_transfrom(garm_img).unsqueeze(0).to(device,torch.float16)
+                    generator = torch.Generator(device).manual_seed(seed) if seed is not None else None
+                    images = pipe(
+                        prompt_embeds=prompt_embeds.to(device,torch.float16),
+                        negative_prompt_embeds=negative_prompt_embeds.to(device,torch.float16),
+                        pooled_prompt_embeds=pooled_prompt_embeds.to(device,torch.float16),
+                        negative_pooled_prompt_embeds=negative_pooled_prompt_embeds.to(device,torch.float16),
+                        num_inference_steps=denoise_steps,
+                        generator=generator,
+                        strength = 1.0,
+                        pose_img = pose_img.to(device,torch.float16),
+                        text_embeds_cloth=prompt_embeds_c.to(device,torch.float16),
+                        cloth = garm_tensor.to(device,torch.float16),
+                        mask_image=mask,
+                        image=human_img, 
+                        height=1024,
+                        width=768,
+                        ip_adapter_image = garm_img.resize((768,1024)),
+                        guidance_scale=2.0,
+                    )[0]
+    return images[0]
+
+garm_list = os.listdir(os.path.join(example_path,"cloth"))
+garm_list_path = [os.path.join(example_path,"cloth",garm) for garm in garm_list]
+
+human_list = os.listdir(os.path.join(example_path,"human"))
+human_list_path = [os.path.join(example_path,"human",human) for human in human_list]
+
+human_ex_list = []
+for ex_human in human_list_path:
+    ex_dict= {}
+    ex_dict['background'] = ex_human
+    ex_dict['layers'] = None
+    ex_dict['composite'] = None
+    human_ex_list.append(ex_dict)
+
+##default human
+
+image_blocks = gr.Blocks().queue()
+with image_blocks as demo:
+    with gr.Row():
+        with gr.Column():
+            imgs = gr.ImageMask(sources='upload', type="pil", label='Human. Mask with pen or use auto-masking', interactive=True)
+            with gr.Row():
+                is_checked = gr.Checkbox(label="Yes", info="Use auto-generated mask")
+            example = gr.Examples(
+                inputs=imgs,
+                examples_per_page=8,
+                examples=human_ex_list
+            )
+
+        with gr.Column():
+            garm_img = gr.Image(label="Garment", sources='upload', type="pil")
+            with gr.Row(elem_id="prompt-container"):
+                with gr.Row():
+                    prompt = gr.Textbox(placeholder="Description of garment ex) Short Sleeve Round Neck T-shirts", show_label=False, elem_id="prompt")
+            example = gr.Examples(
+                inputs=garm_img,
+                examples_per_page=8,
+                examples=garm_list_path)
+        with gr.Column():
+            # image_out = gr.Image(label="Output", elem_id="output-img", height=400)
+            image_out = gr.Image(label="Output", elem_id="output-img",show_share_button=False)
+
+    with gr.Column():
+        try_button = gr.Button(value="Try-on")
+        with gr.Accordion(label="Advanced Settings", open=False):
+            with gr.Row():
+                denoise_steps = gr.Number(label="Denoising Steps", minimum=20, maximum=40, value=30, step=1)
+                seed = gr.Number(label="Seed", minimum=-1, maximum=2147483647, step=1, value=42)
+
+
+    try_button.click(fn=start_tryon, inputs=[imgs, garm_img, prompt, is_checked, denoise_steps, seed], outputs=[image_out], api_name='tryon')
+
+            
+
+
+image_blocks.launch()
+
diff --git a/apply_net.py b/apply_net.py
new file mode 100755
index 0000000000000000000000000000000000000000..732539d616d7dd9d039302f97b5d6438a5cb4892
--- /dev/null
+++ b/apply_net.py
@@ -0,0 +1,359 @@
+#!/usr/bin/env python3
+# Copyright (c) Facebook, Inc. and its affiliates.
+
+import argparse
+import glob
+import logging
+import os
+import sys
+from typing import Any, ClassVar, Dict, List
+import torch
+
+from detectron2.config import CfgNode, get_cfg
+from detectron2.data.detection_utils import read_image
+from detectron2.engine.defaults import DefaultPredictor
+from detectron2.structures.instances import Instances
+from detectron2.utils.logger import setup_logger
+
+from densepose import add_densepose_config
+from densepose.structures import DensePoseChartPredictorOutput, DensePoseEmbeddingPredictorOutput
+from densepose.utils.logger import verbosity_to_level
+from densepose.vis.base import CompoundVisualizer
+from densepose.vis.bounding_box import ScoredBoundingBoxVisualizer
+from densepose.vis.densepose_outputs_vertex import (
+    DensePoseOutputsTextureVisualizer,
+    DensePoseOutputsVertexVisualizer,
+    get_texture_atlases,
+)
+from densepose.vis.densepose_results import (
+    DensePoseResultsContourVisualizer,
+    DensePoseResultsFineSegmentationVisualizer,
+    DensePoseResultsUVisualizer,
+    DensePoseResultsVVisualizer,
+)
+from densepose.vis.densepose_results_textures import (
+    DensePoseResultsVisualizerWithTexture,
+    get_texture_atlas,
+)
+from densepose.vis.extractor import (
+    CompoundExtractor,
+    DensePoseOutputsExtractor,
+    DensePoseResultExtractor,
+    create_extractor,
+)
+
+DOC = """Apply Net - a tool to print / visualize DensePose results
+"""
+
+LOGGER_NAME = "apply_net"
+logger = logging.getLogger(LOGGER_NAME)
+
+_ACTION_REGISTRY: Dict[str, "Action"] = {}
+
+
+class Action:
+    @classmethod
+    def add_arguments(cls: type, parser: argparse.ArgumentParser):
+        parser.add_argument(
+            "-v",
+            "--verbosity",
+            action="count",
+            help="Verbose mode. Multiple -v options increase the verbosity.",
+        )
+
+
+def register_action(cls: type):
+    """
+    Decorator for action classes to automate action registration
+    """
+    global _ACTION_REGISTRY
+    _ACTION_REGISTRY[cls.COMMAND] = cls
+    return cls
+
+
+class InferenceAction(Action):
+    @classmethod
+    def add_arguments(cls: type, parser: argparse.ArgumentParser):
+        super(InferenceAction, cls).add_arguments(parser)
+        parser.add_argument("cfg", metavar="<config>", help="Config file")
+        parser.add_argument("model", metavar="<model>", help="Model file")
+        parser.add_argument(
+            "--opts",
+            help="Modify config options using the command-line 'KEY VALUE' pairs",
+            default=[],
+            nargs=argparse.REMAINDER,
+        )
+
+    @classmethod
+    def execute(cls: type, args: argparse.Namespace, human_img):
+        logger.info(f"Loading config from {args.cfg}")
+        opts = []
+        cfg = cls.setup_config(args.cfg, args.model, args, opts)
+        logger.info(f"Loading model from {args.model}")
+        predictor = DefaultPredictor(cfg)
+        # logger.info(f"Loading data from {args.input}")
+        # file_list = cls._get_input_file_list(args.input)
+        # if len(file_list) == 0:
+        #     logger.warning(f"No input images for {args.input}")
+        #     return
+        context = cls.create_context(args, cfg)
+        # for file_name in file_list:
+        #     img = read_image(file_name, format="BGR")  # predictor expects BGR image.
+        with torch.no_grad():
+            outputs = predictor(human_img)["instances"]
+            out_pose = cls.execute_on_outputs(context, {"image": human_img}, outputs)
+        cls.postexecute(context)
+        return out_pose
+
+    @classmethod
+    def setup_config(
+        cls: type, config_fpath: str, model_fpath: str, args: argparse.Namespace, opts: List[str]
+    ):
+        cfg = get_cfg()
+        add_densepose_config(cfg)
+        cfg.merge_from_file(config_fpath)
+        cfg.merge_from_list(args.opts)
+        if opts:
+            cfg.merge_from_list(opts)
+        cfg.MODEL.WEIGHTS = model_fpath
+        cfg.freeze()
+        return cfg
+
+    @classmethod
+    def _get_input_file_list(cls: type, input_spec: str):
+        if os.path.isdir(input_spec):
+            file_list = [
+                os.path.join(input_spec, fname)
+                for fname in os.listdir(input_spec)
+                if os.path.isfile(os.path.join(input_spec, fname))
+            ]
+        elif os.path.isfile(input_spec):
+            file_list = [input_spec]
+        else:
+            file_list = glob.glob(input_spec)
+        return file_list
+
+
+@register_action
+class DumpAction(InferenceAction):
+    """
+    Dump action that outputs results to a pickle file
+    """
+
+    COMMAND: ClassVar[str] = "dump"
+
+    @classmethod
+    def add_parser(cls: type, subparsers: argparse._SubParsersAction):
+        parser = subparsers.add_parser(cls.COMMAND, help="Dump model outputs to a file.")
+        cls.add_arguments(parser)
+        parser.set_defaults(func=cls.execute)
+
+    @classmethod
+    def add_arguments(cls: type, parser: argparse.ArgumentParser):
+        super(DumpAction, cls).add_arguments(parser)
+        parser.add_argument(
+            "--output",
+            metavar="<dump_file>",
+            default="results.pkl",
+            help="File name to save dump to",
+        )
+
+    @classmethod
+    def execute_on_outputs(
+        cls: type, context: Dict[str, Any], entry: Dict[str, Any], outputs: Instances
+    ):
+        image_fpath = entry["file_name"]
+        logger.info(f"Processing {image_fpath}")
+        result = {"file_name": image_fpath}
+        if outputs.has("scores"):
+            result["scores"] = outputs.get("scores").cpu()
+        if outputs.has("pred_boxes"):
+            result["pred_boxes_XYXY"] = outputs.get("pred_boxes").tensor.cpu()
+            if outputs.has("pred_densepose"):
+                if isinstance(outputs.pred_densepose, DensePoseChartPredictorOutput):
+                    extractor = DensePoseResultExtractor()
+                elif isinstance(outputs.pred_densepose, DensePoseEmbeddingPredictorOutput):
+                    extractor = DensePoseOutputsExtractor()
+                result["pred_densepose"] = extractor(outputs)[0]
+        context["results"].append(result)
+
+    @classmethod
+    def create_context(cls: type, args: argparse.Namespace, cfg: CfgNode):
+        context = {"results": [], "out_fname": args.output}
+        return context
+
+    @classmethod
+    def postexecute(cls: type, context: Dict[str, Any]):
+        out_fname = context["out_fname"]
+        out_dir = os.path.dirname(out_fname)
+        if len(out_dir) > 0 and not os.path.exists(out_dir):
+            os.makedirs(out_dir)
+        with open(out_fname, "wb") as hFile:
+            torch.save(context["results"], hFile)
+            logger.info(f"Output saved to {out_fname}")
+
+
+@register_action
+class ShowAction(InferenceAction):
+    """
+    Show action that visualizes selected entries on an image
+    """
+
+    COMMAND: ClassVar[str] = "show"
+    VISUALIZERS: ClassVar[Dict[str, object]] = {
+        "dp_contour": DensePoseResultsContourVisualizer,
+        "dp_segm": DensePoseResultsFineSegmentationVisualizer,
+        "dp_u": DensePoseResultsUVisualizer,
+        "dp_v": DensePoseResultsVVisualizer,
+        "dp_iuv_texture": DensePoseResultsVisualizerWithTexture,
+        "dp_cse_texture": DensePoseOutputsTextureVisualizer,
+        "dp_vertex": DensePoseOutputsVertexVisualizer,
+        "bbox": ScoredBoundingBoxVisualizer,
+    }
+
+    @classmethod
+    def add_parser(cls: type, subparsers: argparse._SubParsersAction):
+        parser = subparsers.add_parser(cls.COMMAND, help="Visualize selected entries")
+        cls.add_arguments(parser)
+        parser.set_defaults(func=cls.execute)
+
+    @classmethod
+    def add_arguments(cls: type, parser: argparse.ArgumentParser):
+        super(ShowAction, cls).add_arguments(parser)
+        parser.add_argument(
+            "visualizations",
+            metavar="<visualizations>",
+            help="Comma separated list of visualizations, possible values: "
+            "[{}]".format(",".join(sorted(cls.VISUALIZERS.keys()))),
+        )
+        parser.add_argument(
+            "--min_score",
+            metavar="<score>",
+            default=0.8,
+            type=float,
+            help="Minimum detection score to visualize",
+        )
+        parser.add_argument(
+            "--nms_thresh", metavar="<threshold>", default=None, type=float, help="NMS threshold"
+        )
+        parser.add_argument(
+            "--texture_atlas",
+            metavar="<texture_atlas>",
+            default=None,
+            help="Texture atlas file (for IUV texture transfer)",
+        )
+        parser.add_argument(
+            "--texture_atlases_map",
+            metavar="<texture_atlases_map>",
+            default=None,
+            help="JSON string of a dict containing texture atlas files for each mesh",
+        )
+        parser.add_argument(
+            "--output",
+            metavar="<image_file>",
+            default="outputres.png",
+            help="File name to save output to",
+        )
+
+    @classmethod
+    def setup_config(
+        cls: type, config_fpath: str, model_fpath: str, args: argparse.Namespace, opts: List[str]
+    ):
+        opts.append("MODEL.ROI_HEADS.SCORE_THRESH_TEST")
+        opts.append(str(args.min_score))
+        if args.nms_thresh is not None:
+            opts.append("MODEL.ROI_HEADS.NMS_THRESH_TEST")
+            opts.append(str(args.nms_thresh))
+        cfg = super(ShowAction, cls).setup_config(config_fpath, model_fpath, args, opts)
+        return cfg
+
+    @classmethod
+    def execute_on_outputs(
+        cls: type, context: Dict[str, Any], entry: Dict[str, Any], outputs: Instances
+    ):
+        import cv2
+        import numpy as np
+        visualizer = context["visualizer"]
+        extractor = context["extractor"]
+        # image_fpath = entry["file_name"]
+        # logger.info(f"Processing {image_fpath}")
+        image = cv2.cvtColor(entry["image"], cv2.COLOR_BGR2GRAY)
+        image = np.tile(image[:, :, np.newaxis], [1, 1, 3])
+        data = extractor(outputs)
+        image_vis = visualizer.visualize(image, data)
+
+        return image_vis
+        entry_idx = context["entry_idx"] + 1
+        out_fname = './image-densepose/' + image_fpath.split('/')[-1]
+        out_dir = './image-densepose'
+        out_dir = os.path.dirname(out_fname)
+        if len(out_dir) > 0 and not os.path.exists(out_dir):
+            os.makedirs(out_dir)
+        cv2.imwrite(out_fname, image_vis)
+        logger.info(f"Output saved to {out_fname}")
+        context["entry_idx"] += 1
+
+    @classmethod
+    def postexecute(cls: type, context: Dict[str, Any]):
+        pass
+# python ./apply_net.py show ./configs/densepose_rcnn_R_50_FPN_s1x.yaml https://dl.fbaipublicfiles.com/densepose/densepose_rcnn_R_50_FPN_s1x/165712039/model_final_162be9.pkl /home/alin0222/DressCode/upper_body/images dp_segm -v --opts MODEL.DEVICE cpu
+
+    @classmethod
+    def _get_out_fname(cls: type, entry_idx: int, fname_base: str):
+        base, ext = os.path.splitext(fname_base)
+        return base + ".{0:04d}".format(entry_idx) + ext
+
+    @classmethod
+    def create_context(cls: type, args: argparse.Namespace, cfg: CfgNode) -> Dict[str, Any]:
+        vis_specs = args.visualizations.split(",")
+        visualizers = []
+        extractors = []
+        for vis_spec in vis_specs:
+            texture_atlas = get_texture_atlas(args.texture_atlas)
+            texture_atlases_dict = get_texture_atlases(args.texture_atlases_map)
+            vis = cls.VISUALIZERS[vis_spec](
+                cfg=cfg,
+                texture_atlas=texture_atlas,
+                texture_atlases_dict=texture_atlases_dict,
+            )
+            visualizers.append(vis)
+            extractor = create_extractor(vis)
+            extractors.append(extractor)
+        visualizer = CompoundVisualizer(visualizers)
+        extractor = CompoundExtractor(extractors)
+        context = {
+            "extractor": extractor,
+            "visualizer": visualizer,
+            "out_fname": args.output,
+            "entry_idx": 0,
+        }
+        return context
+
+
+def create_argument_parser() -> argparse.ArgumentParser:
+    parser = argparse.ArgumentParser(
+        description=DOC,
+        formatter_class=lambda prog: argparse.HelpFormatter(prog, max_help_position=120),
+    )
+    parser.set_defaults(func=lambda _: parser.print_help(sys.stdout))
+    subparsers = parser.add_subparsers(title="Actions")
+    for _, action in _ACTION_REGISTRY.items():
+        action.add_parser(subparsers)
+    return parser
+
+
+def main():
+    parser = create_argument_parser()
+    args = parser.parse_args()
+    verbosity = getattr(args, "verbosity", None)
+    global logger
+    logger = setup_logger(name=LOGGER_NAME)
+    logger.setLevel(verbosity_to_level(verbosity))
+    args.func(args)
+
+
+if __name__ == "__main__":
+    main()
+
+
+# python ./apply_net.py show ./configs/densepose_rcnn_R_50_FPN_s1x.yaml https://dl.fbaipublicfiles.com/densepose/densepose_rcnn_R_50_FPN_s1x/165712039/model_final_162be9.pkl /home/alin0222/Dresscode/dresses/humanonly dp_segm -v --opts MODEL.DEVICE cuda
diff --git a/ckpt/densepose/model_final_162be9.pkl b/ckpt/densepose/model_final_162be9.pkl
new file mode 100644
index 0000000000000000000000000000000000000000..1556c53af8286e4584b18130475c0ccdb0a61ad6
--- /dev/null
+++ b/ckpt/densepose/model_final_162be9.pkl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b8a7382001b16e453bad95ca9dbc68ae8f2b839b304cf90eaf5c27fbdb4dae91
+size 255757821
diff --git a/ckpt/humanparsing/parsing_atr.onnx b/ckpt/humanparsing/parsing_atr.onnx
new file mode 100644
index 0000000000000000000000000000000000000000..28883cf4b0069c96f0f00930798428017425c3fa
--- /dev/null
+++ b/ckpt/humanparsing/parsing_atr.onnx
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:04c7d1d070d0e0ae943d86b18cb5aaaea9e278d97462e9cfb270cbbe4cd977f4
+size 266859305
diff --git a/ckpt/humanparsing/parsing_lip.onnx b/ckpt/humanparsing/parsing_lip.onnx
new file mode 100644
index 0000000000000000000000000000000000000000..7d1a879fa30fc002188b0c9fec3cc05064dd1093
--- /dev/null
+++ b/ckpt/humanparsing/parsing_lip.onnx
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8436e1dae96e2601c373d1ace29c8f0978b16357d9038c17a8ba756cca376dbc
+size 266863411
diff --git a/ckpt/openpose/.DS_Store b/ckpt/openpose/.DS_Store
new file mode 100644
index 0000000000000000000000000000000000000000..8964b8e00b7eb0b06929e41cc2161b887946c0af
Binary files /dev/null and b/ckpt/openpose/.DS_Store differ
diff --git a/ckpt/openpose/ckpts/body_pose_model.pth b/ckpt/openpose/ckpts/body_pose_model.pth
new file mode 100644
index 0000000000000000000000000000000000000000..9acb77e68f31906a8875f1daef2f3f7ef94acb1e
--- /dev/null
+++ b/ckpt/openpose/ckpts/body_pose_model.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:25a948c16078b0f08e236bda51a385d855ef4c153598947c28c0d47ed94bb746
+size 209267595
diff --git a/configs/Base-DensePose-RCNN-FPN.yaml b/configs/Base-DensePose-RCNN-FPN.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..1579187a7004e716eb3a86dbbfebb092d7aca84b
--- /dev/null
+++ b/configs/Base-DensePose-RCNN-FPN.yaml
@@ -0,0 +1,48 @@
+VERSION: 2
+MODEL:
+  META_ARCHITECTURE: "GeneralizedRCNN"
+  BACKBONE:
+    NAME: "build_resnet_fpn_backbone"
+  RESNETS:
+    OUT_FEATURES: ["res2", "res3", "res4", "res5"]
+  FPN:
+    IN_FEATURES: ["res2", "res3", "res4", "res5"]
+  ANCHOR_GENERATOR:
+    SIZES: [[32], [64], [128], [256], [512]]  # One size for each in feature map
+    ASPECT_RATIOS: [[0.5, 1.0, 2.0]]  # Three aspect ratios (same for all in feature maps)
+  RPN:
+    IN_FEATURES: ["p2", "p3", "p4", "p5", "p6"]
+    PRE_NMS_TOPK_TRAIN: 2000  # Per FPN level
+    PRE_NMS_TOPK_TEST: 1000  # Per FPN level
+    # Detectron1 uses 2000 proposals per-batch,
+    # (See "modeling/rpn/rpn_outputs.py" for details of this legacy issue)
+    # which is approximately 1000 proposals per-image since the default batch size for FPN is 2.
+    POST_NMS_TOPK_TRAIN: 1000
+    POST_NMS_TOPK_TEST: 1000
+
+  DENSEPOSE_ON: True
+  ROI_HEADS:
+    NAME: "DensePoseROIHeads"
+    IN_FEATURES: ["p2", "p3", "p4", "p5"]
+    NUM_CLASSES: 1
+  ROI_BOX_HEAD:
+    NAME: "FastRCNNConvFCHead"
+    NUM_FC: 2
+    POOLER_RESOLUTION: 7
+    POOLER_SAMPLING_RATIO: 2
+    POOLER_TYPE: "ROIAlign"
+  ROI_DENSEPOSE_HEAD:
+    NAME: "DensePoseV1ConvXHead"
+    POOLER_TYPE: "ROIAlign"
+    NUM_COARSE_SEGM_CHANNELS: 2
+DATASETS:
+  TRAIN: ("densepose_coco_2014_train", "densepose_coco_2014_valminusminival")
+  TEST: ("densepose_coco_2014_minival",)
+SOLVER:
+  IMS_PER_BATCH: 16
+  BASE_LR: 0.01
+  STEPS: (60000, 80000)
+  MAX_ITER: 90000
+  WARMUP_FACTOR: 0.1
+INPUT:
+  MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800)
diff --git a/configs/HRNet/densepose_rcnn_HRFPN_HRNet_w32_s1x.yaml b/configs/HRNet/densepose_rcnn_HRFPN_HRNet_w32_s1x.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..36eabfed984b360907f5782d4e8b0232784f8a40
--- /dev/null
+++ b/configs/HRNet/densepose_rcnn_HRFPN_HRNet_w32_s1x.yaml
@@ -0,0 +1,16 @@
+_BASE_: "../Base-DensePose-RCNN-FPN.yaml"
+MODEL:
+  WEIGHTS: "https://1drv.ms/u/s!Aus8VCZ_C_33dYBMemi9xOUFR0w"
+  BACKBONE:
+    NAME: "build_hrfpn_backbone"
+  RPN:
+    IN_FEATURES: ['p1', 'p2', 'p3', 'p4', 'p5']
+  ROI_HEADS:
+    IN_FEATURES: ['p1', 'p2', 'p3', 'p4', 'p5']
+SOLVER:
+  MAX_ITER: 130000
+  STEPS: (100000, 120000)
+  CLIP_GRADIENTS:
+    ENABLED: True
+    CLIP_TYPE: "norm"
+  BASE_LR: 0.03
diff --git a/configs/HRNet/densepose_rcnn_HRFPN_HRNet_w40_s1x.yaml b/configs/HRNet/densepose_rcnn_HRFPN_HRNet_w40_s1x.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..0ca8085e154c40a5b0f42a17575d2d48328619f0
--- /dev/null
+++ b/configs/HRNet/densepose_rcnn_HRFPN_HRNet_w40_s1x.yaml
@@ -0,0 +1,23 @@
+_BASE_: "../Base-DensePose-RCNN-FPN.yaml"
+MODEL:
+  WEIGHTS: "https://1drv.ms/u/s!Aus8VCZ_C_33ck0gvo5jfoWBOPo"
+  BACKBONE:
+    NAME: "build_hrfpn_backbone"
+  RPN:
+    IN_FEATURES: ['p1', 'p2', 'p3', 'p4', 'p5']
+  ROI_HEADS:
+    IN_FEATURES: ['p1', 'p2', 'p3', 'p4', 'p5']
+  HRNET:
+    STAGE2:
+      NUM_CHANNELS: [40, 80]
+    STAGE3:
+      NUM_CHANNELS: [40, 80, 160]
+    STAGE4:
+      NUM_CHANNELS: [40, 80, 160, 320]
+SOLVER:
+  MAX_ITER: 130000
+  STEPS: (100000, 120000)
+  CLIP_GRADIENTS:
+    ENABLED: True
+    CLIP_TYPE: "norm"
+  BASE_LR: 0.03
diff --git a/configs/HRNet/densepose_rcnn_HRFPN_HRNet_w48_s1x.yaml b/configs/HRNet/densepose_rcnn_HRFPN_HRNet_w48_s1x.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..a3f437ab57ae0ff48cd4a97cbda987346f9a5a24
--- /dev/null
+++ b/configs/HRNet/densepose_rcnn_HRFPN_HRNet_w48_s1x.yaml
@@ -0,0 +1,23 @@
+_BASE_: "../Base-DensePose-RCNN-FPN.yaml"
+MODEL:
+  WEIGHTS: "https://1drv.ms/u/s!Aus8VCZ_C_33dKvqI6pBZlifgJk"
+  BACKBONE:
+    NAME: "build_hrfpn_backbone"
+  RPN:
+    IN_FEATURES: ['p1', 'p2', 'p3', 'p4', 'p5']
+  ROI_HEADS:
+    IN_FEATURES: ['p1', 'p2', 'p3', 'p4', 'p5']
+  HRNET:
+    STAGE2:
+      NUM_CHANNELS: [48, 96]
+    STAGE3:
+      NUM_CHANNELS: [48, 96, 192]
+    STAGE4:
+      NUM_CHANNELS: [48, 96, 192, 384]
+SOLVER:
+  MAX_ITER: 130000
+  STEPS: (100000, 120000)
+  CLIP_GRADIENTS:
+    ENABLED: True
+    CLIP_TYPE: "norm"
+  BASE_LR: 0.03
diff --git a/configs/cse/Base-DensePose-RCNN-FPN-Human.yaml b/configs/cse/Base-DensePose-RCNN-FPN-Human.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..e92340ee0cdba2abd0a35114cbf3e78b04435dfe
--- /dev/null
+++ b/configs/cse/Base-DensePose-RCNN-FPN-Human.yaml
@@ -0,0 +1,20 @@
+_BASE_: "Base-DensePose-RCNN-FPN.yaml"
+MODEL:
+  ROI_DENSEPOSE_HEAD:
+    CSE:
+      EMBEDDERS:
+        "smpl_27554":
+          TYPE: vertex_feature
+          NUM_VERTICES: 27554
+          FEATURE_DIM: 256
+          FEATURES_TRAINABLE: False
+          IS_TRAINABLE: True
+          INIT_FILE: "https://dl.fbaipublicfiles.com/densepose/data/cse/lbo/phi_smpl_27554_256.pkl"
+DATASETS:
+  TRAIN:
+    - "densepose_coco_2014_train_cse"
+    - "densepose_coco_2014_valminusminival_cse"
+  TEST:
+    - "densepose_coco_2014_minival_cse"
+  CLASS_TO_MESH_NAME_MAPPING:
+    "0": "smpl_27554"
diff --git a/configs/cse/Base-DensePose-RCNN-FPN.yaml b/configs/cse/Base-DensePose-RCNN-FPN.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..de3b26009bdee95666248f99cd243fe37e7fd8bd
--- /dev/null
+++ b/configs/cse/Base-DensePose-RCNN-FPN.yaml
@@ -0,0 +1,60 @@
+VERSION: 2
+MODEL:
+  META_ARCHITECTURE: "GeneralizedRCNN"
+  BACKBONE:
+    NAME: "build_resnet_fpn_backbone"
+  RESNETS:
+    OUT_FEATURES: ["res2", "res3", "res4", "res5"]
+  FPN:
+    IN_FEATURES: ["res2", "res3", "res4", "res5"]
+  ANCHOR_GENERATOR:
+    SIZES: [[32], [64], [128], [256], [512]]  # One size for each in feature map
+    ASPECT_RATIOS: [[0.5, 1.0, 2.0]]  # Three aspect ratios (same for all in feature maps)
+  RPN:
+    IN_FEATURES: ["p2", "p3", "p4", "p5", "p6"]
+    PRE_NMS_TOPK_TRAIN: 2000  # Per FPN level
+    PRE_NMS_TOPK_TEST: 1000  # Per FPN level
+    # Detectron1 uses 2000 proposals per-batch,
+    # (See "modeling/rpn/rpn_outputs.py" for details of this legacy issue)
+    # which is approximately 1000 proposals per-image since the default batch size for FPN is 2.
+    POST_NMS_TOPK_TRAIN: 1000
+    POST_NMS_TOPK_TEST: 1000
+
+  DENSEPOSE_ON: True
+  ROI_HEADS:
+    NAME: "DensePoseROIHeads"
+    IN_FEATURES: ["p2", "p3", "p4", "p5"]
+    NUM_CLASSES: 1
+  ROI_BOX_HEAD:
+    NAME: "FastRCNNConvFCHead"
+    NUM_FC: 2
+    POOLER_RESOLUTION: 7
+    POOLER_SAMPLING_RATIO: 2
+    POOLER_TYPE: "ROIAlign"
+  ROI_DENSEPOSE_HEAD:
+    NAME: "DensePoseV1ConvXHead"
+    POOLER_TYPE: "ROIAlign"
+    NUM_COARSE_SEGM_CHANNELS: 2
+    PREDICTOR_NAME: "DensePoseEmbeddingPredictor"
+    LOSS_NAME: "DensePoseCseLoss"
+    CSE:
+      # embedding loss, possible values:
+      # - "EmbeddingLoss"
+      # - "SoftEmbeddingLoss"
+      EMBED_LOSS_NAME: "EmbeddingLoss"
+SOLVER:
+  IMS_PER_BATCH: 16
+  BASE_LR: 0.01
+  STEPS: (60000, 80000)
+  MAX_ITER: 90000
+  WARMUP_FACTOR: 0.1
+  CLIP_GRADIENTS:
+    CLIP_TYPE: norm
+    CLIP_VALUE: 1.0
+    ENABLED: true
+    NORM_TYPE: 2.0
+INPUT:
+  MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800)
+DENSEPOSE_EVALUATION:
+  TYPE: cse
+  STORAGE: file
diff --git a/configs/cse/densepose_rcnn_R_101_FPN_DL_s1x.yaml b/configs/cse/densepose_rcnn_R_101_FPN_DL_s1x.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..69d858902671e683b884b32c3c1448a44dc3995e
--- /dev/null
+++ b/configs/cse/densepose_rcnn_R_101_FPN_DL_s1x.yaml
@@ -0,0 +1,12 @@
+_BASE_: "Base-DensePose-RCNN-FPN-Human.yaml"
+MODEL:
+  WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-101.pkl"
+  RESNETS:
+    DEPTH: 101
+  ROI_DENSEPOSE_HEAD:
+    NAME: "DensePoseDeepLabHead"
+    CSE:
+      EMBED_LOSS_NAME: "EmbeddingLoss"
+SOLVER:
+  MAX_ITER: 130000
+  STEPS: (100000, 120000)
diff --git a/configs/cse/densepose_rcnn_R_101_FPN_DL_soft_s1x.yaml b/configs/cse/densepose_rcnn_R_101_FPN_DL_soft_s1x.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..141657cdab24a2f591eeef763aef29543c43108e
--- /dev/null
+++ b/configs/cse/densepose_rcnn_R_101_FPN_DL_soft_s1x.yaml
@@ -0,0 +1,12 @@
+_BASE_: "Base-DensePose-RCNN-FPN-Human.yaml"
+MODEL:
+  WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-101.pkl"
+  RESNETS:
+    DEPTH: 101
+  ROI_DENSEPOSE_HEAD:
+    NAME: "DensePoseDeepLabHead"
+    CSE:
+      EMBED_LOSS_NAME: "SoftEmbeddingLoss"
+SOLVER:
+  MAX_ITER: 130000
+  STEPS: (100000, 120000)
diff --git a/configs/cse/densepose_rcnn_R_101_FPN_s1x.yaml b/configs/cse/densepose_rcnn_R_101_FPN_s1x.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..d2eea1e2c3cecc7bba1bfd6f2332227bd3d0f5ed
--- /dev/null
+++ b/configs/cse/densepose_rcnn_R_101_FPN_s1x.yaml
@@ -0,0 +1,12 @@
+_BASE_: "Base-DensePose-RCNN-FPN-Human.yaml"
+MODEL:
+  WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-101.pkl"
+  RESNETS:
+    DEPTH: 101
+  ROI_DENSEPOSE_HEAD:
+    NAME: "DensePoseV1ConvXHead"
+    CSE:
+      EMBED_LOSS_NAME: "EmbeddingLoss"
+SOLVER:
+  MAX_ITER: 130000
+  STEPS: (100000, 120000)
diff --git a/configs/cse/densepose_rcnn_R_101_FPN_soft_s1x.yaml b/configs/cse/densepose_rcnn_R_101_FPN_soft_s1x.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..1c362e1f9e93f9b9b458532f5318518396404d9f
--- /dev/null
+++ b/configs/cse/densepose_rcnn_R_101_FPN_soft_s1x.yaml
@@ -0,0 +1,12 @@
+_BASE_: "Base-DensePose-RCNN-FPN-Human.yaml"
+MODEL:
+  WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-101.pkl"
+  RESNETS:
+    DEPTH: 101
+  ROI_DENSEPOSE_HEAD:
+    NAME: "DensePoseV1ConvXHead"
+    CSE:
+      EMBED_LOSS_NAME: "SoftEmbeddingLoss"
+SOLVER:
+  MAX_ITER: 130000
+  STEPS: (100000, 120000)
diff --git a/configs/cse/densepose_rcnn_R_50_FPN_DL_s1x.yaml b/configs/cse/densepose_rcnn_R_50_FPN_DL_s1x.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..26684deaa9c72aab1408dbe3abb6ac3a9b6a17ac
--- /dev/null
+++ b/configs/cse/densepose_rcnn_R_50_FPN_DL_s1x.yaml
@@ -0,0 +1,12 @@
+_BASE_: "Base-DensePose-RCNN-FPN-Human.yaml"
+MODEL:
+  WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
+  RESNETS:
+    DEPTH: 50
+  ROI_DENSEPOSE_HEAD:
+    NAME: "DensePoseDeepLabHead"
+    CSE:
+      EMBED_LOSS_NAME: "EmbeddingLoss"
+SOLVER:
+  MAX_ITER: 130000
+  STEPS: (100000, 120000)
diff --git a/configs/cse/densepose_rcnn_R_50_FPN_DL_soft_s1x.yaml b/configs/cse/densepose_rcnn_R_50_FPN_DL_soft_s1x.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..b53501d29b84e9ff4088ce98bc83688e89e546ed
--- /dev/null
+++ b/configs/cse/densepose_rcnn_R_50_FPN_DL_soft_s1x.yaml
@@ -0,0 +1,12 @@
+_BASE_: "Base-DensePose-RCNN-FPN-Human.yaml"
+MODEL:
+  WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
+  RESNETS:
+    DEPTH: 50
+  ROI_DENSEPOSE_HEAD:
+    NAME: "DensePoseDeepLabHead"
+    CSE:
+      EMBED_LOSS_NAME: "SoftEmbeddingLoss"
+SOLVER:
+  MAX_ITER: 130000
+  STEPS: (100000, 120000)
diff --git a/configs/cse/densepose_rcnn_R_50_FPN_s1x.yaml b/configs/cse/densepose_rcnn_R_50_FPN_s1x.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..c186625a86cc76441b9edeefeabd7caf44af7755
--- /dev/null
+++ b/configs/cse/densepose_rcnn_R_50_FPN_s1x.yaml
@@ -0,0 +1,12 @@
+_BASE_: "Base-DensePose-RCNN-FPN-Human.yaml"
+MODEL:
+  WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
+  RESNETS:
+    DEPTH: 50
+  ROI_DENSEPOSE_HEAD:
+    NAME: "DensePoseV1ConvXHead"
+    CSE:
+      EMBED_LOSS_NAME: "EmbeddingLoss"
+SOLVER:
+  MAX_ITER: 130000
+  STEPS: (100000, 120000)
diff --git a/configs/cse/densepose_rcnn_R_50_FPN_soft_animals_CA_finetune_16k.yaml b/configs/cse/densepose_rcnn_R_50_FPN_soft_animals_CA_finetune_16k.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..69ab22669e2176b6ec661fc982be7412abb5e0e8
--- /dev/null
+++ b/configs/cse/densepose_rcnn_R_50_FPN_soft_animals_CA_finetune_16k.yaml
@@ -0,0 +1,133 @@
+_BASE_: "Base-DensePose-RCNN-FPN.yaml"
+MODEL:
+  WEIGHTS: "https://dl.fbaipublicfiles.com/densepose/cse/densepose_rcnn_R_50_FPN_soft_s1x/250533982/model_final_2c4512.pkl"
+  RESNETS:
+    DEPTH: 50
+  ROI_HEADS:
+    NUM_CLASSES: 1
+  ROI_DENSEPOSE_HEAD:
+    NAME: "DensePoseV1ConvXHead"
+    COARSE_SEGM_TRAINED_BY_MASKS: True
+    CSE:
+      EMBED_LOSS_NAME: "SoftEmbeddingLoss"
+      EMBEDDING_DIST_GAUSS_SIGMA: 0.1
+      GEODESIC_DIST_GAUSS_SIGMA: 0.1
+      EMBEDDERS:
+        "cat_7466":
+          TYPE: vertex_feature
+          NUM_VERTICES: 7466
+          FEATURE_DIM: 256
+          FEATURES_TRAINABLE: False
+          IS_TRAINABLE: True
+          INIT_FILE: "https://dl.fbaipublicfiles.com/densepose/data/cse/lbo/phi_cat_7466_256.pkl"
+        "dog_7466":
+          TYPE: vertex_feature
+          NUM_VERTICES: 7466
+          FEATURE_DIM: 256
+          FEATURES_TRAINABLE: False
+          IS_TRAINABLE: True
+          INIT_FILE: "https://dl.fbaipublicfiles.com/densepose/data/cse/lbo/phi_dog_7466_256.pkl"
+        "sheep_5004":
+          TYPE: vertex_feature
+          NUM_VERTICES: 5004
+          FEATURE_DIM: 256
+          FEATURES_TRAINABLE: False
+          IS_TRAINABLE: True
+          INIT_FILE: "https://dl.fbaipublicfiles.com/densepose/data/cse/lbo/phi_sheep_5004_256.pkl"
+        "horse_5004":
+          TYPE: vertex_feature
+          NUM_VERTICES: 5004
+          FEATURE_DIM: 256
+          FEATURES_TRAINABLE: False
+          IS_TRAINABLE: True
+          INIT_FILE: "https://dl.fbaipublicfiles.com/densepose/data/cse/lbo/phi_horse_5004_256.pkl"
+        "zebra_5002":
+          TYPE: vertex_feature
+          NUM_VERTICES: 5002
+          FEATURE_DIM: 256
+          FEATURES_TRAINABLE: False
+          IS_TRAINABLE: True
+          INIT_FILE: "https://dl.fbaipublicfiles.com/densepose/data/cse/lbo/phi_zebra_5002_256.pkl"
+        "giraffe_5002":
+          TYPE: vertex_feature
+          NUM_VERTICES: 5002
+          FEATURE_DIM: 256
+          FEATURES_TRAINABLE: False
+          IS_TRAINABLE: True
+          INIT_FILE: "https://dl.fbaipublicfiles.com/densepose/data/cse/lbo/phi_giraffe_5002_256.pkl"
+        "elephant_5002":
+          TYPE: vertex_feature
+          NUM_VERTICES: 5002
+          FEATURE_DIM: 256
+          FEATURES_TRAINABLE: False
+          IS_TRAINABLE: True
+          INIT_FILE: "https://dl.fbaipublicfiles.com/densepose/data/cse/lbo/phi_elephant_5002_256.pkl"
+        "cow_5002":
+          TYPE: vertex_feature
+          NUM_VERTICES: 5002
+          FEATURE_DIM: 256
+          FEATURES_TRAINABLE: False
+          IS_TRAINABLE: True
+          INIT_FILE: "https://dl.fbaipublicfiles.com/densepose/data/cse/lbo/phi_cow_5002_256.pkl"
+        "bear_4936":
+          TYPE: vertex_feature
+          NUM_VERTICES: 4936
+          FEATURE_DIM: 256
+          FEATURES_TRAINABLE: False
+          IS_TRAINABLE: True
+          INIT_FILE: "https://dl.fbaipublicfiles.com/densepose/data/cse/lbo/phi_bear_4936_256.pkl"
+DATASETS:
+  TRAIN:
+    - "densepose_lvis_v1_ds2_train_v1"
+  TEST:
+    - "densepose_lvis_v1_ds2_val_v1"
+  WHITELISTED_CATEGORIES:
+    "densepose_lvis_v1_ds2_train_v1":
+      - 943  # sheep
+      - 1202 # zebra
+      - 569  # horse
+      - 496  # giraffe
+      - 422  # elephant
+      - 80   # cow
+      - 76   # bear
+      - 225  # cat
+      - 378  # dog
+    "densepose_lvis_v1_ds2_val_v1":
+      - 943  # sheep
+      - 1202 # zebra
+      - 569  # horse
+      - 496  # giraffe
+      - 422  # elephant
+      - 80   # cow
+      - 76   # bear
+      - 225  # cat
+      - 378  # dog
+  CATEGORY_MAPS:
+    "densepose_lvis_v1_ds2_train_v1":
+      "1202": 943 # zebra -> sheep
+      "569": 943  # horse -> sheep
+      "496": 943  # giraffe -> sheep
+      "422": 943  # elephant -> sheep
+      "80": 943   # cow -> sheep
+      "76": 943   # bear -> sheep
+      "225": 943  # cat -> sheep
+      "378": 943  # dog -> sheep
+    "densepose_lvis_v1_ds2_val_v1":
+      "1202": 943 # zebra -> sheep
+      "569": 943  # horse -> sheep
+      "496": 943  # giraffe -> sheep
+      "422": 943  # elephant -> sheep
+      "80": 943   # cow -> sheep
+      "76": 943   # bear -> sheep
+      "225": 943  # cat -> sheep
+      "378": 943  # dog -> sheep
+  CLASS_TO_MESH_NAME_MAPPING:
+    # Note: different classes are mapped to a single class
+    # mesh is chosen based on GT data, so this is just some
+    # value which has no particular meaning
+    "0": "sheep_5004"
+SOLVER:
+  MAX_ITER: 16000
+  STEPS: (12000, 14000)
+DENSEPOSE_EVALUATION:
+  EVALUATE_MESH_ALIGNMENT: True
diff --git a/configs/cse/densepose_rcnn_R_50_FPN_soft_animals_CA_finetune_4k.yaml b/configs/cse/densepose_rcnn_R_50_FPN_soft_animals_CA_finetune_4k.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..921a9c125d9da982fb88172acc7825ba3c583370
--- /dev/null
+++ b/configs/cse/densepose_rcnn_R_50_FPN_soft_animals_CA_finetune_4k.yaml
@@ -0,0 +1,133 @@
+_BASE_: "Base-DensePose-RCNN-FPN.yaml"
+MODEL:
+  WEIGHTS: "https://dl.fbaipublicfiles.com/densepose/cse/densepose_rcnn_R_50_FPN_soft_s1x/250533982/model_final_2c4512.pkl"
+  RESNETS:
+    DEPTH: 50
+  ROI_HEADS:
+    NUM_CLASSES: 1
+  ROI_DENSEPOSE_HEAD:
+    NAME: "DensePoseV1ConvXHead"
+    COARSE_SEGM_TRAINED_BY_MASKS: True
+    CSE:
+      EMBED_LOSS_NAME: "SoftEmbeddingLoss"
+      EMBEDDING_DIST_GAUSS_SIGMA: 0.1
+      GEODESIC_DIST_GAUSS_SIGMA: 0.1
+      EMBEDDERS:
+        "cat_5001":
+          TYPE: vertex_feature
+          NUM_VERTICES: 5001
+          FEATURE_DIM: 256
+          FEATURES_TRAINABLE: False
+          IS_TRAINABLE: True
+          INIT_FILE: "https://dl.fbaipublicfiles.com/densepose/data/cse/lbo/phi_cat_5001_256.pkl"
+        "dog_5002":
+          TYPE: vertex_feature
+          NUM_VERTICES: 5002
+          FEATURE_DIM: 256
+          FEATURES_TRAINABLE: False
+          IS_TRAINABLE: True
+          INIT_FILE: "https://dl.fbaipublicfiles.com/densepose/data/cse/lbo/phi_dog_5002_256.pkl"
+        "sheep_5004":
+          TYPE: vertex_feature
+          NUM_VERTICES: 5004
+          FEATURE_DIM: 256
+          FEATURES_TRAINABLE: False
+          IS_TRAINABLE: True
+          INIT_FILE: "https://dl.fbaipublicfiles.com/densepose/data/cse/lbo/phi_sheep_5004_256.pkl"
+        "horse_5004":
+          TYPE: vertex_feature
+          NUM_VERTICES: 5004
+          FEATURE_DIM: 256
+          FEATURES_TRAINABLE: False
+          IS_TRAINABLE: True
+          INIT_FILE: "https://dl.fbaipublicfiles.com/densepose/data/cse/lbo/phi_horse_5004_256.pkl"
+        "zebra_5002":
+          TYPE: vertex_feature
+          NUM_VERTICES: 5002
+          FEATURE_DIM: 256
+          FEATURES_TRAINABLE: False
+          IS_TRAINABLE: True
+          INIT_FILE: "https://dl.fbaipublicfiles.com/densepose/data/cse/lbo/phi_zebra_5002_256.pkl"
+        "giraffe_5002":
+          TYPE: vertex_feature
+          NUM_VERTICES: 5002
+          FEATURE_DIM: 256
+          FEATURES_TRAINABLE: False
+          IS_TRAINABLE: True
+          INIT_FILE: "https://dl.fbaipublicfiles.com/densepose/data/cse/lbo/phi_giraffe_5002_256.pkl"
+        "elephant_5002":
+          TYPE: vertex_feature
+          NUM_VERTICES: 5002
+          FEATURE_DIM: 256
+          FEATURES_TRAINABLE: False
+          IS_TRAINABLE: True
+          INIT_FILE: "https://dl.fbaipublicfiles.com/densepose/data/cse/lbo/phi_elephant_5002_256.pkl"
+        "cow_5002":
+          TYPE: vertex_feature
+          NUM_VERTICES: 5002
+          FEATURE_DIM: 256
+          FEATURES_TRAINABLE: False
+          IS_TRAINABLE: True
+          INIT_FILE: "https://dl.fbaipublicfiles.com/densepose/data/cse/lbo/phi_cow_5002_256.pkl"
+        "bear_4936":
+          TYPE: vertex_feature
+          NUM_VERTICES: 4936
+          FEATURE_DIM: 256
+          FEATURES_TRAINABLE: False
+          IS_TRAINABLE: True
+          INIT_FILE: "https://dl.fbaipublicfiles.com/densepose/data/cse/lbo/phi_bear_4936_256.pkl"
+DATASETS:
+  TRAIN:
+    - "densepose_lvis_v1_ds1_train_v1"
+  TEST:
+    - "densepose_lvis_v1_ds1_val_v1"
+  WHITELISTED_CATEGORIES:
+    "densepose_lvis_v1_ds1_train_v1":
+      - 943  # sheep
+      - 1202 # zebra
+      - 569  # horse
+      - 496  # giraffe
+      - 422  # elephant
+      - 80   # cow
+      - 76   # bear
+      - 225  # cat
+      - 378  # dog
+    "densepose_lvis_v1_ds1_val_v1":
+      - 943  # sheep
+      - 1202 # zebra
+      - 569  # horse
+      - 496  # giraffe
+      - 422  # elephant
+      - 80   # cow
+      - 76   # bear
+      - 225  # cat
+      - 378  # dog
+  CATEGORY_MAPS:
+    "densepose_lvis_v1_ds1_train_v1":
+      "1202": 943 # zebra -> sheep
+      "569": 943  # horse -> sheep
+      "496": 943  # giraffe -> sheep
+      "422": 943  # elephant -> sheep
+      "80": 943   # cow -> sheep
+      "76": 943   # bear -> sheep
+      "225": 943  # cat -> sheep
+      "378": 943  # dog -> sheep
+    "densepose_lvis_v1_ds1_val_v1":
+      "1202": 943 # zebra -> sheep
+      "569": 943  # horse -> sheep
+      "496": 943  # giraffe -> sheep
+      "422": 943  # elephant -> sheep
+      "80": 943   # cow -> sheep
+      "76": 943   # bear -> sheep
+      "225": 943  # cat -> sheep
+      "378": 943  # dog -> sheep
+  CLASS_TO_MESH_NAME_MAPPING:
+    # Note: different classes are mapped to a single class
+    # mesh is chosen based on GT data, so this is just some
+    # value which has no particular meaning
+    "0": "sheep_5004"
+SOLVER:
+  MAX_ITER: 4000
+  STEPS: (3000, 3500)
+DENSEPOSE_EVALUATION:
+  EVALUATE_MESH_ALIGNMENT: True
diff --git a/configs/cse/densepose_rcnn_R_50_FPN_soft_animals_I0_finetune_16k.yaml b/configs/cse/densepose_rcnn_R_50_FPN_soft_animals_I0_finetune_16k.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..1b5a098d171e508fcb9dd8088ecc1799c3068efc
--- /dev/null
+++ b/configs/cse/densepose_rcnn_R_50_FPN_soft_animals_I0_finetune_16k.yaml
@@ -0,0 +1,119 @@
+_BASE_: "Base-DensePose-RCNN-FPN.yaml"
+MODEL:
+  WEIGHTS: "https://dl.fbaipublicfiles.com/densepose/cse/densepose_rcnn_R_50_FPN_soft_animals_finetune_maskonly_24k/270668502/model_final_21b1d2.pkl"
+  RESNETS:
+    DEPTH: 50
+  ROI_HEADS:
+    NUM_CLASSES: 9
+  ROI_DENSEPOSE_HEAD:
+    NAME: "DensePoseV1ConvXHead"
+    COARSE_SEGM_TRAINED_BY_MASKS: True
+    CSE:
+      EMBED_LOSS_NAME: "SoftEmbeddingLoss"
+      EMBEDDING_DIST_GAUSS_SIGMA: 0.1
+      GEODESIC_DIST_GAUSS_SIGMA: 0.1
+      EMBEDDERS:
+        "cat_7466":
+          TYPE: vertex_feature
+          NUM_VERTICES: 7466
+          FEATURE_DIM: 256
+          FEATURES_TRAINABLE: False
+          IS_TRAINABLE: True
+          INIT_FILE: "https://dl.fbaipublicfiles.com/densepose/data/cse/lbo/phi_cat_7466_256.pkl"
+        "dog_7466":
+          TYPE: vertex_feature
+          NUM_VERTICES: 7466
+          FEATURE_DIM: 256
+          FEATURES_TRAINABLE: False
+          IS_TRAINABLE: True
+          INIT_FILE: "https://dl.fbaipublicfiles.com/densepose/data/cse/lbo/phi_dog_7466_256.pkl"
+        "sheep_5004":
+          TYPE: vertex_feature
+          NUM_VERTICES: 5004
+          FEATURE_DIM: 256
+          FEATURES_TRAINABLE: False
+          IS_TRAINABLE: True
+          INIT_FILE: "https://dl.fbaipublicfiles.com/densepose/data/cse/lbo/phi_sheep_5004_256.pkl"
+        "horse_5004":
+          TYPE: vertex_feature
+          NUM_VERTICES: 5004
+          FEATURE_DIM: 256
+          FEATURES_TRAINABLE: False
+          IS_TRAINABLE: True
+          INIT_FILE: "https://dl.fbaipublicfiles.com/densepose/data/cse/lbo/phi_horse_5004_256.pkl"
+        "zebra_5002":
+          TYPE: vertex_feature
+          NUM_VERTICES: 5002
+          FEATURE_DIM: 256
+          FEATURES_TRAINABLE: False
+          IS_TRAINABLE: True
+          INIT_FILE: "https://dl.fbaipublicfiles.com/densepose/data/cse/lbo/phi_zebra_5002_256.pkl"
+        "giraffe_5002":
+          TYPE: vertex_feature
+          NUM_VERTICES: 5002
+          FEATURE_DIM: 256
+          FEATURES_TRAINABLE: False
+          IS_TRAINABLE: True
+          INIT_FILE: "https://dl.fbaipublicfiles.com/densepose/data/cse/lbo/phi_giraffe_5002_256.pkl"
+        "elephant_5002":
+          TYPE: vertex_feature
+          NUM_VERTICES: 5002
+          FEATURE_DIM: 256
+          FEATURES_TRAINABLE: False
+          IS_TRAINABLE: True
+          INIT_FILE: "https://dl.fbaipublicfiles.com/densepose/data/cse/lbo/phi_elephant_5002_256.pkl"
+        "cow_5002":
+          TYPE: vertex_feature
+          NUM_VERTICES: 5002
+          FEATURE_DIM: 256
+          FEATURES_TRAINABLE: False
+          IS_TRAINABLE: True
+          INIT_FILE: "https://dl.fbaipublicfiles.com/densepose/data/cse/lbo/phi_cow_5002_256.pkl"
+        "bear_4936":
+          TYPE: vertex_feature
+          NUM_VERTICES: 4936
+          FEATURE_DIM: 256
+          FEATURES_TRAINABLE: False
+          IS_TRAINABLE: True
+          INIT_FILE: "https://dl.fbaipublicfiles.com/densepose/data/cse/lbo/phi_bear_4936_256.pkl"
+DATASETS:
+  TRAIN:
+    - "densepose_lvis_v1_ds2_train_v1"
+  TEST:
+    - "densepose_lvis_v1_ds2_val_v1"
+  WHITELISTED_CATEGORIES:
+    "densepose_lvis_v1_ds2_train_v1":
+      - 943  # sheep
+      - 1202 # zebra
+      - 569  # horse
+      - 496  # giraffe
+      - 422  # elephant
+      - 80   # cow
+      - 76   # bear
+      - 225  # cat
+      - 378  # dog
+    "densepose_lvis_v1_ds2_val_v1":
+      - 943  # sheep
+      - 1202 # zebra
+      - 569  # horse
+      - 496  # giraffe
+      - 422  # elephant
+      - 80   # cow
+      - 76   # bear
+      - 225  # cat
+      - 378  # dog
+  CLASS_TO_MESH_NAME_MAPPING:
+    "0": "bear_4936"
+    "1": "cow_5002"
+    "2": "cat_7466"
+    "3": "dog_7466"
+    "4": "elephant_5002"
+    "5": "giraffe_5002"
+    "6": "horse_5004"
+    "7": "sheep_5004"
+    "8": "zebra_5002"
+SOLVER:
+  MAX_ITER: 16000
+  STEPS: (12000, 14000)
+DENSEPOSE_EVALUATION:
+  EVALUATE_MESH_ALIGNMENT: True
diff --git a/configs/cse/densepose_rcnn_R_50_FPN_soft_animals_I0_finetune_i2m_16k.yaml b/configs/cse/densepose_rcnn_R_50_FPN_soft_animals_I0_finetune_i2m_16k.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..18d6dacf4b62e609aa85735a87daa8d2506000d7
--- /dev/null
+++ b/configs/cse/densepose_rcnn_R_50_FPN_soft_animals_I0_finetune_i2m_16k.yaml
@@ -0,0 +1,121 @@
+_BASE_: "Base-DensePose-RCNN-FPN.yaml"
+MODEL:
+  WEIGHTS: "https://dl.fbaipublicfiles.com/densepose/cse/densepose_rcnn_R_50_FPN_soft_animals_finetune_maskonly_24k/270668502/model_final_21b1d2.pkl"
+  RESNETS:
+    DEPTH: 50
+  ROI_HEADS:
+    NUM_CLASSES: 9
+  ROI_DENSEPOSE_HEAD:
+    NAME: "DensePoseV1ConvXHead"
+    COARSE_SEGM_TRAINED_BY_MASKS: True
+    CSE:
+      EMBED_LOSS_NAME: "SoftEmbeddingLoss"
+      EMBEDDING_DIST_GAUSS_SIGMA: 0.1
+      GEODESIC_DIST_GAUSS_SIGMA: 0.1
+      PIX_TO_SHAPE_CYCLE_LOSS:
+        ENABLED: True
+      EMBEDDERS:
+        "cat_7466":
+          TYPE: vertex_feature
+          NUM_VERTICES: 7466
+          FEATURE_DIM: 256
+          FEATURES_TRAINABLE: False
+          IS_TRAINABLE: True
+          INIT_FILE: "https://dl.fbaipublicfiles.com/densepose/data/cse/lbo/phi_cat_7466_256.pkl"
+        "dog_7466":
+          TYPE: vertex_feature
+          NUM_VERTICES: 7466
+          FEATURE_DIM: 256
+          FEATURES_TRAINABLE: False
+          IS_TRAINABLE: True
+          INIT_FILE: "https://dl.fbaipublicfiles.com/densepose/data/cse/lbo/phi_dog_7466_256.pkl"
+        "sheep_5004":
+          TYPE: vertex_feature
+          NUM_VERTICES: 5004
+          FEATURE_DIM: 256
+          FEATURES_TRAINABLE: False
+          IS_TRAINABLE: True
+          INIT_FILE: "https://dl.fbaipublicfiles.com/densepose/data/cse/lbo/phi_sheep_5004_256.pkl"
+        "horse_5004":
+          TYPE: vertex_feature
+          NUM_VERTICES: 5004
+          FEATURE_DIM: 256
+          FEATURES_TRAINABLE: False
+          IS_TRAINABLE: True
+          INIT_FILE: "https://dl.fbaipublicfiles.com/densepose/data/cse/lbo/phi_horse_5004_256.pkl"
+        "zebra_5002":
+          TYPE: vertex_feature
+          NUM_VERTICES: 5002
+          FEATURE_DIM: 256
+          FEATURES_TRAINABLE: False
+          IS_TRAINABLE: True
+          INIT_FILE: "https://dl.fbaipublicfiles.com/densepose/data/cse/lbo/phi_zebra_5002_256.pkl"
+        "giraffe_5002":
+          TYPE: vertex_feature
+          NUM_VERTICES: 5002
+          FEATURE_DIM: 256
+          FEATURES_TRAINABLE: False
+          IS_TRAINABLE: True
+          INIT_FILE: "https://dl.fbaipublicfiles.com/densepose/data/cse/lbo/phi_giraffe_5002_256.pkl"
+        "elephant_5002":
+          TYPE: vertex_feature
+          NUM_VERTICES: 5002
+          FEATURE_DIM: 256
+          FEATURES_TRAINABLE: False
+          IS_TRAINABLE: True
+          INIT_FILE: "https://dl.fbaipublicfiles.com/densepose/data/cse/lbo/phi_elephant_5002_256.pkl"
+        "cow_5002":
+          TYPE: vertex_feature
+          NUM_VERTICES: 5002
+          FEATURE_DIM: 256
+          FEATURES_TRAINABLE: False
+          IS_TRAINABLE: True
+          INIT_FILE: "https://dl.fbaipublicfiles.com/densepose/data/cse/lbo/phi_cow_5002_256.pkl"
+        "bear_4936":
+          TYPE: vertex_feature
+          NUM_VERTICES: 4936
+          FEATURE_DIM: 256
+          FEATURES_TRAINABLE: False
+          IS_TRAINABLE: True
+          INIT_FILE: "https://dl.fbaipublicfiles.com/densepose/data/cse/lbo/phi_bear_4936_256.pkl"
+DATASETS:
+  TRAIN:
+    - "densepose_lvis_v1_ds2_train_v1"
+  TEST:
+    - "densepose_lvis_v1_ds2_val_v1"
+  WHITELISTED_CATEGORIES:
+    "densepose_lvis_v1_ds2_train_v1":
+      - 943  # sheep
+      - 1202 # zebra
+      - 569  # horse
+      - 496  # giraffe
+      - 422  # elephant
+      - 80   # cow
+      - 76   # bear
+      - 225  # cat
+      - 378  # dog
+    "densepose_lvis_v1_ds2_val_v1":
+      - 943  # sheep
+      - 1202 # zebra
+      - 569  # horse
+      - 496  # giraffe
+      - 422  # elephant
+      - 80   # cow
+      - 76   # bear
+      - 225  # cat
+      - 378  # dog
+  CLASS_TO_MESH_NAME_MAPPING:
+    "0": "bear_4936"
+    "1": "cow_5002"
+    "2": "cat_7466"
+    "3": "dog_7466"
+    "4": "elephant_5002"
+    "5": "giraffe_5002"
+    "6": "horse_5004"
+    "7": "sheep_5004"
+    "8": "zebra_5002"
+SOLVER:
+  MAX_ITER: 16000
+  STEPS: (12000, 14000)
+DENSEPOSE_EVALUATION:
+  EVALUATE_MESH_ALIGNMENT: True
diff --git a/configs/cse/densepose_rcnn_R_50_FPN_soft_animals_I0_finetune_m2m_16k.yaml b/configs/cse/densepose_rcnn_R_50_FPN_soft_animals_I0_finetune_m2m_16k.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..6b798ae21204b9310adae33040c870253edc68ee
--- /dev/null
+++ b/configs/cse/densepose_rcnn_R_50_FPN_soft_animals_I0_finetune_m2m_16k.yaml
@@ -0,0 +1,138 @@
+_BASE_: "Base-DensePose-RCNN-FPN.yaml"
+MODEL:
+  WEIGHTS: "https://dl.fbaipublicfiles.com/densepose/cse/densepose_rcnn_R_50_FPN_soft_animals_finetune_maskonly_24k/267687159/model_final_354e61.pkl"
+  RESNETS:
+    DEPTH: 50
+  ROI_HEADS:
+    NUM_CLASSES: 9
+  ROI_DENSEPOSE_HEAD:
+    NAME: "DensePoseV1ConvXHead"
+    COARSE_SEGM_TRAINED_BY_MASKS: True
+    CSE:
+      EMBED_LOSS_NAME: "SoftEmbeddingLoss"
+      EMBEDDING_DIST_GAUSS_SIGMA: 0.1
+      GEODESIC_DIST_GAUSS_SIGMA: 0.1
+      SHAPE_TO_SHAPE_CYCLE_LOSS:
+        ENABLED: True
+      EMBEDDERS:
+        "cat_7466":
+          TYPE: vertex_feature
+          NUM_VERTICES: 7466
+          FEATURE_DIM: 256
+          FEATURES_TRAINABLE: False
+          IS_TRAINABLE: True
+          INIT_FILE: "https://dl.fbaipublicfiles.com/densepose/data/cse/lbo/phi_cat_7466_256.pkl"
+        "dog_7466":
+          TYPE: vertex_feature
+          NUM_VERTICES: 7466
+          FEATURE_DIM: 256
+          FEATURES_TRAINABLE: False
+          IS_TRAINABLE: True
+          INIT_FILE: "https://dl.fbaipublicfiles.com/densepose/data/cse/lbo/phi_dog_7466_256.pkl"
+        "sheep_5004":
+          TYPE: vertex_feature
+          NUM_VERTICES: 5004
+          FEATURE_DIM: 256
+          FEATURES_TRAINABLE: False
+          IS_TRAINABLE: True
+          INIT_FILE: "https://dl.fbaipublicfiles.com/densepose/data/cse/lbo/phi_sheep_5004_256.pkl"
+        "horse_5004":
+          TYPE: vertex_feature
+          NUM_VERTICES: 5004
+          FEATURE_DIM: 256
+          FEATURES_TRAINABLE: False
+          IS_TRAINABLE: True
+          INIT_FILE: "https://dl.fbaipublicfiles.com/densepose/data/cse/lbo/phi_horse_5004_256.pkl"
+        "zebra_5002":
+          TYPE: vertex_feature
+          NUM_VERTICES: 5002
+          FEATURE_DIM: 256
+          FEATURES_TRAINABLE: False
+          IS_TRAINABLE: True
+          INIT_FILE: "https://dl.fbaipublicfiles.com/densepose/data/cse/lbo/phi_zebra_5002_256.pkl"
+        "giraffe_5002":
+          TYPE: vertex_feature
+          NUM_VERTICES: 5002
+          FEATURE_DIM: 256
+          FEATURES_TRAINABLE: False
+          IS_TRAINABLE: True
+          INIT_FILE: "https://dl.fbaipublicfiles.com/densepose/data/cse/lbo/phi_giraffe_5002_256.pkl"
+        "elephant_5002":
+          TYPE: vertex_feature
+          NUM_VERTICES: 5002
+          FEATURE_DIM: 256
+          FEATURES_TRAINABLE: False
+          IS_TRAINABLE: True
+          INIT_FILE: "https://dl.fbaipublicfiles.com/densepose/data/cse/lbo/phi_elephant_5002_256.pkl"
+        "cow_5002":
+          TYPE: vertex_feature
+          NUM_VERTICES: 5002
+          FEATURE_DIM: 256
+          FEATURES_TRAINABLE: False
+          IS_TRAINABLE: True
+          INIT_FILE: "https://dl.fbaipublicfiles.com/densepose/data/cse/lbo/phi_cow_5002_256.pkl"
+        "bear_4936":
+          TYPE: vertex_feature
+          NUM_VERTICES: 4936
+          FEATURE_DIM: 256
+          FEATURES_TRAINABLE: False
+          IS_TRAINABLE: True
+          INIT_FILE: "https://dl.fbaipublicfiles.com/densepose/data/cse/lbo/phi_bear_4936_256.pkl"
+        "smpl_27554":
+          TYPE: vertex_feature
+          NUM_VERTICES: 27554
+          FEATURE_DIM: 256
+          FEATURES_TRAINABLE: False
+          IS_TRAINABLE: True
+          INIT_FILE: "https://dl.fbaipublicfiles.com/densepose/data/cse/lbo/phi_smpl_27554_256.pkl"
+DATASETS:
+  TRAIN:
+    - "densepose_lvis_v1_ds2_train_v1"
+  TEST:
+    - "densepose_lvis_v1_ds2_val_v1"
+  WHITELISTED_CATEGORIES:
+    "densepose_lvis_v1_ds2_train_v1":
+      - 943  # sheep
+      - 1202 # zebra
+      - 569  # horse
+      - 496  # giraffe
+      - 422  # elephant
+      - 80   # cow
+      - 76   # bear
+      - 225  # cat
+      - 378  # dog
+    "densepose_lvis_v1_ds2_val_v1":
+      - 943  # sheep
+      - 1202 # zebra
+      - 569  # horse
+      - 496  # giraffe
+      - 422  # elephant
+      - 80   # cow
+      - 76   # bear
+      - 225  # cat
+      - 378  # dog
+  CLASS_TO_MESH_NAME_MAPPING:
+    "0": "bear_4936"
+    "1": "cow_5002"
+    "2": "cat_7466"
+    "3": "dog_7466"
+    "4": "elephant_5002"
+    "5": "giraffe_5002"
+    "6": "horse_5004"
+    "7": "sheep_5004"
+    "8": "zebra_5002"
+SOLVER:
+  MAX_ITER: 16000
+  STEPS: (12000, 14000)
+DENSEPOSE_EVALUATION:
+  EVALUATE_MESH_ALIGNMENT: True
+  MESH_ALIGNMENT_MESH_NAMES:
+    - bear_4936
+    - cow_5002
+    - cat_7466
+    - dog_7466
+    - elephant_5002
+    - giraffe_5002
+    - horse_5004
+    - sheep_5004
+    - zebra_5002
diff --git a/configs/cse/densepose_rcnn_R_50_FPN_soft_animals_finetune_16k.yaml b/configs/cse/densepose_rcnn_R_50_FPN_soft_animals_finetune_16k.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..b1462e374377fbf448e176951794face175b5002
--- /dev/null
+++ b/configs/cse/densepose_rcnn_R_50_FPN_soft_animals_finetune_16k.yaml
@@ -0,0 +1,119 @@
+_BASE_: "Base-DensePose-RCNN-FPN.yaml"
+MODEL:
+  WEIGHTS: "https://dl.fbaipublicfiles.com/densepose/cse/densepose_rcnn_R_50_FPN_soft_s1x/250533982/model_final_2c4512.pkl"
+  RESNETS:
+    DEPTH: 50
+  ROI_HEADS:
+    NUM_CLASSES: 9
+  ROI_DENSEPOSE_HEAD:
+    NAME: "DensePoseV1ConvXHead"
+    COARSE_SEGM_TRAINED_BY_MASKS: True
+    CSE:
+      EMBED_LOSS_NAME: "SoftEmbeddingLoss"
+      EMBEDDING_DIST_GAUSS_SIGMA: 0.1
+      GEODESIC_DIST_GAUSS_SIGMA: 0.1
+      EMBEDDERS:
+        "cat_7466":
+          TYPE: vertex_feature
+          NUM_VERTICES: 7466
+          FEATURE_DIM: 256
+          FEATURES_TRAINABLE: False
+          IS_TRAINABLE: True
+          INIT_FILE: "https://dl.fbaipublicfiles.com/densepose/data/cse/lbo/phi_cat_7466_256.pkl"
+        "dog_7466":
+          TYPE: vertex_feature
+          NUM_VERTICES: 7466
+          FEATURE_DIM: 256
+          FEATURES_TRAINABLE: False
+          IS_TRAINABLE: True
+          INIT_FILE: "https://dl.fbaipublicfiles.com/densepose/data/cse/lbo/phi_dog_7466_256.pkl"
+        "sheep_5004":
+          TYPE: vertex_feature
+          NUM_VERTICES: 5004
+          FEATURE_DIM: 256
+          FEATURES_TRAINABLE: False
+          IS_TRAINABLE: True
+          INIT_FILE: "https://dl.fbaipublicfiles.com/densepose/data/cse/lbo/phi_sheep_5004_256.pkl"
+        "horse_5004":
+          TYPE: vertex_feature
+          NUM_VERTICES: 5004
+          FEATURE_DIM: 256
+          FEATURES_TRAINABLE: False
+          IS_TRAINABLE: True
+          INIT_FILE: "https://dl.fbaipublicfiles.com/densepose/data/cse/lbo/phi_horse_5004_256.pkl"
+        "zebra_5002":
+          TYPE: vertex_feature
+          NUM_VERTICES: 5002
+          FEATURE_DIM: 256
+          FEATURES_TRAINABLE: False
+          IS_TRAINABLE: True
+          INIT_FILE: "https://dl.fbaipublicfiles.com/densepose/data/cse/lbo/phi_zebra_5002_256.pkl"
+        "giraffe_5002":
+          TYPE: vertex_feature
+          NUM_VERTICES: 5002
+          FEATURE_DIM: 256
+          FEATURES_TRAINABLE: False
+          IS_TRAINABLE: True
+          INIT_FILE: "https://dl.fbaipublicfiles.com/densepose/data/cse/lbo/phi_giraffe_5002_256.pkl"
+        "elephant_5002":
+          TYPE: vertex_feature
+          NUM_VERTICES: 5002
+          FEATURE_DIM: 256
+          FEATURES_TRAINABLE: False
+          IS_TRAINABLE: True
+          INIT_FILE: "https://dl.fbaipublicfiles.com/densepose/data/cse/lbo/phi_elephant_5002_256.pkl"
+        "cow_5002":
+          TYPE: vertex_feature
+          NUM_VERTICES: 5002
+          FEATURE_DIM: 256
+          FEATURES_TRAINABLE: False
+          IS_TRAINABLE: True
+          INIT_FILE: "https://dl.fbaipublicfiles.com/densepose/data/cse/lbo/phi_cow_5002_256.pkl"
+        "bear_4936":
+          TYPE: vertex_feature
+          NUM_VERTICES: 4936
+          FEATURE_DIM: 256
+          FEATURES_TRAINABLE: False
+          IS_TRAINABLE: True
+          INIT_FILE: "https://dl.fbaipublicfiles.com/densepose/data/cse/lbo/phi_bear_4936_256.pkl"
+DATASETS:
+  TRAIN:
+    - "densepose_lvis_v1_ds2_train_v1"
+  TEST:
+    - "densepose_lvis_v1_ds2_val_v1"
+  WHITELISTED_CATEGORIES:
+    "densepose_lvis_v1_ds2_train_v1":
+      - 943  # sheep
+      - 1202 # zebra
+      - 569  # horse
+      - 496  # giraffe
+      - 422  # elephant
+      - 80   # cow
+      - 76   # bear
+      - 225  # cat
+      - 378  # dog
+    "densepose_lvis_v1_ds2_val_v1":
+      - 943  # sheep
+      - 1202 # zebra
+      - 569  # horse
+      - 496  # giraffe
+      - 422  # elephant
+      - 80   # cow
+      - 76   # bear
+      - 225  # cat
+      - 378  # dog
+  CLASS_TO_MESH_NAME_MAPPING:
+    "0": "bear_4936"
+    "1": "cow_5002"
+    "2": "cat_7466"
+    "3": "dog_7466"
+    "4": "elephant_5002"
+    "5": "giraffe_5002"
+    "6": "horse_5004"
+    "7": "sheep_5004"
+    "8": "zebra_5002"
+SOLVER:
+  MAX_ITER: 16000
+  STEPS: (12000, 14000)
+DENSEPOSE_EVALUATION:
+  EVALUATE_MESH_ALIGNMENT: True
diff --git a/configs/cse/densepose_rcnn_R_50_FPN_soft_animals_finetune_4k.yaml b/configs/cse/densepose_rcnn_R_50_FPN_soft_animals_finetune_4k.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..ba4b81dde2ef53749b096f137ac658563fdad857
--- /dev/null
+++ b/configs/cse/densepose_rcnn_R_50_FPN_soft_animals_finetune_4k.yaml
@@ -0,0 +1,119 @@
+_BASE_: "Base-DensePose-RCNN-FPN.yaml"
+MODEL:
+  WEIGHTS: "https://dl.fbaipublicfiles.com/densepose/cse/densepose_rcnn_R_50_FPN_soft_s1x/250533982/model_final_2c4512.pkl"
+  RESNETS:
+    DEPTH: 50
+  ROI_HEADS:
+    NUM_CLASSES: 9
+  ROI_DENSEPOSE_HEAD:
+    NAME: "DensePoseV1ConvXHead"
+    COARSE_SEGM_TRAINED_BY_MASKS: True
+    CSE:
+      EMBED_LOSS_NAME: "SoftEmbeddingLoss"
+      EMBEDDING_DIST_GAUSS_SIGMA: 0.1
+      GEODESIC_DIST_GAUSS_SIGMA: 0.1
+      EMBEDDERS:
+        "cat_5001":
+          TYPE: vertex_feature
+          NUM_VERTICES: 5001
+          FEATURE_DIM: 256
+          FEATURES_TRAINABLE: False
+          IS_TRAINABLE: True
+          INIT_FILE: "https://dl.fbaipublicfiles.com/densepose/data/cse/lbo/phi_cat_5001_256.pkl"
+        "dog_5002":
+          TYPE: vertex_feature
+          NUM_VERTICES: 5002
+          FEATURE_DIM: 256
+          FEATURES_TRAINABLE: False
+          IS_TRAINABLE: True
+          INIT_FILE: "https://dl.fbaipublicfiles.com/densepose/data/cse/lbo/phi_dog_5002_256.pkl"
+        "sheep_5004":
+          TYPE: vertex_feature
+          NUM_VERTICES: 5004
+          FEATURE_DIM: 256
+          FEATURES_TRAINABLE: False
+          IS_TRAINABLE: True
+          INIT_FILE: "https://dl.fbaipublicfiles.com/densepose/data/cse/lbo/phi_sheep_5004_256.pkl"
+        "horse_5004":
+          TYPE: vertex_feature
+          NUM_VERTICES: 5004
+          FEATURE_DIM: 256
+          FEATURES_TRAINABLE: False
+          IS_TRAINABLE: True
+          INIT_FILE: "https://dl.fbaipublicfiles.com/densepose/data/cse/lbo/phi_horse_5004_256.pkl"
+        "zebra_5002":
+          TYPE: vertex_feature
+          NUM_VERTICES: 5002
+          FEATURE_DIM: 256
+          FEATURES_TRAINABLE: False
+          IS_TRAINABLE: True
+          INIT_FILE: "https://dl.fbaipublicfiles.com/densepose/data/cse/lbo/phi_zebra_5002_256.pkl"
+        "giraffe_5002":
+          TYPE: vertex_feature
+          NUM_VERTICES: 5002
+          FEATURE_DIM: 256
+          FEATURES_TRAINABLE: False
+          IS_TRAINABLE: True
+          INIT_FILE: "https://dl.fbaipublicfiles.com/densepose/data/cse/lbo/phi_giraffe_5002_256.pkl"
+        "elephant_5002":
+          TYPE: vertex_feature
+          NUM_VERTICES: 5002
+          FEATURE_DIM: 256
+          FEATURES_TRAINABLE: False
+          IS_TRAINABLE: True
+          INIT_FILE: "https://dl.fbaipublicfiles.com/densepose/data/cse/lbo/phi_elephant_5002_256.pkl"
+        "cow_5002":
+          TYPE: vertex_feature
+          NUM_VERTICES: 5002
+          FEATURE_DIM: 256
+          FEATURES_TRAINABLE: False
+          IS_TRAINABLE: True
+          INIT_FILE: "https://dl.fbaipublicfiles.com/densepose/data/cse/lbo/phi_cow_5002_256.pkl"
+        "bear_4936":
+          TYPE: vertex_feature
+          NUM_VERTICES: 4936
+          FEATURE_DIM: 256
+          FEATURES_TRAINABLE: False
+          IS_TRAINABLE: True
+          INIT_FILE: "https://dl.fbaipublicfiles.com/densepose/data/cse/lbo/phi_bear_4936_256.pkl"
+DATASETS:
+  TRAIN:
+    - "densepose_lvis_v1_ds1_train_v1"
+  TEST:
+    - "densepose_lvis_v1_ds1_val_v1"
+  WHITELISTED_CATEGORIES:
+    "densepose_lvis_v1_ds1_train_v1":
+      - 943  # sheep
+      - 1202 # zebra
+      - 569  # horse
+      - 496  # giraffe
+      - 422  # elephant
+      - 80   # cow
+      - 76   # bear
+      - 225  # cat
+      - 378  # dog
+    "densepose_lvis_v1_ds1_val_v1":
+      - 943  # sheep
+      - 1202 # zebra
+      - 569  # horse
+      - 496  # giraffe
+      - 422  # elephant
+      - 80   # cow
+      - 76   # bear
+      - 225  # cat
+      - 378  # dog
+  CLASS_TO_MESH_NAME_MAPPING:
+    "0": "bear_4936"
+    "1": "cow_5002"
+    "2": "cat_5001"
+    "3": "dog_5002"
+    "4": "elephant_5002"
+    "5": "giraffe_5002"
+    "6": "horse_5004"
+    "7": "sheep_5004"
+    "8": "zebra_5002"
+SOLVER:
+  MAX_ITER: 4000
+  STEPS: (3000, 3500)
+DENSEPOSE_EVALUATION:
+  EVALUATE_MESH_ALIGNMENT: True
diff --git a/configs/cse/densepose_rcnn_R_50_FPN_soft_animals_finetune_maskonly_24k.yaml b/configs/cse/densepose_rcnn_R_50_FPN_soft_animals_finetune_maskonly_24k.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..bb6136e274ca64aa2285698664d3243519d1979f
--- /dev/null
+++ b/configs/cse/densepose_rcnn_R_50_FPN_soft_animals_finetune_maskonly_24k.yaml
@@ -0,0 +1,118 @@
+_BASE_: "Base-DensePose-RCNN-FPN.yaml"
+MODEL:
+  WEIGHTS: "https://dl.fbaipublicfiles.com/densepose/cse/densepose_rcnn_R_50_FPN_soft_s1x/250533982/model_final_2c4512.pkl"
+  RESNETS:
+    DEPTH: 50
+  ROI_HEADS:
+    NUM_CLASSES: 9
+  ROI_DENSEPOSE_HEAD:
+    NAME: "DensePoseV1ConvXHead"
+    COARSE_SEGM_TRAINED_BY_MASKS: True
+    CSE:
+      EMBED_LOSS_NAME: "SoftEmbeddingLoss"
+      EMBED_LOSS_WEIGHT: 0.0
+      EMBEDDING_DIST_GAUSS_SIGMA: 0.1
+      GEODESIC_DIST_GAUSS_SIGMA: 0.1
+      EMBEDDERS:
+        "cat_7466":
+          TYPE: vertex_feature
+          NUM_VERTICES: 7466
+          FEATURE_DIM: 256
+          FEATURES_TRAINABLE: False
+          IS_TRAINABLE: True
+          INIT_FILE: "https://dl.fbaipublicfiles.com/densepose/data/cse/lbo/phi_cat_7466_256.pkl"
+        "dog_7466":
+          TYPE: vertex_feature
+          NUM_VERTICES: 7466
+          FEATURE_DIM: 256
+          FEATURES_TRAINABLE: False
+          IS_TRAINABLE: True
+          INIT_FILE: "https://dl.fbaipublicfiles.com/densepose/data/cse/lbo/phi_dog_7466_256.pkl"
+        "sheep_5004":
+          TYPE: vertex_feature
+          NUM_VERTICES: 5004
+          FEATURE_DIM: 256
+          FEATURES_TRAINABLE: False
+          IS_TRAINABLE: True
+          INIT_FILE: "https://dl.fbaipublicfiles.com/densepose/data/cse/lbo/phi_sheep_5004_256.pkl"
+        "horse_5004":
+          TYPE: vertex_feature
+          NUM_VERTICES: 5004
+          FEATURE_DIM: 256
+          FEATURES_TRAINABLE: False
+          IS_TRAINABLE: True
+          INIT_FILE: "https://dl.fbaipublicfiles.com/densepose/data/cse/lbo/phi_horse_5004_256.pkl"
+        "zebra_5002":
+          TYPE: vertex_feature
+          NUM_VERTICES: 5002
+          FEATURE_DIM: 256
+          FEATURES_TRAINABLE: False
+          IS_TRAINABLE: True
+          INIT_FILE: "https://dl.fbaipublicfiles.com/densepose/data/cse/lbo/phi_zebra_5002_256.pkl"
+        "giraffe_5002":
+          TYPE: vertex_feature
+          NUM_VERTICES: 5002
+          FEATURE_DIM: 256
+          FEATURES_TRAINABLE: False
+          IS_TRAINABLE: True
+          INIT_FILE: "https://dl.fbaipublicfiles.com/densepose/data/cse/lbo/phi_giraffe_5002_256.pkl"
+        "elephant_5002":
+          TYPE: vertex_feature
+          NUM_VERTICES: 5002
+          FEATURE_DIM: 256
+          FEATURES_TRAINABLE: False
+          IS_TRAINABLE: True
+          INIT_FILE: "https://dl.fbaipublicfiles.com/densepose/data/cse/lbo/phi_elephant_5002_256.pkl"
+        "cow_5002":
+          TYPE: vertex_feature
+          NUM_VERTICES: 5002
+          FEATURE_DIM: 256
+          FEATURES_TRAINABLE: False
+          IS_TRAINABLE: True
+          INIT_FILE: "https://dl.fbaipublicfiles.com/densepose/data/cse/lbo/phi_cow_5002_256.pkl"
+        "bear_4936":
+          TYPE: vertex_feature
+          NUM_VERTICES: 4936
+          FEATURE_DIM: 256
+          FEATURES_TRAINABLE: False
+          IS_TRAINABLE: True
+          INIT_FILE: "https://dl.fbaipublicfiles.com/densepose/data/cse/lbo/phi_bear_4936_256.pkl"
+DATASETS:
+  TRAIN:
+    - "densepose_lvis_v1_ds2_train_v1"
+  TEST:
+    - "densepose_lvis_v1_ds2_val_v1"
+  WHITELISTED_CATEGORIES:
+    "densepose_lvis_v1_ds2_train_v1":
+      - 943  # sheep
+      - 1202 # zebra
+      - 569  # horse
+      - 496  # giraffe
+      - 422  # elephant
+      - 80   # cow
+      - 76   # bear
+      - 225  # cat
+      - 378  # dog
+    "densepose_lvis_v1_ds2_val_v1":
+      - 943  # sheep
+      - 1202 # zebra
+      - 569  # horse
+      - 496  # giraffe
+      - 422  # elephant
+      - 80   # cow
+      - 76   # bear
+      - 225  # cat
+      - 378  # dog
+  CLASS_TO_MESH_NAME_MAPPING:
+    "0": "bear_4936"
+    "1": "cow_5002"
+    "2": "cat_7466"
+    "3": "dog_7466"
+    "4": "elephant_5002"
+    "5": "giraffe_5002"
+    "6": "horse_5004"
+    "7": "sheep_5004"
+    "8": "zebra_5002"
+SOLVER:
+  MAX_ITER: 24000
+  STEPS: (20000, 22000)
diff --git a/configs/cse/densepose_rcnn_R_50_FPN_soft_chimps_finetune_4k.yaml b/configs/cse/densepose_rcnn_R_50_FPN_soft_chimps_finetune_4k.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..3bccb7837a2e4b905b4e3c7af465c3be3a44452d
--- /dev/null
+++ b/configs/cse/densepose_rcnn_R_50_FPN_soft_chimps_finetune_4k.yaml
@@ -0,0 +1,29 @@
+_BASE_: "Base-DensePose-RCNN-FPN.yaml"
+MODEL:
+  WEIGHTS: "https://dl.fbaipublicfiles.com/densepose/cse/densepose_rcnn_R_50_FPN_soft_s1x/250533982/model_final_2c4512.pkl"
+  RESNETS:
+    DEPTH: 50
+  ROI_DENSEPOSE_HEAD:
+    NAME: "DensePoseV1ConvXHead"
+    CSE:
+      EMBED_LOSS_NAME: "SoftEmbeddingLoss"
+      EMBEDDING_DIST_GAUSS_SIGMA: 0.1
+      GEODESIC_DIST_GAUSS_SIGMA: 0.1
+      EMBEDDERS:
+        "chimp_5029":
+          TYPE: vertex_feature
+          NUM_VERTICES: 5029
+          FEATURE_DIM: 256
+          FEATURES_TRAINABLE: False
+          IS_TRAINABLE: True
+          INIT_FILE: "https://dl.fbaipublicfiles.com/densepose/data/cse/lbo/phi_chimp_5029_256.pkl"
+DATASETS:
+  TRAIN:
+    - "densepose_chimps_cse_train"
+  TEST:
+    - "densepose_chimps_cse_val"
+  CLASS_TO_MESH_NAME_MAPPING:
+    "0": "chimp_5029"
+SOLVER:
+  MAX_ITER: 4000
+  STEPS: (3000, 3500)
diff --git a/configs/cse/densepose_rcnn_R_50_FPN_soft_s1x.yaml b/configs/cse/densepose_rcnn_R_50_FPN_soft_s1x.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..9662fb8f8a4e9f7b01f41ddb79a3469ecab7032b
--- /dev/null
+++ b/configs/cse/densepose_rcnn_R_50_FPN_soft_s1x.yaml
@@ -0,0 +1,12 @@
+_BASE_: "Base-DensePose-RCNN-FPN-Human.yaml"
+MODEL:
+  WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
+  RESNETS:
+    DEPTH: 50
+  ROI_DENSEPOSE_HEAD:
+    NAME: "DensePoseV1ConvXHead"
+    CSE:
+      EMBED_LOSS_NAME: "SoftEmbeddingLoss"
+SOLVER:
+  MAX_ITER: 130000
+  STEPS: (100000, 120000)
diff --git a/configs/densepose_rcnn_R_101_FPN_DL_WC1M_s1x.yaml b/configs/densepose_rcnn_R_101_FPN_DL_WC1M_s1x.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..3c16763c532499c1a0c62fb8c81a2ab97be3a1ec
--- /dev/null
+++ b/configs/densepose_rcnn_R_101_FPN_DL_WC1M_s1x.yaml
@@ -0,0 +1,18 @@
+_BASE_: "Base-DensePose-RCNN-FPN.yaml"
+MODEL:
+  WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-101.pkl"
+  RESNETS:
+    DEPTH: 101
+  ROI_DENSEPOSE_HEAD:
+    NAME: "DensePoseDeepLabHead"
+    UV_CONFIDENCE:
+      ENABLED: True
+      TYPE: "iid_iso"
+    SEGM_CONFIDENCE:
+      ENABLED: True
+    POINT_REGRESSION_WEIGHTS: 0.0005
+SOLVER:
+  CLIP_GRADIENTS:
+    ENABLED: True
+  MAX_ITER: 130000
+  STEPS: (100000, 120000)
diff --git a/configs/densepose_rcnn_R_101_FPN_DL_WC1_s1x.yaml b/configs/densepose_rcnn_R_101_FPN_DL_WC1_s1x.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..15475b1ac3bb7272a7ebc0061a55119ffd2591b9
--- /dev/null
+++ b/configs/densepose_rcnn_R_101_FPN_DL_WC1_s1x.yaml
@@ -0,0 +1,16 @@
+_BASE_: "Base-DensePose-RCNN-FPN.yaml"
+MODEL:
+  WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-101.pkl"
+  RESNETS:
+    DEPTH: 101
+  ROI_DENSEPOSE_HEAD:
+    NAME: "DensePoseDeepLabHead"
+    UV_CONFIDENCE:
+      ENABLED: True
+      TYPE: "iid_iso"
+    POINT_REGRESSION_WEIGHTS: 0.0005
+SOLVER:
+  CLIP_GRADIENTS:
+    ENABLED: True
+  MAX_ITER: 130000
+  STEPS: (100000, 120000)
diff --git a/configs/densepose_rcnn_R_101_FPN_DL_WC2M_s1x.yaml b/configs/densepose_rcnn_R_101_FPN_DL_WC2M_s1x.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..0cbe07f3bb0027bb7ecdc86f96d60790382b477b
--- /dev/null
+++ b/configs/densepose_rcnn_R_101_FPN_DL_WC2M_s1x.yaml
@@ -0,0 +1,18 @@
+_BASE_: "Base-DensePose-RCNN-FPN.yaml"
+MODEL:
+  WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-101.pkl"
+  RESNETS:
+    DEPTH: 101
+  ROI_DENSEPOSE_HEAD:
+    NAME: "DensePoseDeepLabHead"
+    UV_CONFIDENCE:
+      ENABLED: True
+      TYPE: "indep_aniso"
+    SEGM_CONFIDENCE:
+      ENABLED: True
+    POINT_REGRESSION_WEIGHTS: 0.0005
+SOLVER:
+  CLIP_GRADIENTS:
+    ENABLED: True
+  MAX_ITER: 130000
+  STEPS: (100000, 120000)
diff --git a/configs/densepose_rcnn_R_101_FPN_DL_WC2_s1x.yaml b/configs/densepose_rcnn_R_101_FPN_DL_WC2_s1x.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..7546b967ab89129c9a276f19b1cf2d6b59f1a462
--- /dev/null
+++ b/configs/densepose_rcnn_R_101_FPN_DL_WC2_s1x.yaml
@@ -0,0 +1,16 @@
+_BASE_: "Base-DensePose-RCNN-FPN.yaml"
+MODEL:
+  WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-101.pkl"
+  RESNETS:
+    DEPTH: 101
+  ROI_DENSEPOSE_HEAD:
+    NAME: "DensePoseDeepLabHead"
+    UV_CONFIDENCE:
+      ENABLED: True
+      TYPE: "indep_aniso"
+    POINT_REGRESSION_WEIGHTS: 0.0005
+SOLVER:
+  CLIP_GRADIENTS:
+    ENABLED: True
+  MAX_ITER: 130000
+  STEPS: (100000, 120000)
diff --git a/configs/densepose_rcnn_R_101_FPN_DL_s1x.yaml b/configs/densepose_rcnn_R_101_FPN_DL_s1x.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..045f7f02f1b4eb0c0ef1733c3ac65e3aa70168de
--- /dev/null
+++ b/configs/densepose_rcnn_R_101_FPN_DL_s1x.yaml
@@ -0,0 +1,10 @@
+_BASE_: "Base-DensePose-RCNN-FPN.yaml"
+MODEL:
+  WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-101.pkl"
+  RESNETS:
+    DEPTH: 101
+  ROI_DENSEPOSE_HEAD:
+    NAME: "DensePoseDeepLabHead"
+SOLVER:
+  MAX_ITER: 130000
+  STEPS: (100000, 120000)
diff --git a/configs/densepose_rcnn_R_101_FPN_WC1M_s1x.yaml b/configs/densepose_rcnn_R_101_FPN_WC1M_s1x.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..9334e18655d4451457a58c6ce945e01855f95105
--- /dev/null
+++ b/configs/densepose_rcnn_R_101_FPN_WC1M_s1x.yaml
@@ -0,0 +1,18 @@
+_BASE_: "Base-DensePose-RCNN-FPN.yaml"
+MODEL:
+  WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-101.pkl"
+  RESNETS:
+    DEPTH: 101
+  ROI_DENSEPOSE_HEAD:
+    UV_CONFIDENCE:
+      ENABLED: True
+      TYPE: "iid_iso"
+    SEGM_CONFIDENCE:
+      ENABLED: True
+    POINT_REGRESSION_WEIGHTS: 0.0005
+SOLVER:
+  CLIP_GRADIENTS:
+    ENABLED: True
+  MAX_ITER: 130000
+  STEPS: (100000, 120000)
+  WARMUP_FACTOR: 0.025
diff --git a/configs/densepose_rcnn_R_101_FPN_WC1_s1x.yaml b/configs/densepose_rcnn_R_101_FPN_WC1_s1x.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..ace62094fbc4ce2024810333c11c7a955d8eeb22
--- /dev/null
+++ b/configs/densepose_rcnn_R_101_FPN_WC1_s1x.yaml
@@ -0,0 +1,16 @@
+_BASE_: "Base-DensePose-RCNN-FPN.yaml"
+MODEL:
+  WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-101.pkl"
+  RESNETS:
+    DEPTH: 101
+  ROI_DENSEPOSE_HEAD:
+    UV_CONFIDENCE:
+      ENABLED: True
+      TYPE: "iid_iso"
+    POINT_REGRESSION_WEIGHTS: 0.0005
+SOLVER:
+  CLIP_GRADIENTS:
+    ENABLED: True
+  MAX_ITER: 130000
+  STEPS: (100000, 120000)
+  WARMUP_FACTOR: 0.025
diff --git a/configs/densepose_rcnn_R_101_FPN_WC2M_s1x.yaml b/configs/densepose_rcnn_R_101_FPN_WC2M_s1x.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..90f0be2805cd04e83c25d041d35ae66c90ce2b95
--- /dev/null
+++ b/configs/densepose_rcnn_R_101_FPN_WC2M_s1x.yaml
@@ -0,0 +1,18 @@
+_BASE_: "Base-DensePose-RCNN-FPN.yaml"
+MODEL:
+  WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-101.pkl"
+  RESNETS:
+    DEPTH: 101
+  ROI_DENSEPOSE_HEAD:
+    UV_CONFIDENCE:
+      ENABLED: True
+      TYPE: "indep_aniso"
+    SEGM_CONFIDENCE:
+      ENABLED: True
+    POINT_REGRESSION_WEIGHTS: 0.0005
+SOLVER:
+  CLIP_GRADIENTS:
+    ENABLED: True
+  MAX_ITER: 130000
+  STEPS: (100000, 120000)
+  WARMUP_FACTOR: 0.025
diff --git a/configs/densepose_rcnn_R_101_FPN_WC2_s1x.yaml b/configs/densepose_rcnn_R_101_FPN_WC2_s1x.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..766c098f6dcdd1fb3f67957d7d1d982b37747b96
--- /dev/null
+++ b/configs/densepose_rcnn_R_101_FPN_WC2_s1x.yaml
@@ -0,0 +1,16 @@
+_BASE_: "Base-DensePose-RCNN-FPN.yaml"
+MODEL:
+  WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-101.pkl"
+  RESNETS:
+    DEPTH: 101
+  ROI_DENSEPOSE_HEAD:
+    UV_CONFIDENCE:
+      ENABLED: True
+      TYPE: "indep_aniso"
+    POINT_REGRESSION_WEIGHTS: 0.0005
+SOLVER:
+  CLIP_GRADIENTS:
+    ENABLED: True
+  MAX_ITER: 130000
+  STEPS: (100000, 120000)
+  WARMUP_FACTOR: 0.025
diff --git a/configs/densepose_rcnn_R_101_FPN_s1x.yaml b/configs/densepose_rcnn_R_101_FPN_s1x.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..af44fb767edf9bf093463e62f93e070d0d019c5a
--- /dev/null
+++ b/configs/densepose_rcnn_R_101_FPN_s1x.yaml
@@ -0,0 +1,8 @@
+_BASE_: "Base-DensePose-RCNN-FPN.yaml"
+MODEL:
+  WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-101.pkl"
+  RESNETS:
+    DEPTH: 101
+SOLVER:
+  MAX_ITER: 130000
+  STEPS: (100000, 120000)
diff --git a/configs/densepose_rcnn_R_101_FPN_s1x_legacy.yaml b/configs/densepose_rcnn_R_101_FPN_s1x_legacy.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..8e79a1b9549cf19ed4a43cf9caf3dc88f6133310
--- /dev/null
+++ b/configs/densepose_rcnn_R_101_FPN_s1x_legacy.yaml
@@ -0,0 +1,17 @@
+_BASE_: "Base-DensePose-RCNN-FPN.yaml"
+MODEL:
+  WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-101.pkl"
+  RESNETS:
+    DEPTH: 101
+  ROI_DENSEPOSE_HEAD:
+    NUM_COARSE_SEGM_CHANNELS: 15
+    POOLER_RESOLUTION: 14
+    HEATMAP_SIZE: 56
+    INDEX_WEIGHTS: 2.0
+    PART_WEIGHTS: 0.3
+    POINT_REGRESSION_WEIGHTS: 0.1
+    DECODER_ON: False
+SOLVER:
+  BASE_LR: 0.002
+  MAX_ITER: 130000
+  STEPS: (100000, 120000)
diff --git a/configs/densepose_rcnn_R_50_FPN_DL_WC1M_s1x.yaml b/configs/densepose_rcnn_R_50_FPN_DL_WC1M_s1x.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..18a417a9a76d388810d46d1ee738d8b19abf0db0
--- /dev/null
+++ b/configs/densepose_rcnn_R_50_FPN_DL_WC1M_s1x.yaml
@@ -0,0 +1,18 @@
+_BASE_: "Base-DensePose-RCNN-FPN.yaml"
+MODEL:
+  WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
+  RESNETS:
+    DEPTH: 50
+  ROI_DENSEPOSE_HEAD:
+    NAME: "DensePoseDeepLabHead"
+    UV_CONFIDENCE:
+      ENABLED: True
+      TYPE: "iid_iso"
+    SEGM_CONFIDENCE:
+      ENABLED: True
+    POINT_REGRESSION_WEIGHTS: 0.0005
+SOLVER:
+  CLIP_GRADIENTS:
+    ENABLED: True
+  MAX_ITER: 130000
+  STEPS: (100000, 120000)
diff --git a/configs/densepose_rcnn_R_50_FPN_DL_WC1_s1x.yaml b/configs/densepose_rcnn_R_50_FPN_DL_WC1_s1x.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..f3720eff56ce042a68da6c99f484b963cae2c7d9
--- /dev/null
+++ b/configs/densepose_rcnn_R_50_FPN_DL_WC1_s1x.yaml
@@ -0,0 +1,16 @@
+_BASE_: "Base-DensePose-RCNN-FPN.yaml"
+MODEL:
+  WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
+  RESNETS:
+    DEPTH: 50
+  ROI_DENSEPOSE_HEAD:
+    NAME: "DensePoseDeepLabHead"
+    UV_CONFIDENCE:
+      ENABLED: True
+      TYPE: "iid_iso"
+    POINT_REGRESSION_WEIGHTS: 0.0005
+SOLVER:
+  CLIP_GRADIENTS:
+    ENABLED: True
+  MAX_ITER: 130000
+  STEPS: (100000, 120000)
diff --git a/configs/densepose_rcnn_R_50_FPN_DL_WC2M_s1x.yaml b/configs/densepose_rcnn_R_50_FPN_DL_WC2M_s1x.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..8a413d2a0d1549702fb45a2e50056fe0abde941f
--- /dev/null
+++ b/configs/densepose_rcnn_R_50_FPN_DL_WC2M_s1x.yaml
@@ -0,0 +1,18 @@
+_BASE_: "Base-DensePose-RCNN-FPN.yaml"
+MODEL:
+  WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
+  RESNETS:
+    DEPTH: 50
+  ROI_DENSEPOSE_HEAD:
+    NAME: "DensePoseDeepLabHead"
+    UV_CONFIDENCE:
+      ENABLED: True
+      TYPE: "indep_aniso"
+    SEGM_CONFIDENCE:
+      ENABLED: True
+    POINT_REGRESSION_WEIGHTS: 0.0005
+SOLVER:
+  CLIP_GRADIENTS:
+    ENABLED: True
+  MAX_ITER: 130000
+  STEPS: (100000, 120000)
diff --git a/configs/densepose_rcnn_R_50_FPN_DL_WC2_s1x.yaml b/configs/densepose_rcnn_R_50_FPN_DL_WC2_s1x.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..5a47cc05e6e9dc882778c6b502d93cbcec88fb88
--- /dev/null
+++ b/configs/densepose_rcnn_R_50_FPN_DL_WC2_s1x.yaml
@@ -0,0 +1,16 @@
+_BASE_: "Base-DensePose-RCNN-FPN.yaml"
+MODEL:
+  WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
+  RESNETS:
+    DEPTH: 50
+  ROI_DENSEPOSE_HEAD:
+    NAME: "DensePoseDeepLabHead"
+    UV_CONFIDENCE:
+      ENABLED: True
+      TYPE: "indep_aniso"
+    POINT_REGRESSION_WEIGHTS: 0.0005
+SOLVER:
+  CLIP_GRADIENTS:
+    ENABLED: True
+  MAX_ITER: 130000
+  STEPS: (100000, 120000)
diff --git a/configs/densepose_rcnn_R_50_FPN_DL_s1x.yaml b/configs/densepose_rcnn_R_50_FPN_DL_s1x.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..52a170b4a28289ad943314f77256e34800d23121
--- /dev/null
+++ b/configs/densepose_rcnn_R_50_FPN_DL_s1x.yaml
@@ -0,0 +1,10 @@
+_BASE_: "Base-DensePose-RCNN-FPN.yaml"
+MODEL:
+  WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
+  RESNETS:
+    DEPTH: 50
+  ROI_DENSEPOSE_HEAD:
+    NAME: "DensePoseDeepLabHead"
+SOLVER:
+  MAX_ITER: 130000
+  STEPS: (100000, 120000)
diff --git a/configs/densepose_rcnn_R_50_FPN_WC1M_s1x.yaml b/configs/densepose_rcnn_R_50_FPN_WC1M_s1x.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..8a81f2a143cbfcd2dbc92f0fc5c86f951b9b7adf
--- /dev/null
+++ b/configs/densepose_rcnn_R_50_FPN_WC1M_s1x.yaml
@@ -0,0 +1,20 @@
+_BASE_: "Base-DensePose-RCNN-FPN.yaml"
+MODEL:
+  WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
+  RESNETS:
+    DEPTH: 50
+  ROI_DENSEPOSE_HEAD:
+    UV_CONFIDENCE:
+      ENABLED: True
+      TYPE: "iid_iso"
+    SEGM_CONFIDENCE:
+      ENABLED: True
+    POINT_REGRESSION_WEIGHTS: 0.0005
+SOLVER:
+  CLIP_GRADIENTS:
+    ENABLED: True
+    CLIP_TYPE: norm
+    CLIP_VALUE: 100.0
+  MAX_ITER: 130000
+  STEPS: (100000, 120000)
+  WARMUP_FACTOR: 0.025
diff --git a/configs/densepose_rcnn_R_50_FPN_WC1_s1x.yaml b/configs/densepose_rcnn_R_50_FPN_WC1_s1x.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..d36e54256ac22f1b01604e54430da24972f06eeb
--- /dev/null
+++ b/configs/densepose_rcnn_R_50_FPN_WC1_s1x.yaml
@@ -0,0 +1,16 @@
+_BASE_: "Base-DensePose-RCNN-FPN.yaml"
+MODEL:
+  WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
+  RESNETS:
+    DEPTH: 50
+  ROI_DENSEPOSE_HEAD:
+    UV_CONFIDENCE:
+      ENABLED: True
+      TYPE: "iid_iso"
+    POINT_REGRESSION_WEIGHTS: 0.0005
+SOLVER:
+  CLIP_GRADIENTS:
+    ENABLED: True
+  MAX_ITER: 130000
+  STEPS: (100000, 120000)
+  WARMUP_FACTOR: 0.025
diff --git a/configs/densepose_rcnn_R_50_FPN_WC2M_s1x.yaml b/configs/densepose_rcnn_R_50_FPN_WC2M_s1x.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..5cf29eacd57626c676ed4c960a3e97e552b6dbdf
--- /dev/null
+++ b/configs/densepose_rcnn_R_50_FPN_WC2M_s1x.yaml
@@ -0,0 +1,18 @@
+_BASE_: "Base-DensePose-RCNN-FPN.yaml"
+MODEL:
+  WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
+  RESNETS:
+    DEPTH: 50
+  ROI_DENSEPOSE_HEAD:
+    UV_CONFIDENCE:
+      ENABLED: True
+      TYPE: "indep_aniso"
+    SEGM_CONFIDENCE:
+      ENABLED: True
+    POINT_REGRESSION_WEIGHTS: 0.0005
+SOLVER:
+  CLIP_GRADIENTS:
+    ENABLED: True
+  MAX_ITER: 130000
+  STEPS: (100000, 120000)
+  WARMUP_FACTOR: 0.025
diff --git a/configs/densepose_rcnn_R_50_FPN_WC2_s1x.yaml b/configs/densepose_rcnn_R_50_FPN_WC2_s1x.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..e880d469564a3757ba3f4d708054074cefda49b6
--- /dev/null
+++ b/configs/densepose_rcnn_R_50_FPN_WC2_s1x.yaml
@@ -0,0 +1,16 @@
+_BASE_: "Base-DensePose-RCNN-FPN.yaml"
+MODEL:
+  WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
+  RESNETS:
+    DEPTH: 50
+  ROI_DENSEPOSE_HEAD:
+    UV_CONFIDENCE:
+      ENABLED: True
+      TYPE: "indep_aniso"
+    POINT_REGRESSION_WEIGHTS: 0.0005
+SOLVER:
+  CLIP_GRADIENTS:
+    ENABLED: True
+  MAX_ITER: 130000
+  STEPS: (100000, 120000)
+  WARMUP_FACTOR: 0.025
diff --git a/configs/densepose_rcnn_R_50_FPN_s1x.yaml b/configs/densepose_rcnn_R_50_FPN_s1x.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..d2dd14c6f92f3850b99e6f1c828c0fcee52120e1
--- /dev/null
+++ b/configs/densepose_rcnn_R_50_FPN_s1x.yaml
@@ -0,0 +1,8 @@
+_BASE_: "Base-DensePose-RCNN-FPN.yaml"
+MODEL:
+  WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
+  RESNETS:
+    DEPTH: 50
+SOLVER:
+  MAX_ITER: 130000
+  STEPS: (100000, 120000)
diff --git a/configs/densepose_rcnn_R_50_FPN_s1x_legacy.yaml b/configs/densepose_rcnn_R_50_FPN_s1x_legacy.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..6c5391f3b3c3d437312a290d29b0656cb3804b25
--- /dev/null
+++ b/configs/densepose_rcnn_R_50_FPN_s1x_legacy.yaml
@@ -0,0 +1,17 @@
+_BASE_: "Base-DensePose-RCNN-FPN.yaml"
+MODEL:
+  WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
+  RESNETS:
+    DEPTH: 50
+  ROI_DENSEPOSE_HEAD:
+    NUM_COARSE_SEGM_CHANNELS: 15
+    POOLER_RESOLUTION: 14
+    HEATMAP_SIZE: 56
+    INDEX_WEIGHTS: 2.0
+    PART_WEIGHTS: 0.3
+    POINT_REGRESSION_WEIGHTS: 0.1
+    DECODER_ON: False
+SOLVER:
+  BASE_LR: 0.002
+  MAX_ITER: 130000
+  STEPS: (100000, 120000)
diff --git a/configs/evolution/Base-RCNN-FPN-Atop10P_CA.yaml b/configs/evolution/Base-RCNN-FPN-Atop10P_CA.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..f09d723f3cb9eef94223c5926dbb7731397304c9
--- /dev/null
+++ b/configs/evolution/Base-RCNN-FPN-Atop10P_CA.yaml
@@ -0,0 +1,91 @@
+MODEL:
+  META_ARCHITECTURE: "GeneralizedRCNN"
+  BACKBONE:
+    NAME: "build_resnet_fpn_backbone"
+  RESNETS:
+    OUT_FEATURES: ["res2", "res3", "res4", "res5"]
+  FPN:
+    IN_FEATURES: ["res2", "res3", "res4", "res5"]
+  ANCHOR_GENERATOR:
+    SIZES: [[32], [64], [128], [256], [512]]  # One size for each in feature map
+    ASPECT_RATIOS: [[0.5, 1.0, 2.0]]  # Three aspect ratios (same for all in feature maps)
+  RPN:
+    IN_FEATURES: ["p2", "p3", "p4", "p5", "p6"]
+    PRE_NMS_TOPK_TRAIN: 2000  # Per FPN level
+    PRE_NMS_TOPK_TEST: 1000  # Per FPN level
+    # Detectron1 uses 2000 proposals per-batch,
+    # (See "modeling/rpn/rpn_outputs.py" for details of this legacy issue)
+    # which is approximately 1000 proposals per-image since the default batch size for FPN is 2.
+    POST_NMS_TOPK_TRAIN: 1000
+    POST_NMS_TOPK_TEST: 1000
+  ROI_HEADS:
+    NAME: "StandardROIHeads"
+    IN_FEATURES: ["p2", "p3", "p4", "p5"]
+    NUM_CLASSES: 1
+  ROI_BOX_HEAD:
+    NAME: "FastRCNNConvFCHead"
+    NUM_FC: 2
+    POOLER_RESOLUTION: 7
+  ROI_MASK_HEAD:
+    NAME: "MaskRCNNConvUpsampleHead"
+    NUM_CONV: 4
+    POOLER_RESOLUTION: 14
+DATASETS:
+  TRAIN: ("base_coco_2017_train", "densepose_coco_2014_train")
+  TEST: ("densepose_chimps",)
+  CATEGORY_MAPS:
+    "base_coco_2017_train":
+      "16": 1 # bird -> person
+      "17": 1 # cat -> person
+      "18": 1 # dog -> person
+      "19": 1 # horse -> person
+      "20": 1 # sheep -> person
+      "21": 1 # cow -> person
+      "22": 1 # elephant -> person
+      "23": 1 # bear -> person
+      "24": 1 # zebra -> person
+      "25": 1 # girafe -> person
+    "base_coco_2017_val":
+      "16": 1 # bird -> person
+      "17": 1 # cat -> person
+      "18": 1 # dog -> person
+      "19": 1 # horse -> person
+      "20": 1 # sheep -> person
+      "21": 1 # cow -> person
+      "22": 1 # elephant -> person
+      "23": 1 # bear -> person
+      "24": 1 # zebra -> person
+      "25": 1 # girafe -> person
+  WHITELISTED_CATEGORIES:
+    "base_coco_2017_train":
+      - 1  # person
+      - 16 # bird
+      - 17 # cat
+      - 18 # dog
+      - 19 # horse
+      - 20 # sheep
+      - 21 # cow
+      - 22 # elephant
+      - 23 # bear
+      - 24 # zebra
+      - 25 # girafe
+    "base_coco_2017_val":
+      - 1  # person
+      - 16 # bird
+      - 17 # cat
+      - 18 # dog
+      - 19 # horse
+      - 20 # sheep
+      - 21 # cow
+      - 22 # elephant
+      - 23 # bear
+      - 24 # zebra
+      - 25 # girafe
+SOLVER:
+  IMS_PER_BATCH: 16
+  BASE_LR: 0.02
+  STEPS: (60000, 80000)
+  MAX_ITER: 90000
+INPUT:
+  MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800)
+VERSION: 2
diff --git a/configs/evolution/densepose_R_50_FPN_DL_WC1M_3x_Atop10P_CA.yaml b/configs/evolution/densepose_R_50_FPN_DL_WC1M_3x_Atop10P_CA.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..6296692d5ff15da24f87adb6327a62d9f4a34892
--- /dev/null
+++ b/configs/evolution/densepose_R_50_FPN_DL_WC1M_3x_Atop10P_CA.yaml
@@ -0,0 +1,28 @@
+_BASE_: "Base-RCNN-FPN-Atop10P_CA.yaml"
+MODEL:
+  WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
+  RESNETS:
+    DEPTH: 50
+  DENSEPOSE_ON: True
+  ROI_HEADS:
+    NAME: "DensePoseROIHeads"
+    IN_FEATURES: ["p2", "p3", "p4", "p5"]
+    NUM_CLASSES: 1
+  ROI_DENSEPOSE_HEAD:
+    NAME: "DensePoseDeepLabHead"
+    UV_CONFIDENCE:
+      ENABLED: True
+      TYPE: "iid_iso"
+    SEGM_CONFIDENCE:
+      ENABLED: True
+    POINT_REGRESSION_WEIGHTS: 0.0005
+    POOLER_TYPE: "ROIAlign"
+    NUM_COARSE_SEGM_CHANNELS: 2
+    COARSE_SEGM_TRAINED_BY_MASKS: True
+    INDEX_WEIGHTS: 1.0
+SOLVER:
+  CLIP_GRADIENTS:
+    ENABLED: True
+  WARMUP_FACTOR: 0.025
+  MAX_ITER: 270000
+  STEPS: (210000, 250000)
diff --git a/configs/evolution/densepose_R_50_FPN_DL_WC1M_3x_Atop10P_CA_B_coarsesegm.yaml b/configs/evolution/densepose_R_50_FPN_DL_WC1M_3x_Atop10P_CA_B_coarsesegm.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..033918e0daec8c225306dafac3a5fe9923189e53
--- /dev/null
+++ b/configs/evolution/densepose_R_50_FPN_DL_WC1M_3x_Atop10P_CA_B_coarsesegm.yaml
@@ -0,0 +1,56 @@
+_BASE_: "Base-RCNN-FPN-Atop10P_CA.yaml"
+MODEL:
+  WEIGHTS: https://dl.fbaipublicfiles.com/densepose/evolution/densepose_R_50_FPN_DL_WC1M_3x_Atop10P_CA/217578784/model_final_9fe1cc.pkl
+  RESNETS:
+    DEPTH: 50
+  DENSEPOSE_ON: True
+  ROI_HEADS:
+    NAME: "DensePoseROIHeads"
+    IN_FEATURES: ["p2", "p3", "p4", "p5"]
+    NUM_CLASSES: 1
+  ROI_DENSEPOSE_HEAD:
+    NAME: "DensePoseDeepLabHead"
+    UV_CONFIDENCE:
+      ENABLED: True
+      TYPE: "iid_iso"
+    SEGM_CONFIDENCE:
+      ENABLED: True
+    POINT_REGRESSION_WEIGHTS: 0.0005
+    POOLER_TYPE: "ROIAlign"
+    NUM_COARSE_SEGM_CHANNELS: 2
+    COARSE_SEGM_TRAINED_BY_MASKS: True
+BOOTSTRAP_DATASETS:
+  - DATASET: "chimpnsee"
+    RATIO: 1.0
+    IMAGE_LOADER:
+      TYPE: "video_keyframe"
+      SELECT:
+        STRATEGY: "random_k"
+        NUM_IMAGES: 4
+      TRANSFORM:
+        TYPE: "resize"
+        MIN_SIZE: 800
+        MAX_SIZE: 1333
+      BATCH_SIZE: 8
+      NUM_WORKERS: 1
+    INFERENCE:
+      INPUT_BATCH_SIZE: 1
+      OUTPUT_BATCH_SIZE: 1
+    DATA_SAMPLER:
+      # supported types:
+      #   densepose_uniform
+      #   densepose_UV_confidence
+      #   densepose_fine_segm_confidence
+      #   densepose_coarse_segm_confidence
+      TYPE: "densepose_coarse_segm_confidence"
+      COUNT_PER_CLASS: 8
+    FILTER:
+      TYPE: "detection_score"
+      MIN_VALUE: 0.8
+BOOTSTRAP_MODEL:
+  WEIGHTS: https://dl.fbaipublicfiles.com/densepose/evolution/densepose_R_50_FPN_DL_WC1M_3x_Atop10P_CA/217578784/model_final_9fe1cc.pkl
+SOLVER:
+  CLIP_GRADIENTS:
+    ENABLED: True
+  MAX_ITER: 270000
+  STEPS: (210000, 250000)
diff --git a/configs/evolution/densepose_R_50_FPN_DL_WC1M_3x_Atop10P_CA_B_finesegm.yaml b/configs/evolution/densepose_R_50_FPN_DL_WC1M_3x_Atop10P_CA_B_finesegm.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..5814a4a01fd772674fa40c0cba34666aed87b33a
--- /dev/null
+++ b/configs/evolution/densepose_R_50_FPN_DL_WC1M_3x_Atop10P_CA_B_finesegm.yaml
@@ -0,0 +1,56 @@
+_BASE_: "Base-RCNN-FPN-Atop10P_CA.yaml"
+MODEL:
+  WEIGHTS: https://dl.fbaipublicfiles.com/densepose/evolution/densepose_R_50_FPN_DL_WC1M_3x_Atop10P_CA/217578784/model_final_9fe1cc.pkl
+  RESNETS:
+    DEPTH: 50
+  DENSEPOSE_ON: True
+  ROI_HEADS:
+    NAME: "DensePoseROIHeads"
+    IN_FEATURES: ["p2", "p3", "p4", "p5"]
+    NUM_CLASSES: 1
+  ROI_DENSEPOSE_HEAD:
+    NAME: "DensePoseDeepLabHead"
+    UV_CONFIDENCE:
+      ENABLED: True
+      TYPE: "iid_iso"
+    SEGM_CONFIDENCE:
+      ENABLED: True
+    POINT_REGRESSION_WEIGHTS: 0.0005
+    POOLER_TYPE: "ROIAlign"
+    NUM_COARSE_SEGM_CHANNELS: 2
+    COARSE_SEGM_TRAINED_BY_MASKS: True
+BOOTSTRAP_DATASETS:
+  - DATASET: "chimpnsee"
+    RATIO: 1.0
+    IMAGE_LOADER:
+      TYPE: "video_keyframe"
+      SELECT:
+        STRATEGY: "random_k"
+        NUM_IMAGES: 4
+      TRANSFORM:
+        TYPE: "resize"
+        MIN_SIZE: 800
+        MAX_SIZE: 1333
+      BATCH_SIZE: 8
+      NUM_WORKERS: 1
+    INFERENCE:
+      INPUT_BATCH_SIZE: 1
+      OUTPUT_BATCH_SIZE: 1
+    DATA_SAMPLER:
+      # supported types:
+      #   densepose_uniform
+      #   densepose_UV_confidence
+      #   densepose_fine_segm_confidence
+      #   densepose_coarse_segm_confidence
+      TYPE: "densepose_fine_segm_confidence"
+      COUNT_PER_CLASS: 8
+    FILTER:
+      TYPE: "detection_score"
+      MIN_VALUE: 0.8
+BOOTSTRAP_MODEL:
+  WEIGHTS: https://dl.fbaipublicfiles.com/densepose/evolution/densepose_R_50_FPN_DL_WC1M_3x_Atop10P_CA/217578784/model_final_9fe1cc.pkl
+SOLVER:
+  CLIP_GRADIENTS:
+    ENABLED: True
+  MAX_ITER: 270000
+  STEPS: (210000, 250000)
diff --git a/configs/evolution/densepose_R_50_FPN_DL_WC1M_3x_Atop10P_CA_B_uniform.yaml b/configs/evolution/densepose_R_50_FPN_DL_WC1M_3x_Atop10P_CA_B_uniform.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..d591ea6e22282f43fff0b44131e0913aa7261276
--- /dev/null
+++ b/configs/evolution/densepose_R_50_FPN_DL_WC1M_3x_Atop10P_CA_B_uniform.yaml
@@ -0,0 +1,56 @@
+_BASE_: "Base-RCNN-FPN-Atop10P_CA.yaml"
+MODEL:
+  WEIGHTS: https://dl.fbaipublicfiles.com/densepose/evolution/densepose_R_50_FPN_DL_WC1M_3x_Atop10P_CA/217578784/model_final_9fe1cc.pkl
+  RESNETS:
+    DEPTH: 50
+  DENSEPOSE_ON: True
+  ROI_HEADS:
+    NAME: "DensePoseROIHeads"
+    IN_FEATURES: ["p2", "p3", "p4", "p5"]
+    NUM_CLASSES: 1
+  ROI_DENSEPOSE_HEAD:
+    NAME: "DensePoseDeepLabHead"
+    UV_CONFIDENCE:
+      ENABLED: True
+      TYPE: "iid_iso"
+    SEGM_CONFIDENCE:
+      ENABLED: True
+    POINT_REGRESSION_WEIGHTS: 0.0005
+    POOLER_TYPE: "ROIAlign"
+    NUM_COARSE_SEGM_CHANNELS: 2
+    COARSE_SEGM_TRAINED_BY_MASKS: True
+BOOTSTRAP_DATASETS:
+  - DATASET: "chimpnsee"
+    RATIO: 1.0
+    IMAGE_LOADER:
+      TYPE: "video_keyframe"
+      SELECT:
+        STRATEGY: "random_k"
+        NUM_IMAGES: 4
+      TRANSFORM:
+        TYPE: "resize"
+        MIN_SIZE: 800
+        MAX_SIZE: 1333
+      BATCH_SIZE: 8
+      NUM_WORKERS: 1
+    INFERENCE:
+      INPUT_BATCH_SIZE: 1
+      OUTPUT_BATCH_SIZE: 1
+    DATA_SAMPLER:
+      # supported types:
+      #   densepose_uniform
+      #   densepose_UV_confidence
+      #   densepose_fine_segm_confidence
+      #   densepose_coarse_segm_confidence
+      TYPE: "densepose_uniform"
+      COUNT_PER_CLASS: 8
+    FILTER:
+      TYPE: "detection_score"
+      MIN_VALUE: 0.8
+BOOTSTRAP_MODEL:
+  WEIGHTS: https://dl.fbaipublicfiles.com/densepose/evolution/densepose_R_50_FPN_DL_WC1M_3x_Atop10P_CA/217578784/model_final_9fe1cc.pkl
+SOLVER:
+  CLIP_GRADIENTS:
+    ENABLED: True
+  MAX_ITER: 270000
+  STEPS: (210000, 250000)
diff --git a/configs/evolution/densepose_R_50_FPN_DL_WC1M_3x_Atop10P_CA_B_uv.yaml b/configs/evolution/densepose_R_50_FPN_DL_WC1M_3x_Atop10P_CA_B_uv.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..110acff5a54247abb7b344672038b71e24167f33
--- /dev/null
+++ b/configs/evolution/densepose_R_50_FPN_DL_WC1M_3x_Atop10P_CA_B_uv.yaml
@@ -0,0 +1,56 @@
+_BASE_: "Base-RCNN-FPN-Atop10P_CA.yaml"
+MODEL:
+  WEIGHTS: https://dl.fbaipublicfiles.com/densepose/evolution/densepose_R_50_FPN_DL_WC1M_3x_Atop10P_CA/217578784/model_final_9fe1cc.pkl
+  RESNETS:
+    DEPTH: 50
+  DENSEPOSE_ON: True
+  ROI_HEADS:
+    NAME: "DensePoseROIHeads"
+    IN_FEATURES: ["p2", "p3", "p4", "p5"]
+    NUM_CLASSES: 1
+  ROI_DENSEPOSE_HEAD:
+    NAME: "DensePoseDeepLabHead"
+    UV_CONFIDENCE:
+      ENABLED: True
+      TYPE: "iid_iso"
+    SEGM_CONFIDENCE:
+      ENABLED: True
+    POINT_REGRESSION_WEIGHTS: 0.0005
+    POOLER_TYPE: "ROIAlign"
+    NUM_COARSE_SEGM_CHANNELS: 2
+    COARSE_SEGM_TRAINED_BY_MASKS: True
+BOOTSTRAP_DATASETS:
+  - DATASET: "chimpnsee"
+    RATIO: 1.0
+    IMAGE_LOADER:
+      TYPE: "video_keyframe"
+      SELECT:
+        STRATEGY: "random_k"
+        NUM_IMAGES: 4
+      TRANSFORM:
+        TYPE: "resize"
+        MIN_SIZE: 800
+        MAX_SIZE: 1333
+      BATCH_SIZE: 8
+      NUM_WORKERS: 1
+    INFERENCE:
+      INPUT_BATCH_SIZE: 1
+      OUTPUT_BATCH_SIZE: 1
+    DATA_SAMPLER:
+      # supported types:
+      #   densepose_uniform
+      #   densepose_UV_confidence
+      #   densepose_fine_segm_confidence
+      #   densepose_coarse_segm_confidence
+      TYPE: "densepose_UV_confidence"
+      COUNT_PER_CLASS: 8
+    FILTER:
+      TYPE: "detection_score"
+      MIN_VALUE: 0.8
+BOOTSTRAP_MODEL:
+  WEIGHTS: https://dl.fbaipublicfiles.com/densepose/evolution/densepose_R_50_FPN_DL_WC1M_3x_Atop10P_CA/217578784/model_final_9fe1cc.pkl
+SOLVER:
+  CLIP_GRADIENTS:
+    ENABLED: True
+  MAX_ITER: 270000
+  STEPS: (210000, 250000)
diff --git a/configs/quick_schedules/cse/densepose_rcnn_R_50_FPN_DL_instant_test.yaml b/configs/quick_schedules/cse/densepose_rcnn_R_50_FPN_DL_instant_test.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..3b43f75da549a9e5148c8528b5d375317680d738
--- /dev/null
+++ b/configs/quick_schedules/cse/densepose_rcnn_R_50_FPN_DL_instant_test.yaml
@@ -0,0 +1,11 @@
+_BASE_: "../../cse/Base-DensePose-RCNN-FPN.yaml"
+MODEL:
+  WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
+  ROI_DENSEPOSE_HEAD:
+    NAME: "DensePoseDeepLabHead"
+DATASETS:
+  TRAIN: ("densepose_coco_2014_minival_100_cse",)
+  TEST: ("densepose_coco_2014_minival_100_cse",)
+SOLVER:
+  MAX_ITER: 40
+  STEPS: (30,)
diff --git a/configs/quick_schedules/cse/densepose_rcnn_R_50_FPN_soft_animals_finetune_instant_test.yaml b/configs/quick_schedules/cse/densepose_rcnn_R_50_FPN_soft_animals_finetune_instant_test.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..a2c49a2d14e5665af117972d126e25422e37b2b9
--- /dev/null
+++ b/configs/quick_schedules/cse/densepose_rcnn_R_50_FPN_soft_animals_finetune_instant_test.yaml
@@ -0,0 +1,126 @@
+_BASE_: "../../cse/Base-DensePose-RCNN-FPN.yaml"
+MODEL:
+  WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
+  RESNETS:
+    DEPTH: 50
+  ROI_HEADS:
+    NUM_CLASSES: 9
+  ROI_DENSEPOSE_HEAD:
+    NAME: "DensePoseV1ConvXHead"
+    CSE:
+      EMBED_LOSS_NAME: "SoftEmbeddingLoss"
+      EMBEDDING_DIST_GAUSS_SIGMA: 0.1
+      EMBEDDERS:
+        "cat_5001":
+          TYPE: vertex_feature
+          NUM_VERTICES: 5001
+          FEATURE_DIM: 256
+          FEATURES_TRAINABLE: False
+          IS_TRAINABLE: True
+          INIT_FILE: "https://dl.fbaipublicfiles.com/densepose/data/cse/lbo/phi_cat_5001_256.pkl"
+        "dog_5002":
+          TYPE: vertex_feature
+          NUM_VERTICES: 5002
+          FEATURE_DIM: 256
+          FEATURES_TRAINABLE: False
+          IS_TRAINABLE: True
+          INIT_FILE: "https://dl.fbaipublicfiles.com/densepose/data/cse/lbo/phi_dog_5002_256.pkl"
+        "sheep_5004":
+          TYPE: vertex_feature
+          NUM_VERTICES: 5004
+          FEATURE_DIM: 256
+          FEATURES_TRAINABLE: False
+          IS_TRAINABLE: True
+          INIT_FILE: "https://dl.fbaipublicfiles.com/densepose/data/cse/lbo/phi_sheep_5004_256.pkl"
+        "horse_5004":
+          TYPE: vertex_feature
+          NUM_VERTICES: 5004
+          FEATURE_DIM: 256
+          FEATURES_TRAINABLE: False
+          IS_TRAINABLE: True
+          INIT_FILE: "https://dl.fbaipublicfiles.com/densepose/data/cse/lbo/phi_horse_5004_256.pkl"
+        "zebra_5002":
+          TYPE: vertex_feature
+          NUM_VERTICES: 5002
+          FEATURE_DIM: 256
+          FEATURES_TRAINABLE: False
+          IS_TRAINABLE: True
+          INIT_FILE: "https://dl.fbaipublicfiles.com/densepose/data/cse/lbo/phi_zebra_5002_256.pkl"
+        "giraffe_5002":
+          TYPE: vertex_feature
+          NUM_VERTICES: 5002
+          FEATURE_DIM: 256
+          FEATURES_TRAINABLE: False
+          IS_TRAINABLE: True
+          INIT_FILE: "https://dl.fbaipublicfiles.com/densepose/data/cse/lbo/phi_giraffe_5002_256.pkl"
+        "elephant_5002":
+          TYPE: vertex_feature
+          NUM_VERTICES: 5002
+          FEATURE_DIM: 256
+          FEATURES_TRAINABLE: False
+          IS_TRAINABLE: True
+          INIT_FILE: "https://dl.fbaipublicfiles.com/densepose/data/cse/lbo/phi_elephant_5002_256.pkl"
+        "cow_5002":
+          TYPE: vertex_feature
+          NUM_VERTICES: 5002
+          FEATURE_DIM: 256
+          FEATURES_TRAINABLE: False
+          IS_TRAINABLE: True
+          INIT_FILE: "https://dl.fbaipublicfiles.com/densepose/data/cse/lbo/phi_cow_5002_256.pkl"
+        "bear_4936":
+          TYPE: vertex_feature
+          NUM_VERTICES: 4936
+          FEATURE_DIM: 256
+          FEATURES_TRAINABLE: False
+          IS_TRAINABLE: True
+          INIT_FILE: "https://dl.fbaipublicfiles.com/densepose/data/cse/lbo/phi_bear_4936_256.pkl"
+DATASETS:
+  TRAIN:
+    - "densepose_lvis_v1_train1"
+    - "densepose_lvis_v1_train2"
+  TEST:
+    - "densepose_lvis_v1_val_animals_100"
+  WHITELISTED_CATEGORIES:
+    "densepose_lvis_v1_train1":
+      - 943  # sheep
+      - 1202 # zebra
+      - 569  # horse
+      - 496  # giraffe
+      - 422  # elephant
+      - 80   # cow
+      - 76   # bear
+      - 225  # cat
+      - 378  # dog
+    "densepose_lvis_v1_train2":
+      - 943  # sheep
+      - 1202 # zebra
+      - 569  # horse
+      - 496  # giraffe
+      - 422  # elephant
+      - 80   # cow
+      - 76   # bear
+      - 225  # cat
+      - 378  # dog
+    "densepose_lvis_v1_val_animals_100":
+      - 943  # sheep
+      - 1202 # zebra
+      - 569  # horse
+      - 496  # giraffe
+      - 422  # elephant
+      - 80   # cow
+      - 76   # bear
+      - 225  # cat
+      - 378  # dog
+  CLASS_TO_MESH_NAME_MAPPING:
+    "0": "bear_4936"
+    "1": "cow_5002"
+    "2": "cat_5001"
+    "3": "dog_5002"
+    "4": "elephant_5002"
+    "5": "giraffe_5002"
+    "6": "horse_5004"
+    "7": "sheep_5004"
+    "8": "zebra_5002"
+SOLVER:
+  MAX_ITER: 40
+  STEPS: (30,)
diff --git a/configs/quick_schedules/densepose_rcnn_HRFPN_HRNet_w32_instant_test.yaml b/configs/quick_schedules/densepose_rcnn_HRFPN_HRNet_w32_instant_test.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..95677ce9a7ff426a9051737876e7424908b1423f
--- /dev/null
+++ b/configs/quick_schedules/densepose_rcnn_HRFPN_HRNet_w32_instant_test.yaml
@@ -0,0 +1,8 @@
+_BASE_: "../HRNet/densepose_rcnn_HRFPN_HRNet_w32_s1x.yaml"
+DATASETS:
+  TRAIN: ("densepose_coco_2014_minival_100",)
+  TEST: ("densepose_coco_2014_minival_100",)
+SOLVER:
+  MAX_ITER: 40
+  STEPS: (30,)
+  IMS_PER_BATCH: 2
diff --git a/configs/quick_schedules/densepose_rcnn_R_50_FPN_DL_instant_test.yaml b/configs/quick_schedules/densepose_rcnn_R_50_FPN_DL_instant_test.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..b90989eef81e27d23119d2cd4627e8cea211ac51
--- /dev/null
+++ b/configs/quick_schedules/densepose_rcnn_R_50_FPN_DL_instant_test.yaml
@@ -0,0 +1,11 @@
+_BASE_: "../Base-DensePose-RCNN-FPN.yaml"
+MODEL:
+  WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
+  ROI_DENSEPOSE_HEAD:
+    NAME: "DensePoseDeepLabHead"
+DATASETS:
+  TRAIN: ("densepose_coco_2014_minival_100",)
+  TEST: ("densepose_coco_2014_minival_100",)
+SOLVER:
+  MAX_ITER: 40
+  STEPS: (30,)
diff --git a/configs/quick_schedules/densepose_rcnn_R_50_FPN_TTA_inference_acc_test.yaml b/configs/quick_schedules/densepose_rcnn_R_50_FPN_TTA_inference_acc_test.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..b124da19140f564258b583ec109eeeeaff8fd78a
--- /dev/null
+++ b/configs/quick_schedules/densepose_rcnn_R_50_FPN_TTA_inference_acc_test.yaml
@@ -0,0 +1,13 @@
+_BASE_: "../densepose_rcnn_R_50_FPN_s1x.yaml"
+MODEL:
+  WEIGHTS: "https://dl.fbaipublicfiles.com/densepose/densepose_rcnn_R_50_FPN_s1x/165712039/model_final_162be9.pkl"
+DATASETS:
+  TRAIN: ()
+  TEST: ("densepose_coco_2014_minival_100",)
+TEST:
+  AUG:
+    ENABLED: True
+    MIN_SIZES: (400, 500, 600, 700, 800, 900, 1000, 1100, 1200)
+    MAX_SIZE: 4000
+    FLIP: True
+  EXPECTED_RESULTS: [["bbox_TTA", "AP", 61.74, 0.03], ["densepose_gps_TTA", "AP",  60.22, 0.03], ["densepose_gpsm_TTA", "AP", 63.59, 0.03]]
diff --git a/configs/quick_schedules/densepose_rcnn_R_50_FPN_WC1_instant_test.yaml b/configs/quick_schedules/densepose_rcnn_R_50_FPN_WC1_instant_test.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..f0fe61151adf255baba717f3e65ff6fab52829a6
--- /dev/null
+++ b/configs/quick_schedules/densepose_rcnn_R_50_FPN_WC1_instant_test.yaml
@@ -0,0 +1,19 @@
+_BASE_: "../Base-DensePose-RCNN-FPN.yaml"
+MODEL:
+  WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
+  RESNETS:
+    DEPTH: 50
+  ROI_DENSEPOSE_HEAD:
+    UV_CONFIDENCE:
+      ENABLED: True
+      TYPE: "iid_iso"
+    POINT_REGRESSION_WEIGHTS: 0.0005
+DATASETS:
+  TRAIN: ("densepose_coco_2014_minival_100",)
+  TEST: ("densepose_coco_2014_minival_100",)
+SOLVER:
+  CLIP_GRADIENTS:
+    ENABLED: True
+  MAX_ITER: 40
+  STEPS: (30,)
+  WARMUP_FACTOR: 0.025
diff --git a/configs/quick_schedules/densepose_rcnn_R_50_FPN_WC2_instant_test.yaml b/configs/quick_schedules/densepose_rcnn_R_50_FPN_WC2_instant_test.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..f0d9358c8846452314697a19b5e2ea9e075ddaeb
--- /dev/null
+++ b/configs/quick_schedules/densepose_rcnn_R_50_FPN_WC2_instant_test.yaml
@@ -0,0 +1,19 @@
+_BASE_: "../Base-DensePose-RCNN-FPN.yaml"
+MODEL:
+  WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
+  RESNETS:
+    DEPTH: 50
+  ROI_DENSEPOSE_HEAD:
+    UV_CONFIDENCE:
+      ENABLED: True
+      TYPE: "indep_aniso"
+    POINT_REGRESSION_WEIGHTS: 0.0005
+DATASETS:
+  TRAIN: ("densepose_coco_2014_minival_100",)
+  TEST: ("densepose_coco_2014_minival_100",)
+SOLVER:
+  CLIP_GRADIENTS:
+    ENABLED: True
+  MAX_ITER: 40 
+  STEPS: (30,)
+  WARMUP_FACTOR: 0.025
diff --git a/configs/quick_schedules/densepose_rcnn_R_50_FPN_inference_acc_test.yaml b/configs/quick_schedules/densepose_rcnn_R_50_FPN_inference_acc_test.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..d607c98813d045c1e19875bdfe45fbc1c3fdb292
--- /dev/null
+++ b/configs/quick_schedules/densepose_rcnn_R_50_FPN_inference_acc_test.yaml
@@ -0,0 +1,8 @@
+_BASE_: "../densepose_rcnn_R_50_FPN_s1x.yaml"
+MODEL:
+  WEIGHTS: "https://dl.fbaipublicfiles.com/densepose/densepose_rcnn_R_50_FPN_s1x/165712039/model_final_162be9.pkl"
+DATASETS:
+  TRAIN: ()
+  TEST: ("densepose_coco_2014_minival_100",)
+TEST:
+  EXPECTED_RESULTS: [["bbox", "AP", 59.27, 0.025], ["densepose_gps", "AP",  60.11, 0.02], ["densepose_gpsm", "AP", 64.09, 0.02]]
diff --git a/configs/quick_schedules/densepose_rcnn_R_50_FPN_instant_test.yaml b/configs/quick_schedules/densepose_rcnn_R_50_FPN_instant_test.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..057c8768186e8a818228aa2f028ba3007374c571
--- /dev/null
+++ b/configs/quick_schedules/densepose_rcnn_R_50_FPN_instant_test.yaml
@@ -0,0 +1,9 @@
+_BASE_: "../Base-DensePose-RCNN-FPN.yaml"
+MODEL:
+  WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
+DATASETS:
+  TRAIN: ("densepose_coco_2014_minival_100",)
+  TEST: ("densepose_coco_2014_minival_100",)
+SOLVER:
+  MAX_ITER: 40
+  STEPS: (30,)
diff --git a/configs/quick_schedules/densepose_rcnn_R_50_FPN_training_acc_test.yaml b/configs/quick_schedules/densepose_rcnn_R_50_FPN_training_acc_test.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..0053c9d7d41af0ee7262804838d8edcde10ed40d
--- /dev/null
+++ b/configs/quick_schedules/densepose_rcnn_R_50_FPN_training_acc_test.yaml
@@ -0,0 +1,18 @@
+_BASE_: "../Base-DensePose-RCNN-FPN.yaml"
+MODEL:
+  WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
+  ROI_HEADS:
+    NUM_CLASSES: 1
+DATASETS:
+  TRAIN: ("densepose_coco_2014_minival",)
+  TEST: ("densepose_coco_2014_minival",)
+SOLVER:
+  CLIP_GRADIENTS:
+    ENABLED: True
+    CLIP_TYPE: norm
+    CLIP_VALUE: 1.0
+  MAX_ITER: 6000
+  STEPS: (5500, 5800)
+TEST:
+  EXPECTED_RESULTS: [["bbox", "AP", 76.2477, 1.0], ["densepose_gps", "AP", 79.6090, 1.5], ["densepose_gpsm", "AP", 80.0061, 1.5]]
+
diff --git a/densepose/__init__.py b/densepose/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..b50a3da91dd0d2a69502af9d5d62f2f4280d973f
--- /dev/null
+++ b/densepose/__init__.py
@@ -0,0 +1,20 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+from .data.datasets import builtin  # just to register data
+from .converters import builtin as builtin_converters  # register converters
+from .config import (
+    add_densepose_config,
+    add_densepose_head_config,
+    add_hrnet_config,
+    add_dataset_category_config,
+    add_bootstrap_config,
+    load_bootstrap_config,
+)
+from .structures import DensePoseDataRelative, DensePoseList, DensePoseTransformData
+from .evaluation import DensePoseCOCOEvaluator
+from .modeling.roi_heads import DensePoseROIHeads
+from .modeling.test_time_augmentation import (
+    DensePoseGeneralizedRCNNWithTTA,
+    DensePoseDatasetMapperTTA,
+)
+from .utils.transform import load_from_cfg
+from .modeling.hrfpn import build_hrfpn_backbone
diff --git a/densepose/__pycache__/__init__.cpython-310.pyc b/densepose/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5d516775b9dacd3a9788c6042b24a0df9f936d5f
Binary files /dev/null and b/densepose/__pycache__/__init__.cpython-310.pyc differ
diff --git a/densepose/__pycache__/__init__.cpython-39.pyc b/densepose/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..53668ac9fd42a5153ac8b22398070540cdf2c843
Binary files /dev/null and b/densepose/__pycache__/__init__.cpython-39.pyc differ
diff --git a/densepose/__pycache__/config.cpython-310.pyc b/densepose/__pycache__/config.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..62a3c65fe36542426872bdb088d88dcfeed25751
Binary files /dev/null and b/densepose/__pycache__/config.cpython-310.pyc differ
diff --git a/densepose/__pycache__/config.cpython-39.pyc b/densepose/__pycache__/config.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5e1945a501b37e6138aa693f7007ec24c8deb5b3
Binary files /dev/null and b/densepose/__pycache__/config.cpython-39.pyc differ
diff --git a/densepose/config.py b/densepose/config.py
new file mode 100644
index 0000000000000000000000000000000000000000..2a06a09c80865ab987773511b2acc71e232b26ac
--- /dev/null
+++ b/densepose/config.py
@@ -0,0 +1,277 @@
+# -*- coding = utf-8 -*-
+# Copyright (c) Facebook, Inc. and its affiliates.
+# pyre-ignore-all-errors
+
+from detectron2.config import CfgNode as CN
+
+
+def add_dataset_category_config(cfg: CN) -> None:
+    """
+    Add config for additional category-related dataset options
+     - category whitelisting
+     - category mapping
+    """
+    _C = cfg
+    _C.DATASETS.CATEGORY_MAPS = CN(new_allowed=True)
+    _C.DATASETS.WHITELISTED_CATEGORIES = CN(new_allowed=True)
+    # class to mesh mapping
+    _C.DATASETS.CLASS_TO_MESH_NAME_MAPPING = CN(new_allowed=True)
+
+
+def add_evaluation_config(cfg: CN) -> None:
+    _C = cfg
+    _C.DENSEPOSE_EVALUATION = CN()
+    # evaluator type, possible values:
+    #  - "iou": evaluator for models that produce iou data
+    #  - "cse": evaluator for models that produce cse data
+    _C.DENSEPOSE_EVALUATION.TYPE = "iou"
+    # storage for DensePose results, possible values:
+    #  - "none": no explicit storage, all the results are stored in the
+    #            dictionary with predictions, memory intensive;
+    #            historically the default storage type
+    #  - "ram": RAM storage, uses per-process RAM storage, which is
+    #           reduced to a single process storage on later stages,
+    #           less memory intensive
+    #  - "file": file storage, uses per-process file-based storage,
+    #            the least memory intensive, but may create bottlenecks
+    #            on file system accesses
+    _C.DENSEPOSE_EVALUATION.STORAGE = "none"
+    # minimum threshold for IOU values: the lower its values is,
+    # the more matches are produced (and the higher the AP score)
+    _C.DENSEPOSE_EVALUATION.MIN_IOU_THRESHOLD = 0.5
+    # Non-distributed inference is slower (at inference time) but can avoid RAM OOM
+    _C.DENSEPOSE_EVALUATION.DISTRIBUTED_INFERENCE = True
+    # evaluate mesh alignment based on vertex embeddings, only makes sense in CSE context
+    _C.DENSEPOSE_EVALUATION.EVALUATE_MESH_ALIGNMENT = False
+    # meshes to compute mesh alignment for
+    _C.DENSEPOSE_EVALUATION.MESH_ALIGNMENT_MESH_NAMES = []
+
+
+def add_bootstrap_config(cfg: CN) -> None:
+    """ """
+    _C = cfg
+    _C.BOOTSTRAP_DATASETS = []
+    _C.BOOTSTRAP_MODEL = CN()
+    _C.BOOTSTRAP_MODEL.WEIGHTS = ""
+    _C.BOOTSTRAP_MODEL.DEVICE = "cuda"
+
+
+def get_bootstrap_dataset_config() -> CN:
+    _C = CN()
+    _C.DATASET = ""
+    # ratio used to mix data loaders
+    _C.RATIO = 0.1
+    # image loader
+    _C.IMAGE_LOADER = CN(new_allowed=True)
+    _C.IMAGE_LOADER.TYPE = ""
+    _C.IMAGE_LOADER.BATCH_SIZE = 4
+    _C.IMAGE_LOADER.NUM_WORKERS = 4
+    _C.IMAGE_LOADER.CATEGORIES = []
+    _C.IMAGE_LOADER.MAX_COUNT_PER_CATEGORY = 1_000_000
+    _C.IMAGE_LOADER.CATEGORY_TO_CLASS_MAPPING = CN(new_allowed=True)
+    # inference
+    _C.INFERENCE = CN()
+    # batch size for model inputs
+    _C.INFERENCE.INPUT_BATCH_SIZE = 4
+    # batch size to group model outputs
+    _C.INFERENCE.OUTPUT_BATCH_SIZE = 2
+    # sampled data
+    _C.DATA_SAMPLER = CN(new_allowed=True)
+    _C.DATA_SAMPLER.TYPE = ""
+    _C.DATA_SAMPLER.USE_GROUND_TRUTH_CATEGORIES = False
+    # filter
+    _C.FILTER = CN(new_allowed=True)
+    _C.FILTER.TYPE = ""
+    return _C
+
+
+def load_bootstrap_config(cfg: CN) -> None:
+    """
+    Bootstrap datasets are given as a list of `dict` that are not automatically
+    converted into CfgNode. This method processes all bootstrap dataset entries
+    and ensures that they are in CfgNode format and comply with the specification
+    """
+    if not cfg.BOOTSTRAP_DATASETS:
+        return
+
+    bootstrap_datasets_cfgnodes = []
+    for dataset_cfg in cfg.BOOTSTRAP_DATASETS:
+        _C = get_bootstrap_dataset_config().clone()
+        _C.merge_from_other_cfg(CN(dataset_cfg))
+        bootstrap_datasets_cfgnodes.append(_C)
+    cfg.BOOTSTRAP_DATASETS = bootstrap_datasets_cfgnodes
+
+
+def add_densepose_head_cse_config(cfg: CN) -> None:
+    """
+    Add configuration options for Continuous Surface Embeddings (CSE)
+    """
+    _C = cfg
+    _C.MODEL.ROI_DENSEPOSE_HEAD.CSE = CN()
+    # Dimensionality D of the embedding space
+    _C.MODEL.ROI_DENSEPOSE_HEAD.CSE.EMBED_SIZE = 16
+    # Embedder specifications for various mesh IDs
+    _C.MODEL.ROI_DENSEPOSE_HEAD.CSE.EMBEDDERS = CN(new_allowed=True)
+    # normalization coefficient for embedding distances
+    _C.MODEL.ROI_DENSEPOSE_HEAD.CSE.EMBEDDING_DIST_GAUSS_SIGMA = 0.01
+    # normalization coefficient for geodesic distances
+    _C.MODEL.ROI_DENSEPOSE_HEAD.CSE.GEODESIC_DIST_GAUSS_SIGMA = 0.01
+    # embedding loss weight
+    _C.MODEL.ROI_DENSEPOSE_HEAD.CSE.EMBED_LOSS_WEIGHT = 0.6
+    # embedding loss name, currently the following options are supported:
+    # - EmbeddingLoss: cross-entropy on vertex labels
+    # - SoftEmbeddingLoss: cross-entropy on vertex label combined with
+    #    Gaussian penalty on distance between vertices
+    _C.MODEL.ROI_DENSEPOSE_HEAD.CSE.EMBED_LOSS_NAME = "EmbeddingLoss"
+    # optimizer hyperparameters
+    _C.MODEL.ROI_DENSEPOSE_HEAD.CSE.FEATURES_LR_FACTOR = 1.0
+    _C.MODEL.ROI_DENSEPOSE_HEAD.CSE.EMBEDDING_LR_FACTOR = 1.0
+    # Shape to shape cycle consistency loss parameters:
+    _C.MODEL.ROI_DENSEPOSE_HEAD.CSE.SHAPE_TO_SHAPE_CYCLE_LOSS = CN({"ENABLED": False})
+    # shape to shape cycle consistency loss weight
+    _C.MODEL.ROI_DENSEPOSE_HEAD.CSE.SHAPE_TO_SHAPE_CYCLE_LOSS.WEIGHT = 0.025
+    # norm type used for loss computation
+    _C.MODEL.ROI_DENSEPOSE_HEAD.CSE.SHAPE_TO_SHAPE_CYCLE_LOSS.NORM_P = 2
+    # normalization term for embedding similarity matrices
+    _C.MODEL.ROI_DENSEPOSE_HEAD.CSE.SHAPE_TO_SHAPE_CYCLE_LOSS.TEMPERATURE = 0.05
+    # maximum number of vertices to include into shape to shape cycle loss
+    # if negative or zero, all vertices are considered
+    # if positive, random subset of vertices of given size is considered
+    _C.MODEL.ROI_DENSEPOSE_HEAD.CSE.SHAPE_TO_SHAPE_CYCLE_LOSS.MAX_NUM_VERTICES = 4936
+    # Pixel to shape cycle consistency loss parameters:
+    _C.MODEL.ROI_DENSEPOSE_HEAD.CSE.PIX_TO_SHAPE_CYCLE_LOSS = CN({"ENABLED": False})
+    # pixel to shape cycle consistency loss weight
+    _C.MODEL.ROI_DENSEPOSE_HEAD.CSE.PIX_TO_SHAPE_CYCLE_LOSS.WEIGHT = 0.0001
+    # norm type used for loss computation
+    _C.MODEL.ROI_DENSEPOSE_HEAD.CSE.PIX_TO_SHAPE_CYCLE_LOSS.NORM_P = 2
+    # map images to all meshes and back (if false, use only gt meshes from the batch)
+    _C.MODEL.ROI_DENSEPOSE_HEAD.CSE.PIX_TO_SHAPE_CYCLE_LOSS.USE_ALL_MESHES_NOT_GT_ONLY = False
+    # Randomly select at most this number of pixels from every instance
+    # if negative or zero, all vertices are considered
+    _C.MODEL.ROI_DENSEPOSE_HEAD.CSE.PIX_TO_SHAPE_CYCLE_LOSS.NUM_PIXELS_TO_SAMPLE = 100
+    # normalization factor for pixel to pixel distances (higher value = smoother distribution)
+    _C.MODEL.ROI_DENSEPOSE_HEAD.CSE.PIX_TO_SHAPE_CYCLE_LOSS.PIXEL_SIGMA = 5.0
+    _C.MODEL.ROI_DENSEPOSE_HEAD.CSE.PIX_TO_SHAPE_CYCLE_LOSS.TEMPERATURE_PIXEL_TO_VERTEX = 0.05
+    _C.MODEL.ROI_DENSEPOSE_HEAD.CSE.PIX_TO_SHAPE_CYCLE_LOSS.TEMPERATURE_VERTEX_TO_PIXEL = 0.05
+
+
+def add_densepose_head_config(cfg: CN) -> None:
+    """
+    Add config for densepose head.
+    """
+    _C = cfg
+
+    _C.MODEL.DENSEPOSE_ON = True
+
+    _C.MODEL.ROI_DENSEPOSE_HEAD = CN()
+    _C.MODEL.ROI_DENSEPOSE_HEAD.NAME = ""
+    _C.MODEL.ROI_DENSEPOSE_HEAD.NUM_STACKED_CONVS = 8
+    # Number of parts used for point labels
+    _C.MODEL.ROI_DENSEPOSE_HEAD.NUM_PATCHES = 24
+    _C.MODEL.ROI_DENSEPOSE_HEAD.DECONV_KERNEL = 4
+    _C.MODEL.ROI_DENSEPOSE_HEAD.CONV_HEAD_DIM = 512
+    _C.MODEL.ROI_DENSEPOSE_HEAD.CONV_HEAD_KERNEL = 3
+    _C.MODEL.ROI_DENSEPOSE_HEAD.UP_SCALE = 2
+    _C.MODEL.ROI_DENSEPOSE_HEAD.HEATMAP_SIZE = 112
+    _C.MODEL.ROI_DENSEPOSE_HEAD.POOLER_TYPE = "ROIAlignV2"
+    _C.MODEL.ROI_DENSEPOSE_HEAD.POOLER_RESOLUTION = 28
+    _C.MODEL.ROI_DENSEPOSE_HEAD.POOLER_SAMPLING_RATIO = 2
+    _C.MODEL.ROI_DENSEPOSE_HEAD.NUM_COARSE_SEGM_CHANNELS = 2  # 15 or 2
+    # Overlap threshold for an RoI to be considered foreground (if >= FG_IOU_THRESHOLD)
+    _C.MODEL.ROI_DENSEPOSE_HEAD.FG_IOU_THRESHOLD = 0.7
+    # Loss weights for annotation masks.(14 Parts)
+    _C.MODEL.ROI_DENSEPOSE_HEAD.INDEX_WEIGHTS = 5.0
+    # Loss weights for surface parts. (24 Parts)
+    _C.MODEL.ROI_DENSEPOSE_HEAD.PART_WEIGHTS = 1.0
+    # Loss weights for UV regression.
+    _C.MODEL.ROI_DENSEPOSE_HEAD.POINT_REGRESSION_WEIGHTS = 0.01
+    # Coarse segmentation is trained using instance segmentation task data
+    _C.MODEL.ROI_DENSEPOSE_HEAD.COARSE_SEGM_TRAINED_BY_MASKS = False
+    # For Decoder
+    _C.MODEL.ROI_DENSEPOSE_HEAD.DECODER_ON = True
+    _C.MODEL.ROI_DENSEPOSE_HEAD.DECODER_NUM_CLASSES = 256
+    _C.MODEL.ROI_DENSEPOSE_HEAD.DECODER_CONV_DIMS = 256
+    _C.MODEL.ROI_DENSEPOSE_HEAD.DECODER_NORM = ""
+    _C.MODEL.ROI_DENSEPOSE_HEAD.DECODER_COMMON_STRIDE = 4
+    # For DeepLab head
+    _C.MODEL.ROI_DENSEPOSE_HEAD.DEEPLAB = CN()
+    _C.MODEL.ROI_DENSEPOSE_HEAD.DEEPLAB.NORM = "GN"
+    _C.MODEL.ROI_DENSEPOSE_HEAD.DEEPLAB.NONLOCAL_ON = 0
+    # Predictor class name, must be registered in DENSEPOSE_PREDICTOR_REGISTRY
+    # Some registered predictors:
+    #   "DensePoseChartPredictor": predicts segmentation and UV coordinates for predefined charts
+    #   "DensePoseChartWithConfidencePredictor": predicts segmentation, UV coordinates
+    #       and associated confidences for predefined charts (default)
+    #   "DensePoseEmbeddingWithConfidencePredictor": predicts segmentation, embeddings
+    #       and associated confidences for CSE
+    _C.MODEL.ROI_DENSEPOSE_HEAD.PREDICTOR_NAME = "DensePoseChartWithConfidencePredictor"
+    # Loss class name, must be registered in DENSEPOSE_LOSS_REGISTRY
+    # Some registered losses:
+    #   "DensePoseChartLoss": loss for chart-based models that estimate
+    #      segmentation and UV coordinates
+    #   "DensePoseChartWithConfidenceLoss": loss for chart-based models that estimate
+    #      segmentation, UV coordinates and the corresponding confidences (default)
+    _C.MODEL.ROI_DENSEPOSE_HEAD.LOSS_NAME = "DensePoseChartWithConfidenceLoss"
+    # Confidences
+    # Enable learning UV confidences (variances) along with the actual values
+    _C.MODEL.ROI_DENSEPOSE_HEAD.UV_CONFIDENCE = CN({"ENABLED": False})
+    # UV confidence lower bound
+    _C.MODEL.ROI_DENSEPOSE_HEAD.UV_CONFIDENCE.EPSILON = 0.01
+    # Enable learning segmentation confidences (variances) along with the actual values
+    _C.MODEL.ROI_DENSEPOSE_HEAD.SEGM_CONFIDENCE = CN({"ENABLED": False})
+    # Segmentation confidence lower bound
+    _C.MODEL.ROI_DENSEPOSE_HEAD.SEGM_CONFIDENCE.EPSILON = 0.01
+    # Statistical model type for confidence learning, possible values:
+    # - "iid_iso": statistically independent identically distributed residuals
+    #    with isotropic covariance
+    # - "indep_aniso": statistically independent residuals with anisotropic
+    #    covariances
+    _C.MODEL.ROI_DENSEPOSE_HEAD.UV_CONFIDENCE.TYPE = "iid_iso"
+    # List of angles for rotation in data augmentation during training
+    _C.INPUT.ROTATION_ANGLES = [0]
+    _C.TEST.AUG.ROTATION_ANGLES = ()  # Rotation TTA
+
+    add_densepose_head_cse_config(cfg)
+
+
+def add_hrnet_config(cfg: CN) -> None:
+    """
+    Add config for HRNet backbone.
+    """
+    _C = cfg
+
+    # For HigherHRNet w32
+    _C.MODEL.HRNET = CN()
+    _C.MODEL.HRNET.STEM_INPLANES = 64
+    _C.MODEL.HRNET.STAGE2 = CN()
+    _C.MODEL.HRNET.STAGE2.NUM_MODULES = 1
+    _C.MODEL.HRNET.STAGE2.NUM_BRANCHES = 2
+    _C.MODEL.HRNET.STAGE2.BLOCK = "BASIC"
+    _C.MODEL.HRNET.STAGE2.NUM_BLOCKS = [4, 4]
+    _C.MODEL.HRNET.STAGE2.NUM_CHANNELS = [32, 64]
+    _C.MODEL.HRNET.STAGE2.FUSE_METHOD = "SUM"
+    _C.MODEL.HRNET.STAGE3 = CN()
+    _C.MODEL.HRNET.STAGE3.NUM_MODULES = 4
+    _C.MODEL.HRNET.STAGE3.NUM_BRANCHES = 3
+    _C.MODEL.HRNET.STAGE3.BLOCK = "BASIC"
+    _C.MODEL.HRNET.STAGE3.NUM_BLOCKS = [4, 4, 4]
+    _C.MODEL.HRNET.STAGE3.NUM_CHANNELS = [32, 64, 128]
+    _C.MODEL.HRNET.STAGE3.FUSE_METHOD = "SUM"
+    _C.MODEL.HRNET.STAGE4 = CN()
+    _C.MODEL.HRNET.STAGE4.NUM_MODULES = 3
+    _C.MODEL.HRNET.STAGE4.NUM_BRANCHES = 4
+    _C.MODEL.HRNET.STAGE4.BLOCK = "BASIC"
+    _C.MODEL.HRNET.STAGE4.NUM_BLOCKS = [4, 4, 4, 4]
+    _C.MODEL.HRNET.STAGE4.NUM_CHANNELS = [32, 64, 128, 256]
+    _C.MODEL.HRNET.STAGE4.FUSE_METHOD = "SUM"
+
+    _C.MODEL.HRNET.HRFPN = CN()
+    _C.MODEL.HRNET.HRFPN.OUT_CHANNELS = 256
+
+
+def add_densepose_config(cfg: CN) -> None:
+    add_densepose_head_config(cfg)
+    add_hrnet_config(cfg)
+    add_bootstrap_config(cfg)
+    add_dataset_category_config(cfg)
+    add_evaluation_config(cfg)
diff --git a/densepose/converters/__init__.py b/densepose/converters/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..930339e13f408ad46d0504fac557ef8cf0a57a56
--- /dev/null
+++ b/densepose/converters/__init__.py
@@ -0,0 +1,15 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+
+from .hflip import HFlipConverter
+from .to_mask import ToMaskConverter
+from .to_chart_result import ToChartResultConverter, ToChartResultConverterWithConfidences
+from .segm_to_mask import (
+    predictor_output_with_fine_and_coarse_segm_to_mask,
+    predictor_output_with_coarse_segm_to_mask,
+    resample_fine_and_coarse_segm_to_bbox,
+)
+from .chart_output_to_chart_result import (
+    densepose_chart_predictor_output_to_result,
+    densepose_chart_predictor_output_to_result_with_confidences,
+)
+from .chart_output_hflip import densepose_chart_predictor_output_hflip
diff --git a/densepose/converters/__pycache__/__init__.cpython-310.pyc b/densepose/converters/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d51493d45c6fdb6a8eb86cb6b1dfa32149eab63f
Binary files /dev/null and b/densepose/converters/__pycache__/__init__.cpython-310.pyc differ
diff --git a/densepose/converters/__pycache__/__init__.cpython-39.pyc b/densepose/converters/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b397079c9271a1fb18c6d891cb9a909de21722e8
Binary files /dev/null and b/densepose/converters/__pycache__/__init__.cpython-39.pyc differ
diff --git a/densepose/converters/__pycache__/base.cpython-310.pyc b/densepose/converters/__pycache__/base.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0d48fb70a00a420822e22fdcf1dd5ad926e11a92
Binary files /dev/null and b/densepose/converters/__pycache__/base.cpython-310.pyc differ
diff --git a/densepose/converters/__pycache__/base.cpython-39.pyc b/densepose/converters/__pycache__/base.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0cfe39944da98c398a82d2e5056aa7333118cb3f
Binary files /dev/null and b/densepose/converters/__pycache__/base.cpython-39.pyc differ
diff --git a/densepose/converters/__pycache__/builtin.cpython-310.pyc b/densepose/converters/__pycache__/builtin.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..cd07e31cd50859f1cf05ee5f82a59439cdd6127f
Binary files /dev/null and b/densepose/converters/__pycache__/builtin.cpython-310.pyc differ
diff --git a/densepose/converters/__pycache__/builtin.cpython-39.pyc b/densepose/converters/__pycache__/builtin.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..29ca8c18f001f1482f6e134f18cc499156909d8f
Binary files /dev/null and b/densepose/converters/__pycache__/builtin.cpython-39.pyc differ
diff --git a/densepose/converters/__pycache__/chart_output_hflip.cpython-310.pyc b/densepose/converters/__pycache__/chart_output_hflip.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..46d3030a40d3623ed7543ea6f9571cd8d6ad44de
Binary files /dev/null and b/densepose/converters/__pycache__/chart_output_hflip.cpython-310.pyc differ
diff --git a/densepose/converters/__pycache__/chart_output_hflip.cpython-39.pyc b/densepose/converters/__pycache__/chart_output_hflip.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9b10ee3fbbe2045135510dd5361fc65ab396ec27
Binary files /dev/null and b/densepose/converters/__pycache__/chart_output_hflip.cpython-39.pyc differ
diff --git a/densepose/converters/__pycache__/chart_output_to_chart_result.cpython-310.pyc b/densepose/converters/__pycache__/chart_output_to_chart_result.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f0ae9e1170f765c132e0718773e71684058f545e
Binary files /dev/null and b/densepose/converters/__pycache__/chart_output_to_chart_result.cpython-310.pyc differ
diff --git a/densepose/converters/__pycache__/chart_output_to_chart_result.cpython-39.pyc b/densepose/converters/__pycache__/chart_output_to_chart_result.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f2a89edca68b6e3f9f060711cc41c6862f8816cc
Binary files /dev/null and b/densepose/converters/__pycache__/chart_output_to_chart_result.cpython-39.pyc differ
diff --git a/densepose/converters/__pycache__/hflip.cpython-310.pyc b/densepose/converters/__pycache__/hflip.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..50b542803ec84068b3fbe259d5e24a3715f4583a
Binary files /dev/null and b/densepose/converters/__pycache__/hflip.cpython-310.pyc differ
diff --git a/densepose/converters/__pycache__/hflip.cpython-39.pyc b/densepose/converters/__pycache__/hflip.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4dfa9764962964dfd9a082cd4bc111d11a996f5d
Binary files /dev/null and b/densepose/converters/__pycache__/hflip.cpython-39.pyc differ
diff --git a/densepose/converters/__pycache__/segm_to_mask.cpython-310.pyc b/densepose/converters/__pycache__/segm_to_mask.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..01c63f5a8114c0e066b13df424bded8b0cd31a53
Binary files /dev/null and b/densepose/converters/__pycache__/segm_to_mask.cpython-310.pyc differ
diff --git a/densepose/converters/__pycache__/segm_to_mask.cpython-39.pyc b/densepose/converters/__pycache__/segm_to_mask.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c109ddaef4474185ef2a8f6ceb5382d5cf1564de
Binary files /dev/null and b/densepose/converters/__pycache__/segm_to_mask.cpython-39.pyc differ
diff --git a/densepose/converters/__pycache__/to_chart_result.cpython-310.pyc b/densepose/converters/__pycache__/to_chart_result.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b75c46f8b88c876f1db9010ec0b0ee9a6df095bc
Binary files /dev/null and b/densepose/converters/__pycache__/to_chart_result.cpython-310.pyc differ
diff --git a/densepose/converters/__pycache__/to_chart_result.cpython-39.pyc b/densepose/converters/__pycache__/to_chart_result.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..bb47e393e3eda25f25b2ac4b31ca1b297f7b2ebf
Binary files /dev/null and b/densepose/converters/__pycache__/to_chart_result.cpython-39.pyc differ
diff --git a/densepose/converters/__pycache__/to_mask.cpython-310.pyc b/densepose/converters/__pycache__/to_mask.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..49a8a3d9ccddde5c9e406bcdd7c376645c7d41d6
Binary files /dev/null and b/densepose/converters/__pycache__/to_mask.cpython-310.pyc differ
diff --git a/densepose/converters/__pycache__/to_mask.cpython-39.pyc b/densepose/converters/__pycache__/to_mask.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..43ffab372f616e4178a0eb91ca0636ff50b98026
Binary files /dev/null and b/densepose/converters/__pycache__/to_mask.cpython-39.pyc differ
diff --git a/densepose/converters/base.py b/densepose/converters/base.py
new file mode 100644
index 0000000000000000000000000000000000000000..c9dbe56cecff6dbbc1a1fda5a89c5f917513dcd8
--- /dev/null
+++ b/densepose/converters/base.py
@@ -0,0 +1,93 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+
+from typing import Any, Tuple, Type
+import torch
+
+
+class BaseConverter:
+    """
+    Converter base class to be reused by various converters.
+    Converter allows one to convert data from various source types to a particular
+    destination type. Each source type needs to register its converter. The
+    registration for each source type is valid for all descendants of that type.
+    """
+
+    @classmethod
+    def register(cls, from_type: Type, converter: Any = None):
+        """
+        Registers a converter for the specified type.
+        Can be used as a decorator (if converter is None), or called as a method.
+
+        Args:
+            from_type (type): type to register the converter for;
+                all instances of this type will use the same converter
+            converter (callable): converter to be registered for the given
+                type; if None, this method is assumed to be a decorator for the converter
+        """
+
+        if converter is not None:
+            cls._do_register(from_type, converter)
+
+        def wrapper(converter: Any) -> Any:
+            cls._do_register(from_type, converter)
+            return converter
+
+        return wrapper
+
+    @classmethod
+    def _do_register(cls, from_type: Type, converter: Any):
+        cls.registry[from_type] = converter  # pyre-ignore[16]
+
+    @classmethod
+    def _lookup_converter(cls, from_type: Type) -> Any:
+        """
+        Perform recursive lookup for the given type
+        to find registered converter. If a converter was found for some base
+        class, it gets registered for this class to save on further lookups.
+
+        Args:
+            from_type: type for which to find a converter
+        Return:
+            callable or None - registered converter or None
+                if no suitable entry was found in the registry
+        """
+        if from_type in cls.registry:  # pyre-ignore[16]
+            return cls.registry[from_type]
+        for base in from_type.__bases__:
+            converter = cls._lookup_converter(base)
+            if converter is not None:
+                cls._do_register(from_type, converter)
+                return converter
+        return None
+
+    @classmethod
+    def convert(cls, instance: Any, *args, **kwargs):
+        """
+        Convert an instance to the destination type using some registered
+        converter. Does recursive lookup for base classes, so there's no need
+        for explicit registration for derived classes.
+
+        Args:
+            instance: source instance to convert to the destination type
+        Return:
+            An instance of the destination type obtained from the source instance
+            Raises KeyError, if no suitable converter found
+        """
+        instance_type = type(instance)
+        converter = cls._lookup_converter(instance_type)
+        if converter is None:
+            if cls.dst_type is None:  # pyre-ignore[16]
+                output_type_str = "itself"
+            else:
+                output_type_str = cls.dst_type
+            raise KeyError(f"Could not find converter from {instance_type} to {output_type_str}")
+        return converter(instance, *args, **kwargs)
+
+
+IntTupleBox = Tuple[int, int, int, int]
+
+
+def make_int_box(box: torch.Tensor) -> IntTupleBox:
+    int_box = [0, 0, 0, 0]
+    int_box[0], int_box[1], int_box[2], int_box[3] = tuple(box.long().tolist())
+    return int_box[0], int_box[1], int_box[2], int_box[3]
diff --git a/densepose/converters/builtin.py b/densepose/converters/builtin.py
new file mode 100644
index 0000000000000000000000000000000000000000..3bd48f8f7afc49cf38bf410f01bc673d446f37d7
--- /dev/null
+++ b/densepose/converters/builtin.py
@@ -0,0 +1,31 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+
+from ..structures import DensePoseChartPredictorOutput, DensePoseEmbeddingPredictorOutput
+from . import (
+    HFlipConverter,
+    ToChartResultConverter,
+    ToChartResultConverterWithConfidences,
+    ToMaskConverter,
+    densepose_chart_predictor_output_hflip,
+    densepose_chart_predictor_output_to_result,
+    densepose_chart_predictor_output_to_result_with_confidences,
+    predictor_output_with_coarse_segm_to_mask,
+    predictor_output_with_fine_and_coarse_segm_to_mask,
+)
+
+ToMaskConverter.register(
+    DensePoseChartPredictorOutput, predictor_output_with_fine_and_coarse_segm_to_mask
+)
+ToMaskConverter.register(
+    DensePoseEmbeddingPredictorOutput, predictor_output_with_coarse_segm_to_mask
+)
+
+ToChartResultConverter.register(
+    DensePoseChartPredictorOutput, densepose_chart_predictor_output_to_result
+)
+
+ToChartResultConverterWithConfidences.register(
+    DensePoseChartPredictorOutput, densepose_chart_predictor_output_to_result_with_confidences
+)
+
+HFlipConverter.register(DensePoseChartPredictorOutput, densepose_chart_predictor_output_hflip)
diff --git a/densepose/converters/chart_output_hflip.py b/densepose/converters/chart_output_hflip.py
new file mode 100644
index 0000000000000000000000000000000000000000..17d294841264c248cf7fa9e3d2d2b4efdbb9a5e8
--- /dev/null
+++ b/densepose/converters/chart_output_hflip.py
@@ -0,0 +1,71 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+from dataclasses import fields
+import torch
+
+from densepose.structures import DensePoseChartPredictorOutput, DensePoseTransformData
+
+
+def densepose_chart_predictor_output_hflip(
+    densepose_predictor_output: DensePoseChartPredictorOutput,
+    transform_data: DensePoseTransformData,
+) -> DensePoseChartPredictorOutput:
+    """
+    Change  to take into account a Horizontal flip.
+    """
+    if len(densepose_predictor_output) > 0:
+
+        PredictorOutput = type(densepose_predictor_output)
+        output_dict = {}
+
+        for field in fields(densepose_predictor_output):
+            field_value = getattr(densepose_predictor_output, field.name)
+            # flip tensors
+            if isinstance(field_value, torch.Tensor):
+                setattr(densepose_predictor_output, field.name, torch.flip(field_value, [3]))
+
+        densepose_predictor_output = _flip_iuv_semantics_tensor(
+            densepose_predictor_output, transform_data
+        )
+        densepose_predictor_output = _flip_segm_semantics_tensor(
+            densepose_predictor_output, transform_data
+        )
+
+        for field in fields(densepose_predictor_output):
+            output_dict[field.name] = getattr(densepose_predictor_output, field.name)
+
+        return PredictorOutput(**output_dict)
+    else:
+        return densepose_predictor_output
+
+
+def _flip_iuv_semantics_tensor(
+    densepose_predictor_output: DensePoseChartPredictorOutput,
+    dp_transform_data: DensePoseTransformData,
+) -> DensePoseChartPredictorOutput:
+    point_label_symmetries = dp_transform_data.point_label_symmetries
+    uv_symmetries = dp_transform_data.uv_symmetries
+
+    N, C, H, W = densepose_predictor_output.u.shape
+    u_loc = (densepose_predictor_output.u[:, 1:, :, :].clamp(0, 1) * 255).long()
+    v_loc = (densepose_predictor_output.v[:, 1:, :, :].clamp(0, 1) * 255).long()
+    Iindex = torch.arange(C - 1, device=densepose_predictor_output.u.device)[
+        None, :, None, None
+    ].expand(N, C - 1, H, W)
+    densepose_predictor_output.u[:, 1:, :, :] = uv_symmetries["U_transforms"][Iindex, v_loc, u_loc]
+    densepose_predictor_output.v[:, 1:, :, :] = uv_symmetries["V_transforms"][Iindex, v_loc, u_loc]
+
+    for el in ["fine_segm", "u", "v"]:
+        densepose_predictor_output.__dict__[el] = densepose_predictor_output.__dict__[el][
+            :, point_label_symmetries, :, :
+        ]
+    return densepose_predictor_output
+
+
+def _flip_segm_semantics_tensor(
+    densepose_predictor_output: DensePoseChartPredictorOutput, dp_transform_data
+):
+    if densepose_predictor_output.coarse_segm.shape[1] > 2:
+        densepose_predictor_output.coarse_segm = densepose_predictor_output.coarse_segm[
+            :, dp_transform_data.mask_label_symmetries, :, :
+        ]
+    return densepose_predictor_output
diff --git a/densepose/converters/chart_output_to_chart_result.py b/densepose/converters/chart_output_to_chart_result.py
new file mode 100644
index 0000000000000000000000000000000000000000..4248f6c91b641a4ad1d00d0316ee82d701f9152f
--- /dev/null
+++ b/densepose/converters/chart_output_to_chart_result.py
@@ -0,0 +1,188 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+
+from typing import Dict
+import torch
+from torch.nn import functional as F
+
+from detectron2.structures.boxes import Boxes, BoxMode
+
+from ..structures import (
+    DensePoseChartPredictorOutput,
+    DensePoseChartResult,
+    DensePoseChartResultWithConfidences,
+)
+from . import resample_fine_and_coarse_segm_to_bbox
+from .base import IntTupleBox, make_int_box
+
+
+def resample_uv_tensors_to_bbox(
+    u: torch.Tensor,
+    v: torch.Tensor,
+    labels: torch.Tensor,
+    box_xywh_abs: IntTupleBox,
+) -> torch.Tensor:
+    """
+    Resamples U and V coordinate estimates for the given bounding box
+
+    Args:
+        u (tensor [1, C, H, W] of float): U coordinates
+        v (tensor [1, C, H, W] of float): V coordinates
+        labels (tensor [H, W] of long): labels obtained by resampling segmentation
+            outputs for the given bounding box
+        box_xywh_abs (tuple of 4 int): bounding box that corresponds to predictor outputs
+    Return:
+       Resampled U and V coordinates - a tensor [2, H, W] of float
+    """
+    x, y, w, h = box_xywh_abs
+    w = max(int(w), 1)
+    h = max(int(h), 1)
+    u_bbox = F.interpolate(u, (h, w), mode="bilinear", align_corners=False)
+    v_bbox = F.interpolate(v, (h, w), mode="bilinear", align_corners=False)
+    uv = torch.zeros([2, h, w], dtype=torch.float32, device=u.device)
+    for part_id in range(1, u_bbox.size(1)):
+        uv[0][labels == part_id] = u_bbox[0, part_id][labels == part_id]
+        uv[1][labels == part_id] = v_bbox[0, part_id][labels == part_id]
+    return uv
+
+
+def resample_uv_to_bbox(
+    predictor_output: DensePoseChartPredictorOutput,
+    labels: torch.Tensor,
+    box_xywh_abs: IntTupleBox,
+) -> torch.Tensor:
+    """
+    Resamples U and V coordinate estimates for the given bounding box
+
+    Args:
+        predictor_output (DensePoseChartPredictorOutput): DensePose predictor
+            output to be resampled
+        labels (tensor [H, W] of long): labels obtained by resampling segmentation
+            outputs for the given bounding box
+        box_xywh_abs (tuple of 4 int): bounding box that corresponds to predictor outputs
+    Return:
+       Resampled U and V coordinates - a tensor [2, H, W] of float
+    """
+    return resample_uv_tensors_to_bbox(
+        predictor_output.u,
+        predictor_output.v,
+        labels,
+        box_xywh_abs,
+    )
+
+
+def densepose_chart_predictor_output_to_result(
+    predictor_output: DensePoseChartPredictorOutput, boxes: Boxes
+) -> DensePoseChartResult:
+    """
+    Convert densepose chart predictor outputs to results
+
+    Args:
+        predictor_output (DensePoseChartPredictorOutput): DensePose predictor
+            output to be converted to results, must contain only 1 output
+        boxes (Boxes): bounding box that corresponds to the predictor output,
+            must contain only 1 bounding box
+    Return:
+       DensePose chart-based result (DensePoseChartResult)
+    """
+    assert len(predictor_output) == 1 and len(boxes) == 1, (
+        f"Predictor output to result conversion can operate only single outputs"
+        f", got {len(predictor_output)} predictor outputs and {len(boxes)} boxes"
+    )
+
+    boxes_xyxy_abs = boxes.tensor.clone()
+    boxes_xywh_abs = BoxMode.convert(boxes_xyxy_abs, BoxMode.XYXY_ABS, BoxMode.XYWH_ABS)
+    box_xywh = make_int_box(boxes_xywh_abs[0])
+
+    labels = resample_fine_and_coarse_segm_to_bbox(predictor_output, box_xywh).squeeze(0)
+    uv = resample_uv_to_bbox(predictor_output, labels, box_xywh)
+    return DensePoseChartResult(labels=labels, uv=uv)
+
+
+def resample_confidences_to_bbox(
+    predictor_output: DensePoseChartPredictorOutput,
+    labels: torch.Tensor,
+    box_xywh_abs: IntTupleBox,
+) -> Dict[str, torch.Tensor]:
+    """
+    Resamples confidences for the given bounding box
+
+    Args:
+        predictor_output (DensePoseChartPredictorOutput): DensePose predictor
+            output to be resampled
+        labels (tensor [H, W] of long): labels obtained by resampling segmentation
+            outputs for the given bounding box
+        box_xywh_abs (tuple of 4 int): bounding box that corresponds to predictor outputs
+    Return:
+       Resampled confidences - a dict of [H, W] tensors of float
+    """
+
+    x, y, w, h = box_xywh_abs
+    w = max(int(w), 1)
+    h = max(int(h), 1)
+
+    confidence_names = [
+        "sigma_1",
+        "sigma_2",
+        "kappa_u",
+        "kappa_v",
+        "fine_segm_confidence",
+        "coarse_segm_confidence",
+    ]
+    confidence_results = {key: None for key in confidence_names}
+    confidence_names = [
+        key for key in confidence_names if getattr(predictor_output, key) is not None
+    ]
+    confidence_base = torch.zeros([h, w], dtype=torch.float32, device=predictor_output.u.device)
+
+    # assign data from channels that correspond to the labels
+    for key in confidence_names:
+        resampled_confidence = F.interpolate(
+            getattr(predictor_output, key),
+            (h, w),
+            mode="bilinear",
+            align_corners=False,
+        )
+        result = confidence_base.clone()
+        for part_id in range(1, predictor_output.u.size(1)):
+            if resampled_confidence.size(1) != predictor_output.u.size(1):
+                # confidence is not part-based, don't try to fill it part by part
+                continue
+            result[labels == part_id] = resampled_confidence[0, part_id][labels == part_id]
+
+        if resampled_confidence.size(1) != predictor_output.u.size(1):
+            # confidence is not part-based, fill the data with the first channel
+            # (targeted for segmentation confidences that have only 1 channel)
+            result = resampled_confidence[0, 0]
+
+        confidence_results[key] = result
+
+    return confidence_results  # pyre-ignore[7]
+
+
+def densepose_chart_predictor_output_to_result_with_confidences(
+    predictor_output: DensePoseChartPredictorOutput, boxes: Boxes
+) -> DensePoseChartResultWithConfidences:
+    """
+    Convert densepose chart predictor outputs to results
+
+    Args:
+        predictor_output (DensePoseChartPredictorOutput): DensePose predictor
+            output with confidences to be converted to results, must contain only 1 output
+        boxes (Boxes): bounding box that corresponds to the predictor output,
+            must contain only 1 bounding box
+    Return:
+       DensePose chart-based result with confidences (DensePoseChartResultWithConfidences)
+    """
+    assert len(predictor_output) == 1 and len(boxes) == 1, (
+        f"Predictor output to result conversion can operate only single outputs"
+        f", got {len(predictor_output)} predictor outputs and {len(boxes)} boxes"
+    )
+
+    boxes_xyxy_abs = boxes.tensor.clone()
+    boxes_xywh_abs = BoxMode.convert(boxes_xyxy_abs, BoxMode.XYXY_ABS, BoxMode.XYWH_ABS)
+    box_xywh = make_int_box(boxes_xywh_abs[0])
+
+    labels = resample_fine_and_coarse_segm_to_bbox(predictor_output, box_xywh).squeeze(0)
+    uv = resample_uv_to_bbox(predictor_output, labels, box_xywh)
+    confidences = resample_confidences_to_bbox(predictor_output, labels, box_xywh)
+    return DensePoseChartResultWithConfidences(labels=labels, uv=uv, **confidences)
diff --git a/densepose/converters/hflip.py b/densepose/converters/hflip.py
new file mode 100644
index 0000000000000000000000000000000000000000..6df144280b2b84308acbb607e3313d0992faa68c
--- /dev/null
+++ b/densepose/converters/hflip.py
@@ -0,0 +1,34 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+
+from typing import Any
+
+from .base import BaseConverter
+
+
+class HFlipConverter(BaseConverter):
+    """
+    Converts various DensePose predictor outputs to DensePose results.
+    Each DensePose predictor output type has to register its convertion strategy.
+    """
+
+    registry = {}
+    dst_type = None
+
+    @classmethod
+    # pyre-fixme[14]: `convert` overrides method defined in `BaseConverter`
+    #  inconsistently.
+    def convert(cls, predictor_outputs: Any, transform_data: Any, *args, **kwargs):
+        """
+        Performs an horizontal flip on DensePose predictor outputs.
+        Does recursive lookup for base classes, so there's no need
+        for explicit registration for derived classes.
+
+        Args:
+            predictor_outputs: DensePose predictor output to be converted to BitMasks
+            transform_data: Anything useful for the flip
+        Return:
+            An instance of the same type as predictor_outputs
+        """
+        return super(HFlipConverter, cls).convert(
+            predictor_outputs, transform_data, *args, **kwargs
+        )
diff --git a/densepose/converters/segm_to_mask.py b/densepose/converters/segm_to_mask.py
new file mode 100644
index 0000000000000000000000000000000000000000..6433d5dec75c3d6141252af144b61d8999077bb7
--- /dev/null
+++ b/densepose/converters/segm_to_mask.py
@@ -0,0 +1,150 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+
+from typing import Any
+import torch
+from torch.nn import functional as F
+
+from detectron2.structures import BitMasks, Boxes, BoxMode
+
+from .base import IntTupleBox, make_int_box
+from .to_mask import ImageSizeType
+
+
+def resample_coarse_segm_tensor_to_bbox(coarse_segm: torch.Tensor, box_xywh_abs: IntTupleBox):
+    """
+    Resample coarse segmentation tensor to the given
+    bounding box and derive labels for each pixel of the bounding box
+
+    Args:
+        coarse_segm: float tensor of shape [1, K, Hout, Wout]
+        box_xywh_abs (tuple of 4 int): bounding box given by its upper-left
+            corner coordinates, width (W) and height (H)
+    Return:
+        Labels for each pixel of the bounding box, a long tensor of size [1, H, W]
+    """
+    x, y, w, h = box_xywh_abs
+    w = max(int(w), 1)
+    h = max(int(h), 1)
+    labels = F.interpolate(coarse_segm, (h, w), mode="bilinear", align_corners=False).argmax(dim=1)
+    return labels
+
+
+def resample_fine_and_coarse_segm_tensors_to_bbox(
+    fine_segm: torch.Tensor, coarse_segm: torch.Tensor, box_xywh_abs: IntTupleBox
+):
+    """
+    Resample fine and coarse segmentation tensors to the given
+    bounding box and derive labels for each pixel of the bounding box
+
+    Args:
+        fine_segm: float tensor of shape [1, C, Hout, Wout]
+        coarse_segm: float tensor of shape [1, K, Hout, Wout]
+        box_xywh_abs (tuple of 4 int): bounding box given by its upper-left
+            corner coordinates, width (W) and height (H)
+    Return:
+        Labels for each pixel of the bounding box, a long tensor of size [1, H, W]
+    """
+    x, y, w, h = box_xywh_abs
+    w = max(int(w), 1)
+    h = max(int(h), 1)
+    # coarse segmentation
+    coarse_segm_bbox = F.interpolate(
+        coarse_segm,
+        (h, w),
+        mode="bilinear",
+        align_corners=False,
+    ).argmax(dim=1)
+    # combined coarse and fine segmentation
+    labels = (
+        F.interpolate(fine_segm, (h, w), mode="bilinear", align_corners=False).argmax(dim=1)
+        * (coarse_segm_bbox > 0).long()
+    )
+    return labels
+
+
+def resample_fine_and_coarse_segm_to_bbox(predictor_output: Any, box_xywh_abs: IntTupleBox):
+    """
+    Resample fine and coarse segmentation outputs from a predictor to the given
+    bounding box and derive labels for each pixel of the bounding box
+
+    Args:
+        predictor_output: DensePose predictor output that contains segmentation
+            results to be resampled
+        box_xywh_abs (tuple of 4 int): bounding box given by its upper-left
+            corner coordinates, width (W) and height (H)
+    Return:
+        Labels for each pixel of the bounding box, a long tensor of size [1, H, W]
+    """
+    return resample_fine_and_coarse_segm_tensors_to_bbox(
+        predictor_output.fine_segm,
+        predictor_output.coarse_segm,
+        box_xywh_abs,
+    )
+
+
+def predictor_output_with_coarse_segm_to_mask(
+    predictor_output: Any, boxes: Boxes, image_size_hw: ImageSizeType
+) -> BitMasks:
+    """
+    Convert predictor output with coarse and fine segmentation to a mask.
+    Assumes that predictor output has the following attributes:
+     - coarse_segm (tensor of size [N, D, H, W]): coarse segmentation
+         unnormalized scores for N instances; D is the number of coarse
+         segmentation labels, H and W is the resolution of the estimate
+
+    Args:
+        predictor_output: DensePose predictor output to be converted to mask
+        boxes (Boxes): bounding boxes that correspond to the DensePose
+            predictor outputs
+        image_size_hw (tuple [int, int]): image height Himg and width Wimg
+    Return:
+        BitMasks that contain a bool tensor of size [N, Himg, Wimg] with
+        a mask of the size of the image for each instance
+    """
+    H, W = image_size_hw
+    boxes_xyxy_abs = boxes.tensor.clone()
+    boxes_xywh_abs = BoxMode.convert(boxes_xyxy_abs, BoxMode.XYXY_ABS, BoxMode.XYWH_ABS)
+    N = len(boxes_xywh_abs)
+    masks = torch.zeros((N, H, W), dtype=torch.bool, device=boxes.tensor.device)
+    for i in range(len(boxes_xywh_abs)):
+        box_xywh = make_int_box(boxes_xywh_abs[i])
+        box_mask = resample_coarse_segm_tensor_to_bbox(predictor_output[i].coarse_segm, box_xywh)
+        x, y, w, h = box_xywh
+        masks[i, y : y + h, x : x + w] = box_mask
+
+    return BitMasks(masks)
+
+
+def predictor_output_with_fine_and_coarse_segm_to_mask(
+    predictor_output: Any, boxes: Boxes, image_size_hw: ImageSizeType
+) -> BitMasks:
+    """
+    Convert predictor output with coarse and fine segmentation to a mask.
+    Assumes that predictor output has the following attributes:
+     - coarse_segm (tensor of size [N, D, H, W]): coarse segmentation
+         unnormalized scores for N instances; D is the number of coarse
+         segmentation labels, H and W is the resolution of the estimate
+     - fine_segm (tensor of size [N, C, H, W]): fine segmentation
+         unnormalized scores for N instances; C is the number of fine
+         segmentation labels, H and W is the resolution of the estimate
+
+    Args:
+        predictor_output: DensePose predictor output to be converted to mask
+        boxes (Boxes): bounding boxes that correspond to the DensePose
+            predictor outputs
+        image_size_hw (tuple [int, int]): image height Himg and width Wimg
+    Return:
+        BitMasks that contain a bool tensor of size [N, Himg, Wimg] with
+        a mask of the size of the image for each instance
+    """
+    H, W = image_size_hw
+    boxes_xyxy_abs = boxes.tensor.clone()
+    boxes_xywh_abs = BoxMode.convert(boxes_xyxy_abs, BoxMode.XYXY_ABS, BoxMode.XYWH_ABS)
+    N = len(boxes_xywh_abs)
+    masks = torch.zeros((N, H, W), dtype=torch.bool, device=boxes.tensor.device)
+    for i in range(len(boxes_xywh_abs)):
+        box_xywh = make_int_box(boxes_xywh_abs[i])
+        labels_i = resample_fine_and_coarse_segm_to_bbox(predictor_output[i], box_xywh)
+        x, y, w, h = box_xywh
+        masks[i, y : y + h, x : x + w] = labels_i > 0
+    return BitMasks(masks)
diff --git a/densepose/converters/to_chart_result.py b/densepose/converters/to_chart_result.py
new file mode 100644
index 0000000000000000000000000000000000000000..3eabd2614c285e8ea39d241b73f0d4b5762e6baa
--- /dev/null
+++ b/densepose/converters/to_chart_result.py
@@ -0,0 +1,70 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+
+from typing import Any
+
+from detectron2.structures import Boxes
+
+from ..structures import DensePoseChartResult, DensePoseChartResultWithConfidences
+from .base import BaseConverter
+
+
+class ToChartResultConverter(BaseConverter):
+    """
+    Converts various DensePose predictor outputs to DensePose results.
+    Each DensePose predictor output type has to register its convertion strategy.
+    """
+
+    registry = {}
+    dst_type = DensePoseChartResult
+
+    @classmethod
+    # pyre-fixme[14]: `convert` overrides method defined in `BaseConverter`
+    #  inconsistently.
+    def convert(cls, predictor_outputs: Any, boxes: Boxes, *args, **kwargs) -> DensePoseChartResult:
+        """
+        Convert DensePose predictor outputs to DensePoseResult using some registered
+        converter. Does recursive lookup for base classes, so there's no need
+        for explicit registration for derived classes.
+
+        Args:
+            densepose_predictor_outputs: DensePose predictor output to be
+                converted to BitMasks
+            boxes (Boxes): bounding boxes that correspond to the DensePose
+                predictor outputs
+        Return:
+            An instance of DensePoseResult. If no suitable converter was found, raises KeyError
+        """
+        return super(ToChartResultConverter, cls).convert(predictor_outputs, boxes, *args, **kwargs)
+
+
+class ToChartResultConverterWithConfidences(BaseConverter):
+    """
+    Converts various DensePose predictor outputs to DensePose results.
+    Each DensePose predictor output type has to register its convertion strategy.
+    """
+
+    registry = {}
+    dst_type = DensePoseChartResultWithConfidences
+
+    @classmethod
+    # pyre-fixme[14]: `convert` overrides method defined in `BaseConverter`
+    #  inconsistently.
+    def convert(
+        cls, predictor_outputs: Any, boxes: Boxes, *args, **kwargs
+    ) -> DensePoseChartResultWithConfidences:
+        """
+        Convert DensePose predictor outputs to DensePoseResult with confidences
+        using some registered converter. Does recursive lookup for base classes,
+        so there's no need for explicit registration for derived classes.
+
+        Args:
+            densepose_predictor_outputs: DensePose predictor output with confidences
+                to be converted to BitMasks
+            boxes (Boxes): bounding boxes that correspond to the DensePose
+                predictor outputs
+        Return:
+            An instance of DensePoseResult. If no suitable converter was found, raises KeyError
+        """
+        return super(ToChartResultConverterWithConfidences, cls).convert(
+            predictor_outputs, boxes, *args, **kwargs
+        )
diff --git a/densepose/converters/to_mask.py b/densepose/converters/to_mask.py
new file mode 100644
index 0000000000000000000000000000000000000000..a57fd71afc448a7d269a8a38c2014b14c8c5074f
--- /dev/null
+++ b/densepose/converters/to_mask.py
@@ -0,0 +1,49 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+
+from typing import Any, Tuple
+
+from detectron2.structures import BitMasks, Boxes
+
+from .base import BaseConverter
+
+ImageSizeType = Tuple[int, int]
+
+
+class ToMaskConverter(BaseConverter):
+    """
+    Converts various DensePose predictor outputs to masks
+    in bit mask format (see `BitMasks`). Each DensePose predictor output type
+    has to register its convertion strategy.
+    """
+
+    registry = {}
+    dst_type = BitMasks
+
+    @classmethod
+    # pyre-fixme[14]: `convert` overrides method defined in `BaseConverter`
+    #  inconsistently.
+    def convert(
+        cls,
+        densepose_predictor_outputs: Any,
+        boxes: Boxes,
+        image_size_hw: ImageSizeType,
+        *args,
+        **kwargs
+    ) -> BitMasks:
+        """
+        Convert DensePose predictor outputs to BitMasks using some registered
+        converter. Does recursive lookup for base classes, so there's no need
+        for explicit registration for derived classes.
+
+        Args:
+            densepose_predictor_outputs: DensePose predictor output to be
+                converted to BitMasks
+            boxes (Boxes): bounding boxes that correspond to the DensePose
+                predictor outputs
+            image_size_hw (tuple [int, int]): image height and width
+        Return:
+            An instance of `BitMasks`. If no suitable converter was found, raises KeyError
+        """
+        return super(ToMaskConverter, cls).convert(
+            densepose_predictor_outputs, boxes, image_size_hw, *args, **kwargs
+        )
diff --git a/densepose/data/__init__.py b/densepose/data/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..bf21ba75306970fd6a44069b49107320a84182b8
--- /dev/null
+++ b/densepose/data/__init__.py
@@ -0,0 +1,25 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+
+from .meshes import builtin
+from .build import (
+    build_detection_test_loader,
+    build_detection_train_loader,
+    build_combined_loader,
+    build_frame_selector,
+    build_inference_based_loaders,
+    has_inference_based_loaders,
+    BootstrapDatasetFactoryCatalog,
+)
+from .combined_loader import CombinedDataLoader
+from .dataset_mapper import DatasetMapper
+from .inference_based_loader import InferenceBasedLoader, ScoreBasedFilter
+from .image_list_dataset import ImageListDataset
+from .utils import is_relative_local_path, maybe_prepend_base_path
+
+# ensure the builtin datasets are registered
+from . import datasets
+
+# ensure the bootstrap datasets builders are registered
+from . import build
+
+__all__ = [k for k in globals().keys() if not k.startswith("_")]
diff --git a/densepose/data/__pycache__/__init__.cpython-310.pyc b/densepose/data/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b37ddb00d02913b693f64dce9eaca902ce101664
Binary files /dev/null and b/densepose/data/__pycache__/__init__.cpython-310.pyc differ
diff --git a/densepose/data/__pycache__/__init__.cpython-39.pyc b/densepose/data/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ed776ed4dec58c07e0b1c0650bc914fb9901e68a
Binary files /dev/null and b/densepose/data/__pycache__/__init__.cpython-39.pyc differ
diff --git a/densepose/data/__pycache__/build.cpython-310.pyc b/densepose/data/__pycache__/build.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..22a3ee55fd8a40782169e733dccbef0a0c5af4bd
Binary files /dev/null and b/densepose/data/__pycache__/build.cpython-310.pyc differ
diff --git a/densepose/data/__pycache__/build.cpython-39.pyc b/densepose/data/__pycache__/build.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9f11bc95f7b64fb701d42a1289e2d9c642817c22
Binary files /dev/null and b/densepose/data/__pycache__/build.cpython-39.pyc differ
diff --git a/densepose/data/__pycache__/combined_loader.cpython-310.pyc b/densepose/data/__pycache__/combined_loader.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..bcca7cd057d7baeab82694bc05f15e4fdc0e6cec
Binary files /dev/null and b/densepose/data/__pycache__/combined_loader.cpython-310.pyc differ
diff --git a/densepose/data/__pycache__/combined_loader.cpython-39.pyc b/densepose/data/__pycache__/combined_loader.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9d00b3d2a2fd19d486fdb1aa6ed1afb530fd3ab9
Binary files /dev/null and b/densepose/data/__pycache__/combined_loader.cpython-39.pyc differ
diff --git a/densepose/data/__pycache__/dataset_mapper.cpython-310.pyc b/densepose/data/__pycache__/dataset_mapper.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..33fbd771f763fd646bc00c9fdbe081c98dd0dce7
Binary files /dev/null and b/densepose/data/__pycache__/dataset_mapper.cpython-310.pyc differ
diff --git a/densepose/data/__pycache__/dataset_mapper.cpython-39.pyc b/densepose/data/__pycache__/dataset_mapper.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c790b4f0d00560658d398990bb5b00cade5924f8
Binary files /dev/null and b/densepose/data/__pycache__/dataset_mapper.cpython-39.pyc differ
diff --git a/densepose/data/__pycache__/image_list_dataset.cpython-310.pyc b/densepose/data/__pycache__/image_list_dataset.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d924941c22322d9f559cf2f75a721ea55fafc44d
Binary files /dev/null and b/densepose/data/__pycache__/image_list_dataset.cpython-310.pyc differ
diff --git a/densepose/data/__pycache__/image_list_dataset.cpython-39.pyc b/densepose/data/__pycache__/image_list_dataset.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ba57cd2eec88f3ae63759c92d13285926b3141e9
Binary files /dev/null and b/densepose/data/__pycache__/image_list_dataset.cpython-39.pyc differ
diff --git a/densepose/data/__pycache__/inference_based_loader.cpython-310.pyc b/densepose/data/__pycache__/inference_based_loader.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7e0030b44dce03080366230416e32fe16e5ccc9b
Binary files /dev/null and b/densepose/data/__pycache__/inference_based_loader.cpython-310.pyc differ
diff --git a/densepose/data/__pycache__/inference_based_loader.cpython-39.pyc b/densepose/data/__pycache__/inference_based_loader.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9b38482df39304d6b1f20edb67e62b786a399b40
Binary files /dev/null and b/densepose/data/__pycache__/inference_based_loader.cpython-39.pyc differ
diff --git a/densepose/data/__pycache__/utils.cpython-310.pyc b/densepose/data/__pycache__/utils.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..17080e02c7c545863e6274118cc3cd603eb31fd7
Binary files /dev/null and b/densepose/data/__pycache__/utils.cpython-310.pyc differ
diff --git a/densepose/data/__pycache__/utils.cpython-39.pyc b/densepose/data/__pycache__/utils.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..dbdf74068a4cfc896bce23abec0f7a527a08ccc7
Binary files /dev/null and b/densepose/data/__pycache__/utils.cpython-39.pyc differ
diff --git a/densepose/data/build.py b/densepose/data/build.py
new file mode 100644
index 0000000000000000000000000000000000000000..39edbd89d88b7f66e4952add5d23289c8e7b9348
--- /dev/null
+++ b/densepose/data/build.py
@@ -0,0 +1,736 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+
+import itertools
+import logging
+import numpy as np
+from collections import UserDict, defaultdict
+from dataclasses import dataclass
+from typing import Any, Callable, Collection, Dict, Iterable, List, Optional, Sequence, Tuple
+import torch
+from torch.utils.data.dataset import Dataset
+
+from detectron2.config import CfgNode
+from detectron2.data.build import build_detection_test_loader as d2_build_detection_test_loader
+from detectron2.data.build import build_detection_train_loader as d2_build_detection_train_loader
+from detectron2.data.build import (
+    load_proposals_into_dataset,
+    print_instances_class_histogram,
+    trivial_batch_collator,
+    worker_init_reset_seed,
+)
+from detectron2.data.catalog import DatasetCatalog, Metadata, MetadataCatalog
+from detectron2.data.samplers import TrainingSampler
+from detectron2.utils.comm import get_world_size
+
+from densepose.config import get_bootstrap_dataset_config
+from densepose.modeling import build_densepose_embedder
+
+from .combined_loader import CombinedDataLoader, Loader
+from .dataset_mapper import DatasetMapper
+from .datasets.coco import DENSEPOSE_CSE_KEYS_WITHOUT_MASK, DENSEPOSE_IUV_KEYS_WITHOUT_MASK
+from .datasets.dataset_type import DatasetType
+from .inference_based_loader import InferenceBasedLoader, ScoreBasedFilter
+from .samplers import (
+    DensePoseConfidenceBasedSampler,
+    DensePoseCSEConfidenceBasedSampler,
+    DensePoseCSEUniformSampler,
+    DensePoseUniformSampler,
+    MaskFromDensePoseSampler,
+    PredictionToGroundTruthSampler,
+)
+from .transform import ImageResizeTransform
+from .utils import get_category_to_class_mapping, get_class_to_mesh_name_mapping
+from .video import (
+    FirstKFramesSelector,
+    FrameSelectionStrategy,
+    LastKFramesSelector,
+    RandomKFramesSelector,
+    VideoKeyframeDataset,
+    video_list_from_file,
+)
+
+__all__ = ["build_detection_train_loader", "build_detection_test_loader"]
+
+
+Instance = Dict[str, Any]
+InstancePredicate = Callable[[Instance], bool]
+
+
+def _compute_num_images_per_worker(cfg: CfgNode) -> int:
+    num_workers = get_world_size()
+    images_per_batch = cfg.SOLVER.IMS_PER_BATCH
+    assert (
+        images_per_batch % num_workers == 0
+    ), "SOLVER.IMS_PER_BATCH ({}) must be divisible by the number of workers ({}).".format(
+        images_per_batch, num_workers
+    )
+    assert (
+        images_per_batch >= num_workers
+    ), "SOLVER.IMS_PER_BATCH ({}) must be larger than the number of workers ({}).".format(
+        images_per_batch, num_workers
+    )
+    images_per_worker = images_per_batch // num_workers
+    return images_per_worker
+
+
+def _map_category_id_to_contiguous_id(dataset_name: str, dataset_dicts: Iterable[Instance]) -> None:
+    meta = MetadataCatalog.get(dataset_name)
+    for dataset_dict in dataset_dicts:
+        for ann in dataset_dict["annotations"]:
+            ann["category_id"] = meta.thing_dataset_id_to_contiguous_id[ann["category_id"]]
+
+
+@dataclass
+class _DatasetCategory:
+    """
+    Class representing category data in a dataset:
+     - id: category ID, as specified in the dataset annotations file
+     - name: category name, as specified in the dataset annotations file
+     - mapped_id: category ID after applying category maps (DATASETS.CATEGORY_MAPS config option)
+     - mapped_name: category name after applying category maps
+     - dataset_name: dataset in which the category is defined
+
+    For example, when training models in a class-agnostic manner, one could take LVIS 1.0
+    dataset and map the animal categories to the same category as human data from COCO:
+     id = 225
+     name = "cat"
+     mapped_id = 1
+     mapped_name = "person"
+     dataset_name = "lvis_v1_animals_dp_train"
+    """
+
+    id: int
+    name: str
+    mapped_id: int
+    mapped_name: str
+    dataset_name: str
+
+
+_MergedCategoriesT = Dict[int, List[_DatasetCategory]]
+
+
+def _add_category_id_to_contiguous_id_maps_to_metadata(
+    merged_categories: _MergedCategoriesT,
+) -> None:
+    merged_categories_per_dataset = {}
+    for contiguous_cat_id, cat_id in enumerate(sorted(merged_categories.keys())):
+        for cat in merged_categories[cat_id]:
+            if cat.dataset_name not in merged_categories_per_dataset:
+                merged_categories_per_dataset[cat.dataset_name] = defaultdict(list)
+            merged_categories_per_dataset[cat.dataset_name][cat_id].append(
+                (
+                    contiguous_cat_id,
+                    cat,
+                )
+            )
+
+    logger = logging.getLogger(__name__)
+    for dataset_name, merged_categories in merged_categories_per_dataset.items():
+        meta = MetadataCatalog.get(dataset_name)
+        if not hasattr(meta, "thing_classes"):
+            meta.thing_classes = []
+            meta.thing_dataset_id_to_contiguous_id = {}
+            meta.thing_dataset_id_to_merged_id = {}
+        else:
+            meta.thing_classes.clear()
+            meta.thing_dataset_id_to_contiguous_id.clear()
+            meta.thing_dataset_id_to_merged_id.clear()
+        logger.info(f"Dataset {dataset_name}: category ID to contiguous ID mapping:")
+        for _cat_id, categories in sorted(merged_categories.items()):
+            added_to_thing_classes = False
+            for contiguous_cat_id, cat in categories:
+                if not added_to_thing_classes:
+                    meta.thing_classes.append(cat.mapped_name)
+                    added_to_thing_classes = True
+                meta.thing_dataset_id_to_contiguous_id[cat.id] = contiguous_cat_id
+                meta.thing_dataset_id_to_merged_id[cat.id] = cat.mapped_id
+                logger.info(f"{cat.id} ({cat.name}) -> {contiguous_cat_id}")
+
+
+def _maybe_create_general_keep_instance_predicate(cfg: CfgNode) -> Optional[InstancePredicate]:
+    def has_annotations(instance: Instance) -> bool:
+        return "annotations" in instance
+
+    def has_only_crowd_anotations(instance: Instance) -> bool:
+        for ann in instance["annotations"]:
+            if ann.get("is_crowd", 0) == 0:
+                return False
+        return True
+
+    def general_keep_instance_predicate(instance: Instance) -> bool:
+        return has_annotations(instance) and not has_only_crowd_anotations(instance)
+
+    if not cfg.DATALOADER.FILTER_EMPTY_ANNOTATIONS:
+        return None
+    return general_keep_instance_predicate
+
+
+def _maybe_create_keypoints_keep_instance_predicate(cfg: CfgNode) -> Optional[InstancePredicate]:
+
+    min_num_keypoints = cfg.MODEL.ROI_KEYPOINT_HEAD.MIN_KEYPOINTS_PER_IMAGE
+
+    def has_sufficient_num_keypoints(instance: Instance) -> bool:
+        num_kpts = sum(
+            (np.array(ann["keypoints"][2::3]) > 0).sum()
+            for ann in instance["annotations"]
+            if "keypoints" in ann
+        )
+        return num_kpts >= min_num_keypoints
+
+    if cfg.MODEL.KEYPOINT_ON and (min_num_keypoints > 0):
+        return has_sufficient_num_keypoints
+    return None
+
+
+def _maybe_create_mask_keep_instance_predicate(cfg: CfgNode) -> Optional[InstancePredicate]:
+    if not cfg.MODEL.MASK_ON:
+        return None
+
+    def has_mask_annotations(instance: Instance) -> bool:
+        return any("segmentation" in ann for ann in instance["annotations"])
+
+    return has_mask_annotations
+
+
+def _maybe_create_densepose_keep_instance_predicate(cfg: CfgNode) -> Optional[InstancePredicate]:
+    if not cfg.MODEL.DENSEPOSE_ON:
+        return None
+
+    use_masks = cfg.MODEL.ROI_DENSEPOSE_HEAD.COARSE_SEGM_TRAINED_BY_MASKS
+
+    def has_densepose_annotations(instance: Instance) -> bool:
+        for ann in instance["annotations"]:
+            if all(key in ann for key in DENSEPOSE_IUV_KEYS_WITHOUT_MASK) or all(
+                key in ann for key in DENSEPOSE_CSE_KEYS_WITHOUT_MASK
+            ):
+                return True
+            if use_masks and "segmentation" in ann:
+                return True
+        return False
+
+    return has_densepose_annotations
+
+
+def _maybe_create_specific_keep_instance_predicate(cfg: CfgNode) -> Optional[InstancePredicate]:
+    specific_predicate_creators = [
+        _maybe_create_keypoints_keep_instance_predicate,
+        _maybe_create_mask_keep_instance_predicate,
+        _maybe_create_densepose_keep_instance_predicate,
+    ]
+    predicates = [creator(cfg) for creator in specific_predicate_creators]
+    predicates = [p for p in predicates if p is not None]
+    if not predicates:
+        return None
+
+    def combined_predicate(instance: Instance) -> bool:
+        return any(p(instance) for p in predicates)
+
+    return combined_predicate
+
+
+def _get_train_keep_instance_predicate(cfg: CfgNode):
+    general_keep_predicate = _maybe_create_general_keep_instance_predicate(cfg)
+    combined_specific_keep_predicate = _maybe_create_specific_keep_instance_predicate(cfg)
+
+    def combined_general_specific_keep_predicate(instance: Instance) -> bool:
+        return general_keep_predicate(instance) and combined_specific_keep_predicate(instance)
+
+    if (general_keep_predicate is None) and (combined_specific_keep_predicate is None):
+        return None
+    if general_keep_predicate is None:
+        return combined_specific_keep_predicate
+    if combined_specific_keep_predicate is None:
+        return general_keep_predicate
+    return combined_general_specific_keep_predicate
+
+
+def _get_test_keep_instance_predicate(cfg: CfgNode):
+    general_keep_predicate = _maybe_create_general_keep_instance_predicate(cfg)
+    return general_keep_predicate
+
+
+def _maybe_filter_and_map_categories(
+    dataset_name: str, dataset_dicts: List[Instance]
+) -> List[Instance]:
+    meta = MetadataCatalog.get(dataset_name)
+    category_id_map = meta.thing_dataset_id_to_contiguous_id
+    filtered_dataset_dicts = []
+    for dataset_dict in dataset_dicts:
+        anns = []
+        for ann in dataset_dict["annotations"]:
+            cat_id = ann["category_id"]
+            if cat_id not in category_id_map:
+                continue
+            ann["category_id"] = category_id_map[cat_id]
+            anns.append(ann)
+        dataset_dict["annotations"] = anns
+        filtered_dataset_dicts.append(dataset_dict)
+    return filtered_dataset_dicts
+
+
+def _add_category_whitelists_to_metadata(cfg: CfgNode) -> None:
+    for dataset_name, whitelisted_cat_ids in cfg.DATASETS.WHITELISTED_CATEGORIES.items():
+        meta = MetadataCatalog.get(dataset_name)
+        meta.whitelisted_categories = whitelisted_cat_ids
+        logger = logging.getLogger(__name__)
+        logger.info(
+            "Whitelisted categories for dataset {}: {}".format(
+                dataset_name, meta.whitelisted_categories
+            )
+        )
+
+
+def _add_category_maps_to_metadata(cfg: CfgNode) -> None:
+    for dataset_name, category_map in cfg.DATASETS.CATEGORY_MAPS.items():
+        category_map = {
+            int(cat_id_src): int(cat_id_dst) for cat_id_src, cat_id_dst in category_map.items()
+        }
+        meta = MetadataCatalog.get(dataset_name)
+        meta.category_map = category_map
+        logger = logging.getLogger(__name__)
+        logger.info("Category maps for dataset {}: {}".format(dataset_name, meta.category_map))
+
+
+def _add_category_info_to_bootstrapping_metadata(dataset_name: str, dataset_cfg: CfgNode) -> None:
+    meta = MetadataCatalog.get(dataset_name)
+    meta.category_to_class_mapping = get_category_to_class_mapping(dataset_cfg)
+    meta.categories = dataset_cfg.CATEGORIES
+    meta.max_count_per_category = dataset_cfg.MAX_COUNT_PER_CATEGORY
+    logger = logging.getLogger(__name__)
+    logger.info(
+        "Category to class mapping for dataset {}: {}".format(
+            dataset_name, meta.category_to_class_mapping
+        )
+    )
+
+
+def _maybe_add_class_to_mesh_name_map_to_metadata(dataset_names: List[str], cfg: CfgNode) -> None:
+    for dataset_name in dataset_names:
+        meta = MetadataCatalog.get(dataset_name)
+        if not hasattr(meta, "class_to_mesh_name"):
+            meta.class_to_mesh_name = get_class_to_mesh_name_mapping(cfg)
+
+
+def _merge_categories(dataset_names: Collection[str]) -> _MergedCategoriesT:
+    merged_categories = defaultdict(list)
+    category_names = {}
+    for dataset_name in dataset_names:
+        meta = MetadataCatalog.get(dataset_name)
+        whitelisted_categories = meta.get("whitelisted_categories")
+        category_map = meta.get("category_map", {})
+        cat_ids = (
+            whitelisted_categories if whitelisted_categories is not None else meta.categories.keys()
+        )
+        for cat_id in cat_ids:
+            cat_name = meta.categories[cat_id]
+            cat_id_mapped = category_map.get(cat_id, cat_id)
+            if cat_id_mapped == cat_id or cat_id_mapped in cat_ids:
+                category_names[cat_id] = cat_name
+            else:
+                category_names[cat_id] = str(cat_id_mapped)
+            # assign temporary mapped category name, this name can be changed
+            # during the second pass, since mapped ID can correspond to a category
+            # from a different dataset
+            cat_name_mapped = meta.categories[cat_id_mapped]
+            merged_categories[cat_id_mapped].append(
+                _DatasetCategory(
+                    id=cat_id,
+                    name=cat_name,
+                    mapped_id=cat_id_mapped,
+                    mapped_name=cat_name_mapped,
+                    dataset_name=dataset_name,
+                )
+            )
+    # second pass to assign proper mapped category names
+    for cat_id, categories in merged_categories.items():
+        for cat in categories:
+            if cat_id in category_names and cat.mapped_name != category_names[cat_id]:
+                cat.mapped_name = category_names[cat_id]
+
+    return merged_categories
+
+
+def _warn_if_merged_different_categories(merged_categories: _MergedCategoriesT) -> None:
+    logger = logging.getLogger(__name__)
+    for cat_id in merged_categories:
+        merged_categories_i = merged_categories[cat_id]
+        first_cat_name = merged_categories_i[0].name
+        if len(merged_categories_i) > 1 and not all(
+            cat.name == first_cat_name for cat in merged_categories_i[1:]
+        ):
+            cat_summary_str = ", ".join(
+                [f"{cat.id} ({cat.name}) from {cat.dataset_name}" for cat in merged_categories_i]
+            )
+            logger.warning(
+                f"Merged category {cat_id} corresponds to the following categories: "
+                f"{cat_summary_str}"
+            )
+
+
+def combine_detection_dataset_dicts(
+    dataset_names: Collection[str],
+    keep_instance_predicate: Optional[InstancePredicate] = None,
+    proposal_files: Optional[Collection[str]] = None,
+) -> List[Instance]:
+    """
+    Load and prepare dataset dicts for training / testing
+
+    Args:
+        dataset_names (Collection[str]): a list of dataset names
+        keep_instance_predicate (Callable: Dict[str, Any] -> bool): predicate
+            applied to instance dicts which defines whether to keep the instance
+        proposal_files (Collection[str]): if given, a list of object proposal files
+            that match each dataset in `dataset_names`.
+    """
+    assert len(dataset_names)
+    if proposal_files is None:
+        proposal_files = [None] * len(dataset_names)
+    assert len(dataset_names) == len(proposal_files)
+    # load datasets and metadata
+    dataset_name_to_dicts = {}
+    for dataset_name in dataset_names:
+        dataset_name_to_dicts[dataset_name] = DatasetCatalog.get(dataset_name)
+        assert len(dataset_name_to_dicts), f"Dataset '{dataset_name}' is empty!"
+    # merge categories, requires category metadata to be loaded
+    # cat_id -> [(orig_cat_id, cat_name, dataset_name)]
+    merged_categories = _merge_categories(dataset_names)
+    _warn_if_merged_different_categories(merged_categories)
+    merged_category_names = [
+        merged_categories[cat_id][0].mapped_name for cat_id in sorted(merged_categories)
+    ]
+    # map to contiguous category IDs
+    _add_category_id_to_contiguous_id_maps_to_metadata(merged_categories)
+    # load annotations and dataset metadata
+    for dataset_name, proposal_file in zip(dataset_names, proposal_files):
+        dataset_dicts = dataset_name_to_dicts[dataset_name]
+        assert len(dataset_dicts), f"Dataset '{dataset_name}' is empty!"
+        if proposal_file is not None:
+            dataset_dicts = load_proposals_into_dataset(dataset_dicts, proposal_file)
+        dataset_dicts = _maybe_filter_and_map_categories(dataset_name, dataset_dicts)
+        print_instances_class_histogram(dataset_dicts, merged_category_names)
+        dataset_name_to_dicts[dataset_name] = dataset_dicts
+
+    if keep_instance_predicate is not None:
+        all_datasets_dicts_plain = [
+            d
+            for d in itertools.chain.from_iterable(dataset_name_to_dicts.values())
+            if keep_instance_predicate(d)
+        ]
+    else:
+        all_datasets_dicts_plain = list(
+            itertools.chain.from_iterable(dataset_name_to_dicts.values())
+        )
+    return all_datasets_dicts_plain
+
+
+def build_detection_train_loader(cfg: CfgNode, mapper=None):
+    """
+    A data loader is created in a way similar to that of Detectron2.
+    The main differences are:
+     - it allows to combine datasets with different but compatible object category sets
+
+    The data loader is created by the following steps:
+    1. Use the dataset names in config to query :class:`DatasetCatalog`, and obtain a list of dicts.
+    2. Start workers to work on the dicts. Each worker will:
+        * Map each metadata dict into another format to be consumed by the model.
+        * Batch them by simply putting dicts into a list.
+    The batched ``list[mapped_dict]`` is what this dataloader will return.
+
+    Args:
+        cfg (CfgNode): the config
+        mapper (callable): a callable which takes a sample (dict) from dataset and
+            returns the format to be consumed by the model.
+            By default it will be `DatasetMapper(cfg, True)`.
+
+    Returns:
+        an infinite iterator of training data
+    """
+
+    _add_category_whitelists_to_metadata(cfg)
+    _add_category_maps_to_metadata(cfg)
+    _maybe_add_class_to_mesh_name_map_to_metadata(cfg.DATASETS.TRAIN, cfg)
+    dataset_dicts = combine_detection_dataset_dicts(
+        cfg.DATASETS.TRAIN,
+        keep_instance_predicate=_get_train_keep_instance_predicate(cfg),
+        proposal_files=cfg.DATASETS.PROPOSAL_FILES_TRAIN if cfg.MODEL.LOAD_PROPOSALS else None,
+    )
+    if mapper is None:
+        mapper = DatasetMapper(cfg, True)
+    return d2_build_detection_train_loader(cfg, dataset=dataset_dicts, mapper=mapper)
+
+
+def build_detection_test_loader(cfg, dataset_name, mapper=None):
+    """
+    Similar to `build_detection_train_loader`.
+    But this function uses the given `dataset_name` argument (instead of the names in cfg),
+    and uses batch size 1.
+
+    Args:
+        cfg: a detectron2 CfgNode
+        dataset_name (str): a name of the dataset that's available in the DatasetCatalog
+        mapper (callable): a callable which takes a sample (dict) from dataset
+            and returns the format to be consumed by the model.
+            By default it will be `DatasetMapper(cfg, False)`.
+
+    Returns:
+        DataLoader: a torch DataLoader, that loads the given detection
+            dataset, with test-time transformation and batching.
+    """
+    _add_category_whitelists_to_metadata(cfg)
+    _add_category_maps_to_metadata(cfg)
+    _maybe_add_class_to_mesh_name_map_to_metadata([dataset_name], cfg)
+    dataset_dicts = combine_detection_dataset_dicts(
+        [dataset_name],
+        keep_instance_predicate=_get_test_keep_instance_predicate(cfg),
+        proposal_files=[
+            cfg.DATASETS.PROPOSAL_FILES_TEST[list(cfg.DATASETS.TEST).index(dataset_name)]
+        ]
+        if cfg.MODEL.LOAD_PROPOSALS
+        else None,
+    )
+    sampler = None
+    if not cfg.DENSEPOSE_EVALUATION.DISTRIBUTED_INFERENCE:
+        sampler = torch.utils.data.SequentialSampler(dataset_dicts)
+    if mapper is None:
+        mapper = DatasetMapper(cfg, False)
+    return d2_build_detection_test_loader(
+        dataset_dicts, mapper=mapper, num_workers=cfg.DATALOADER.NUM_WORKERS, sampler=sampler
+    )
+
+
+def build_frame_selector(cfg: CfgNode):
+    strategy = FrameSelectionStrategy(cfg.STRATEGY)
+    if strategy == FrameSelectionStrategy.RANDOM_K:
+        frame_selector = RandomKFramesSelector(cfg.NUM_IMAGES)
+    elif strategy == FrameSelectionStrategy.FIRST_K:
+        frame_selector = FirstKFramesSelector(cfg.NUM_IMAGES)
+    elif strategy == FrameSelectionStrategy.LAST_K:
+        frame_selector = LastKFramesSelector(cfg.NUM_IMAGES)
+    elif strategy == FrameSelectionStrategy.ALL:
+        frame_selector = None
+    # pyre-fixme[61]: `frame_selector` may not be initialized here.
+    return frame_selector
+
+
+def build_transform(cfg: CfgNode, data_type: str):
+    if cfg.TYPE == "resize":
+        if data_type == "image":
+            return ImageResizeTransform(cfg.MIN_SIZE, cfg.MAX_SIZE)
+    raise ValueError(f"Unknown transform {cfg.TYPE} for data type {data_type}")
+
+
+def build_combined_loader(cfg: CfgNode, loaders: Collection[Loader], ratios: Sequence[float]):
+    images_per_worker = _compute_num_images_per_worker(cfg)
+    return CombinedDataLoader(loaders, images_per_worker, ratios)
+
+
+def build_bootstrap_dataset(dataset_name: str, cfg: CfgNode) -> Sequence[torch.Tensor]:
+    """
+    Build dataset that provides data to bootstrap on
+
+    Args:
+        dataset_name (str): Name of the dataset, needs to have associated metadata
+            to load the data
+        cfg (CfgNode): bootstrapping config
+    Returns:
+        Sequence[Tensor] - dataset that provides image batches, Tensors of size
+            [N, C, H, W] of type float32
+    """
+    logger = logging.getLogger(__name__)
+    _add_category_info_to_bootstrapping_metadata(dataset_name, cfg)
+    meta = MetadataCatalog.get(dataset_name)
+    factory = BootstrapDatasetFactoryCatalog.get(meta.dataset_type)
+    dataset = None
+    if factory is not None:
+        dataset = factory(meta, cfg)
+    if dataset is None:
+        logger.warning(f"Failed to create dataset {dataset_name} of type {meta.dataset_type}")
+    return dataset
+
+
+def build_data_sampler(cfg: CfgNode, sampler_cfg: CfgNode, embedder: Optional[torch.nn.Module]):
+    if sampler_cfg.TYPE == "densepose_uniform":
+        data_sampler = PredictionToGroundTruthSampler()
+        # transform densepose pred -> gt
+        data_sampler.register_sampler(
+            "pred_densepose",
+            "gt_densepose",
+            DensePoseUniformSampler(count_per_class=sampler_cfg.COUNT_PER_CLASS),
+        )
+        data_sampler.register_sampler("pred_densepose", "gt_masks", MaskFromDensePoseSampler())
+        return data_sampler
+    elif sampler_cfg.TYPE == "densepose_UV_confidence":
+        data_sampler = PredictionToGroundTruthSampler()
+        # transform densepose pred -> gt
+        data_sampler.register_sampler(
+            "pred_densepose",
+            "gt_densepose",
+            DensePoseConfidenceBasedSampler(
+                confidence_channel="sigma_2",
+                count_per_class=sampler_cfg.COUNT_PER_CLASS,
+                search_proportion=0.5,
+            ),
+        )
+        data_sampler.register_sampler("pred_densepose", "gt_masks", MaskFromDensePoseSampler())
+        return data_sampler
+    elif sampler_cfg.TYPE == "densepose_fine_segm_confidence":
+        data_sampler = PredictionToGroundTruthSampler()
+        # transform densepose pred -> gt
+        data_sampler.register_sampler(
+            "pred_densepose",
+            "gt_densepose",
+            DensePoseConfidenceBasedSampler(
+                confidence_channel="fine_segm_confidence",
+                count_per_class=sampler_cfg.COUNT_PER_CLASS,
+                search_proportion=0.5,
+            ),
+        )
+        data_sampler.register_sampler("pred_densepose", "gt_masks", MaskFromDensePoseSampler())
+        return data_sampler
+    elif sampler_cfg.TYPE == "densepose_coarse_segm_confidence":
+        data_sampler = PredictionToGroundTruthSampler()
+        # transform densepose pred -> gt
+        data_sampler.register_sampler(
+            "pred_densepose",
+            "gt_densepose",
+            DensePoseConfidenceBasedSampler(
+                confidence_channel="coarse_segm_confidence",
+                count_per_class=sampler_cfg.COUNT_PER_CLASS,
+                search_proportion=0.5,
+            ),
+        )
+        data_sampler.register_sampler("pred_densepose", "gt_masks", MaskFromDensePoseSampler())
+        return data_sampler
+    elif sampler_cfg.TYPE == "densepose_cse_uniform":
+        assert embedder is not None
+        data_sampler = PredictionToGroundTruthSampler()
+        # transform densepose pred -> gt
+        data_sampler.register_sampler(
+            "pred_densepose",
+            "gt_densepose",
+            DensePoseCSEUniformSampler(
+                cfg=cfg,
+                use_gt_categories=sampler_cfg.USE_GROUND_TRUTH_CATEGORIES,
+                embedder=embedder,
+                count_per_class=sampler_cfg.COUNT_PER_CLASS,
+            ),
+        )
+        data_sampler.register_sampler("pred_densepose", "gt_masks", MaskFromDensePoseSampler())
+        return data_sampler
+    elif sampler_cfg.TYPE == "densepose_cse_coarse_segm_confidence":
+        assert embedder is not None
+        data_sampler = PredictionToGroundTruthSampler()
+        # transform densepose pred -> gt
+        data_sampler.register_sampler(
+            "pred_densepose",
+            "gt_densepose",
+            DensePoseCSEConfidenceBasedSampler(
+                cfg=cfg,
+                use_gt_categories=sampler_cfg.USE_GROUND_TRUTH_CATEGORIES,
+                embedder=embedder,
+                confidence_channel="coarse_segm_confidence",
+                count_per_class=sampler_cfg.COUNT_PER_CLASS,
+                search_proportion=0.5,
+            ),
+        )
+        data_sampler.register_sampler("pred_densepose", "gt_masks", MaskFromDensePoseSampler())
+        return data_sampler
+
+    raise ValueError(f"Unknown data sampler type {sampler_cfg.TYPE}")
+
+
+def build_data_filter(cfg: CfgNode):
+    if cfg.TYPE == "detection_score":
+        min_score = cfg.MIN_VALUE
+        return ScoreBasedFilter(min_score=min_score)
+    raise ValueError(f"Unknown data filter type {cfg.TYPE}")
+
+
+def build_inference_based_loader(
+    cfg: CfgNode,
+    dataset_cfg: CfgNode,
+    model: torch.nn.Module,
+    embedder: Optional[torch.nn.Module] = None,
+) -> InferenceBasedLoader:
+    """
+    Constructs data loader based on inference results of a model.
+    """
+    dataset = build_bootstrap_dataset(dataset_cfg.DATASET, dataset_cfg.IMAGE_LOADER)
+    meta = MetadataCatalog.get(dataset_cfg.DATASET)
+    training_sampler = TrainingSampler(len(dataset))
+    data_loader = torch.utils.data.DataLoader(
+        dataset,  # pyre-ignore[6]
+        batch_size=dataset_cfg.IMAGE_LOADER.BATCH_SIZE,
+        sampler=training_sampler,
+        num_workers=dataset_cfg.IMAGE_LOADER.NUM_WORKERS,
+        collate_fn=trivial_batch_collator,
+        worker_init_fn=worker_init_reset_seed,
+    )
+    return InferenceBasedLoader(
+        model,
+        data_loader=data_loader,
+        data_sampler=build_data_sampler(cfg, dataset_cfg.DATA_SAMPLER, embedder),
+        data_filter=build_data_filter(dataset_cfg.FILTER),
+        shuffle=True,
+        batch_size=dataset_cfg.INFERENCE.OUTPUT_BATCH_SIZE,
+        inference_batch_size=dataset_cfg.INFERENCE.INPUT_BATCH_SIZE,
+        category_to_class_mapping=meta.category_to_class_mapping,
+    )
+
+
+def has_inference_based_loaders(cfg: CfgNode) -> bool:
+    """
+    Returns True, if at least one inferense-based loader must
+    be instantiated for training
+    """
+    return len(cfg.BOOTSTRAP_DATASETS) > 0
+
+
+def build_inference_based_loaders(
+    cfg: CfgNode, model: torch.nn.Module
+) -> Tuple[List[InferenceBasedLoader], List[float]]:
+    loaders = []
+    ratios = []
+    embedder = build_densepose_embedder(cfg).to(device=model.device)  # pyre-ignore[16]
+    for dataset_spec in cfg.BOOTSTRAP_DATASETS:
+        dataset_cfg = get_bootstrap_dataset_config().clone()
+        dataset_cfg.merge_from_other_cfg(CfgNode(dataset_spec))
+        loader = build_inference_based_loader(cfg, dataset_cfg, model, embedder)
+        loaders.append(loader)
+        ratios.append(dataset_cfg.RATIO)
+    return loaders, ratios
+
+
+def build_video_list_dataset(meta: Metadata, cfg: CfgNode):
+    video_list_fpath = meta.video_list_fpath
+    video_base_path = meta.video_base_path
+    category = meta.category
+    if cfg.TYPE == "video_keyframe":
+        frame_selector = build_frame_selector(cfg.SELECT)
+        transform = build_transform(cfg.TRANSFORM, data_type="image")
+        video_list = video_list_from_file(video_list_fpath, video_base_path)
+        keyframe_helper_fpath = getattr(cfg, "KEYFRAME_HELPER", None)
+        return VideoKeyframeDataset(
+            video_list, category, frame_selector, transform, keyframe_helper_fpath
+        )
+
+
+class _BootstrapDatasetFactoryCatalog(UserDict):
+    """
+    A global dictionary that stores information about bootstrapped datasets creation functions
+    from metadata and config, for diverse DatasetType
+    """
+
+    def register(self, dataset_type: DatasetType, factory: Callable[[Metadata, CfgNode], Dataset]):
+        """
+        Args:
+            dataset_type (DatasetType): a DatasetType e.g. DatasetType.VIDEO_LIST
+            factory (Callable[Metadata, CfgNode]): a callable which takes Metadata and cfg
+            arguments and returns a dataset object.
+        """
+        assert dataset_type not in self, "Dataset '{}' is already registered!".format(dataset_type)
+        self[dataset_type] = factory
+
+
+BootstrapDatasetFactoryCatalog = _BootstrapDatasetFactoryCatalog()
+BootstrapDatasetFactoryCatalog.register(DatasetType.VIDEO_LIST, build_video_list_dataset)
diff --git a/densepose/data/combined_loader.py b/densepose/data/combined_loader.py
new file mode 100644
index 0000000000000000000000000000000000000000..5bfbbdeaf53e184b83a6e0f951867b79d3d9f1fd
--- /dev/null
+++ b/densepose/data/combined_loader.py
@@ -0,0 +1,44 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+
+import random
+from collections import deque
+from typing import Any, Collection, Deque, Iterable, Iterator, List, Sequence
+
+Loader = Iterable[Any]
+
+
+def _pooled_next(iterator: Iterator[Any], pool: Deque[Any]):
+    if not pool:
+        pool.extend(next(iterator))
+    return pool.popleft()
+
+
+class CombinedDataLoader:
+    """
+    Combines data loaders using the provided sampling ratios
+    """
+
+    BATCH_COUNT = 100
+
+    def __init__(self, loaders: Collection[Loader], batch_size: int, ratios: Sequence[float]):
+        self.loaders = loaders
+        self.batch_size = batch_size
+        self.ratios = ratios
+
+    def __iter__(self) -> Iterator[List[Any]]:
+        iters = [iter(loader) for loader in self.loaders]
+        indices = []
+        pool = [deque()] * len(iters)
+        # infinite iterator, as in D2
+        while True:
+            if not indices:
+                # just a buffer of indices, its size doesn't matter
+                # as long as it's a multiple of batch_size
+                k = self.batch_size * self.BATCH_COUNT
+                indices = random.choices(range(len(self.loaders)), self.ratios, k=k)
+            try:
+                batch = [_pooled_next(iters[i], pool[i]) for i in indices[: self.batch_size]]
+            except StopIteration:
+                break
+            indices = indices[self.batch_size :]
+            yield batch
diff --git a/densepose/data/dataset_mapper.py b/densepose/data/dataset_mapper.py
new file mode 100644
index 0000000000000000000000000000000000000000..3229c4d7b9eab3e8e2d4f895d5209dd655d716a5
--- /dev/null
+++ b/densepose/data/dataset_mapper.py
@@ -0,0 +1,168 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) Facebook, Inc. and its affiliates.
+
+import copy
+import logging
+from typing import Any, Dict, List, Tuple
+import torch
+
+from detectron2.data import MetadataCatalog
+from detectron2.data import detection_utils as utils
+from detectron2.data import transforms as T
+from detectron2.layers import ROIAlign
+from detectron2.structures import BoxMode
+from detectron2.utils.file_io import PathManager
+
+from densepose.structures import DensePoseDataRelative, DensePoseList, DensePoseTransformData
+
+
+def build_augmentation(cfg, is_train):
+    logger = logging.getLogger(__name__)
+    result = utils.build_augmentation(cfg, is_train)
+    if is_train:
+        random_rotation = T.RandomRotation(
+            cfg.INPUT.ROTATION_ANGLES, expand=False, sample_style="choice"
+        )
+        result.append(random_rotation)
+        logger.info("DensePose-specific augmentation used in training: " + str(random_rotation))
+    return result
+
+
+class DatasetMapper:
+    """
+    A customized version of `detectron2.data.DatasetMapper`
+    """
+
+    def __init__(self, cfg, is_train=True):
+        self.augmentation = build_augmentation(cfg, is_train)
+
+        # fmt: off
+        self.img_format     = cfg.INPUT.FORMAT
+        self.mask_on        = (
+            cfg.MODEL.MASK_ON or (
+                cfg.MODEL.DENSEPOSE_ON
+                and cfg.MODEL.ROI_DENSEPOSE_HEAD.COARSE_SEGM_TRAINED_BY_MASKS)
+        )
+        self.keypoint_on    = cfg.MODEL.KEYPOINT_ON
+        self.densepose_on   = cfg.MODEL.DENSEPOSE_ON
+        assert not cfg.MODEL.LOAD_PROPOSALS, "not supported yet"
+        # fmt: on
+        if self.keypoint_on and is_train:
+            # Flip only makes sense in training
+            self.keypoint_hflip_indices = utils.create_keypoint_hflip_indices(cfg.DATASETS.TRAIN)
+        else:
+            self.keypoint_hflip_indices = None
+
+        if self.densepose_on:
+            densepose_transform_srcs = [
+                MetadataCatalog.get(ds).densepose_transform_src
+                for ds in cfg.DATASETS.TRAIN + cfg.DATASETS.TEST
+            ]
+            assert len(densepose_transform_srcs) > 0
+            # TODO: check that DensePose transformation data is the same for
+            # all the datasets. Otherwise one would have to pass DB ID with
+            # each entry to select proper transformation data. For now, since
+            # all DensePose annotated data uses the same data semantics, we
+            # omit this check.
+            densepose_transform_data_fpath = PathManager.get_local_path(densepose_transform_srcs[0])
+            self.densepose_transform_data = DensePoseTransformData.load(
+                densepose_transform_data_fpath
+            )
+
+        self.is_train = is_train
+
+    def __call__(self, dataset_dict):
+        """
+        Args:
+            dataset_dict (dict): Metadata of one image, in Detectron2 Dataset format.
+
+        Returns:
+            dict: a format that builtin models in detectron2 accept
+        """
+        dataset_dict = copy.deepcopy(dataset_dict)  # it will be modified by code below
+        image = utils.read_image(dataset_dict["file_name"], format=self.img_format)
+        utils.check_image_size(dataset_dict, image)
+
+        image, transforms = T.apply_transform_gens(self.augmentation, image)
+        image_shape = image.shape[:2]  # h, w
+        dataset_dict["image"] = torch.as_tensor(image.transpose(2, 0, 1).astype("float32"))
+
+        if not self.is_train:
+            dataset_dict.pop("annotations", None)
+            return dataset_dict
+
+        for anno in dataset_dict["annotations"]:
+            if not self.mask_on:
+                anno.pop("segmentation", None)
+            if not self.keypoint_on:
+                anno.pop("keypoints", None)
+
+        # USER: Implement additional transformations if you have other types of data
+        # USER: Don't call transpose_densepose if you don't need
+        annos = [
+            self._transform_densepose(
+                utils.transform_instance_annotations(
+                    obj, transforms, image_shape, keypoint_hflip_indices=self.keypoint_hflip_indices
+                ),
+                transforms,
+            )
+            for obj in dataset_dict.pop("annotations")
+            if obj.get("iscrowd", 0) == 0
+        ]
+
+        if self.mask_on:
+            self._add_densepose_masks_as_segmentation(annos, image_shape)
+
+        instances = utils.annotations_to_instances(annos, image_shape, mask_format="bitmask")
+        densepose_annotations = [obj.get("densepose") for obj in annos]
+        if densepose_annotations and not all(v is None for v in densepose_annotations):
+            instances.gt_densepose = DensePoseList(
+                densepose_annotations, instances.gt_boxes, image_shape
+            )
+
+        dataset_dict["instances"] = instances[instances.gt_boxes.nonempty()]
+        return dataset_dict
+
+    def _transform_densepose(self, annotation, transforms):
+        if not self.densepose_on:
+            return annotation
+
+        # Handle densepose annotations
+        is_valid, reason_not_valid = DensePoseDataRelative.validate_annotation(annotation)
+        if is_valid:
+            densepose_data = DensePoseDataRelative(annotation, cleanup=True)
+            densepose_data.apply_transform(transforms, self.densepose_transform_data)
+            annotation["densepose"] = densepose_data
+        else:
+            # logger = logging.getLogger(__name__)
+            # logger.debug("Could not load DensePose annotation: {}".format(reason_not_valid))
+            DensePoseDataRelative.cleanup_annotation(annotation)
+            # NOTE: annotations for certain instances may be unavailable.
+            # 'None' is accepted by the DensePostList data structure.
+            annotation["densepose"] = None
+        return annotation
+
+    def _add_densepose_masks_as_segmentation(
+        self, annotations: List[Dict[str, Any]], image_shape_hw: Tuple[int, int]
+    ):
+        for obj in annotations:
+            if ("densepose" not in obj) or ("segmentation" in obj):
+                continue
+            # DP segmentation: torch.Tensor [S, S] of float32, S=256
+            segm_dp = torch.zeros_like(obj["densepose"].segm)
+            segm_dp[obj["densepose"].segm > 0] = 1
+            segm_h, segm_w = segm_dp.shape
+            bbox_segm_dp = torch.tensor((0, 0, segm_h - 1, segm_w - 1), dtype=torch.float32)
+            # image bbox
+            x0, y0, x1, y1 = (
+                v.item() for v in BoxMode.convert(obj["bbox"], obj["bbox_mode"], BoxMode.XYXY_ABS)
+            )
+            segm_aligned = (
+                ROIAlign((y1 - y0, x1 - x0), 1.0, 0, aligned=True)
+                .forward(segm_dp.view(1, 1, *segm_dp.shape), bbox_segm_dp)
+                .squeeze()
+            )
+            image_mask = torch.zeros(*image_shape_hw, dtype=torch.float32)
+            image_mask[y0:y1, x0:x1] = segm_aligned
+            # segmentation for BitMask: np.array [H, W] of bool
+            obj["segmentation"] = image_mask >= 0.5
diff --git a/densepose/data/datasets/__init__.py b/densepose/data/datasets/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..260ccb9c43e5aa2d0f1fd28cfcbdd4f31913d16a
--- /dev/null
+++ b/densepose/data/datasets/__init__.py
@@ -0,0 +1,5 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+
+from . import builtin  # ensure the builtin datasets are registered
+
+__all__ = [k for k in globals().keys() if "builtin" not in k and not k.startswith("_")]
diff --git a/densepose/data/datasets/__pycache__/__init__.cpython-310.pyc b/densepose/data/datasets/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..82f9e4bf2e0fea47d7a4310402894238c431b817
Binary files /dev/null and b/densepose/data/datasets/__pycache__/__init__.cpython-310.pyc differ
diff --git a/densepose/data/datasets/__pycache__/__init__.cpython-39.pyc b/densepose/data/datasets/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..32fe0eed5c95808a9d36d1491350b22a6db3b00b
Binary files /dev/null and b/densepose/data/datasets/__pycache__/__init__.cpython-39.pyc differ
diff --git a/densepose/data/datasets/__pycache__/builtin.cpython-310.pyc b/densepose/data/datasets/__pycache__/builtin.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5ad7abdb3296d44561b80b8e58556bb47e82993b
Binary files /dev/null and b/densepose/data/datasets/__pycache__/builtin.cpython-310.pyc differ
diff --git a/densepose/data/datasets/__pycache__/builtin.cpython-39.pyc b/densepose/data/datasets/__pycache__/builtin.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1e3e2f71c4f5d8ff3346759879baff0a9c7e4184
Binary files /dev/null and b/densepose/data/datasets/__pycache__/builtin.cpython-39.pyc differ
diff --git a/densepose/data/datasets/__pycache__/chimpnsee.cpython-310.pyc b/densepose/data/datasets/__pycache__/chimpnsee.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2635d9fcd5a7dcab7ef3a878087683c7ff2ea013
Binary files /dev/null and b/densepose/data/datasets/__pycache__/chimpnsee.cpython-310.pyc differ
diff --git a/densepose/data/datasets/__pycache__/chimpnsee.cpython-39.pyc b/densepose/data/datasets/__pycache__/chimpnsee.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1705202e0b3da77d07a48ba60e618bc8ab4c3178
Binary files /dev/null and b/densepose/data/datasets/__pycache__/chimpnsee.cpython-39.pyc differ
diff --git a/densepose/data/datasets/__pycache__/coco.cpython-310.pyc b/densepose/data/datasets/__pycache__/coco.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..bc370c892a9c2a9cd7839ef916453887b9b3a8e3
Binary files /dev/null and b/densepose/data/datasets/__pycache__/coco.cpython-310.pyc differ
diff --git a/densepose/data/datasets/__pycache__/coco.cpython-39.pyc b/densepose/data/datasets/__pycache__/coco.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3a9216c6c75a2cfd1c261d9d101b59d62ea9bb0a
Binary files /dev/null and b/densepose/data/datasets/__pycache__/coco.cpython-39.pyc differ
diff --git a/densepose/data/datasets/__pycache__/dataset_type.cpython-310.pyc b/densepose/data/datasets/__pycache__/dataset_type.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2e0b089bddbcfef86b4bf0465b54f61edbb918f0
Binary files /dev/null and b/densepose/data/datasets/__pycache__/dataset_type.cpython-310.pyc differ
diff --git a/densepose/data/datasets/__pycache__/dataset_type.cpython-39.pyc b/densepose/data/datasets/__pycache__/dataset_type.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..078063496e4a331dac079c3872e0e741259c0bc2
Binary files /dev/null and b/densepose/data/datasets/__pycache__/dataset_type.cpython-39.pyc differ
diff --git a/densepose/data/datasets/__pycache__/lvis.cpython-310.pyc b/densepose/data/datasets/__pycache__/lvis.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e737d182bcc24e88e2245d138e2e3fdd3bf255a1
Binary files /dev/null and b/densepose/data/datasets/__pycache__/lvis.cpython-310.pyc differ
diff --git a/densepose/data/datasets/__pycache__/lvis.cpython-39.pyc b/densepose/data/datasets/__pycache__/lvis.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..40da40748ae96ea7f0e758e5c3edda001e90f3d0
Binary files /dev/null and b/densepose/data/datasets/__pycache__/lvis.cpython-39.pyc differ
diff --git a/densepose/data/datasets/builtin.py b/densepose/data/datasets/builtin.py
new file mode 100644
index 0000000000000000000000000000000000000000..7572cd6abc550fdce9d1fd079a7af4870de303bb
--- /dev/null
+++ b/densepose/data/datasets/builtin.py
@@ -0,0 +1,16 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+from .chimpnsee import register_dataset as register_chimpnsee_dataset
+from .coco import BASE_DATASETS as BASE_COCO_DATASETS
+from .coco import DATASETS as COCO_DATASETS
+from .coco import register_datasets as register_coco_datasets
+from .lvis import DATASETS as LVIS_DATASETS
+from .lvis import register_datasets as register_lvis_datasets
+
+DEFAULT_DATASETS_ROOT = "datasets"
+
+
+register_coco_datasets(COCO_DATASETS, DEFAULT_DATASETS_ROOT)
+register_coco_datasets(BASE_COCO_DATASETS, DEFAULT_DATASETS_ROOT)
+register_lvis_datasets(LVIS_DATASETS, DEFAULT_DATASETS_ROOT)
+
+register_chimpnsee_dataset(DEFAULT_DATASETS_ROOT)  # pyre-ignore[19]
diff --git a/densepose/data/datasets/chimpnsee.py b/densepose/data/datasets/chimpnsee.py
new file mode 100644
index 0000000000000000000000000000000000000000..61e0b506dc4ed6ad78c9c4ce4677415a27f5f6cd
--- /dev/null
+++ b/densepose/data/datasets/chimpnsee.py
@@ -0,0 +1,29 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+
+from typing import Optional
+
+from detectron2.data import DatasetCatalog, MetadataCatalog
+
+from ..utils import maybe_prepend_base_path
+from .dataset_type import DatasetType
+
+CHIMPNSEE_DATASET_NAME = "chimpnsee"
+
+
+def register_dataset(datasets_root: Optional[str] = None) -> None:
+    def empty_load_callback():
+        pass
+
+    video_list_fpath = maybe_prepend_base_path(
+        datasets_root,
+        "chimpnsee/cdna.eva.mpg.de/video_list.txt",
+    )
+    video_base_path = maybe_prepend_base_path(datasets_root, "chimpnsee/cdna.eva.mpg.de")
+
+    DatasetCatalog.register(CHIMPNSEE_DATASET_NAME, empty_load_callback)
+    MetadataCatalog.get(CHIMPNSEE_DATASET_NAME).set(
+        dataset_type=DatasetType.VIDEO_LIST,
+        video_list_fpath=video_list_fpath,
+        video_base_path=video_base_path,
+        category="chimpanzee",
+    )
diff --git a/densepose/data/datasets/coco.py b/densepose/data/datasets/coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..c19f7b034b1641c9ccd88634f12fcdc3013bce09
--- /dev/null
+++ b/densepose/data/datasets/coco.py
@@ -0,0 +1,432 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+import contextlib
+import io
+import logging
+import os
+from collections import defaultdict
+from dataclasses import dataclass
+from typing import Any, Dict, Iterable, List, Optional
+from fvcore.common.timer import Timer
+
+from detectron2.data import DatasetCatalog, MetadataCatalog
+from detectron2.structures import BoxMode
+from detectron2.utils.file_io import PathManager
+
+from ..utils import maybe_prepend_base_path
+
+DENSEPOSE_MASK_KEY = "dp_masks"
+DENSEPOSE_IUV_KEYS_WITHOUT_MASK = ["dp_x", "dp_y", "dp_I", "dp_U", "dp_V"]
+DENSEPOSE_CSE_KEYS_WITHOUT_MASK = ["dp_x", "dp_y", "dp_vertex", "ref_model"]
+DENSEPOSE_ALL_POSSIBLE_KEYS = set(
+    DENSEPOSE_IUV_KEYS_WITHOUT_MASK + DENSEPOSE_CSE_KEYS_WITHOUT_MASK + [DENSEPOSE_MASK_KEY]
+)
+DENSEPOSE_METADATA_URL_PREFIX = "https://dl.fbaipublicfiles.com/densepose/data/"
+
+
+@dataclass
+class CocoDatasetInfo:
+    name: str
+    images_root: str
+    annotations_fpath: str
+
+
+DATASETS = [
+    CocoDatasetInfo(
+        name="densepose_coco_2014_train",
+        images_root="coco/train2014",
+        annotations_fpath="coco/annotations/densepose_train2014.json",
+    ),
+    CocoDatasetInfo(
+        name="densepose_coco_2014_minival",
+        images_root="coco/val2014",
+        annotations_fpath="coco/annotations/densepose_minival2014.json",
+    ),
+    CocoDatasetInfo(
+        name="densepose_coco_2014_minival_100",
+        images_root="coco/val2014",
+        annotations_fpath="coco/annotations/densepose_minival2014_100.json",
+    ),
+    CocoDatasetInfo(
+        name="densepose_coco_2014_valminusminival",
+        images_root="coco/val2014",
+        annotations_fpath="coco/annotations/densepose_valminusminival2014.json",
+    ),
+    CocoDatasetInfo(
+        name="densepose_coco_2014_train_cse",
+        images_root="coco/train2014",
+        annotations_fpath="coco_cse/densepose_train2014_cse.json",
+    ),
+    CocoDatasetInfo(
+        name="densepose_coco_2014_minival_cse",
+        images_root="coco/val2014",
+        annotations_fpath="coco_cse/densepose_minival2014_cse.json",
+    ),
+    CocoDatasetInfo(
+        name="densepose_coco_2014_minival_100_cse",
+        images_root="coco/val2014",
+        annotations_fpath="coco_cse/densepose_minival2014_100_cse.json",
+    ),
+    CocoDatasetInfo(
+        name="densepose_coco_2014_valminusminival_cse",
+        images_root="coco/val2014",
+        annotations_fpath="coco_cse/densepose_valminusminival2014_cse.json",
+    ),
+    CocoDatasetInfo(
+        name="densepose_chimps",
+        images_root="densepose_chimps/images",
+        annotations_fpath="densepose_chimps/densepose_chimps_densepose.json",
+    ),
+    CocoDatasetInfo(
+        name="densepose_chimps_cse_train",
+        images_root="densepose_chimps/images",
+        annotations_fpath="densepose_chimps/densepose_chimps_cse_train.json",
+    ),
+    CocoDatasetInfo(
+        name="densepose_chimps_cse_val",
+        images_root="densepose_chimps/images",
+        annotations_fpath="densepose_chimps/densepose_chimps_cse_val.json",
+    ),
+    CocoDatasetInfo(
+        name="posetrack2017_train",
+        images_root="posetrack2017/posetrack_data_2017",
+        annotations_fpath="posetrack2017/densepose_posetrack_train2017.json",
+    ),
+    CocoDatasetInfo(
+        name="posetrack2017_val",
+        images_root="posetrack2017/posetrack_data_2017",
+        annotations_fpath="posetrack2017/densepose_posetrack_val2017.json",
+    ),
+    CocoDatasetInfo(
+        name="lvis_v05_train",
+        images_root="coco/train2017",
+        annotations_fpath="lvis/lvis_v0.5_plus_dp_train.json",
+    ),
+    CocoDatasetInfo(
+        name="lvis_v05_val",
+        images_root="coco/val2017",
+        annotations_fpath="lvis/lvis_v0.5_plus_dp_val.json",
+    ),
+]
+
+
+BASE_DATASETS = [
+    CocoDatasetInfo(
+        name="base_coco_2017_train",
+        images_root="coco/train2017",
+        annotations_fpath="coco/annotations/instances_train2017.json",
+    ),
+    CocoDatasetInfo(
+        name="base_coco_2017_val",
+        images_root="coco/val2017",
+        annotations_fpath="coco/annotations/instances_val2017.json",
+    ),
+    CocoDatasetInfo(
+        name="base_coco_2017_val_100",
+        images_root="coco/val2017",
+        annotations_fpath="coco/annotations/instances_val2017_100.json",
+    ),
+]
+
+
+def get_metadata(base_path: Optional[str]) -> Dict[str, Any]:
+    """
+    Returns metadata associated with COCO DensePose datasets
+
+    Args:
+    base_path: Optional[str]
+        Base path used to load metadata from
+
+    Returns:
+    Dict[str, Any]
+        Metadata in the form of a dictionary
+    """
+    meta = {
+        "densepose_transform_src": maybe_prepend_base_path(base_path, "UV_symmetry_transforms.mat"),
+        "densepose_smpl_subdiv": maybe_prepend_base_path(base_path, "SMPL_subdiv.mat"),
+        "densepose_smpl_subdiv_transform": maybe_prepend_base_path(
+            base_path,
+            "SMPL_SUBDIV_TRANSFORM.mat",
+        ),
+    }
+    return meta
+
+
+def _load_coco_annotations(json_file: str):
+    """
+    Load COCO annotations from a JSON file
+
+    Args:
+        json_file: str
+            Path to the file to load annotations from
+    Returns:
+        Instance of `pycocotools.coco.COCO` that provides access to annotations
+        data
+    """
+    from pycocotools.coco import COCO
+
+    logger = logging.getLogger(__name__)
+    timer = Timer()
+    with contextlib.redirect_stdout(io.StringIO()):
+        coco_api = COCO(json_file)
+    if timer.seconds() > 1:
+        logger.info("Loading {} takes {:.2f} seconds.".format(json_file, timer.seconds()))
+    return coco_api
+
+
+def _add_categories_metadata(dataset_name: str, categories: List[Dict[str, Any]]):
+    meta = MetadataCatalog.get(dataset_name)
+    meta.categories = {c["id"]: c["name"] for c in categories}
+    logger = logging.getLogger(__name__)
+    logger.info("Dataset {} categories: {}".format(dataset_name, meta.categories))
+
+
+def _verify_annotations_have_unique_ids(json_file: str, anns: List[List[Dict[str, Any]]]):
+    if "minival" in json_file:
+        # Skip validation on COCO2014 valminusminival and minival annotations
+        # The ratio of buggy annotations there is tiny and does not affect accuracy
+        # Therefore we explicitly white-list them
+        return
+    ann_ids = [ann["id"] for anns_per_image in anns for ann in anns_per_image]
+    assert len(set(ann_ids)) == len(ann_ids), "Annotation ids in '{}' are not unique!".format(
+        json_file
+    )
+
+
+def _maybe_add_bbox(obj: Dict[str, Any], ann_dict: Dict[str, Any]):
+    if "bbox" not in ann_dict:
+        return
+    obj["bbox"] = ann_dict["bbox"]
+    obj["bbox_mode"] = BoxMode.XYWH_ABS
+
+
+def _maybe_add_segm(obj: Dict[str, Any], ann_dict: Dict[str, Any]):
+    if "segmentation" not in ann_dict:
+        return
+    segm = ann_dict["segmentation"]
+    if not isinstance(segm, dict):
+        # filter out invalid polygons (< 3 points)
+        segm = [poly for poly in segm if len(poly) % 2 == 0 and len(poly) >= 6]
+        if len(segm) == 0:
+            return
+    obj["segmentation"] = segm
+
+
+def _maybe_add_keypoints(obj: Dict[str, Any], ann_dict: Dict[str, Any]):
+    if "keypoints" not in ann_dict:
+        return
+    keypts = ann_dict["keypoints"]  # list[int]
+    for idx, v in enumerate(keypts):
+        if idx % 3 != 2:
+            # COCO's segmentation coordinates are floating points in [0, H or W],
+            # but keypoint coordinates are integers in [0, H-1 or W-1]
+            # Therefore we assume the coordinates are "pixel indices" and
+            # add 0.5 to convert to floating point coordinates.
+            keypts[idx] = v + 0.5
+    obj["keypoints"] = keypts
+
+
+def _maybe_add_densepose(obj: Dict[str, Any], ann_dict: Dict[str, Any]):
+    for key in DENSEPOSE_ALL_POSSIBLE_KEYS:
+        if key in ann_dict:
+            obj[key] = ann_dict[key]
+
+
+def _combine_images_with_annotations(
+    dataset_name: str,
+    image_root: str,
+    img_datas: Iterable[Dict[str, Any]],
+    ann_datas: Iterable[Iterable[Dict[str, Any]]],
+):
+
+    ann_keys = ["iscrowd", "category_id"]
+    dataset_dicts = []
+    contains_video_frame_info = False
+
+    for img_dict, ann_dicts in zip(img_datas, ann_datas):
+        record = {}
+        record["file_name"] = os.path.join(image_root, img_dict["file_name"])
+        record["height"] = img_dict["height"]
+        record["width"] = img_dict["width"]
+        record["image_id"] = img_dict["id"]
+        record["dataset"] = dataset_name
+        if "frame_id" in img_dict:
+            record["frame_id"] = img_dict["frame_id"]
+            record["video_id"] = img_dict.get("vid_id", None)
+            contains_video_frame_info = True
+        objs = []
+        for ann_dict in ann_dicts:
+            assert ann_dict["image_id"] == record["image_id"]
+            assert ann_dict.get("ignore", 0) == 0
+            obj = {key: ann_dict[key] for key in ann_keys if key in ann_dict}
+            _maybe_add_bbox(obj, ann_dict)
+            _maybe_add_segm(obj, ann_dict)
+            _maybe_add_keypoints(obj, ann_dict)
+            _maybe_add_densepose(obj, ann_dict)
+            objs.append(obj)
+        record["annotations"] = objs
+        dataset_dicts.append(record)
+    if contains_video_frame_info:
+        create_video_frame_mapping(dataset_name, dataset_dicts)
+    return dataset_dicts
+
+
+def get_contiguous_id_to_category_id_map(metadata):
+    cat_id_2_cont_id = metadata.thing_dataset_id_to_contiguous_id
+    cont_id_2_cat_id = {}
+    for cat_id, cont_id in cat_id_2_cont_id.items():
+        if cont_id in cont_id_2_cat_id:
+            continue
+        cont_id_2_cat_id[cont_id] = cat_id
+    return cont_id_2_cat_id
+
+
+def maybe_filter_categories_cocoapi(dataset_name, coco_api):
+    meta = MetadataCatalog.get(dataset_name)
+    cont_id_2_cat_id = get_contiguous_id_to_category_id_map(meta)
+    cat_id_2_cont_id = meta.thing_dataset_id_to_contiguous_id
+    # filter categories
+    cats = []
+    for cat in coco_api.dataset["categories"]:
+        cat_id = cat["id"]
+        if cat_id not in cat_id_2_cont_id:
+            continue
+        cont_id = cat_id_2_cont_id[cat_id]
+        if (cont_id in cont_id_2_cat_id) and (cont_id_2_cat_id[cont_id] == cat_id):
+            cats.append(cat)
+    coco_api.dataset["categories"] = cats
+    # filter annotations, if multiple categories are mapped to a single
+    # contiguous ID, use only one category ID and map all annotations to that category ID
+    anns = []
+    for ann in coco_api.dataset["annotations"]:
+        cat_id = ann["category_id"]
+        if cat_id not in cat_id_2_cont_id:
+            continue
+        cont_id = cat_id_2_cont_id[cat_id]
+        ann["category_id"] = cont_id_2_cat_id[cont_id]
+        anns.append(ann)
+    coco_api.dataset["annotations"] = anns
+    # recreate index
+    coco_api.createIndex()
+
+
+def maybe_filter_and_map_categories_cocoapi(dataset_name, coco_api):
+    meta = MetadataCatalog.get(dataset_name)
+    category_id_map = meta.thing_dataset_id_to_contiguous_id
+    # map categories
+    cats = []
+    for cat in coco_api.dataset["categories"]:
+        cat_id = cat["id"]
+        if cat_id not in category_id_map:
+            continue
+        cat["id"] = category_id_map[cat_id]
+        cats.append(cat)
+    coco_api.dataset["categories"] = cats
+    # map annotation categories
+    anns = []
+    for ann in coco_api.dataset["annotations"]:
+        cat_id = ann["category_id"]
+        if cat_id not in category_id_map:
+            continue
+        ann["category_id"] = category_id_map[cat_id]
+        anns.append(ann)
+    coco_api.dataset["annotations"] = anns
+    # recreate index
+    coco_api.createIndex()
+
+
+def create_video_frame_mapping(dataset_name, dataset_dicts):
+    mapping = defaultdict(dict)
+    for d in dataset_dicts:
+        video_id = d.get("video_id")
+        if video_id is None:
+            continue
+        mapping[video_id].update({d["frame_id"]: d["file_name"]})
+    MetadataCatalog.get(dataset_name).set(video_frame_mapping=mapping)
+
+
+def load_coco_json(annotations_json_file: str, image_root: str, dataset_name: str):
+    """
+    Loads a JSON file with annotations in COCO instances format.
+    Replaces `detectron2.data.datasets.coco.load_coco_json` to handle metadata
+    in a more flexible way. Postpones category mapping to a later stage to be
+    able to combine several datasets with different (but coherent) sets of
+    categories.
+
+    Args:
+
+    annotations_json_file: str
+        Path to the JSON file with annotations in COCO instances format.
+    image_root: str
+        directory that contains all the images
+    dataset_name: str
+        the name that identifies a dataset, e.g. "densepose_coco_2014_train"
+    extra_annotation_keys: Optional[List[str]]
+        If provided, these keys are used to extract additional data from
+        the annotations.
+    """
+    coco_api = _load_coco_annotations(PathManager.get_local_path(annotations_json_file))
+    _add_categories_metadata(dataset_name, coco_api.loadCats(coco_api.getCatIds()))
+    # sort indices for reproducible results
+    img_ids = sorted(coco_api.imgs.keys())
+    # imgs is a list of dicts, each looks something like:
+    # {'license': 4,
+    #  'url': 'http://farm6.staticflickr.com/5454/9413846304_881d5e5c3b_z.jpg',
+    #  'file_name': 'COCO_val2014_000000001268.jpg',
+    #  'height': 427,
+    #  'width': 640,
+    #  'date_captured': '2013-11-17 05:57:24',
+    #  'id': 1268}
+    imgs = coco_api.loadImgs(img_ids)
+    logger = logging.getLogger(__name__)
+    logger.info("Loaded {} images in COCO format from {}".format(len(imgs), annotations_json_file))
+    # anns is a list[list[dict]], where each dict is an annotation
+    # record for an object. The inner list enumerates the objects in an image
+    # and the outer list enumerates over images.
+    anns = [coco_api.imgToAnns[img_id] for img_id in img_ids]
+    _verify_annotations_have_unique_ids(annotations_json_file, anns)
+    dataset_records = _combine_images_with_annotations(dataset_name, image_root, imgs, anns)
+    return dataset_records
+
+
+def register_dataset(dataset_data: CocoDatasetInfo, datasets_root: Optional[str] = None):
+    """
+    Registers provided COCO DensePose dataset
+
+    Args:
+    dataset_data: CocoDatasetInfo
+        Dataset data
+    datasets_root: Optional[str]
+        Datasets root folder (default: None)
+    """
+    annotations_fpath = maybe_prepend_base_path(datasets_root, dataset_data.annotations_fpath)
+    images_root = maybe_prepend_base_path(datasets_root, dataset_data.images_root)
+
+    def load_annotations():
+        return load_coco_json(
+            annotations_json_file=annotations_fpath,
+            image_root=images_root,
+            dataset_name=dataset_data.name,
+        )
+
+    DatasetCatalog.register(dataset_data.name, load_annotations)
+    MetadataCatalog.get(dataset_data.name).set(
+        json_file=annotations_fpath,
+        image_root=images_root,
+        **get_metadata(DENSEPOSE_METADATA_URL_PREFIX)
+    )
+
+
+def register_datasets(
+    datasets_data: Iterable[CocoDatasetInfo], datasets_root: Optional[str] = None
+):
+    """
+    Registers provided COCO DensePose datasets
+
+    Args:
+    datasets_data: Iterable[CocoDatasetInfo]
+        An iterable of dataset datas
+    datasets_root: Optional[str]
+        Datasets root folder (default: None)
+    """
+    for dataset_data in datasets_data:
+        register_dataset(dataset_data, datasets_root)
diff --git a/densepose/data/datasets/dataset_type.py b/densepose/data/datasets/dataset_type.py
new file mode 100644
index 0000000000000000000000000000000000000000..ed8f8f299af96847d9d16a77920429fe0195c526
--- /dev/null
+++ b/densepose/data/datasets/dataset_type.py
@@ -0,0 +1,11 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+
+from enum import Enum
+
+
+class DatasetType(Enum):
+    """
+    Dataset type, mostly used for datasets that contain data to bootstrap models on
+    """
+
+    VIDEO_LIST = "video_list"
diff --git a/densepose/data/datasets/lvis.py b/densepose/data/datasets/lvis.py
new file mode 100644
index 0000000000000000000000000000000000000000..b4af9fa292f445c81dc840ab53d07c1af313dfc7
--- /dev/null
+++ b/densepose/data/datasets/lvis.py
@@ -0,0 +1,257 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+import logging
+import os
+from typing import Any, Dict, Iterable, List, Optional
+from fvcore.common.timer import Timer
+
+from detectron2.data import DatasetCatalog, MetadataCatalog
+from detectron2.data.datasets.lvis import get_lvis_instances_meta
+from detectron2.structures import BoxMode
+from detectron2.utils.file_io import PathManager
+
+from ..utils import maybe_prepend_base_path
+from .coco import (
+    DENSEPOSE_ALL_POSSIBLE_KEYS,
+    DENSEPOSE_METADATA_URL_PREFIX,
+    CocoDatasetInfo,
+    get_metadata,
+)
+
+DATASETS = [
+    CocoDatasetInfo(
+        name="densepose_lvis_v1_ds1_train_v1",
+        images_root="coco_",
+        annotations_fpath="lvis/densepose_lvis_v1_ds1_train_v1.json",
+    ),
+    CocoDatasetInfo(
+        name="densepose_lvis_v1_ds1_val_v1",
+        images_root="coco_",
+        annotations_fpath="lvis/densepose_lvis_v1_ds1_val_v1.json",
+    ),
+    CocoDatasetInfo(
+        name="densepose_lvis_v1_ds2_train_v1",
+        images_root="coco_",
+        annotations_fpath="lvis/densepose_lvis_v1_ds2_train_v1.json",
+    ),
+    CocoDatasetInfo(
+        name="densepose_lvis_v1_ds2_val_v1",
+        images_root="coco_",
+        annotations_fpath="lvis/densepose_lvis_v1_ds2_val_v1.json",
+    ),
+    CocoDatasetInfo(
+        name="densepose_lvis_v1_ds1_val_animals_100",
+        images_root="coco_",
+        annotations_fpath="lvis/densepose_lvis_v1_val_animals_100_v2.json",
+    ),
+]
+
+
+def _load_lvis_annotations(json_file: str):
+    """
+    Load COCO annotations from a JSON file
+
+    Args:
+        json_file: str
+            Path to the file to load annotations from
+    Returns:
+        Instance of `pycocotools.coco.COCO` that provides access to annotations
+        data
+    """
+    from lvis import LVIS
+
+    json_file = PathManager.get_local_path(json_file)
+    logger = logging.getLogger(__name__)
+    timer = Timer()
+    lvis_api = LVIS(json_file)
+    if timer.seconds() > 1:
+        logger.info("Loading {} takes {:.2f} seconds.".format(json_file, timer.seconds()))
+    return lvis_api
+
+
+def _add_categories_metadata(dataset_name: str) -> None:
+    metadict = get_lvis_instances_meta(dataset_name)
+    categories = metadict["thing_classes"]
+    metadata = MetadataCatalog.get(dataset_name)
+    metadata.categories = {i + 1: categories[i] for i in range(len(categories))}
+    logger = logging.getLogger(__name__)
+    logger.info(f"Dataset {dataset_name} has {len(categories)} categories")
+
+
+def _verify_annotations_have_unique_ids(json_file: str, anns: List[List[Dict[str, Any]]]) -> None:
+    ann_ids = [ann["id"] for anns_per_image in anns for ann in anns_per_image]
+    assert len(set(ann_ids)) == len(ann_ids), "Annotation ids in '{}' are not unique!".format(
+        json_file
+    )
+
+
+def _maybe_add_bbox(obj: Dict[str, Any], ann_dict: Dict[str, Any]) -> None:
+    if "bbox" not in ann_dict:
+        return
+    obj["bbox"] = ann_dict["bbox"]
+    obj["bbox_mode"] = BoxMode.XYWH_ABS
+
+
+def _maybe_add_segm(obj: Dict[str, Any], ann_dict: Dict[str, Any]) -> None:
+    if "segmentation" not in ann_dict:
+        return
+    segm = ann_dict["segmentation"]
+    if not isinstance(segm, dict):
+        # filter out invalid polygons (< 3 points)
+        segm = [poly for poly in segm if len(poly) % 2 == 0 and len(poly) >= 6]
+        if len(segm) == 0:
+            return
+    obj["segmentation"] = segm
+
+
+def _maybe_add_keypoints(obj: Dict[str, Any], ann_dict: Dict[str, Any]) -> None:
+    if "keypoints" not in ann_dict:
+        return
+    keypts = ann_dict["keypoints"]  # list[int]
+    for idx, v in enumerate(keypts):
+        if idx % 3 != 2:
+            # COCO's segmentation coordinates are floating points in [0, H or W],
+            # but keypoint coordinates are integers in [0, H-1 or W-1]
+            # Therefore we assume the coordinates are "pixel indices" and
+            # add 0.5 to convert to floating point coordinates.
+            keypts[idx] = v + 0.5
+    obj["keypoints"] = keypts
+
+
+def _maybe_add_densepose(obj: Dict[str, Any], ann_dict: Dict[str, Any]) -> None:
+    for key in DENSEPOSE_ALL_POSSIBLE_KEYS:
+        if key in ann_dict:
+            obj[key] = ann_dict[key]
+
+
+def _combine_images_with_annotations(
+    dataset_name: str,
+    image_root: str,
+    img_datas: Iterable[Dict[str, Any]],
+    ann_datas: Iterable[Iterable[Dict[str, Any]]],
+):
+
+    dataset_dicts = []
+
+    def get_file_name(img_root, img_dict):
+        # Determine the path including the split folder ("train2017", "val2017", "test2017") from
+        # the coco_url field. Example:
+        #   'coco_url': 'http://images.cocodataset.org/train2017/000000155379.jpg'
+        split_folder, file_name = img_dict["coco_url"].split("/")[-2:]
+        return os.path.join(img_root + split_folder, file_name)
+
+    for img_dict, ann_dicts in zip(img_datas, ann_datas):
+        record = {}
+        record["file_name"] = get_file_name(image_root, img_dict)
+        record["height"] = img_dict["height"]
+        record["width"] = img_dict["width"]
+        record["not_exhaustive_category_ids"] = img_dict.get("not_exhaustive_category_ids", [])
+        record["neg_category_ids"] = img_dict.get("neg_category_ids", [])
+        record["image_id"] = img_dict["id"]
+        record["dataset"] = dataset_name
+
+        objs = []
+        for ann_dict in ann_dicts:
+            assert ann_dict["image_id"] == record["image_id"]
+            obj = {}
+            _maybe_add_bbox(obj, ann_dict)
+            obj["iscrowd"] = ann_dict.get("iscrowd", 0)
+            obj["category_id"] = ann_dict["category_id"]
+            _maybe_add_segm(obj, ann_dict)
+            _maybe_add_keypoints(obj, ann_dict)
+            _maybe_add_densepose(obj, ann_dict)
+            objs.append(obj)
+        record["annotations"] = objs
+        dataset_dicts.append(record)
+    return dataset_dicts
+
+
+def load_lvis_json(annotations_json_file: str, image_root: str, dataset_name: str):
+    """
+    Loads a JSON file with annotations in LVIS instances format.
+    Replaces `detectron2.data.datasets.coco.load_lvis_json` to handle metadata
+    in a more flexible way. Postpones category mapping to a later stage to be
+    able to combine several datasets with different (but coherent) sets of
+    categories.
+
+    Args:
+
+    annotations_json_file: str
+        Path to the JSON file with annotations in COCO instances format.
+    image_root: str
+        directory that contains all the images
+    dataset_name: str
+        the name that identifies a dataset, e.g. "densepose_coco_2014_train"
+    extra_annotation_keys: Optional[List[str]]
+        If provided, these keys are used to extract additional data from
+        the annotations.
+    """
+    lvis_api = _load_lvis_annotations(PathManager.get_local_path(annotations_json_file))
+
+    _add_categories_metadata(dataset_name)
+
+    # sort indices for reproducible results
+    img_ids = sorted(lvis_api.imgs.keys())
+    # imgs is a list of dicts, each looks something like:
+    # {'license': 4,
+    #  'url': 'http://farm6.staticflickr.com/5454/9413846304_881d5e5c3b_z.jpg',
+    #  'file_name': 'COCO_val2014_000000001268.jpg',
+    #  'height': 427,
+    #  'width': 640,
+    #  'date_captured': '2013-11-17 05:57:24',
+    #  'id': 1268}
+    imgs = lvis_api.load_imgs(img_ids)
+    logger = logging.getLogger(__name__)
+    logger.info("Loaded {} images in LVIS format from {}".format(len(imgs), annotations_json_file))
+    # anns is a list[list[dict]], where each dict is an annotation
+    # record for an object. The inner list enumerates the objects in an image
+    # and the outer list enumerates over images.
+    anns = [lvis_api.img_ann_map[img_id] for img_id in img_ids]
+
+    _verify_annotations_have_unique_ids(annotations_json_file, anns)
+    dataset_records = _combine_images_with_annotations(dataset_name, image_root, imgs, anns)
+    return dataset_records
+
+
+def register_dataset(dataset_data: CocoDatasetInfo, datasets_root: Optional[str] = None) -> None:
+    """
+    Registers provided LVIS DensePose dataset
+
+    Args:
+        dataset_data: CocoDatasetInfo
+            Dataset data
+        datasets_root: Optional[str]
+            Datasets root folder (default: None)
+    """
+    annotations_fpath = maybe_prepend_base_path(datasets_root, dataset_data.annotations_fpath)
+    images_root = maybe_prepend_base_path(datasets_root, dataset_data.images_root)
+
+    def load_annotations():
+        return load_lvis_json(
+            annotations_json_file=annotations_fpath,
+            image_root=images_root,
+            dataset_name=dataset_data.name,
+        )
+
+    DatasetCatalog.register(dataset_data.name, load_annotations)
+    MetadataCatalog.get(dataset_data.name).set(
+        json_file=annotations_fpath,
+        image_root=images_root,
+        evaluator_type="lvis",
+        **get_metadata(DENSEPOSE_METADATA_URL_PREFIX),
+    )
+
+
+def register_datasets(
+    datasets_data: Iterable[CocoDatasetInfo], datasets_root: Optional[str] = None
+) -> None:
+    """
+    Registers provided LVIS DensePose datasets
+
+    Args:
+        datasets_data: Iterable[CocoDatasetInfo]
+            An iterable of dataset datas
+        datasets_root: Optional[str]
+            Datasets root folder (default: None)
+    """
+    for dataset_data in datasets_data:
+        register_dataset(dataset_data, datasets_root)
diff --git a/densepose/data/image_list_dataset.py b/densepose/data/image_list_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..92a95d3d5e7d4d7d6bf1d29d51295d32ae2104d2
--- /dev/null
+++ b/densepose/data/image_list_dataset.py
@@ -0,0 +1,72 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) Facebook, Inc. and its affiliates.
+
+import logging
+import numpy as np
+from typing import Any, Callable, Dict, List, Optional, Union
+import torch
+from torch.utils.data.dataset import Dataset
+
+from detectron2.data.detection_utils import read_image
+
+ImageTransform = Callable[[torch.Tensor], torch.Tensor]
+
+
+class ImageListDataset(Dataset):
+    """
+    Dataset that provides images from a list.
+    """
+
+    _EMPTY_IMAGE = torch.empty((0, 3, 1, 1))
+
+    def __init__(
+        self,
+        image_list: List[str],
+        category_list: Union[str, List[str], None] = None,
+        transform: Optional[ImageTransform] = None,
+    ):
+        """
+        Args:
+            image_list (List[str]): list of paths to image files
+            category_list (Union[str, List[str], None]): list of animal categories for
+                each image. If it is a string, or None, this applies to all images
+        """
+        if type(category_list) == list:
+            self.category_list = category_list
+        else:
+            self.category_list = [category_list] * len(image_list)
+        assert len(image_list) == len(
+            self.category_list
+        ), "length of image and category lists must be equal"
+        self.image_list = image_list
+        self.transform = transform
+
+    def __getitem__(self, idx: int) -> Dict[str, Any]:
+        """
+        Gets selected images from the list
+
+        Args:
+            idx (int): video index in the video list file
+        Returns:
+            A dictionary containing two keys:
+                images (torch.Tensor): tensor of size [N, 3, H, W] (N = 1, or 0 for _EMPTY_IMAGE)
+                categories (List[str]): categories of the frames
+        """
+        categories = [self.category_list[idx]]
+        fpath = self.image_list[idx]
+        transform = self.transform
+
+        try:
+            image = torch.from_numpy(np.ascontiguousarray(read_image(fpath, format="BGR")))
+            image = image.permute(2, 0, 1).unsqueeze(0).float()  # HWC -> NCHW
+            if transform is not None:
+                image = transform(image)
+            return {"images": image, "categories": categories}
+        except (OSError, RuntimeError) as e:
+            logger = logging.getLogger(__name__)
+            logger.warning(f"Error opening image file container {fpath}: {e}")
+
+        return {"images": self._EMPTY_IMAGE, "categories": []}
+
+    def __len__(self):
+        return len(self.image_list)
diff --git a/densepose/data/inference_based_loader.py b/densepose/data/inference_based_loader.py
new file mode 100644
index 0000000000000000000000000000000000000000..cb89544500c29c4055353060ebbc8b428bd0262a
--- /dev/null
+++ b/densepose/data/inference_based_loader.py
@@ -0,0 +1,172 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+
+import random
+from typing import Any, Callable, Dict, Iterable, Iterator, List, Optional, Tuple
+import torch
+from torch import nn
+
+SampledData = Any
+ModelOutput = Any
+
+
+def _grouper(iterable: Iterable[Any], n: int, fillvalue=None) -> Iterator[Tuple[Any]]:
+    """
+    Group elements of an iterable by chunks of size `n`, e.g.
+    grouper(range(9), 4) ->
+        (0, 1, 2, 3), (4, 5, 6, 7), (8, None, None, None)
+    """
+    it = iter(iterable)
+    while True:
+        values = []
+        for _ in range(n):
+            try:
+                value = next(it)
+            except StopIteration:
+                if values:
+                    values.extend([fillvalue] * (n - len(values)))
+                    yield tuple(values)
+                return
+            values.append(value)
+        yield tuple(values)
+
+
+class ScoreBasedFilter:
+    """
+    Filters entries in model output based on their scores
+    Discards all entries with score less than the specified minimum
+    """
+
+    def __init__(self, min_score: float = 0.8):
+        self.min_score = min_score
+
+    def __call__(self, model_output: ModelOutput) -> ModelOutput:
+        for model_output_i in model_output:
+            instances = model_output_i["instances"]
+            if not instances.has("scores"):
+                continue
+            instances_filtered = instances[instances.scores >= self.min_score]
+            model_output_i["instances"] = instances_filtered
+        return model_output
+
+
+class InferenceBasedLoader:
+    """
+    Data loader based on results inferred by a model. Consists of:
+     - a data loader that provides batches of images
+     - a model that is used to infer the results
+     - a data sampler that converts inferred results to annotations
+    """
+
+    def __init__(
+        self,
+        model: nn.Module,
+        data_loader: Iterable[List[Dict[str, Any]]],
+        data_sampler: Optional[Callable[[ModelOutput], List[SampledData]]] = None,
+        data_filter: Optional[Callable[[ModelOutput], ModelOutput]] = None,
+        shuffle: bool = True,
+        batch_size: int = 4,
+        inference_batch_size: int = 4,
+        drop_last: bool = False,
+        category_to_class_mapping: Optional[dict] = None,
+    ):
+        """
+        Constructor
+
+        Args:
+          model (torch.nn.Module): model used to produce data
+          data_loader (Iterable[List[Dict[str, Any]]]): iterable that provides
+            dictionaries with "images" and "categories" fields to perform inference on
+          data_sampler (Callable: ModelOutput -> SampledData): functor
+              that produces annotation data from inference results;
+              (optional, default: None)
+          data_filter (Callable: ModelOutput -> ModelOutput): filter
+              that selects model outputs for further processing
+              (optional, default: None)
+          shuffle (bool): if True, the input images get shuffled
+          batch_size (int): batch size for the produced annotation data
+          inference_batch_size (int): batch size for input images
+          drop_last (bool): if True, drop the last batch if it is undersized
+          category_to_class_mapping (dict): category to class mapping
+        """
+        self.model = model
+        self.model.eval()
+        self.data_loader = data_loader
+        self.data_sampler = data_sampler
+        self.data_filter = data_filter
+        self.shuffle = shuffle
+        self.batch_size = batch_size
+        self.inference_batch_size = inference_batch_size
+        self.drop_last = drop_last
+        if category_to_class_mapping is not None:
+            self.category_to_class_mapping = category_to_class_mapping
+        else:
+            self.category_to_class_mapping = {}
+
+    def __iter__(self) -> Iterator[List[SampledData]]:
+        for batch in self.data_loader:
+            # batch : List[Dict[str: Tensor[N, C, H, W], str: Optional[str]]]
+            # images_batch : Tensor[N, C, H, W]
+            # image : Tensor[C, H, W]
+            images_and_categories = [
+                {"image": image, "category": category}
+                for element in batch
+                for image, category in zip(element["images"], element["categories"])
+            ]
+            if not images_and_categories:
+                continue
+            if self.shuffle:
+                random.shuffle(images_and_categories)
+            yield from self._produce_data(images_and_categories)  # pyre-ignore[6]
+
+    def _produce_data(
+        self, images_and_categories: List[Tuple[torch.Tensor, Optional[str]]]
+    ) -> Iterator[List[SampledData]]:
+        """
+        Produce batches of data from images
+
+        Args:
+          images_and_categories (List[Tuple[torch.Tensor, Optional[str]]]):
+            list of images and corresponding categories to process
+
+        Returns:
+          Iterator over batches of data sampled from model outputs
+        """
+        data_batches: List[SampledData] = []
+        category_to_class_mapping = self.category_to_class_mapping
+        batched_images_and_categories = _grouper(images_and_categories, self.inference_batch_size)
+        for batch in batched_images_and_categories:
+            batch = [
+                {
+                    "image": image_and_category["image"].to(self.model.device),
+                    "category": image_and_category["category"],
+                }
+                for image_and_category in batch
+                if image_and_category is not None
+            ]
+            if not batch:
+                continue
+            with torch.no_grad():
+                model_output = self.model(batch)
+            for model_output_i, batch_i in zip(model_output, batch):
+                assert len(batch_i["image"].shape) == 3
+                model_output_i["image"] = batch_i["image"]
+                instance_class = category_to_class_mapping.get(batch_i["category"], 0)
+                model_output_i["instances"].dataset_classes = torch.tensor(
+                    [instance_class] * len(model_output_i["instances"])
+                )
+            model_output_filtered = (
+                model_output if self.data_filter is None else self.data_filter(model_output)
+            )
+            data = (
+                model_output_filtered
+                if self.data_sampler is None
+                else self.data_sampler(model_output_filtered)
+            )
+            for data_i in data:
+                if len(data_i["instances"]):
+                    data_batches.append(data_i)
+            if len(data_batches) >= self.batch_size:
+                yield data_batches[: self.batch_size]
+                data_batches = data_batches[self.batch_size :]
+        if not self.drop_last and data_batches:
+            yield data_batches
diff --git a/densepose/data/meshes/__init__.py b/densepose/data/meshes/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..1e1f0d5dc439dc58914238b23572f586dd1c693e
--- /dev/null
+++ b/densepose/data/meshes/__init__.py
@@ -0,0 +1,5 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+
+from . import builtin
+
+__all__ = [k for k in globals().keys() if "builtin" not in k and not k.startswith("_")]
diff --git a/densepose/data/meshes/__pycache__/__init__.cpython-310.pyc b/densepose/data/meshes/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9637c2ccdb528a2b4416fa7dd3c7269732cc2441
Binary files /dev/null and b/densepose/data/meshes/__pycache__/__init__.cpython-310.pyc differ
diff --git a/densepose/data/meshes/__pycache__/__init__.cpython-39.pyc b/densepose/data/meshes/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..245237007d0cbc3609b5cf2eb8ac8ae7a0c26181
Binary files /dev/null and b/densepose/data/meshes/__pycache__/__init__.cpython-39.pyc differ
diff --git a/densepose/data/meshes/__pycache__/builtin.cpython-310.pyc b/densepose/data/meshes/__pycache__/builtin.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..290cd1ab4320218265651ed7252b49f9cc59d389
Binary files /dev/null and b/densepose/data/meshes/__pycache__/builtin.cpython-310.pyc differ
diff --git a/densepose/data/meshes/__pycache__/builtin.cpython-39.pyc b/densepose/data/meshes/__pycache__/builtin.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..96aca3a3abafa39d07b9995e6f8eb1ebb3cf7c30
Binary files /dev/null and b/densepose/data/meshes/__pycache__/builtin.cpython-39.pyc differ
diff --git a/densepose/data/meshes/__pycache__/catalog.cpython-310.pyc b/densepose/data/meshes/__pycache__/catalog.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9fd1eee5118e6196aa70ba639828b98454f7416a
Binary files /dev/null and b/densepose/data/meshes/__pycache__/catalog.cpython-310.pyc differ
diff --git a/densepose/data/meshes/__pycache__/catalog.cpython-39.pyc b/densepose/data/meshes/__pycache__/catalog.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..fcbc71c3ea4ec00b67339fc680ba9f526b09f23a
Binary files /dev/null and b/densepose/data/meshes/__pycache__/catalog.cpython-39.pyc differ
diff --git a/densepose/data/meshes/builtin.py b/densepose/data/meshes/builtin.py
new file mode 100644
index 0000000000000000000000000000000000000000..c0b23760e8268b068149931b173a4285ba451993
--- /dev/null
+++ b/densepose/data/meshes/builtin.py
@@ -0,0 +1,101 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+
+from .catalog import MeshInfo, register_meshes
+
+DENSEPOSE_MESHES_DIR = "https://dl.fbaipublicfiles.com/densepose/meshes/"
+
+MESHES = [
+    MeshInfo(
+        name="smpl_27554",
+        data="smpl_27554.pkl",
+        geodists="geodists/geodists_smpl_27554.pkl",
+        symmetry="symmetry/symmetry_smpl_27554.pkl",
+        texcoords="texcoords/texcoords_smpl_27554.pkl",
+    ),
+    MeshInfo(
+        name="chimp_5029",
+        data="chimp_5029.pkl",
+        geodists="geodists/geodists_chimp_5029.pkl",
+        symmetry="symmetry/symmetry_chimp_5029.pkl",
+        texcoords="texcoords/texcoords_chimp_5029.pkl",
+    ),
+    MeshInfo(
+        name="cat_5001",
+        data="cat_5001.pkl",
+        geodists="geodists/geodists_cat_5001.pkl",
+        symmetry="symmetry/symmetry_cat_5001.pkl",
+        texcoords="texcoords/texcoords_cat_5001.pkl",
+    ),
+    MeshInfo(
+        name="cat_7466",
+        data="cat_7466.pkl",
+        geodists="geodists/geodists_cat_7466.pkl",
+        symmetry="symmetry/symmetry_cat_7466.pkl",
+        texcoords="texcoords/texcoords_cat_7466.pkl",
+    ),
+    MeshInfo(
+        name="sheep_5004",
+        data="sheep_5004.pkl",
+        geodists="geodists/geodists_sheep_5004.pkl",
+        symmetry="symmetry/symmetry_sheep_5004.pkl",
+        texcoords="texcoords/texcoords_sheep_5004.pkl",
+    ),
+    MeshInfo(
+        name="zebra_5002",
+        data="zebra_5002.pkl",
+        geodists="geodists/geodists_zebra_5002.pkl",
+        symmetry="symmetry/symmetry_zebra_5002.pkl",
+        texcoords="texcoords/texcoords_zebra_5002.pkl",
+    ),
+    MeshInfo(
+        name="horse_5004",
+        data="horse_5004.pkl",
+        geodists="geodists/geodists_horse_5004.pkl",
+        symmetry="symmetry/symmetry_horse_5004.pkl",
+        texcoords="texcoords/texcoords_zebra_5002.pkl",
+    ),
+    MeshInfo(
+        name="giraffe_5002",
+        data="giraffe_5002.pkl",
+        geodists="geodists/geodists_giraffe_5002.pkl",
+        symmetry="symmetry/symmetry_giraffe_5002.pkl",
+        texcoords="texcoords/texcoords_giraffe_5002.pkl",
+    ),
+    MeshInfo(
+        name="elephant_5002",
+        data="elephant_5002.pkl",
+        geodists="geodists/geodists_elephant_5002.pkl",
+        symmetry="symmetry/symmetry_elephant_5002.pkl",
+        texcoords="texcoords/texcoords_elephant_5002.pkl",
+    ),
+    MeshInfo(
+        name="dog_5002",
+        data="dog_5002.pkl",
+        geodists="geodists/geodists_dog_5002.pkl",
+        symmetry="symmetry/symmetry_dog_5002.pkl",
+        texcoords="texcoords/texcoords_dog_5002.pkl",
+    ),
+    MeshInfo(
+        name="dog_7466",
+        data="dog_7466.pkl",
+        geodists="geodists/geodists_dog_7466.pkl",
+        symmetry="symmetry/symmetry_dog_7466.pkl",
+        texcoords="texcoords/texcoords_dog_7466.pkl",
+    ),
+    MeshInfo(
+        name="cow_5002",
+        data="cow_5002.pkl",
+        geodists="geodists/geodists_cow_5002.pkl",
+        symmetry="symmetry/symmetry_cow_5002.pkl",
+        texcoords="texcoords/texcoords_cow_5002.pkl",
+    ),
+    MeshInfo(
+        name="bear_4936",
+        data="bear_4936.pkl",
+        geodists="geodists/geodists_bear_4936.pkl",
+        symmetry="symmetry/symmetry_bear_4936.pkl",
+        texcoords="texcoords/texcoords_bear_4936.pkl",
+    ),
+]
+
+register_meshes(MESHES, DENSEPOSE_MESHES_DIR)
diff --git a/densepose/data/meshes/catalog.py b/densepose/data/meshes/catalog.py
new file mode 100644
index 0000000000000000000000000000000000000000..b258f3ce11a90666b9c764541ce299384cfddf4e
--- /dev/null
+++ b/densepose/data/meshes/catalog.py
@@ -0,0 +1,71 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+
+import logging
+from collections import UserDict
+from dataclasses import dataclass
+from typing import Iterable, Optional
+
+from ..utils import maybe_prepend_base_path
+
+
+@dataclass
+class MeshInfo:
+    name: str
+    data: str
+    geodists: Optional[str] = None
+    symmetry: Optional[str] = None
+    texcoords: Optional[str] = None
+
+
+class _MeshCatalog(UserDict):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.mesh_ids = {}
+        self.mesh_names = {}
+        self.max_mesh_id = -1
+
+    def __setitem__(self, key, value):
+        if key in self:
+            logger = logging.getLogger(__name__)
+            logger.warning(
+                f"Overwriting mesh catalog entry '{key}': old value {self[key]}"
+                f", new value {value}"
+            )
+            mesh_id = self.mesh_ids[key]
+        else:
+            self.max_mesh_id += 1
+            mesh_id = self.max_mesh_id
+        super().__setitem__(key, value)
+        self.mesh_ids[key] = mesh_id
+        self.mesh_names[mesh_id] = key
+
+    def get_mesh_id(self, shape_name: str) -> int:
+        return self.mesh_ids[shape_name]
+
+    def get_mesh_name(self, mesh_id: int) -> str:
+        return self.mesh_names[mesh_id]
+
+
+MeshCatalog = _MeshCatalog()
+
+
+def register_mesh(mesh_info: MeshInfo, base_path: Optional[str]) -> None:
+    geodists, symmetry, texcoords = mesh_info.geodists, mesh_info.symmetry, mesh_info.texcoords
+    if geodists:
+        geodists = maybe_prepend_base_path(base_path, geodists)
+    if symmetry:
+        symmetry = maybe_prepend_base_path(base_path, symmetry)
+    if texcoords:
+        texcoords = maybe_prepend_base_path(base_path, texcoords)
+    MeshCatalog[mesh_info.name] = MeshInfo(
+        name=mesh_info.name,
+        data=maybe_prepend_base_path(base_path, mesh_info.data),
+        geodists=geodists,
+        symmetry=symmetry,
+        texcoords=texcoords,
+    )
+
+
+def register_meshes(mesh_infos: Iterable[MeshInfo], base_path: Optional[str]) -> None:
+    for mesh_info in mesh_infos:
+        register_mesh(mesh_info, base_path)
diff --git a/densepose/data/samplers/__init__.py b/densepose/data/samplers/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..7dba87ea1c6f37ab56071d2f5d715bd78fe8816f
--- /dev/null
+++ b/densepose/data/samplers/__init__.py
@@ -0,0 +1,8 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+
+from .densepose_uniform import DensePoseUniformSampler
+from .densepose_confidence_based import DensePoseConfidenceBasedSampler
+from .densepose_cse_uniform import DensePoseCSEUniformSampler
+from .densepose_cse_confidence_based import DensePoseCSEConfidenceBasedSampler
+from .mask_from_densepose import MaskFromDensePoseSampler
+from .prediction_to_gt import PredictionToGroundTruthSampler
diff --git a/densepose/data/samplers/__pycache__/__init__.cpython-310.pyc b/densepose/data/samplers/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..48effd278455cb2154284386409486bee48ebbdc
Binary files /dev/null and b/densepose/data/samplers/__pycache__/__init__.cpython-310.pyc differ
diff --git a/densepose/data/samplers/__pycache__/__init__.cpython-39.pyc b/densepose/data/samplers/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4f96b843d400e7148c7ddb9f4b555dc012a1bca3
Binary files /dev/null and b/densepose/data/samplers/__pycache__/__init__.cpython-39.pyc differ
diff --git a/densepose/data/samplers/__pycache__/densepose_base.cpython-310.pyc b/densepose/data/samplers/__pycache__/densepose_base.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ab7456728ecae4b91b9d9313d7f11ce86d1ca470
Binary files /dev/null and b/densepose/data/samplers/__pycache__/densepose_base.cpython-310.pyc differ
diff --git a/densepose/data/samplers/__pycache__/densepose_base.cpython-39.pyc b/densepose/data/samplers/__pycache__/densepose_base.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8f043400eb4a01045a2126e0ebe47363c7648fac
Binary files /dev/null and b/densepose/data/samplers/__pycache__/densepose_base.cpython-39.pyc differ
diff --git a/densepose/data/samplers/__pycache__/densepose_confidence_based.cpython-310.pyc b/densepose/data/samplers/__pycache__/densepose_confidence_based.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3103ddcf5dbb737a41656cf427fbf2e895ea9575
Binary files /dev/null and b/densepose/data/samplers/__pycache__/densepose_confidence_based.cpython-310.pyc differ
diff --git a/densepose/data/samplers/__pycache__/densepose_confidence_based.cpython-39.pyc b/densepose/data/samplers/__pycache__/densepose_confidence_based.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..87b9b90fc69cf7231f225eeeb9763a5b42b951fa
Binary files /dev/null and b/densepose/data/samplers/__pycache__/densepose_confidence_based.cpython-39.pyc differ
diff --git a/densepose/data/samplers/__pycache__/densepose_cse_base.cpython-310.pyc b/densepose/data/samplers/__pycache__/densepose_cse_base.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..dce0c910ab90112ce19f2196dea49137240d8680
Binary files /dev/null and b/densepose/data/samplers/__pycache__/densepose_cse_base.cpython-310.pyc differ
diff --git a/densepose/data/samplers/__pycache__/densepose_cse_base.cpython-39.pyc b/densepose/data/samplers/__pycache__/densepose_cse_base.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f8a4f8a72947996029fa7d4b35533528ef8a5ae5
Binary files /dev/null and b/densepose/data/samplers/__pycache__/densepose_cse_base.cpython-39.pyc differ
diff --git a/densepose/data/samplers/__pycache__/densepose_cse_confidence_based.cpython-310.pyc b/densepose/data/samplers/__pycache__/densepose_cse_confidence_based.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..177bae9067bb407fc910c7551f0120b84819b53e
Binary files /dev/null and b/densepose/data/samplers/__pycache__/densepose_cse_confidence_based.cpython-310.pyc differ
diff --git a/densepose/data/samplers/__pycache__/densepose_cse_confidence_based.cpython-39.pyc b/densepose/data/samplers/__pycache__/densepose_cse_confidence_based.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d7eb64dc7a1bf7e9c19589c4f255ca32ca9ee7ca
Binary files /dev/null and b/densepose/data/samplers/__pycache__/densepose_cse_confidence_based.cpython-39.pyc differ
diff --git a/densepose/data/samplers/__pycache__/densepose_cse_uniform.cpython-310.pyc b/densepose/data/samplers/__pycache__/densepose_cse_uniform.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9e01845898301618c3cd2f8667331667a012f9c9
Binary files /dev/null and b/densepose/data/samplers/__pycache__/densepose_cse_uniform.cpython-310.pyc differ
diff --git a/densepose/data/samplers/__pycache__/densepose_cse_uniform.cpython-39.pyc b/densepose/data/samplers/__pycache__/densepose_cse_uniform.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..041e8c9bc0331df19318d4785776bf7488eafe3a
Binary files /dev/null and b/densepose/data/samplers/__pycache__/densepose_cse_uniform.cpython-39.pyc differ
diff --git a/densepose/data/samplers/__pycache__/densepose_uniform.cpython-310.pyc b/densepose/data/samplers/__pycache__/densepose_uniform.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4bb5d8226630f061b763e76e60b3150a839f64e9
Binary files /dev/null and b/densepose/data/samplers/__pycache__/densepose_uniform.cpython-310.pyc differ
diff --git a/densepose/data/samplers/__pycache__/densepose_uniform.cpython-39.pyc b/densepose/data/samplers/__pycache__/densepose_uniform.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..655dc8b186bbaaf816a554d9578f027e12a3402f
Binary files /dev/null and b/densepose/data/samplers/__pycache__/densepose_uniform.cpython-39.pyc differ
diff --git a/densepose/data/samplers/__pycache__/mask_from_densepose.cpython-310.pyc b/densepose/data/samplers/__pycache__/mask_from_densepose.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..462adaf9799e1e41e65740e508481ee8af580fde
Binary files /dev/null and b/densepose/data/samplers/__pycache__/mask_from_densepose.cpython-310.pyc differ
diff --git a/densepose/data/samplers/__pycache__/mask_from_densepose.cpython-39.pyc b/densepose/data/samplers/__pycache__/mask_from_densepose.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..cd6dd7c2ced6e896f5ef7f94ffb388a82dd32be5
Binary files /dev/null and b/densepose/data/samplers/__pycache__/mask_from_densepose.cpython-39.pyc differ
diff --git a/densepose/data/samplers/__pycache__/prediction_to_gt.cpython-310.pyc b/densepose/data/samplers/__pycache__/prediction_to_gt.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..dc6c8639bb7bea2aa47bf4c916b52bf1377f2704
Binary files /dev/null and b/densepose/data/samplers/__pycache__/prediction_to_gt.cpython-310.pyc differ
diff --git a/densepose/data/samplers/__pycache__/prediction_to_gt.cpython-39.pyc b/densepose/data/samplers/__pycache__/prediction_to_gt.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f24c3ee1c5fefa51698a40413cad4f6411208196
Binary files /dev/null and b/densepose/data/samplers/__pycache__/prediction_to_gt.cpython-39.pyc differ
diff --git a/densepose/data/samplers/densepose_base.py b/densepose/data/samplers/densepose_base.py
new file mode 100644
index 0000000000000000000000000000000000000000..4d499d8f20d811fb8197d7bdae358540bb5b0dfc
--- /dev/null
+++ b/densepose/data/samplers/densepose_base.py
@@ -0,0 +1,203 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+
+from typing import Any, Dict, List, Tuple
+import torch
+from torch.nn import functional as F
+
+from detectron2.structures import BoxMode, Instances
+
+from densepose.converters import ToChartResultConverter
+from densepose.converters.base import IntTupleBox, make_int_box
+from densepose.structures import DensePoseDataRelative, DensePoseList
+
+
+class DensePoseBaseSampler:
+    """
+    Base DensePose sampler to produce DensePose data from DensePose predictions.
+    Samples for each class are drawn according to some distribution over all pixels estimated
+    to belong to that class.
+    """
+
+    def __init__(self, count_per_class: int = 8):
+        """
+        Constructor
+
+        Args:
+          count_per_class (int): the sampler produces at most `count_per_class`
+              samples for each category
+        """
+        self.count_per_class = count_per_class
+
+    def __call__(self, instances: Instances) -> DensePoseList:
+        """
+        Convert DensePose predictions (an instance of `DensePoseChartPredictorOutput`)
+        into DensePose annotations data (an instance of `DensePoseList`)
+        """
+        boxes_xyxy_abs = instances.pred_boxes.tensor.clone().cpu()
+        boxes_xywh_abs = BoxMode.convert(boxes_xyxy_abs, BoxMode.XYXY_ABS, BoxMode.XYWH_ABS)
+        dp_datas = []
+        for i in range(len(boxes_xywh_abs)):
+            annotation_i = self._sample(instances[i], make_int_box(boxes_xywh_abs[i]))
+            annotation_i[DensePoseDataRelative.S_KEY] = self._resample_mask(  # pyre-ignore[6]
+                instances[i].pred_densepose
+            )
+            dp_datas.append(DensePoseDataRelative(annotation_i))
+        # create densepose annotations on CPU
+        dp_list = DensePoseList(dp_datas, boxes_xyxy_abs, instances.image_size)
+        return dp_list
+
+    def _sample(self, instance: Instances, bbox_xywh: IntTupleBox) -> Dict[str, List[Any]]:
+        """
+        Sample DensPoseDataRelative from estimation results
+        """
+        labels, dp_result = self._produce_labels_and_results(instance)
+        annotation = {
+            DensePoseDataRelative.X_KEY: [],
+            DensePoseDataRelative.Y_KEY: [],
+            DensePoseDataRelative.U_KEY: [],
+            DensePoseDataRelative.V_KEY: [],
+            DensePoseDataRelative.I_KEY: [],
+        }
+        n, h, w = dp_result.shape
+        for part_id in range(1, DensePoseDataRelative.N_PART_LABELS + 1):
+            # indices - tuple of 3 1D tensors of size k
+            # 0: index along the first dimension N
+            # 1: index along H dimension
+            # 2: index along W dimension
+            indices = torch.nonzero(labels.expand(n, h, w) == part_id, as_tuple=True)
+            # values - an array of size [n, k]
+            # n: number of channels (U, V, confidences)
+            # k: number of points labeled with part_id
+            values = dp_result[indices].view(n, -1)
+            k = values.shape[1]
+            count = min(self.count_per_class, k)
+            if count <= 0:
+                continue
+            index_sample = self._produce_index_sample(values, count)
+            sampled_values = values[:, index_sample]
+            sampled_y = indices[1][index_sample] + 0.5
+            sampled_x = indices[2][index_sample] + 0.5
+            # prepare / normalize data
+            x = (sampled_x / w * 256.0).cpu().tolist()
+            y = (sampled_y / h * 256.0).cpu().tolist()
+            u = sampled_values[0].clamp(0, 1).cpu().tolist()
+            v = sampled_values[1].clamp(0, 1).cpu().tolist()
+            fine_segm_labels = [part_id] * count
+            # extend annotations
+            annotation[DensePoseDataRelative.X_KEY].extend(x)
+            annotation[DensePoseDataRelative.Y_KEY].extend(y)
+            annotation[DensePoseDataRelative.U_KEY].extend(u)
+            annotation[DensePoseDataRelative.V_KEY].extend(v)
+            annotation[DensePoseDataRelative.I_KEY].extend(fine_segm_labels)
+        return annotation
+
+    def _produce_index_sample(self, values: torch.Tensor, count: int):
+        """
+        Abstract method to produce a sample of indices to select data
+        To be implemented in descendants
+
+        Args:
+            values (torch.Tensor): an array of size [n, k] that contains
+                estimated values (U, V, confidences);
+                n: number of channels (U, V, confidences)
+                k: number of points labeled with part_id
+            count (int): number of samples to produce, should be positive and <= k
+
+        Return:
+            list(int): indices of values (along axis 1) selected as a sample
+        """
+        raise NotImplementedError
+
+    def _produce_labels_and_results(self, instance: Instances) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Method to get labels and DensePose results from an instance
+
+        Args:
+            instance (Instances): an instance of `DensePoseChartPredictorOutput`
+
+        Return:
+            labels (torch.Tensor): shape [H, W], DensePose segmentation labels
+            dp_result (torch.Tensor): shape [2, H, W], stacked DensePose results u and v
+        """
+        converter = ToChartResultConverter
+        chart_result = converter.convert(instance.pred_densepose, instance.pred_boxes)
+        labels, dp_result = chart_result.labels.cpu(), chart_result.uv.cpu()
+        return labels, dp_result
+
+    def _resample_mask(self, output: Any) -> torch.Tensor:
+        """
+        Convert DensePose predictor output to segmentation annotation - tensors of size
+        (256, 256) and type `int64`.
+
+        Args:
+            output: DensePose predictor output with the following attributes:
+             - coarse_segm: tensor of size [N, D, H, W] with unnormalized coarse
+               segmentation scores
+             - fine_segm: tensor of size [N, C, H, W] with unnormalized fine
+               segmentation scores
+        Return:
+            Tensor of size (S, S) and type `int64` with coarse segmentation annotations,
+            where S = DensePoseDataRelative.MASK_SIZE
+        """
+        sz = DensePoseDataRelative.MASK_SIZE
+        S = (
+            F.interpolate(output.coarse_segm, (sz, sz), mode="bilinear", align_corners=False)
+            .argmax(dim=1)
+            .long()
+        )
+        I = (
+            (
+                F.interpolate(
+                    output.fine_segm,
+                    (sz, sz),
+                    mode="bilinear",
+                    align_corners=False,
+                ).argmax(dim=1)
+                * (S > 0).long()
+            )
+            .squeeze()
+            .cpu()
+        )
+        # Map fine segmentation results to coarse segmentation ground truth
+        # TODO: extract this into separate classes
+        # coarse segmentation: 1 = Torso, 2 = Right Hand, 3 = Left Hand,
+        # 4 = Left Foot, 5 = Right Foot, 6 = Upper Leg Right, 7 = Upper Leg Left,
+        # 8 = Lower Leg Right, 9 = Lower Leg Left, 10 = Upper Arm Left,
+        # 11 = Upper Arm Right, 12 = Lower Arm Left, 13 = Lower Arm Right,
+        # 14 = Head
+        # fine segmentation: 1, 2 = Torso, 3 = Right Hand, 4 = Left Hand,
+        # 5 = Left Foot, 6 = Right Foot, 7, 9 = Upper Leg Right,
+        # 8, 10 = Upper Leg Left, 11, 13 = Lower Leg Right,
+        # 12, 14 = Lower Leg Left, 15, 17 = Upper Arm Left,
+        # 16, 18 = Upper Arm Right, 19, 21 = Lower Arm Left,
+        # 20, 22 = Lower Arm Right, 23, 24 = Head
+        FINE_TO_COARSE_SEGMENTATION = {
+            1: 1,
+            2: 1,
+            3: 2,
+            4: 3,
+            5: 4,
+            6: 5,
+            7: 6,
+            8: 7,
+            9: 6,
+            10: 7,
+            11: 8,
+            12: 9,
+            13: 8,
+            14: 9,
+            15: 10,
+            16: 11,
+            17: 10,
+            18: 11,
+            19: 12,
+            20: 13,
+            21: 12,
+            22: 13,
+            23: 14,
+            24: 14,
+        }
+        mask = torch.zeros((sz, sz), dtype=torch.int64, device=torch.device("cpu"))
+        for i in range(DensePoseDataRelative.N_PART_LABELS):
+            mask[I == i + 1] = FINE_TO_COARSE_SEGMENTATION[i + 1]
+        return mask
diff --git a/densepose/data/samplers/densepose_confidence_based.py b/densepose/data/samplers/densepose_confidence_based.py
new file mode 100644
index 0000000000000000000000000000000000000000..48e325b06e46817dafc0da2d984a8626d754e119
--- /dev/null
+++ b/densepose/data/samplers/densepose_confidence_based.py
@@ -0,0 +1,108 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+
+import random
+from typing import Optional, Tuple
+import torch
+
+from densepose.converters import ToChartResultConverterWithConfidences
+
+from .densepose_base import DensePoseBaseSampler
+
+
+class DensePoseConfidenceBasedSampler(DensePoseBaseSampler):
+    """
+    Samples DensePose data from DensePose predictions.
+    Samples for each class are drawn using confidence value estimates.
+    """
+
+    def __init__(
+        self,
+        confidence_channel: str,
+        count_per_class: int = 8,
+        search_count_multiplier: Optional[float] = None,
+        search_proportion: Optional[float] = None,
+    ):
+        """
+        Constructor
+
+        Args:
+          confidence_channel (str): confidence channel to use for sampling;
+            possible values:
+              "sigma_2": confidences for UV values
+              "fine_segm_confidence": confidences for fine segmentation
+              "coarse_segm_confidence": confidences for coarse segmentation
+            (default: "sigma_2")
+          count_per_class (int): the sampler produces at most `count_per_class`
+              samples for each category (default: 8)
+          search_count_multiplier (float or None): if not None, the total number
+              of the most confident estimates of a given class to consider is
+              defined as `min(search_count_multiplier * count_per_class, N)`,
+              where `N` is the total number of estimates of the class; cannot be
+              specified together with `search_proportion` (default: None)
+          search_proportion (float or None): if not None, the total number of the
+              of the most confident estimates of a given class to consider is
+              defined as `min(max(search_proportion * N, count_per_class), N)`,
+              where `N` is the total number of estimates of the class; cannot be
+              specified together with `search_count_multiplier` (default: None)
+        """
+        super().__init__(count_per_class)
+        self.confidence_channel = confidence_channel
+        self.search_count_multiplier = search_count_multiplier
+        self.search_proportion = search_proportion
+        assert (search_count_multiplier is None) or (search_proportion is None), (
+            f"Cannot specify both search_count_multiplier (={search_count_multiplier})"
+            f"and search_proportion (={search_proportion})"
+        )
+
+    def _produce_index_sample(self, values: torch.Tensor, count: int):
+        """
+        Produce a sample of indices to select data based on confidences
+
+        Args:
+            values (torch.Tensor): an array of size [n, k] that contains
+                estimated values (U, V, confidences);
+                n: number of channels (U, V, confidences)
+                k: number of points labeled with part_id
+            count (int): number of samples to produce, should be positive and <= k
+
+        Return:
+            list(int): indices of values (along axis 1) selected as a sample
+        """
+        k = values.shape[1]
+        if k == count:
+            index_sample = list(range(k))
+        else:
+            # take the best count * search_count_multiplier pixels,
+            # sample from them uniformly
+            # (here best = smallest variance)
+            _, sorted_confidence_indices = torch.sort(values[2])
+            if self.search_count_multiplier is not None:
+                search_count = min(int(count * self.search_count_multiplier), k)
+            elif self.search_proportion is not None:
+                search_count = min(max(int(k * self.search_proportion), count), k)
+            else:
+                search_count = min(count, k)
+            sample_from_top = random.sample(range(search_count), count)
+            index_sample = sorted_confidence_indices[:search_count][sample_from_top]
+        return index_sample
+
+    def _produce_labels_and_results(self, instance) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Method to get labels and DensePose results from an instance, with confidences
+
+        Args:
+            instance (Instances): an instance of `DensePoseChartPredictorOutputWithConfidences`
+
+        Return:
+            labels (torch.Tensor): shape [H, W], DensePose segmentation labels
+            dp_result (torch.Tensor): shape [3, H, W], DensePose results u and v
+                stacked with the confidence channel
+        """
+        converter = ToChartResultConverterWithConfidences
+        chart_result = converter.convert(instance.pred_densepose, instance.pred_boxes)
+        labels, dp_result = chart_result.labels.cpu(), chart_result.uv.cpu()
+        dp_result = torch.cat(
+            (dp_result, getattr(chart_result, self.confidence_channel)[None].cpu())
+        )
+
+        return labels, dp_result
diff --git a/densepose/data/samplers/densepose_cse_base.py b/densepose/data/samplers/densepose_cse_base.py
new file mode 100644
index 0000000000000000000000000000000000000000..845545c1438b9d2a4fbb4c6dac0642461a7e539f
--- /dev/null
+++ b/densepose/data/samplers/densepose_cse_base.py
@@ -0,0 +1,139 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+
+from typing import Any, Dict, List, Tuple
+import torch
+from torch.nn import functional as F
+
+from detectron2.config import CfgNode
+from detectron2.structures import Instances
+
+from densepose.converters.base import IntTupleBox
+from densepose.data.utils import get_class_to_mesh_name_mapping
+from densepose.modeling.cse.utils import squared_euclidean_distance_matrix
+from densepose.structures import DensePoseDataRelative
+
+from .densepose_base import DensePoseBaseSampler
+
+
+class DensePoseCSEBaseSampler(DensePoseBaseSampler):
+    """
+    Base DensePose sampler to produce DensePose data from DensePose predictions.
+    Samples for each class are drawn according to some distribution over all pixels estimated
+    to belong to that class.
+    """
+
+    def __init__(
+        self,
+        cfg: CfgNode,
+        use_gt_categories: bool,
+        embedder: torch.nn.Module,
+        count_per_class: int = 8,
+    ):
+        """
+        Constructor
+
+        Args:
+          cfg (CfgNode): the config of the model
+          embedder (torch.nn.Module): necessary to compute mesh vertex embeddings
+          count_per_class (int): the sampler produces at most `count_per_class`
+              samples for each category
+        """
+        super().__init__(count_per_class)
+        self.embedder = embedder
+        self.class_to_mesh_name = get_class_to_mesh_name_mapping(cfg)
+        self.use_gt_categories = use_gt_categories
+
+    def _sample(self, instance: Instances, bbox_xywh: IntTupleBox) -> Dict[str, List[Any]]:
+        """
+        Sample DensPoseDataRelative from estimation results
+        """
+        if self.use_gt_categories:
+            instance_class = instance.dataset_classes.tolist()[0]
+        else:
+            instance_class = instance.pred_classes.tolist()[0]
+        mesh_name = self.class_to_mesh_name[instance_class]
+
+        annotation = {
+            DensePoseDataRelative.X_KEY: [],
+            DensePoseDataRelative.Y_KEY: [],
+            DensePoseDataRelative.VERTEX_IDS_KEY: [],
+            DensePoseDataRelative.MESH_NAME_KEY: mesh_name,
+        }
+
+        mask, embeddings, other_values = self._produce_mask_and_results(instance, bbox_xywh)
+        indices = torch.nonzero(mask, as_tuple=True)
+        selected_embeddings = embeddings.permute(1, 2, 0)[indices].cpu()
+        values = other_values[:, indices[0], indices[1]]
+        k = values.shape[1]
+
+        count = min(self.count_per_class, k)
+        if count <= 0:
+            return annotation
+
+        index_sample = self._produce_index_sample(values, count)
+        closest_vertices = squared_euclidean_distance_matrix(
+            selected_embeddings[index_sample], self.embedder(mesh_name)
+        )
+        closest_vertices = torch.argmin(closest_vertices, dim=1)
+
+        sampled_y = indices[0][index_sample] + 0.5
+        sampled_x = indices[1][index_sample] + 0.5
+        # prepare / normalize data
+        _, _, w, h = bbox_xywh
+        x = (sampled_x / w * 256.0).cpu().tolist()
+        y = (sampled_y / h * 256.0).cpu().tolist()
+        # extend annotations
+        annotation[DensePoseDataRelative.X_KEY].extend(x)
+        annotation[DensePoseDataRelative.Y_KEY].extend(y)
+        annotation[DensePoseDataRelative.VERTEX_IDS_KEY].extend(closest_vertices.cpu().tolist())
+        return annotation
+
+    def _produce_mask_and_results(
+        self, instance: Instances, bbox_xywh: IntTupleBox
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        """
+        Method to get labels and DensePose results from an instance
+
+        Args:
+            instance (Instances): an instance of `DensePoseEmbeddingPredictorOutput`
+            bbox_xywh (IntTupleBox): the corresponding bounding box
+
+        Return:
+            mask (torch.Tensor): shape [H, W], DensePose segmentation mask
+            embeddings (Tuple[torch.Tensor]): a tensor of shape [D, H, W],
+                DensePose CSE Embeddings
+            other_values (Tuple[torch.Tensor]): a tensor of shape [0, H, W],
+                for potential other values
+        """
+        densepose_output = instance.pred_densepose
+        S = densepose_output.coarse_segm
+        E = densepose_output.embedding
+        _, _, w, h = bbox_xywh
+        embeddings = F.interpolate(E, size=(h, w), mode="bilinear")[0]
+        coarse_segm_resized = F.interpolate(S, size=(h, w), mode="bilinear")[0]
+        mask = coarse_segm_resized.argmax(0) > 0
+        other_values = torch.empty((0, h, w), device=E.device)
+        return mask, embeddings, other_values
+
+    def _resample_mask(self, output: Any) -> torch.Tensor:
+        """
+        Convert DensePose predictor output to segmentation annotation - tensors of size
+        (256, 256) and type `int64`.
+
+        Args:
+            output: DensePose predictor output with the following attributes:
+             - coarse_segm: tensor of size [N, D, H, W] with unnormalized coarse
+               segmentation scores
+        Return:
+            Tensor of size (S, S) and type `int64` with coarse segmentation annotations,
+            where S = DensePoseDataRelative.MASK_SIZE
+        """
+        sz = DensePoseDataRelative.MASK_SIZE
+        mask = (
+            F.interpolate(output.coarse_segm, (sz, sz), mode="bilinear", align_corners=False)
+            .argmax(dim=1)
+            .long()
+            .squeeze()
+            .cpu()
+        )
+        return mask
diff --git a/densepose/data/samplers/densepose_cse_confidence_based.py b/densepose/data/samplers/densepose_cse_confidence_based.py
new file mode 100644
index 0000000000000000000000000000000000000000..964b7f4ac41d2e1bb3da1cf6861af7f644b859fc
--- /dev/null
+++ b/densepose/data/samplers/densepose_cse_confidence_based.py
@@ -0,0 +1,119 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+
+import random
+from typing import Optional, Tuple
+import torch
+from torch.nn import functional as F
+
+from detectron2.config import CfgNode
+from detectron2.structures import Instances
+
+from densepose.converters.base import IntTupleBox
+
+from .densepose_cse_base import DensePoseCSEBaseSampler
+
+
+class DensePoseCSEConfidenceBasedSampler(DensePoseCSEBaseSampler):
+    """
+    Samples DensePose data from DensePose predictions.
+    Samples for each class are drawn using confidence value estimates.
+    """
+
+    def __init__(
+        self,
+        cfg: CfgNode,
+        use_gt_categories: bool,
+        embedder: torch.nn.Module,
+        confidence_channel: str,
+        count_per_class: int = 8,
+        search_count_multiplier: Optional[float] = None,
+        search_proportion: Optional[float] = None,
+    ):
+        """
+        Constructor
+
+        Args:
+          cfg (CfgNode): the config of the model
+          embedder (torch.nn.Module): necessary to compute mesh vertex embeddings
+          confidence_channel (str): confidence channel to use for sampling;
+            possible values:
+              "coarse_segm_confidence": confidences for coarse segmentation
+            (default: "coarse_segm_confidence")
+          count_per_class (int): the sampler produces at most `count_per_class`
+              samples for each category (default: 8)
+          search_count_multiplier (float or None): if not None, the total number
+              of the most confident estimates of a given class to consider is
+              defined as `min(search_count_multiplier * count_per_class, N)`,
+              where `N` is the total number of estimates of the class; cannot be
+              specified together with `search_proportion` (default: None)
+          search_proportion (float or None): if not None, the total number of the
+              of the most confident estimates of a given class to consider is
+              defined as `min(max(search_proportion * N, count_per_class), N)`,
+              where `N` is the total number of estimates of the class; cannot be
+              specified together with `search_count_multiplier` (default: None)
+        """
+        super().__init__(cfg, use_gt_categories, embedder, count_per_class)
+        self.confidence_channel = confidence_channel
+        self.search_count_multiplier = search_count_multiplier
+        self.search_proportion = search_proportion
+        assert (search_count_multiplier is None) or (search_proportion is None), (
+            f"Cannot specify both search_count_multiplier (={search_count_multiplier})"
+            f"and search_proportion (={search_proportion})"
+        )
+
+    def _produce_index_sample(self, values: torch.Tensor, count: int):
+        """
+        Produce a sample of indices to select data based on confidences
+
+        Args:
+            values (torch.Tensor): a tensor of length k that contains confidences
+                k: number of points labeled with part_id
+            count (int): number of samples to produce, should be positive and <= k
+
+        Return:
+            list(int): indices of values (along axis 1) selected as a sample
+        """
+        k = values.shape[1]
+        if k == count:
+            index_sample = list(range(k))
+        else:
+            # take the best count * search_count_multiplier pixels,
+            # sample from them uniformly
+            # (here best = smallest variance)
+            _, sorted_confidence_indices = torch.sort(values[0])
+            if self.search_count_multiplier is not None:
+                search_count = min(int(count * self.search_count_multiplier), k)
+            elif self.search_proportion is not None:
+                search_count = min(max(int(k * self.search_proportion), count), k)
+            else:
+                search_count = min(count, k)
+            sample_from_top = random.sample(range(search_count), count)
+            index_sample = sorted_confidence_indices[-search_count:][sample_from_top]
+        return index_sample
+
+    def _produce_mask_and_results(
+        self, instance: Instances, bbox_xywh: IntTupleBox
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        """
+        Method to get labels and DensePose results from an instance
+
+        Args:
+            instance (Instances): an instance of
+                `DensePoseEmbeddingPredictorOutputWithConfidences`
+            bbox_xywh (IntTupleBox): the corresponding bounding box
+
+        Return:
+            mask (torch.Tensor): shape [H, W], DensePose segmentation mask
+            embeddings (Tuple[torch.Tensor]): a tensor of shape [D, H, W]
+                DensePose CSE Embeddings
+            other_values: a tensor of shape [1, H, W], DensePose CSE confidence
+        """
+        _, _, w, h = bbox_xywh
+        densepose_output = instance.pred_densepose
+        mask, embeddings, _ = super()._produce_mask_and_results(instance, bbox_xywh)
+        other_values = F.interpolate(
+            getattr(densepose_output, self.confidence_channel),
+            size=(h, w),
+            mode="bilinear",
+        )[0].cpu()
+        return mask, embeddings, other_values
diff --git a/densepose/data/samplers/densepose_cse_uniform.py b/densepose/data/samplers/densepose_cse_uniform.py
new file mode 100644
index 0000000000000000000000000000000000000000..567636cc7dfbcc9167dd7f4aa2b752c6e53d311f
--- /dev/null
+++ b/densepose/data/samplers/densepose_cse_uniform.py
@@ -0,0 +1,12 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+
+from .densepose_cse_base import DensePoseCSEBaseSampler
+from .densepose_uniform import DensePoseUniformSampler
+
+
+class DensePoseCSEUniformSampler(DensePoseCSEBaseSampler, DensePoseUniformSampler):
+    """
+    Uniform Sampler for CSE
+    """
+
+    pass
diff --git a/densepose/data/samplers/densepose_uniform.py b/densepose/data/samplers/densepose_uniform.py
new file mode 100644
index 0000000000000000000000000000000000000000..0d72cc30c9342b36efd6a7e80e55bf088b5c797c
--- /dev/null
+++ b/densepose/data/samplers/densepose_uniform.py
@@ -0,0 +1,41 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+
+import random
+import torch
+
+from .densepose_base import DensePoseBaseSampler
+
+
+class DensePoseUniformSampler(DensePoseBaseSampler):
+    """
+    Samples DensePose data from DensePose predictions.
+    Samples for each class are drawn uniformly over all pixels estimated
+    to belong to that class.
+    """
+
+    def __init__(self, count_per_class: int = 8):
+        """
+        Constructor
+
+        Args:
+          count_per_class (int): the sampler produces at most `count_per_class`
+              samples for each category
+        """
+        super().__init__(count_per_class)
+
+    def _produce_index_sample(self, values: torch.Tensor, count: int):
+        """
+        Produce a uniform sample of indices to select data
+
+        Args:
+            values (torch.Tensor): an array of size [n, k] that contains
+                estimated values (U, V, confidences);
+                n: number of channels (U, V, confidences)
+                k: number of points labeled with part_id
+            count (int): number of samples to produce, should be positive and <= k
+
+        Return:
+            list(int): indices of values (along axis 1) selected as a sample
+        """
+        k = values.shape[1]
+        return random.sample(range(k), count)
diff --git a/densepose/data/samplers/mask_from_densepose.py b/densepose/data/samplers/mask_from_densepose.py
new file mode 100644
index 0000000000000000000000000000000000000000..0e6e812ba5af4675a81aec3ef8fd9b96d53325cc
--- /dev/null
+++ b/densepose/data/samplers/mask_from_densepose.py
@@ -0,0 +1,28 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+
+from detectron2.structures import BitMasks, Instances
+
+from densepose.converters import ToMaskConverter
+
+
+class MaskFromDensePoseSampler:
+    """
+    Produce mask GT from DensePose predictions
+    This sampler simply converts DensePose predictions to BitMasks
+    that a contain a bool tensor of the size of the input image
+    """
+
+    def __call__(self, instances: Instances) -> BitMasks:
+        """
+        Converts predicted data from `instances` into the GT mask data
+
+        Args:
+            instances (Instances): predicted results, expected to have `pred_densepose` field
+
+        Returns:
+            Boolean Tensor of the size of the input image that has non-zero
+            values at pixels that are estimated to belong to the detected object
+        """
+        return ToMaskConverter.convert(
+            instances.pred_densepose, instances.pred_boxes, instances.image_size
+        )
diff --git a/densepose/data/samplers/prediction_to_gt.py b/densepose/data/samplers/prediction_to_gt.py
new file mode 100644
index 0000000000000000000000000000000000000000..3881fa5503c32c9e2f0602971971995f1211e054
--- /dev/null
+++ b/densepose/data/samplers/prediction_to_gt.py
@@ -0,0 +1,98 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+
+from dataclasses import dataclass
+from typing import Any, Callable, Dict, List, Optional
+
+from detectron2.structures import Instances
+
+ModelOutput = Dict[str, Any]
+SampledData = Dict[str, Any]
+
+
+@dataclass
+class _Sampler:
+    """
+    Sampler registry entry that contains:
+     - src (str): source field to sample from (deleted after sampling)
+     - dst (Optional[str]): destination field to sample to, if not None
+     - func (Optional[Callable: Any -> Any]): function that performs sampling,
+         if None, reference copy is performed
+    """
+
+    src: str
+    dst: Optional[str]
+    func: Optional[Callable[[Any], Any]]
+
+
+class PredictionToGroundTruthSampler:
+    """
+    Sampler implementation that converts predictions to GT using registered
+    samplers for different fields of `Instances`.
+    """
+
+    def __init__(self, dataset_name: str = ""):
+        self.dataset_name = dataset_name
+        self._samplers = {}
+        self.register_sampler("pred_boxes", "gt_boxes", None)
+        self.register_sampler("pred_classes", "gt_classes", None)
+        # delete scores
+        self.register_sampler("scores")
+
+    def __call__(self, model_output: List[ModelOutput]) -> List[SampledData]:
+        """
+        Transform model output into ground truth data through sampling
+
+        Args:
+          model_output (Dict[str, Any]): model output
+        Returns:
+          Dict[str, Any]: sampled data
+        """
+        for model_output_i in model_output:
+            instances: Instances = model_output_i["instances"]
+            # transform data in each field
+            for _, sampler in self._samplers.items():
+                if not instances.has(sampler.src) or sampler.dst is None:
+                    continue
+                if sampler.func is None:
+                    instances.set(sampler.dst, instances.get(sampler.src))
+                else:
+                    instances.set(sampler.dst, sampler.func(instances))
+            # delete model output data that was transformed
+            for _, sampler in self._samplers.items():
+                if sampler.src != sampler.dst and instances.has(sampler.src):
+                    instances.remove(sampler.src)
+            model_output_i["dataset"] = self.dataset_name
+        return model_output
+
+    def register_sampler(
+        self,
+        prediction_attr: str,
+        gt_attr: Optional[str] = None,
+        func: Optional[Callable[[Any], Any]] = None,
+    ):
+        """
+        Register sampler for a field
+
+        Args:
+          prediction_attr (str): field to replace with a sampled value
+          gt_attr (Optional[str]): field to store the sampled value to, if not None
+          func (Optional[Callable: Any -> Any]): sampler function
+        """
+        self._samplers[(prediction_attr, gt_attr)] = _Sampler(
+            src=prediction_attr, dst=gt_attr, func=func
+        )
+
+    def remove_sampler(
+        self,
+        prediction_attr: str,
+        gt_attr: Optional[str] = None,
+    ):
+        """
+        Remove sampler for a field
+
+        Args:
+          prediction_attr (str): field to replace with a sampled value
+          gt_attr (Optional[str]): field to store the sampled value to, if not None
+        """
+        assert (prediction_attr, gt_attr) in self._samplers
+        del self._samplers[(prediction_attr, gt_attr)]
diff --git a/densepose/data/transform/__init__.py b/densepose/data/transform/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..369e1b278899b225d55bfc729514873b4259c7b9
--- /dev/null
+++ b/densepose/data/transform/__init__.py
@@ -0,0 +1,3 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+
+from .image import ImageResizeTransform
diff --git a/densepose/data/transform/__pycache__/__init__.cpython-310.pyc b/densepose/data/transform/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8103b79b934070d1f96b2b493434fd0c55d682c7
Binary files /dev/null and b/densepose/data/transform/__pycache__/__init__.cpython-310.pyc differ
diff --git a/densepose/data/transform/__pycache__/__init__.cpython-39.pyc b/densepose/data/transform/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..70ef2e5e174cbe615ecf32e141873b247e3cbc07
Binary files /dev/null and b/densepose/data/transform/__pycache__/__init__.cpython-39.pyc differ
diff --git a/densepose/data/transform/__pycache__/image.cpython-310.pyc b/densepose/data/transform/__pycache__/image.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4c27c3f86ed927f36c08ad4c4142d61914baaffb
Binary files /dev/null and b/densepose/data/transform/__pycache__/image.cpython-310.pyc differ
diff --git a/densepose/data/transform/__pycache__/image.cpython-39.pyc b/densepose/data/transform/__pycache__/image.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6a9fad36268062cb8a72351094609708e9c1d73d
Binary files /dev/null and b/densepose/data/transform/__pycache__/image.cpython-39.pyc differ
diff --git a/densepose/data/transform/image.py b/densepose/data/transform/image.py
new file mode 100644
index 0000000000000000000000000000000000000000..8139b67841633841199a1aae3b25e326afaaf5e2
--- /dev/null
+++ b/densepose/data/transform/image.py
@@ -0,0 +1,39 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+
+import torch
+
+
+class ImageResizeTransform:
+    """
+    Transform that resizes images loaded from a dataset
+    (BGR data in NCHW channel order, typically uint8) to a format ready to be
+    consumed by DensePose training (BGR float32 data in NCHW channel order)
+    """
+
+    def __init__(self, min_size: int = 800, max_size: int = 1333):
+        self.min_size = min_size
+        self.max_size = max_size
+
+    def __call__(self, images: torch.Tensor) -> torch.Tensor:
+        """
+        Args:
+            images (torch.Tensor): tensor of size [N, 3, H, W] that contains
+                BGR data (typically in uint8)
+        Returns:
+            images (torch.Tensor): tensor of size [N, 3, H1, W1] where
+                H1 and W1 are chosen to respect the specified min and max sizes
+                and preserve the original aspect ratio, the data channels
+                follow BGR order and the data type is `torch.float32`
+        """
+        # resize with min size
+        images = images.float()
+        min_size = min(images.shape[-2:])
+        max_size = max(images.shape[-2:])
+        scale = min(self.min_size / min_size, self.max_size / max_size)
+        images = torch.nn.functional.interpolate(
+            images,
+            scale_factor=scale,
+            mode="bilinear",
+            align_corners=False,
+        )
+        return images
diff --git a/densepose/data/utils.py b/densepose/data/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..9878c31d03bd4114425f89dd1c6dda74337fe2e2
--- /dev/null
+++ b/densepose/data/utils.py
@@ -0,0 +1,38 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+
+import os
+from typing import Dict, Optional
+
+from detectron2.config import CfgNode
+
+
+def is_relative_local_path(path: str) -> bool:
+    path_str = os.fsdecode(path)
+    return ("://" not in path_str) and not os.path.isabs(path)
+
+
+def maybe_prepend_base_path(base_path: Optional[str], path: str):
+    """
+    Prepends the provided path with a base path prefix if:
+    1) base path is not None;
+    2) path is a local path
+    """
+    if base_path is None:
+        return path
+    if is_relative_local_path(path):
+        return os.path.join(base_path, path)
+    return path
+
+
+def get_class_to_mesh_name_mapping(cfg: CfgNode) -> Dict[int, str]:
+    return {
+        int(class_id): mesh_name
+        for class_id, mesh_name in cfg.DATASETS.CLASS_TO_MESH_NAME_MAPPING.items()
+    }
+
+
+def get_category_to_class_mapping(dataset_cfg: CfgNode) -> Dict[str, int]:
+    return {
+        category: int(class_id)
+        for category, class_id in dataset_cfg.CATEGORY_TO_CLASS_MAPPING.items()
+    }
diff --git a/densepose/data/video/__init__.py b/densepose/data/video/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..72406e153b688461bfcb0ef21e35020399239309
--- /dev/null
+++ b/densepose/data/video/__init__.py
@@ -0,0 +1,17 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+
+from .frame_selector import (
+    FrameSelectionStrategy,
+    RandomKFramesSelector,
+    FirstKFramesSelector,
+    LastKFramesSelector,
+    FrameTsList,
+    FrameSelector,
+)
+
+from .video_keyframe_dataset import (
+    VideoKeyframeDataset,
+    video_list_from_file,
+    list_keyframes,
+    read_keyframes,
+)
diff --git a/densepose/data/video/__pycache__/__init__.cpython-310.pyc b/densepose/data/video/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..dfd4603cfc85cee30831b47799ff676fbcf6d173
Binary files /dev/null and b/densepose/data/video/__pycache__/__init__.cpython-310.pyc differ
diff --git a/densepose/data/video/__pycache__/__init__.cpython-39.pyc b/densepose/data/video/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2dbbd022a9a3b3c40b823df16aa6e95e187db30f
Binary files /dev/null and b/densepose/data/video/__pycache__/__init__.cpython-39.pyc differ
diff --git a/densepose/data/video/__pycache__/frame_selector.cpython-310.pyc b/densepose/data/video/__pycache__/frame_selector.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5a07d807af8d065a482789b99cb718aed2842eb5
Binary files /dev/null and b/densepose/data/video/__pycache__/frame_selector.cpython-310.pyc differ
diff --git a/densepose/data/video/__pycache__/frame_selector.cpython-39.pyc b/densepose/data/video/__pycache__/frame_selector.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1ffc71638501f2b15107d3f913d726eaa793afef
Binary files /dev/null and b/densepose/data/video/__pycache__/frame_selector.cpython-39.pyc differ
diff --git a/densepose/data/video/__pycache__/video_keyframe_dataset.cpython-310.pyc b/densepose/data/video/__pycache__/video_keyframe_dataset.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a978056b07d7ed5010bbec9bad2e30d18710a835
Binary files /dev/null and b/densepose/data/video/__pycache__/video_keyframe_dataset.cpython-310.pyc differ
diff --git a/densepose/data/video/__pycache__/video_keyframe_dataset.cpython-39.pyc b/densepose/data/video/__pycache__/video_keyframe_dataset.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ffcc9bb23c0d2f23bfa56c14dbfb93ce50824a1d
Binary files /dev/null and b/densepose/data/video/__pycache__/video_keyframe_dataset.cpython-39.pyc differ
diff --git a/densepose/data/video/frame_selector.py b/densepose/data/video/frame_selector.py
new file mode 100644
index 0000000000000000000000000000000000000000..c28f0e96475537319ff584f73fa422f838ae7b40
--- /dev/null
+++ b/densepose/data/video/frame_selector.py
@@ -0,0 +1,87 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+
+import random
+from collections.abc import Callable
+from enum import Enum
+from typing import Callable as TCallable
+from typing import List
+
+FrameTsList = List[int]
+FrameSelector = TCallable[[FrameTsList], FrameTsList]
+
+
+class FrameSelectionStrategy(Enum):
+    """
+    Frame selection strategy used with videos:
+     - "random_k": select k random frames
+     - "first_k": select k first frames
+     - "last_k": select k last frames
+     - "all": select all frames
+    """
+
+    # fmt: off
+    RANDOM_K = "random_k"
+    FIRST_K  = "first_k"
+    LAST_K   = "last_k"
+    ALL      = "all"
+    # fmt: on
+
+
+class RandomKFramesSelector(Callable):  # pyre-ignore[39]
+    """
+    Selector that retains at most `k` random frames
+    """
+
+    def __init__(self, k: int):
+        self.k = k
+
+    def __call__(self, frame_tss: FrameTsList) -> FrameTsList:
+        """
+        Select `k` random frames
+
+        Args:
+          frames_tss (List[int]): timestamps of input frames
+        Returns:
+          List[int]: timestamps of selected frames
+        """
+        return random.sample(frame_tss, min(self.k, len(frame_tss)))
+
+
+class FirstKFramesSelector(Callable):  # pyre-ignore[39]
+    """
+    Selector that retains at most `k` first frames
+    """
+
+    def __init__(self, k: int):
+        self.k = k
+
+    def __call__(self, frame_tss: FrameTsList) -> FrameTsList:
+        """
+        Select `k` first frames
+
+        Args:
+          frames_tss (List[int]): timestamps of input frames
+        Returns:
+          List[int]: timestamps of selected frames
+        """
+        return frame_tss[: self.k]
+
+
+class LastKFramesSelector(Callable):  # pyre-ignore[39]
+    """
+    Selector that retains at most `k` last frames from video data
+    """
+
+    def __init__(self, k: int):
+        self.k = k
+
+    def __call__(self, frame_tss: FrameTsList) -> FrameTsList:
+        """
+        Select `k` last frames
+
+        Args:
+          frames_tss (List[int]): timestamps of input frames
+        Returns:
+          List[int]: timestamps of selected frames
+        """
+        return frame_tss[-self.k :]
diff --git a/densepose/data/video/video_keyframe_dataset.py b/densepose/data/video/video_keyframe_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..be379d12bff5b6348087ba343d3c027b52524136
--- /dev/null
+++ b/densepose/data/video/video_keyframe_dataset.py
@@ -0,0 +1,300 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) Facebook, Inc. and its affiliates.
+
+import csv
+import logging
+import numpy as np
+from typing import Any, Callable, Dict, List, Optional, Union
+import av
+import torch
+from torch.utils.data.dataset import Dataset
+
+from detectron2.utils.file_io import PathManager
+
+from ..utils import maybe_prepend_base_path
+from .frame_selector import FrameSelector, FrameTsList
+
+FrameList = List[av.frame.Frame]  # pyre-ignore[16]
+FrameTransform = Callable[[torch.Tensor], torch.Tensor]
+
+
+def list_keyframes(video_fpath: str, video_stream_idx: int = 0) -> FrameTsList:
+    """
+    Traverses all keyframes of a video file. Returns a list of keyframe
+    timestamps. Timestamps are counts in timebase units.
+
+    Args:
+       video_fpath (str): Video file path
+       video_stream_idx (int): Video stream index (default: 0)
+    Returns:
+       List[int]: list of keyframe timestaps (timestamp is a count in timebase
+           units)
+    """
+    try:
+        with PathManager.open(video_fpath, "rb") as io:
+            container = av.open(io, mode="r")
+            stream = container.streams.video[video_stream_idx]
+            keyframes = []
+            pts = -1
+            # Note: even though we request forward seeks for keyframes, sometimes
+            # a keyframe in backwards direction is returned. We introduce tolerance
+            # as a max count of ignored backward seeks
+            tolerance_backward_seeks = 2
+            while True:
+                try:
+                    container.seek(pts + 1, backward=False, any_frame=False, stream=stream)
+                except av.AVError as e:
+                    # the exception occurs when the video length is exceeded,
+                    # we then return whatever data we've already collected
+                    logger = logging.getLogger(__name__)
+                    logger.debug(
+                        f"List keyframes: Error seeking video file {video_fpath}, "
+                        f"video stream {video_stream_idx}, pts {pts + 1}, AV error: {e}"
+                    )
+                    return keyframes
+                except OSError as e:
+                    logger = logging.getLogger(__name__)
+                    logger.warning(
+                        f"List keyframes: Error seeking video file {video_fpath}, "
+                        f"video stream {video_stream_idx}, pts {pts + 1}, OS error: {e}"
+                    )
+                    return []
+                packet = next(container.demux(video=video_stream_idx))
+                if packet.pts is not None and packet.pts <= pts:
+                    logger = logging.getLogger(__name__)
+                    logger.warning(
+                        f"Video file {video_fpath}, stream {video_stream_idx}: "
+                        f"bad seek for packet {pts + 1} (got packet {packet.pts}), "
+                        f"tolerance {tolerance_backward_seeks}."
+                    )
+                    tolerance_backward_seeks -= 1
+                    if tolerance_backward_seeks == 0:
+                        return []
+                    pts += 1
+                    continue
+                tolerance_backward_seeks = 2
+                pts = packet.pts
+                if pts is None:
+                    return keyframes
+                if packet.is_keyframe:
+                    keyframes.append(pts)
+            return keyframes
+    except OSError as e:
+        logger = logging.getLogger(__name__)
+        logger.warning(
+            f"List keyframes: Error opening video file container {video_fpath}, " f"OS error: {e}"
+        )
+    except RuntimeError as e:
+        logger = logging.getLogger(__name__)
+        logger.warning(
+            f"List keyframes: Error opening video file container {video_fpath}, "
+            f"Runtime error: {e}"
+        )
+    return []
+
+
+def read_keyframes(
+    video_fpath: str, keyframes: FrameTsList, video_stream_idx: int = 0
+) -> FrameList:  # pyre-ignore[11]
+    """
+    Reads keyframe data from a video file.
+
+    Args:
+        video_fpath (str): Video file path
+        keyframes (List[int]): List of keyframe timestamps (as counts in
+            timebase units to be used in container seek operations)
+        video_stream_idx (int): Video stream index (default: 0)
+    Returns:
+        List[Frame]: list of frames that correspond to the specified timestamps
+    """
+    try:
+        with PathManager.open(video_fpath, "rb") as io:
+            container = av.open(io)
+            stream = container.streams.video[video_stream_idx]
+            frames = []
+            for pts in keyframes:
+                try:
+                    container.seek(pts, any_frame=False, stream=stream)
+                    frame = next(container.decode(video=0))
+                    frames.append(frame)
+                except av.AVError as e:
+                    logger = logging.getLogger(__name__)
+                    logger.warning(
+                        f"Read keyframes: Error seeking video file {video_fpath}, "
+                        f"video stream {video_stream_idx}, pts {pts}, AV error: {e}"
+                    )
+                    container.close()
+                    return frames
+                except OSError as e:
+                    logger = logging.getLogger(__name__)
+                    logger.warning(
+                        f"Read keyframes: Error seeking video file {video_fpath}, "
+                        f"video stream {video_stream_idx}, pts {pts}, OS error: {e}"
+                    )
+                    container.close()
+                    return frames
+                except StopIteration:
+                    logger = logging.getLogger(__name__)
+                    logger.warning(
+                        f"Read keyframes: Error decoding frame from {video_fpath}, "
+                        f"video stream {video_stream_idx}, pts {pts}"
+                    )
+                    container.close()
+                    return frames
+
+            container.close()
+            return frames
+    except OSError as e:
+        logger = logging.getLogger(__name__)
+        logger.warning(
+            f"Read keyframes: Error opening video file container {video_fpath}, OS error: {e}"
+        )
+    except RuntimeError as e:
+        logger = logging.getLogger(__name__)
+        logger.warning(
+            f"Read keyframes: Error opening video file container {video_fpath}, Runtime error: {e}"
+        )
+    return []
+
+
+def video_list_from_file(video_list_fpath: str, base_path: Optional[str] = None):
+    """
+    Create a list of paths to video files from a text file.
+
+    Args:
+        video_list_fpath (str): path to a plain text file with the list of videos
+        base_path (str): base path for entries from the video list (default: None)
+    """
+    video_list = []
+    with PathManager.open(video_list_fpath, "r") as io:
+        for line in io:
+            video_list.append(maybe_prepend_base_path(base_path, str(line.strip())))
+    return video_list
+
+
+def read_keyframe_helper_data(fpath: str):
+    """
+    Read keyframe data from a file in CSV format: the header should contain
+    "video_id" and "keyframes" fields. Value specifications are:
+      video_id: int
+      keyframes: list(int)
+    Example of contents:
+      video_id,keyframes
+      2,"[1,11,21,31,41,51,61,71,81]"
+
+    Args:
+        fpath (str): File containing keyframe data
+
+    Return:
+        video_id_to_keyframes (dict: int -> list(int)): for a given video ID it
+          contains a list of keyframes for that video
+    """
+    video_id_to_keyframes = {}
+    try:
+        with PathManager.open(fpath, "r") as io:
+            csv_reader = csv.reader(io)
+            header = next(csv_reader)
+            video_id_idx = header.index("video_id")
+            keyframes_idx = header.index("keyframes")
+            for row in csv_reader:
+                video_id = int(row[video_id_idx])
+                assert (
+                    video_id not in video_id_to_keyframes
+                ), f"Duplicate keyframes entry for video {fpath}"
+                video_id_to_keyframes[video_id] = (
+                    [int(v) for v in row[keyframes_idx][1:-1].split(",")]
+                    if len(row[keyframes_idx]) > 2
+                    else []
+                )
+    except Exception as e:
+        logger = logging.getLogger(__name__)
+        logger.warning(f"Error reading keyframe helper data from {fpath}: {e}")
+    return video_id_to_keyframes
+
+
+class VideoKeyframeDataset(Dataset):
+    """
+    Dataset that provides keyframes for a set of videos.
+    """
+
+    _EMPTY_FRAMES = torch.empty((0, 3, 1, 1))
+
+    def __init__(
+        self,
+        video_list: List[str],
+        category_list: Union[str, List[str], None] = None,
+        frame_selector: Optional[FrameSelector] = None,
+        transform: Optional[FrameTransform] = None,
+        keyframe_helper_fpath: Optional[str] = None,
+    ):
+        """
+        Dataset constructor
+
+        Args:
+            video_list (List[str]): list of paths to video files
+            category_list (Union[str, List[str], None]): list of animal categories for each
+                video file. If it is a string, or None, this applies to all videos
+            frame_selector (Callable: KeyFrameList -> KeyFrameList):
+                selects keyframes to process, keyframes are given by
+                packet timestamps in timebase counts. If None, all keyframes
+                are selected (default: None)
+            transform (Callable: torch.Tensor -> torch.Tensor):
+                transforms a batch of RGB images (tensors of size [B, 3, H, W]),
+                returns a tensor of the same size. If None, no transform is
+                applied (default: None)
+
+        """
+        if type(category_list) == list:
+            self.category_list = category_list
+        else:
+            self.category_list = [category_list] * len(video_list)
+        assert len(video_list) == len(
+            self.category_list
+        ), "length of video and category lists must be equal"
+        self.video_list = video_list
+        self.frame_selector = frame_selector
+        self.transform = transform
+        self.keyframe_helper_data = (
+            read_keyframe_helper_data(keyframe_helper_fpath)
+            if keyframe_helper_fpath is not None
+            else None
+        )
+
+    def __getitem__(self, idx: int) -> Dict[str, Any]:
+        """
+        Gets selected keyframes from a given video
+
+        Args:
+            idx (int): video index in the video list file
+        Returns:
+            A dictionary containing two keys:
+                images (torch.Tensor): tensor of size [N, H, W, 3] or of size
+                    defined by the transform that contains keyframes data
+                categories (List[str]): categories of the frames
+        """
+        categories = [self.category_list[idx]]
+        fpath = self.video_list[idx]
+        keyframes = (
+            list_keyframes(fpath)
+            if self.keyframe_helper_data is None or idx not in self.keyframe_helper_data
+            else self.keyframe_helper_data[idx]
+        )
+        transform = self.transform
+        frame_selector = self.frame_selector
+        if not keyframes:
+            return {"images": self._EMPTY_FRAMES, "categories": []}
+        if frame_selector is not None:
+            keyframes = frame_selector(keyframes)
+        frames = read_keyframes(fpath, keyframes)
+        if not frames:
+            return {"images": self._EMPTY_FRAMES, "categories": []}
+        frames = np.stack([frame.to_rgb().to_ndarray() for frame in frames])
+        frames = torch.as_tensor(frames, device=torch.device("cpu"))
+        frames = frames[..., [2, 1, 0]]  # RGB -> BGR
+        frames = frames.permute(0, 3, 1, 2).float()  # NHWC -> NCHW
+        if transform is not None:
+            frames = transform(frames)
+        return {"images": frames, "categories": categories}
+
+    def __len__(self):
+        return len(self.video_list)
diff --git a/densepose/engine/__init__.py b/densepose/engine/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..539b93a7beca07d229a6b6d387f885469242ad86
--- /dev/null
+++ b/densepose/engine/__init__.py
@@ -0,0 +1,3 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+
+from .trainer import Trainer
diff --git a/densepose/engine/trainer.py b/densepose/engine/trainer.py
new file mode 100644
index 0000000000000000000000000000000000000000..a8ffe82c3d64d01ae36bb3c07cc6d75950937389
--- /dev/null
+++ b/densepose/engine/trainer.py
@@ -0,0 +1,258 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+
+import logging
+import os
+from collections import OrderedDict
+from typing import List, Optional, Union
+import torch
+from torch import nn
+
+from detectron2.checkpoint import DetectionCheckpointer
+from detectron2.config import CfgNode
+from detectron2.engine import DefaultTrainer
+from detectron2.evaluation import (
+    DatasetEvaluator,
+    DatasetEvaluators,
+    inference_on_dataset,
+    print_csv_format,
+)
+from detectron2.solver.build import get_default_optimizer_params, maybe_add_gradient_clipping
+from detectron2.utils import comm
+from detectron2.utils.events import EventWriter, get_event_storage
+
+from densepose import DensePoseDatasetMapperTTA, DensePoseGeneralizedRCNNWithTTA, load_from_cfg
+from densepose.data import (
+    DatasetMapper,
+    build_combined_loader,
+    build_detection_test_loader,
+    build_detection_train_loader,
+    build_inference_based_loaders,
+    has_inference_based_loaders,
+)
+from densepose.evaluation.d2_evaluator_adapter import Detectron2COCOEvaluatorAdapter
+from densepose.evaluation.evaluator import DensePoseCOCOEvaluator, build_densepose_evaluator_storage
+from densepose.modeling.cse import Embedder
+
+
+class SampleCountingLoader:
+    def __init__(self, loader):
+        self.loader = loader
+
+    def __iter__(self):
+        it = iter(self.loader)
+        storage = get_event_storage()
+        while True:
+            try:
+                batch = next(it)
+                num_inst_per_dataset = {}
+                for data in batch:
+                    dataset_name = data["dataset"]
+                    if dataset_name not in num_inst_per_dataset:
+                        num_inst_per_dataset[dataset_name] = 0
+                    num_inst = len(data["instances"])
+                    num_inst_per_dataset[dataset_name] += num_inst
+                for dataset_name in num_inst_per_dataset:
+                    storage.put_scalar(f"batch/{dataset_name}", num_inst_per_dataset[dataset_name])
+                yield batch
+            except StopIteration:
+                break
+
+
+class SampleCountMetricPrinter(EventWriter):
+    def __init__(self):
+        self.logger = logging.getLogger(__name__)
+
+    def write(self):
+        storage = get_event_storage()
+        batch_stats_strs = []
+        for key, buf in storage.histories().items():
+            if key.startswith("batch/"):
+                batch_stats_strs.append(f"{key} {buf.avg(20)}")
+        self.logger.info(", ".join(batch_stats_strs))
+
+
+class Trainer(DefaultTrainer):
+    @classmethod
+    def extract_embedder_from_model(cls, model: nn.Module) -> Optional[Embedder]:
+        if isinstance(model, nn.parallel.DistributedDataParallel):
+            model = model.module
+        if hasattr(model, "roi_heads") and hasattr(model.roi_heads, "embedder"):
+            return model.roi_heads.embedder
+        return None
+
+    # TODO: the only reason to copy the base class code here is to pass the embedder from
+    # the model to the evaluator; that should be refactored to avoid unnecessary copy-pasting
+    @classmethod
+    def test(
+        cls,
+        cfg: CfgNode,
+        model: nn.Module,
+        evaluators: Optional[Union[DatasetEvaluator, List[DatasetEvaluator]]] = None,
+    ):
+        """
+        Args:
+            cfg (CfgNode):
+            model (nn.Module):
+            evaluators (DatasetEvaluator, list[DatasetEvaluator] or None): if None, will call
+                :meth:`build_evaluator`. Otherwise, must have the same length as
+                ``cfg.DATASETS.TEST``.
+
+        Returns:
+            dict: a dict of result metrics
+        """
+        logger = logging.getLogger(__name__)
+        if isinstance(evaluators, DatasetEvaluator):
+            evaluators = [evaluators]
+        if evaluators is not None:
+            assert len(cfg.DATASETS.TEST) == len(evaluators), "{} != {}".format(
+                len(cfg.DATASETS.TEST), len(evaluators)
+            )
+
+        results = OrderedDict()
+        for idx, dataset_name in enumerate(cfg.DATASETS.TEST):
+            data_loader = cls.build_test_loader(cfg, dataset_name)
+            # When evaluators are passed in as arguments,
+            # implicitly assume that evaluators can be created before data_loader.
+            if evaluators is not None:
+                evaluator = evaluators[idx]
+            else:
+                try:
+                    embedder = cls.extract_embedder_from_model(model)
+                    evaluator = cls.build_evaluator(cfg, dataset_name, embedder=embedder)
+                except NotImplementedError:
+                    logger.warn(
+                        "No evaluator found. Use `DefaultTrainer.test(evaluators=)`, "
+                        "or implement its `build_evaluator` method."
+                    )
+                    results[dataset_name] = {}
+                    continue
+            if cfg.DENSEPOSE_EVALUATION.DISTRIBUTED_INFERENCE or comm.is_main_process():
+                results_i = inference_on_dataset(model, data_loader, evaluator)
+            else:
+                results_i = {}
+            results[dataset_name] = results_i
+            if comm.is_main_process():
+                assert isinstance(
+                    results_i, dict
+                ), "Evaluator must return a dict on the main process. Got {} instead.".format(
+                    results_i
+                )
+                logger.info("Evaluation results for {} in csv format:".format(dataset_name))
+                print_csv_format(results_i)
+
+        if len(results) == 1:
+            results = list(results.values())[0]
+        return results
+
+    @classmethod
+    def build_evaluator(
+        cls,
+        cfg: CfgNode,
+        dataset_name: str,
+        output_folder: Optional[str] = None,
+        embedder: Optional[Embedder] = None,
+    ) -> DatasetEvaluators:
+        if output_folder is None:
+            output_folder = os.path.join(cfg.OUTPUT_DIR, "inference")
+        evaluators = []
+        distributed = cfg.DENSEPOSE_EVALUATION.DISTRIBUTED_INFERENCE
+        # Note: we currently use COCO evaluator for both COCO and LVIS datasets
+        # to have compatible metrics. LVIS bbox evaluator could also be used
+        # with an adapter to properly handle filtered / mapped categories
+        # evaluator_type = MetadataCatalog.get(dataset_name).evaluator_type
+        # if evaluator_type == "coco":
+        #     evaluators.append(COCOEvaluator(dataset_name, output_dir=output_folder))
+        # elif evaluator_type == "lvis":
+        #     evaluators.append(LVISEvaluator(dataset_name, output_dir=output_folder))
+        evaluators.append(
+            Detectron2COCOEvaluatorAdapter(
+                dataset_name, output_dir=output_folder, distributed=distributed
+            )
+        )
+        if cfg.MODEL.DENSEPOSE_ON:
+            storage = build_densepose_evaluator_storage(cfg, output_folder)
+            evaluators.append(
+                DensePoseCOCOEvaluator(
+                    dataset_name,
+                    distributed,
+                    output_folder,
+                    evaluator_type=cfg.DENSEPOSE_EVALUATION.TYPE,
+                    min_iou_threshold=cfg.DENSEPOSE_EVALUATION.MIN_IOU_THRESHOLD,
+                    storage=storage,
+                    embedder=embedder,
+                    should_evaluate_mesh_alignment=cfg.DENSEPOSE_EVALUATION.EVALUATE_MESH_ALIGNMENT,
+                    mesh_alignment_mesh_names=cfg.DENSEPOSE_EVALUATION.MESH_ALIGNMENT_MESH_NAMES,
+                )
+            )
+        return DatasetEvaluators(evaluators)
+
+    @classmethod
+    def build_optimizer(cls, cfg: CfgNode, model: nn.Module):
+        params = get_default_optimizer_params(
+            model,
+            base_lr=cfg.SOLVER.BASE_LR,
+            weight_decay_norm=cfg.SOLVER.WEIGHT_DECAY_NORM,
+            bias_lr_factor=cfg.SOLVER.BIAS_LR_FACTOR,
+            weight_decay_bias=cfg.SOLVER.WEIGHT_DECAY_BIAS,
+            overrides={
+                "features": {
+                    "lr": cfg.SOLVER.BASE_LR * cfg.MODEL.ROI_DENSEPOSE_HEAD.CSE.FEATURES_LR_FACTOR,
+                },
+                "embeddings": {
+                    "lr": cfg.SOLVER.BASE_LR * cfg.MODEL.ROI_DENSEPOSE_HEAD.CSE.EMBEDDING_LR_FACTOR,
+                },
+            },
+        )
+        optimizer = torch.optim.SGD(
+            params,
+            cfg.SOLVER.BASE_LR,
+            momentum=cfg.SOLVER.MOMENTUM,
+            nesterov=cfg.SOLVER.NESTEROV,
+            weight_decay=cfg.SOLVER.WEIGHT_DECAY,
+        )
+        # pyre-fixme[6]: For 2nd param expected `Type[Optimizer]` but got `SGD`.
+        return maybe_add_gradient_clipping(cfg, optimizer)
+
+    @classmethod
+    def build_test_loader(cls, cfg: CfgNode, dataset_name):
+        return build_detection_test_loader(cfg, dataset_name, mapper=DatasetMapper(cfg, False))
+
+    @classmethod
+    def build_train_loader(cls, cfg: CfgNode):
+        data_loader = build_detection_train_loader(cfg, mapper=DatasetMapper(cfg, True))
+        if not has_inference_based_loaders(cfg):
+            return data_loader
+        model = cls.build_model(cfg)
+        model.to(cfg.BOOTSTRAP_MODEL.DEVICE)
+        DetectionCheckpointer(model).resume_or_load(cfg.BOOTSTRAP_MODEL.WEIGHTS, resume=False)
+        inference_based_loaders, ratios = build_inference_based_loaders(cfg, model)
+        loaders = [data_loader] + inference_based_loaders
+        ratios = [1.0] + ratios
+        combined_data_loader = build_combined_loader(cfg, loaders, ratios)
+        sample_counting_loader = SampleCountingLoader(combined_data_loader)
+        return sample_counting_loader
+
+    def build_writers(self):
+        writers = super().build_writers()
+        writers.append(SampleCountMetricPrinter())
+        return writers
+
+    @classmethod
+    def test_with_TTA(cls, cfg: CfgNode, model):
+        logger = logging.getLogger("detectron2.trainer")
+        # In the end of training, run an evaluation with TTA
+        # Only support some R-CNN models.
+        logger.info("Running inference with test-time augmentation ...")
+        transform_data = load_from_cfg(cfg)
+        model = DensePoseGeneralizedRCNNWithTTA(
+            cfg, model, transform_data, DensePoseDatasetMapperTTA(cfg)
+        )
+        evaluators = [
+            cls.build_evaluator(
+                cfg, name, output_folder=os.path.join(cfg.OUTPUT_DIR, "inference_TTA")
+            )
+            for name in cfg.DATASETS.TEST
+        ]
+        res = cls.test(cfg, model, evaluators)  # pyre-ignore[6]
+        res = OrderedDict({k + "_TTA": v for k, v in res.items()})
+        return res
diff --git a/densepose/evaluation/__init__.py b/densepose/evaluation/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e5ae1f20cdc822ebf3c870f1289a0ad210c57ae7
--- /dev/null
+++ b/densepose/evaluation/__init__.py
@@ -0,0 +1,3 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+
+from .evaluator import DensePoseCOCOEvaluator
diff --git a/densepose/evaluation/__pycache__/__init__.cpython-310.pyc b/densepose/evaluation/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1433a9f5bea8135bbb96d8da4c537df77a20665f
Binary files /dev/null and b/densepose/evaluation/__pycache__/__init__.cpython-310.pyc differ
diff --git a/densepose/evaluation/__pycache__/__init__.cpython-39.pyc b/densepose/evaluation/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4c1c8353be90a36de9c84914c1b9abafbfb62e0a
Binary files /dev/null and b/densepose/evaluation/__pycache__/__init__.cpython-39.pyc differ
diff --git a/densepose/evaluation/__pycache__/densepose_coco_evaluation.cpython-310.pyc b/densepose/evaluation/__pycache__/densepose_coco_evaluation.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6fc43f10a3a62b5680f0eadd4f4ef3d79f948d7d
Binary files /dev/null and b/densepose/evaluation/__pycache__/densepose_coco_evaluation.cpython-310.pyc differ
diff --git a/densepose/evaluation/__pycache__/densepose_coco_evaluation.cpython-39.pyc b/densepose/evaluation/__pycache__/densepose_coco_evaluation.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..edf86f31601980f1a035d14d30765522b9d9c36b
Binary files /dev/null and b/densepose/evaluation/__pycache__/densepose_coco_evaluation.cpython-39.pyc differ
diff --git a/densepose/evaluation/__pycache__/evaluator.cpython-310.pyc b/densepose/evaluation/__pycache__/evaluator.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0b28b54c5226ae99e46b78a8bcd2793602fb16d6
Binary files /dev/null and b/densepose/evaluation/__pycache__/evaluator.cpython-310.pyc differ
diff --git a/densepose/evaluation/__pycache__/evaluator.cpython-39.pyc b/densepose/evaluation/__pycache__/evaluator.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f6d0a4ce5b05843747749ddec8a39bdde934ca25
Binary files /dev/null and b/densepose/evaluation/__pycache__/evaluator.cpython-39.pyc differ
diff --git a/densepose/evaluation/__pycache__/mesh_alignment_evaluator.cpython-310.pyc b/densepose/evaluation/__pycache__/mesh_alignment_evaluator.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..03bfca889c248d13244d4a38369de9633711acb0
Binary files /dev/null and b/densepose/evaluation/__pycache__/mesh_alignment_evaluator.cpython-310.pyc differ
diff --git a/densepose/evaluation/__pycache__/mesh_alignment_evaluator.cpython-39.pyc b/densepose/evaluation/__pycache__/mesh_alignment_evaluator.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9ba5315d0ec19ad4403b51530b1595149d84f176
Binary files /dev/null and b/densepose/evaluation/__pycache__/mesh_alignment_evaluator.cpython-39.pyc differ
diff --git a/densepose/evaluation/__pycache__/tensor_storage.cpython-310.pyc b/densepose/evaluation/__pycache__/tensor_storage.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..fe4dca68db52006cd62511eed0a06c06872a9fed
Binary files /dev/null and b/densepose/evaluation/__pycache__/tensor_storage.cpython-310.pyc differ
diff --git a/densepose/evaluation/__pycache__/tensor_storage.cpython-39.pyc b/densepose/evaluation/__pycache__/tensor_storage.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3fa25cce32b54e594e16ba8d23e97ec3e8a258f3
Binary files /dev/null and b/densepose/evaluation/__pycache__/tensor_storage.cpython-39.pyc differ
diff --git a/densepose/evaluation/d2_evaluator_adapter.py b/densepose/evaluation/d2_evaluator_adapter.py
new file mode 100644
index 0000000000000000000000000000000000000000..1fbc526059a191f9414231c1b21ed3e8b7b58580
--- /dev/null
+++ b/densepose/evaluation/d2_evaluator_adapter.py
@@ -0,0 +1,50 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+
+from detectron2.data.catalog import Metadata
+from detectron2.evaluation import COCOEvaluator
+
+from densepose.data.datasets.coco import (
+    get_contiguous_id_to_category_id_map,
+    maybe_filter_categories_cocoapi,
+)
+
+
+def _maybe_add_iscrowd_annotations(cocoapi) -> None:
+    for ann in cocoapi.dataset["annotations"]:
+        if "iscrowd" not in ann:
+            ann["iscrowd"] = 0
+
+
+class Detectron2COCOEvaluatorAdapter(COCOEvaluator):
+    def __init__(
+        self,
+        dataset_name,
+        output_dir=None,
+        distributed=True,
+    ):
+        super().__init__(dataset_name, output_dir=output_dir, distributed=distributed)
+        maybe_filter_categories_cocoapi(dataset_name, self._coco_api)
+        _maybe_add_iscrowd_annotations(self._coco_api)
+        # substitute category metadata to account for categories
+        # that are mapped to the same contiguous id
+        if hasattr(self._metadata, "thing_dataset_id_to_contiguous_id"):
+            self._maybe_substitute_metadata()
+
+    def _maybe_substitute_metadata(self):
+        cont_id_2_cat_id = get_contiguous_id_to_category_id_map(self._metadata)
+        cat_id_2_cont_id = self._metadata.thing_dataset_id_to_contiguous_id
+        if len(cont_id_2_cat_id) == len(cat_id_2_cont_id):
+            return
+
+        cat_id_2_cont_id_injective = {}
+        for cat_id, cont_id in cat_id_2_cont_id.items():
+            if (cont_id in cont_id_2_cat_id) and (cont_id_2_cat_id[cont_id] == cat_id):
+                cat_id_2_cont_id_injective[cat_id] = cont_id
+
+        metadata_new = Metadata(name=self._metadata.name)
+        for key, value in self._metadata.__dict__.items():
+            if key == "thing_dataset_id_to_contiguous_id":
+                setattr(metadata_new, key, cat_id_2_cont_id_injective)
+            else:
+                setattr(metadata_new, key, value)
+        self._metadata = metadata_new
diff --git a/densepose/evaluation/densepose_coco_evaluation.py b/densepose/evaluation/densepose_coco_evaluation.py
new file mode 100644
index 0000000000000000000000000000000000000000..6370ba2a1bce45493e3d4bebd05b1b449334871d
--- /dev/null
+++ b/densepose/evaluation/densepose_coco_evaluation.py
@@ -0,0 +1,1303 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+# This is a modified version of cocoeval.py where we also have the densepose evaluation.
+
+__author__ = "tsungyi"
+
+import copy
+import datetime
+import logging
+import numpy as np
+import pickle
+import time
+from collections import defaultdict
+from enum import Enum
+from typing import Any, Dict, Tuple
+import scipy.spatial.distance as ssd
+import torch
+import torch.nn.functional as F
+from pycocotools import mask as maskUtils
+from scipy.io import loadmat
+from scipy.ndimage import zoom as spzoom
+
+from detectron2.utils.file_io import PathManager
+
+from densepose.converters.chart_output_to_chart_result import resample_uv_tensors_to_bbox
+from densepose.converters.segm_to_mask import (
+    resample_coarse_segm_tensor_to_bbox,
+    resample_fine_and_coarse_segm_tensors_to_bbox,
+)
+from densepose.modeling.cse.utils import squared_euclidean_distance_matrix
+from densepose.structures import DensePoseDataRelative
+from densepose.structures.mesh import create_mesh
+
+logger = logging.getLogger(__name__)
+
+
+class DensePoseEvalMode(str, Enum):
+    # use both masks and geodesic distances (GPS * IOU) to compute scores
+    GPSM = "gpsm"
+    # use only geodesic distances (GPS)  to compute scores
+    GPS = "gps"
+    # use only masks (IOU) to compute scores
+    IOU = "iou"
+
+
+class DensePoseDataMode(str, Enum):
+    # use estimated IUV data (default mode)
+    IUV_DT = "iuvdt"
+    # use ground truth IUV data
+    IUV_GT = "iuvgt"
+    # use ground truth labels I and set UV to 0
+    I_GT_UV_0 = "igtuv0"
+    # use ground truth labels I and estimated UV coordinates
+    I_GT_UV_DT = "igtuvdt"
+    # use estimated labels I and set UV to 0
+    I_DT_UV_0 = "idtuv0"
+
+
+class DensePoseCocoEval:
+    # Interface for evaluating detection on the Microsoft COCO dataset.
+    #
+    # The usage for CocoEval is as follows:
+    #  cocoGt=..., cocoDt=...       # load dataset and results
+    #  E = CocoEval(cocoGt,cocoDt); # initialize CocoEval object
+    #  E.params.recThrs = ...;      # set parameters as desired
+    #  E.evaluate();                # run per image evaluation
+    #  E.accumulate();              # accumulate per image results
+    #  E.summarize();               # display summary metrics of results
+    # For example usage see evalDemo.m and http://mscoco.org/.
+    #
+    # The evaluation parameters are as follows (defaults in brackets):
+    #  imgIds     - [all] N img ids to use for evaluation
+    #  catIds     - [all] K cat ids to use for evaluation
+    #  iouThrs    - [.5:.05:.95] T=10 IoU thresholds for evaluation
+    #  recThrs    - [0:.01:1] R=101 recall thresholds for evaluation
+    #  areaRng    - [...] A=4 object area ranges for evaluation
+    #  maxDets    - [1 10 100] M=3 thresholds on max detections per image
+    #  iouType    - ['segm'] set iouType to 'segm', 'bbox', 'keypoints' or 'densepose'
+    #  iouType replaced the now DEPRECATED useSegm parameter.
+    #  useCats    - [1] if true use category labels for evaluation
+    # Note: if useCats=0 category labels are ignored as in proposal scoring.
+    # Note: multiple areaRngs [Ax2] and maxDets [Mx1] can be specified.
+    #
+    # evaluate(): evaluates detections on every image and every category and
+    # concats the results into the "evalImgs" with fields:
+    #  dtIds      - [1xD] id for each of the D detections (dt)
+    #  gtIds      - [1xG] id for each of the G ground truths (gt)
+    #  dtMatches  - [TxD] matching gt id at each IoU or 0
+    #  gtMatches  - [TxG] matching dt id at each IoU or 0
+    #  dtScores   - [1xD] confidence of each dt
+    #  gtIgnore   - [1xG] ignore flag for each gt
+    #  dtIgnore   - [TxD] ignore flag for each dt at each IoU
+    #
+    # accumulate(): accumulates the per-image, per-category evaluation
+    # results in "evalImgs" into the dictionary "eval" with fields:
+    #  params     - parameters used for evaluation
+    #  date       - date evaluation was performed
+    #  counts     - [T,R,K,A,M] parameter dimensions (see above)
+    #  precision  - [TxRxKxAxM] precision for every evaluation setting
+    #  recall     - [TxKxAxM] max recall for every evaluation setting
+    # Note: precision and recall==-1 for settings with no gt objects.
+    #
+    # See also coco, mask, pycocoDemo, pycocoEvalDemo
+    #
+    # Microsoft COCO Toolbox.      version 2.0
+    # Data, paper, and tutorials available at:  http://mscoco.org/
+    # Code written by Piotr Dollar and Tsung-Yi Lin, 2015.
+    # Licensed under the Simplified BSD License [see coco/license.txt]
+    def __init__(
+        self,
+        cocoGt=None,
+        cocoDt=None,
+        iouType: str = "densepose",
+        multi_storage=None,
+        embedder=None,
+        dpEvalMode: DensePoseEvalMode = DensePoseEvalMode.GPS,
+        dpDataMode: DensePoseDataMode = DensePoseDataMode.IUV_DT,
+    ):
+        """
+        Initialize CocoEval using coco APIs for gt and dt
+        :param cocoGt: coco object with ground truth annotations
+        :param cocoDt: coco object with detection results
+        :return: None
+        """
+        self.cocoGt = cocoGt  # ground truth COCO API
+        self.cocoDt = cocoDt  # detections COCO API
+        self.multi_storage = multi_storage
+        self.embedder = embedder
+        self._dpEvalMode = dpEvalMode
+        self._dpDataMode = dpDataMode
+        self.evalImgs = defaultdict(list)  # per-image per-category eval results [KxAxI]
+        self.eval = {}  # accumulated evaluation results
+        self._gts = defaultdict(list)  # gt for evaluation
+        self._dts = defaultdict(list)  # dt for evaluation
+        self.params = Params(iouType=iouType)  # parameters
+        self._paramsEval = {}  # parameters for evaluation
+        self.stats = []  # result summarization
+        self.ious = {}  # ious between all gts and dts
+        if cocoGt is not None:
+            self.params.imgIds = sorted(cocoGt.getImgIds())
+            self.params.catIds = sorted(cocoGt.getCatIds())
+        self.ignoreThrBB = 0.7
+        self.ignoreThrUV = 0.9
+
+    def _loadGEval(self):
+        smpl_subdiv_fpath = PathManager.get_local_path(
+            "https://dl.fbaipublicfiles.com/densepose/data/SMPL_subdiv.mat"
+        )
+        pdist_transform_fpath = PathManager.get_local_path(
+            "https://dl.fbaipublicfiles.com/densepose/data/SMPL_SUBDIV_TRANSFORM.mat"
+        )
+        pdist_matrix_fpath = PathManager.get_local_path(
+            "https://dl.fbaipublicfiles.com/densepose/data/Pdist_matrix.pkl", timeout_sec=120
+        )
+        SMPL_subdiv = loadmat(smpl_subdiv_fpath)
+        self.PDIST_transform = loadmat(pdist_transform_fpath)
+        self.PDIST_transform = self.PDIST_transform["index"].squeeze()
+        UV = np.array([SMPL_subdiv["U_subdiv"], SMPL_subdiv["V_subdiv"]]).squeeze()
+        ClosestVertInds = np.arange(UV.shape[1]) + 1
+        self.Part_UVs = []
+        self.Part_ClosestVertInds = []
+        for i in np.arange(24):
+            self.Part_UVs.append(UV[:, SMPL_subdiv["Part_ID_subdiv"].squeeze() == (i + 1)])
+            self.Part_ClosestVertInds.append(
+                ClosestVertInds[SMPL_subdiv["Part_ID_subdiv"].squeeze() == (i + 1)]
+            )
+
+        with open(pdist_matrix_fpath, "rb") as hFile:
+            arrays = pickle.load(hFile, encoding="latin1")
+        self.Pdist_matrix = arrays["Pdist_matrix"]
+        self.Part_ids = np.array(SMPL_subdiv["Part_ID_subdiv"].squeeze())
+        # Mean geodesic distances for parts.
+        self.Mean_Distances = np.array([0, 0.351, 0.107, 0.126, 0.237, 0.173, 0.142, 0.128, 0.150])
+        # Coarse Part labels.
+        self.CoarseParts = np.array(
+            [0, 1, 1, 2, 2, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7, 8, 8]
+        )
+
+    def _prepare(self):
+        """
+        Prepare ._gts and ._dts for evaluation based on params
+        :return: None
+        """
+
+        def _toMask(anns, coco):
+            # modify ann['segmentation'] by reference
+            for ann in anns:
+                # safeguard for invalid segmentation annotation;
+                # annotations containing empty lists exist in the posetrack
+                # dataset. This is not a correct segmentation annotation
+                # in terms of COCO format; we need to deal with it somehow
+                segm = ann["segmentation"]
+                if type(segm) == list and len(segm) == 0:
+                    ann["segmentation"] = None
+                    continue
+                rle = coco.annToRLE(ann)
+                ann["segmentation"] = rle
+
+        def _getIgnoreRegion(iid, coco):
+            img = coco.imgs[iid]
+
+            if "ignore_regions_x" not in img.keys():
+                return None
+
+            if len(img["ignore_regions_x"]) == 0:
+                return None
+
+            rgns_merged = [
+                [v for xy in zip(region_x, region_y) for v in xy]
+                for region_x, region_y in zip(img["ignore_regions_x"], img["ignore_regions_y"])
+            ]
+            rles = maskUtils.frPyObjects(rgns_merged, img["height"], img["width"])
+            rle = maskUtils.merge(rles)
+            return maskUtils.decode(rle)
+
+        def _checkIgnore(dt, iregion):
+            if iregion is None:
+                return True
+
+            bb = np.array(dt["bbox"]).astype(int)
+            x1, y1, x2, y2 = bb[0], bb[1], bb[0] + bb[2], bb[1] + bb[3]
+            x2 = min([x2, iregion.shape[1]])
+            y2 = min([y2, iregion.shape[0]])
+
+            if bb[2] * bb[3] == 0:
+                return False
+
+            crop_iregion = iregion[y1:y2, x1:x2]
+
+            if crop_iregion.sum() == 0:
+                return True
+
+            if "densepose" not in dt.keys():  # filtering boxes
+                return crop_iregion.sum() / bb[2] / bb[3] < self.ignoreThrBB
+
+            # filtering UVs
+            ignoremask = np.require(crop_iregion, requirements=["F"])
+            mask = self._extract_mask(dt)
+            uvmask = np.require(np.asarray(mask > 0), dtype=np.uint8, requirements=["F"])
+            uvmask_ = maskUtils.encode(uvmask)
+            ignoremask_ = maskUtils.encode(ignoremask)
+            uviou = maskUtils.iou([uvmask_], [ignoremask_], [1])[0]
+            return uviou < self.ignoreThrUV
+
+        p = self.params
+
+        if p.useCats:
+            gts = self.cocoGt.loadAnns(self.cocoGt.getAnnIds(imgIds=p.imgIds, catIds=p.catIds))
+            dts = self.cocoDt.loadAnns(self.cocoDt.getAnnIds(imgIds=p.imgIds, catIds=p.catIds))
+        else:
+            gts = self.cocoGt.loadAnns(self.cocoGt.getAnnIds(imgIds=p.imgIds))
+            dts = self.cocoDt.loadAnns(self.cocoDt.getAnnIds(imgIds=p.imgIds))
+
+        imns = self.cocoGt.loadImgs(p.imgIds)
+        self.size_mapping = {}
+        for im in imns:
+            self.size_mapping[im["id"]] = [im["height"], im["width"]]
+
+        # if iouType == 'uv', add point gt annotations
+        if p.iouType == "densepose":
+            self._loadGEval()
+
+        # convert ground truth to mask if iouType == 'segm'
+        if p.iouType == "segm":
+            _toMask(gts, self.cocoGt)
+            _toMask(dts, self.cocoDt)
+
+        # set ignore flag
+        for gt in gts:
+            gt["ignore"] = gt["ignore"] if "ignore" in gt else 0
+            gt["ignore"] = "iscrowd" in gt and gt["iscrowd"]
+            if p.iouType == "keypoints":
+                gt["ignore"] = (gt["num_keypoints"] == 0) or gt["ignore"]
+            if p.iouType == "densepose":
+                gt["ignore"] = ("dp_x" in gt) == 0
+            if p.iouType == "segm":
+                gt["ignore"] = gt["segmentation"] is None
+
+        self._gts = defaultdict(list)  # gt for evaluation
+        self._dts = defaultdict(list)  # dt for evaluation
+        self._igrgns = defaultdict(list)
+
+        for gt in gts:
+            iid = gt["image_id"]
+            if iid not in self._igrgns.keys():
+                self._igrgns[iid] = _getIgnoreRegion(iid, self.cocoGt)
+            if _checkIgnore(gt, self._igrgns[iid]):
+                self._gts[iid, gt["category_id"]].append(gt)
+        for dt in dts:
+            iid = dt["image_id"]
+            if (iid not in self._igrgns) or _checkIgnore(dt, self._igrgns[iid]):
+                self._dts[iid, dt["category_id"]].append(dt)
+
+        self.evalImgs = defaultdict(list)  # per-image per-category evaluation results
+        self.eval = {}  # accumulated evaluation results
+
+    def evaluate(self):
+        """
+        Run per image evaluation on given images and store results (a list of dict) in self.evalImgs
+        :return: None
+        """
+        tic = time.time()
+        logger.info("Running per image DensePose evaluation... {}".format(self.params.iouType))
+        p = self.params
+        # add backward compatibility if useSegm is specified in params
+        if p.useSegm is not None:
+            p.iouType = "segm" if p.useSegm == 1 else "bbox"
+            logger.info("useSegm (deprecated) is not None. Running DensePose evaluation")
+        p.imgIds = list(np.unique(p.imgIds))
+        if p.useCats:
+            p.catIds = list(np.unique(p.catIds))
+        p.maxDets = sorted(p.maxDets)
+        self.params = p
+
+        self._prepare()
+        # loop through images, area range, max detection number
+        catIds = p.catIds if p.useCats else [-1]
+
+        if p.iouType in ["segm", "bbox"]:
+            computeIoU = self.computeIoU
+        elif p.iouType == "keypoints":
+            computeIoU = self.computeOks
+        elif p.iouType == "densepose":
+            computeIoU = self.computeOgps
+            if self._dpEvalMode in {DensePoseEvalMode.GPSM, DensePoseEvalMode.IOU}:
+                self.real_ious = {
+                    (imgId, catId): self.computeDPIoU(imgId, catId)
+                    for imgId in p.imgIds
+                    for catId in catIds
+                }
+
+        self.ious = {
+            (imgId, catId): computeIoU(imgId, catId) for imgId in p.imgIds for catId in catIds
+        }
+
+        evaluateImg = self.evaluateImg
+        maxDet = p.maxDets[-1]
+        self.evalImgs = [
+            evaluateImg(imgId, catId, areaRng, maxDet)
+            for catId in catIds
+            for areaRng in p.areaRng
+            for imgId in p.imgIds
+        ]
+        self._paramsEval = copy.deepcopy(self.params)
+        toc = time.time()
+        logger.info("DensePose evaluation DONE (t={:0.2f}s).".format(toc - tic))
+
+    def getDensePoseMask(self, polys):
+        maskGen = np.zeros([256, 256])
+        stop = min(len(polys) + 1, 15)
+        for i in range(1, stop):
+            if polys[i - 1]:
+                currentMask = maskUtils.decode(polys[i - 1])
+                maskGen[currentMask > 0] = i
+        return maskGen
+
+    def _generate_rlemask_on_image(self, mask, imgId, data):
+        bbox_xywh = np.array(data["bbox"])
+        x, y, w, h = bbox_xywh
+        im_h, im_w = self.size_mapping[imgId]
+        im_mask = np.zeros((im_h, im_w), dtype=np.uint8)
+        if mask is not None:
+            x0 = max(int(x), 0)
+            x1 = min(int(x + w), im_w, int(x) + mask.shape[1])
+            y0 = max(int(y), 0)
+            y1 = min(int(y + h), im_h, int(y) + mask.shape[0])
+            y = int(y)
+            x = int(x)
+            im_mask[y0:y1, x0:x1] = mask[y0 - y : y1 - y, x0 - x : x1 - x]
+        im_mask = np.require(np.asarray(im_mask > 0), dtype=np.uint8, requirements=["F"])
+        rle_mask = maskUtils.encode(np.array(im_mask[:, :, np.newaxis], order="F"))[0]
+        return rle_mask
+
+    def computeDPIoU(self, imgId, catId):
+        p = self.params
+        if p.useCats:
+            gt = self._gts[imgId, catId]
+            dt = self._dts[imgId, catId]
+        else:
+            gt = [_ for cId in p.catIds for _ in self._gts[imgId, cId]]
+            dt = [_ for cId in p.catIds for _ in self._dts[imgId, cId]]
+        if len(gt) == 0 and len(dt) == 0:
+            return []
+        inds = np.argsort([-d["score"] for d in dt], kind="mergesort")
+        dt = [dt[i] for i in inds]
+        if len(dt) > p.maxDets[-1]:
+            dt = dt[0 : p.maxDets[-1]]
+
+        gtmasks = []
+        for g in gt:
+            if DensePoseDataRelative.S_KEY in g:
+                # convert DensePose mask to a binary mask
+                mask = np.minimum(self.getDensePoseMask(g[DensePoseDataRelative.S_KEY]), 1.0)
+                _, _, w, h = g["bbox"]
+                scale_x = float(max(w, 1)) / mask.shape[1]
+                scale_y = float(max(h, 1)) / mask.shape[0]
+                mask = spzoom(mask, (scale_y, scale_x), order=1, prefilter=False)
+                mask = np.array(mask > 0.5, dtype=np.uint8)
+                rle_mask = self._generate_rlemask_on_image(mask, imgId, g)
+            elif "segmentation" in g:
+                segmentation = g["segmentation"]
+                if isinstance(segmentation, list) and segmentation:
+                    # polygons
+                    im_h, im_w = self.size_mapping[imgId]
+                    rles = maskUtils.frPyObjects(segmentation, im_h, im_w)
+                    rle_mask = maskUtils.merge(rles)
+                elif isinstance(segmentation, dict):
+                    if isinstance(segmentation["counts"], list):
+                        # uncompressed RLE
+                        im_h, im_w = self.size_mapping[imgId]
+                        rle_mask = maskUtils.frPyObjects(segmentation, im_h, im_w)
+                    else:
+                        # compressed RLE
+                        rle_mask = segmentation
+                else:
+                    rle_mask = self._generate_rlemask_on_image(None, imgId, g)
+            else:
+                rle_mask = self._generate_rlemask_on_image(None, imgId, g)
+            gtmasks.append(rle_mask)
+
+        dtmasks = []
+        for d in dt:
+            mask = self._extract_mask(d)
+            mask = np.require(np.asarray(mask > 0), dtype=np.uint8, requirements=["F"])
+            rle_mask = self._generate_rlemask_on_image(mask, imgId, d)
+            dtmasks.append(rle_mask)
+
+        # compute iou between each dt and gt region
+        iscrowd = [int(o.get("iscrowd", 0)) for o in gt]
+        iousDP = maskUtils.iou(dtmasks, gtmasks, iscrowd)
+        return iousDP
+
+    def computeIoU(self, imgId, catId):
+        p = self.params
+        if p.useCats:
+            gt = self._gts[imgId, catId]
+            dt = self._dts[imgId, catId]
+        else:
+            gt = [_ for cId in p.catIds for _ in self._gts[imgId, cId]]
+            dt = [_ for cId in p.catIds for _ in self._dts[imgId, cId]]
+        if len(gt) == 0 and len(dt) == 0:
+            return []
+        inds = np.argsort([-d["score"] for d in dt], kind="mergesort")
+        dt = [dt[i] for i in inds]
+        if len(dt) > p.maxDets[-1]:
+            dt = dt[0 : p.maxDets[-1]]
+
+        if p.iouType == "segm":
+            g = [g["segmentation"] for g in gt if g["segmentation"] is not None]
+            d = [d["segmentation"] for d in dt if d["segmentation"] is not None]
+        elif p.iouType == "bbox":
+            g = [g["bbox"] for g in gt]
+            d = [d["bbox"] for d in dt]
+        else:
+            raise Exception("unknown iouType for iou computation")
+
+        # compute iou between each dt and gt region
+        iscrowd = [int(o.get("iscrowd", 0)) for o in gt]
+        ious = maskUtils.iou(d, g, iscrowd)
+        return ious
+
+    def computeOks(self, imgId, catId):
+        p = self.params
+        # dimension here should be Nxm
+        gts = self._gts[imgId, catId]
+        dts = self._dts[imgId, catId]
+        inds = np.argsort([-d["score"] for d in dts], kind="mergesort")
+        dts = [dts[i] for i in inds]
+        if len(dts) > p.maxDets[-1]:
+            dts = dts[0 : p.maxDets[-1]]
+        # if len(gts) == 0 and len(dts) == 0:
+        if len(gts) == 0 or len(dts) == 0:
+            return []
+        ious = np.zeros((len(dts), len(gts)))
+        sigmas = (
+            np.array(
+                [
+                    0.26,
+                    0.25,
+                    0.25,
+                    0.35,
+                    0.35,
+                    0.79,
+                    0.79,
+                    0.72,
+                    0.72,
+                    0.62,
+                    0.62,
+                    1.07,
+                    1.07,
+                    0.87,
+                    0.87,
+                    0.89,
+                    0.89,
+                ]
+            )
+            / 10.0
+        )
+        vars = (sigmas * 2) ** 2
+        k = len(sigmas)
+        # compute oks between each detection and ground truth object
+        for j, gt in enumerate(gts):
+            # create bounds for ignore regions(double the gt bbox)
+            g = np.array(gt["keypoints"])
+            xg = g[0::3]
+            yg = g[1::3]
+            vg = g[2::3]
+            k1 = np.count_nonzero(vg > 0)
+            bb = gt["bbox"]
+            x0 = bb[0] - bb[2]
+            x1 = bb[0] + bb[2] * 2
+            y0 = bb[1] - bb[3]
+            y1 = bb[1] + bb[3] * 2
+            for i, dt in enumerate(dts):
+                d = np.array(dt["keypoints"])
+                xd = d[0::3]
+                yd = d[1::3]
+                if k1 > 0:
+                    # measure the per-keypoint distance if keypoints visible
+                    dx = xd - xg
+                    dy = yd - yg
+                else:
+                    # measure minimum distance to keypoints in (x0,y0) & (x1,y1)
+                    z = np.zeros(k)
+                    dx = np.max((z, x0 - xd), axis=0) + np.max((z, xd - x1), axis=0)
+                    dy = np.max((z, y0 - yd), axis=0) + np.max((z, yd - y1), axis=0)
+                e = (dx**2 + dy**2) / vars / (gt["area"] + np.spacing(1)) / 2
+                if k1 > 0:
+                    e = e[vg > 0]
+                ious[i, j] = np.sum(np.exp(-e)) / e.shape[0]
+        return ious
+
+    def _extract_mask(self, dt: Dict[str, Any]) -> np.ndarray:
+        if "densepose" in dt:
+            densepose_results_quantized = dt["densepose"]
+            return densepose_results_quantized.labels_uv_uint8[0].numpy()
+        elif "cse_mask" in dt:
+            return dt["cse_mask"]
+        elif "coarse_segm" in dt:
+            dy = max(int(dt["bbox"][3]), 1)
+            dx = max(int(dt["bbox"][2]), 1)
+            return (
+                F.interpolate(
+                    dt["coarse_segm"].unsqueeze(0),
+                    (dy, dx),
+                    mode="bilinear",
+                    align_corners=False,
+                )
+                .squeeze(0)
+                .argmax(0)
+                .numpy()
+                .astype(np.uint8)
+            )
+        elif "record_id" in dt:
+            assert (
+                self.multi_storage is not None
+            ), f"Storage record id encountered in a detection {dt}, but no storage provided!"
+            record = self.multi_storage.get(dt["rank"], dt["record_id"])
+            coarse_segm = record["coarse_segm"]
+            dy = max(int(dt["bbox"][3]), 1)
+            dx = max(int(dt["bbox"][2]), 1)
+            return (
+                F.interpolate(
+                    coarse_segm.unsqueeze(0),
+                    (dy, dx),
+                    mode="bilinear",
+                    align_corners=False,
+                )
+                .squeeze(0)
+                .argmax(0)
+                .numpy()
+                .astype(np.uint8)
+            )
+        else:
+            raise Exception(f"No mask data in the detection: {dt}")
+        raise ValueError('The prediction dict needs to contain either "densepose" or "cse_mask"')
+
+    def _extract_iuv(
+        self, densepose_data: np.ndarray, py: np.ndarray, px: np.ndarray, gt: Dict[str, Any]
+    ) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
+        """
+        Extract arrays of I, U and V values at given points as numpy arrays
+        given the data mode stored in self._dpDataMode
+        """
+        if self._dpDataMode == DensePoseDataMode.IUV_DT:
+            # estimated labels and UV (default)
+            ipoints = densepose_data[0, py, px]
+            upoints = densepose_data[1, py, px] / 255.0  # convert from uint8 by /255.
+            vpoints = densepose_data[2, py, px] / 255.0
+        elif self._dpDataMode == DensePoseDataMode.IUV_GT:
+            # ground truth
+            ipoints = np.array(gt["dp_I"])
+            upoints = np.array(gt["dp_U"])
+            vpoints = np.array(gt["dp_V"])
+        elif self._dpDataMode == DensePoseDataMode.I_GT_UV_0:
+            # ground truth labels, UV = 0
+            ipoints = np.array(gt["dp_I"])
+            upoints = upoints * 0.0
+            vpoints = vpoints * 0.0
+        elif self._dpDataMode == DensePoseDataMode.I_GT_UV_DT:
+            # ground truth labels, estimated UV
+            ipoints = np.array(gt["dp_I"])
+            upoints = densepose_data[1, py, px] / 255.0  # convert from uint8 by /255.
+            vpoints = densepose_data[2, py, px] / 255.0
+        elif self._dpDataMode == DensePoseDataMode.I_DT_UV_0:
+            # estimated labels, UV = 0
+            ipoints = densepose_data[0, py, px]
+            upoints = upoints * 0.0
+            vpoints = vpoints * 0.0
+        else:
+            raise ValueError(f"Unknown data mode: {self._dpDataMode}")
+        return ipoints, upoints, vpoints
+
+    def computeOgps_single_pair(self, dt, gt, py, px, pt_mask):
+        if "densepose" in dt:
+            ipoints, upoints, vpoints = self.extract_iuv_from_quantized(dt, gt, py, px, pt_mask)
+            return self.computeOgps_single_pair_iuv(dt, gt, ipoints, upoints, vpoints)
+        elif "u" in dt:
+            ipoints, upoints, vpoints = self.extract_iuv_from_raw(dt, gt, py, px, pt_mask)
+            return self.computeOgps_single_pair_iuv(dt, gt, ipoints, upoints, vpoints)
+        elif "record_id" in dt:
+            assert (
+                self.multi_storage is not None
+            ), f"Storage record id encountered in detection {dt}, but no storage provided!"
+            record = self.multi_storage.get(dt["rank"], dt["record_id"])
+            record["bbox"] = dt["bbox"]
+            if "u" in record:
+                ipoints, upoints, vpoints = self.extract_iuv_from_raw(record, gt, py, px, pt_mask)
+                return self.computeOgps_single_pair_iuv(dt, gt, ipoints, upoints, vpoints)
+            elif "embedding" in record:
+                return self.computeOgps_single_pair_cse(
+                    dt,
+                    gt,
+                    py,
+                    px,
+                    pt_mask,
+                    record["coarse_segm"],
+                    record["embedding"],
+                    record["bbox"],
+                )
+            else:
+                raise Exception(f"Unknown record format: {record}")
+        elif "embedding" in dt:
+            return self.computeOgps_single_pair_cse(
+                dt, gt, py, px, pt_mask, dt["coarse_segm"], dt["embedding"], dt["bbox"]
+            )
+        raise Exception(f"Unknown detection format: {dt}")
+
+    def extract_iuv_from_quantized(self, dt, gt, py, px, pt_mask):
+        densepose_results_quantized = dt["densepose"]
+        ipoints, upoints, vpoints = self._extract_iuv(
+            densepose_results_quantized.labels_uv_uint8.numpy(), py, px, gt
+        )
+        ipoints[pt_mask == -1] = 0
+        return ipoints, upoints, vpoints
+
+    def extract_iuv_from_raw(self, dt, gt, py, px, pt_mask):
+        labels_dt = resample_fine_and_coarse_segm_tensors_to_bbox(
+            dt["fine_segm"].unsqueeze(0),
+            dt["coarse_segm"].unsqueeze(0),
+            dt["bbox"],
+        )
+        uv = resample_uv_tensors_to_bbox(
+            dt["u"].unsqueeze(0), dt["v"].unsqueeze(0), labels_dt.squeeze(0), dt["bbox"]
+        )
+        labels_uv_uint8 = torch.cat((labels_dt.byte(), (uv * 255).clamp(0, 255).byte()))
+        ipoints, upoints, vpoints = self._extract_iuv(labels_uv_uint8.numpy(), py, px, gt)
+        ipoints[pt_mask == -1] = 0
+        return ipoints, upoints, vpoints
+
+    def computeOgps_single_pair_iuv(self, dt, gt, ipoints, upoints, vpoints):
+        cVertsGT, ClosestVertsGTTransformed = self.findAllClosestVertsGT(gt)
+        cVerts = self.findAllClosestVertsUV(upoints, vpoints, ipoints)
+        # Get pairwise geodesic distances between gt and estimated mesh points.
+        dist = self.getDistancesUV(ClosestVertsGTTransformed, cVerts)
+        # Compute the Ogps measure.
+        # Find the mean geodesic normalization distance for
+        # each GT point, based on which part it is on.
+        Current_Mean_Distances = self.Mean_Distances[
+            self.CoarseParts[self.Part_ids[cVertsGT[cVertsGT > 0].astype(int) - 1]]
+        ]
+        return dist, Current_Mean_Distances
+
+    def computeOgps_single_pair_cse(
+        self, dt, gt, py, px, pt_mask, coarse_segm, embedding, bbox_xywh_abs
+    ):
+        # 0-based mesh vertex indices
+        cVertsGT = torch.as_tensor(gt["dp_vertex"], dtype=torch.int64)
+        # label for each pixel of the bbox, [H, W] tensor of long
+        labels_dt = resample_coarse_segm_tensor_to_bbox(
+            coarse_segm.unsqueeze(0), bbox_xywh_abs
+        ).squeeze(0)
+        x, y, w, h = bbox_xywh_abs
+        # embedding for each pixel of the bbox, [D, H, W] tensor of float32
+        embedding = F.interpolate(
+            embedding.unsqueeze(0), (int(h), int(w)), mode="bilinear", align_corners=False
+        ).squeeze(0)
+        # valid locations py, px
+        py_pt = torch.from_numpy(py[pt_mask > -1])
+        px_pt = torch.from_numpy(px[pt_mask > -1])
+        cVerts = torch.ones_like(cVertsGT) * -1
+        cVerts[pt_mask > -1] = self.findClosestVertsCse(
+            embedding, py_pt, px_pt, labels_dt, gt["ref_model"]
+        )
+        # Get pairwise geodesic distances between gt and estimated mesh points.
+        dist = self.getDistancesCse(cVertsGT, cVerts, gt["ref_model"])
+        # normalize distances
+        if (gt["ref_model"] == "smpl_27554") and ("dp_I" in gt):
+            Current_Mean_Distances = self.Mean_Distances[
+                self.CoarseParts[np.array(gt["dp_I"], dtype=int)]
+            ]
+        else:
+            Current_Mean_Distances = 0.255
+        return dist, Current_Mean_Distances
+
+    def computeOgps(self, imgId, catId):
+        p = self.params
+        # dimension here should be Nxm
+        g = self._gts[imgId, catId]
+        d = self._dts[imgId, catId]
+        inds = np.argsort([-d_["score"] for d_ in d], kind="mergesort")
+        d = [d[i] for i in inds]
+        if len(d) > p.maxDets[-1]:
+            d = d[0 : p.maxDets[-1]]
+        # if len(gts) == 0 and len(dts) == 0:
+        if len(g) == 0 or len(d) == 0:
+            return []
+        ious = np.zeros((len(d), len(g)))
+        # compute opgs between each detection and ground truth object
+        # sigma = self.sigma #0.255 # dist = 0.3m corresponds to ogps = 0.5
+        # 1 # dist = 0.3m corresponds to ogps = 0.96
+        # 1.45 # dist = 1.7m (person height) corresponds to ogps = 0.5)
+        for j, gt in enumerate(g):
+            if not gt["ignore"]:
+                g_ = gt["bbox"]
+                for i, dt in enumerate(d):
+                    #
+                    dy = int(dt["bbox"][3])
+                    dx = int(dt["bbox"][2])
+                    dp_x = np.array(gt["dp_x"]) * g_[2] / 255.0
+                    dp_y = np.array(gt["dp_y"]) * g_[3] / 255.0
+                    py = (dp_y + g_[1] - dt["bbox"][1]).astype(int)
+                    px = (dp_x + g_[0] - dt["bbox"][0]).astype(int)
+                    #
+                    pts = np.zeros(len(px))
+                    pts[px >= dx] = -1
+                    pts[py >= dy] = -1
+                    pts[px < 0] = -1
+                    pts[py < 0] = -1
+                    if len(pts) < 1:
+                        ogps = 0.0
+                    elif np.max(pts) == -1:
+                        ogps = 0.0
+                    else:
+                        px[pts == -1] = 0
+                        py[pts == -1] = 0
+                        dists_between_matches, dist_norm_coeffs = self.computeOgps_single_pair(
+                            dt, gt, py, px, pts
+                        )
+                        # Compute gps
+                        ogps_values = np.exp(
+                            -(dists_between_matches**2) / (2 * (dist_norm_coeffs**2))
+                        )
+                        #
+                        ogps = np.mean(ogps_values) if len(ogps_values) > 0 else 0.0
+                    ious[i, j] = ogps
+
+        gbb = [gt["bbox"] for gt in g]
+        dbb = [dt["bbox"] for dt in d]
+
+        # compute iou between each dt and gt region
+        iscrowd = [int(o.get("iscrowd", 0)) for o in g]
+        ious_bb = maskUtils.iou(dbb, gbb, iscrowd)
+        return ious, ious_bb
+
+    def evaluateImg(self, imgId, catId, aRng, maxDet):
+        """
+        perform evaluation for single category and image
+        :return: dict (single image results)
+        """
+
+        p = self.params
+        if p.useCats:
+            gt = self._gts[imgId, catId]
+            dt = self._dts[imgId, catId]
+        else:
+            gt = [_ for cId in p.catIds for _ in self._gts[imgId, cId]]
+            dt = [_ for cId in p.catIds for _ in self._dts[imgId, cId]]
+        if len(gt) == 0 and len(dt) == 0:
+            return None
+
+        for g in gt:
+            # g['_ignore'] = g['ignore']
+            if g["ignore"] or (g["area"] < aRng[0] or g["area"] > aRng[1]):
+                g["_ignore"] = True
+            else:
+                g["_ignore"] = False
+
+        # sort dt highest score first, sort gt ignore last
+        gtind = np.argsort([g["_ignore"] for g in gt], kind="mergesort")
+        gt = [gt[i] for i in gtind]
+        dtind = np.argsort([-d["score"] for d in dt], kind="mergesort")
+        dt = [dt[i] for i in dtind[0:maxDet]]
+        iscrowd = [int(o.get("iscrowd", 0)) for o in gt]
+        # load computed ious
+        if p.iouType == "densepose":
+            # print('Checking the length', len(self.ious[imgId, catId]))
+            # if len(self.ious[imgId, catId]) == 0:
+            #    print(self.ious[imgId, catId])
+            ious = (
+                self.ious[imgId, catId][0][:, gtind]
+                if len(self.ious[imgId, catId]) > 0
+                else self.ious[imgId, catId]
+            )
+            ioubs = (
+                self.ious[imgId, catId][1][:, gtind]
+                if len(self.ious[imgId, catId]) > 0
+                else self.ious[imgId, catId]
+            )
+            if self._dpEvalMode in {DensePoseEvalMode.GPSM, DensePoseEvalMode.IOU}:
+                iousM = (
+                    self.real_ious[imgId, catId][:, gtind]
+                    if len(self.real_ious[imgId, catId]) > 0
+                    else self.real_ious[imgId, catId]
+                )
+        else:
+            ious = (
+                self.ious[imgId, catId][:, gtind]
+                if len(self.ious[imgId, catId]) > 0
+                else self.ious[imgId, catId]
+            )
+
+        T = len(p.iouThrs)
+        G = len(gt)
+        D = len(dt)
+        gtm = np.zeros((T, G))
+        dtm = np.zeros((T, D))
+        gtIg = np.array([g["_ignore"] for g in gt])
+        dtIg = np.zeros((T, D))
+        if np.all(gtIg) and p.iouType == "densepose":
+            dtIg = np.logical_or(dtIg, True)
+
+        if len(ious) > 0:  # and not p.iouType == 'densepose':
+            for tind, t in enumerate(p.iouThrs):
+                for dind, d in enumerate(dt):
+                    # information about best match so far (m=-1 -> unmatched)
+                    iou = min([t, 1 - 1e-10])
+                    m = -1
+                    for gind, _g in enumerate(gt):
+                        # if this gt already matched, and not a crowd, continue
+                        if gtm[tind, gind] > 0 and not iscrowd[gind]:
+                            continue
+                        # if dt matched to reg gt, and on ignore gt, stop
+                        if m > -1 and gtIg[m] == 0 and gtIg[gind] == 1:
+                            break
+                        if p.iouType == "densepose":
+                            if self._dpEvalMode == DensePoseEvalMode.GPSM:
+                                new_iou = np.sqrt(iousM[dind, gind] * ious[dind, gind])
+                            elif self._dpEvalMode == DensePoseEvalMode.IOU:
+                                new_iou = iousM[dind, gind]
+                            elif self._dpEvalMode == DensePoseEvalMode.GPS:
+                                new_iou = ious[dind, gind]
+                        else:
+                            new_iou = ious[dind, gind]
+                        if new_iou < iou:
+                            continue
+                        if new_iou == 0.0:
+                            continue
+                        # if match successful and best so far, store appropriately
+                        iou = new_iou
+                        m = gind
+                    # if match made store id of match for both dt and gt
+                    if m == -1:
+                        continue
+                    dtIg[tind, dind] = gtIg[m]
+                    dtm[tind, dind] = gt[m]["id"]
+                    gtm[tind, m] = d["id"]
+
+        if p.iouType == "densepose":
+            if not len(ioubs) == 0:
+                for dind, d in enumerate(dt):
+                    # information about best match so far (m=-1 -> unmatched)
+                    if dtm[tind, dind] == 0:
+                        ioub = 0.8
+                        m = -1
+                        for gind, _g in enumerate(gt):
+                            # if this gt already matched, and not a crowd, continue
+                            if gtm[tind, gind] > 0 and not iscrowd[gind]:
+                                continue
+                            # continue to next gt unless better match made
+                            if ioubs[dind, gind] < ioub:
+                                continue
+                            # if match successful and best so far, store appropriately
+                            ioub = ioubs[dind, gind]
+                            m = gind
+                            # if match made store id of match for both dt and gt
+                        if m > -1:
+                            dtIg[:, dind] = gtIg[m]
+                            if gtIg[m]:
+                                dtm[tind, dind] = gt[m]["id"]
+                                gtm[tind, m] = d["id"]
+        # set unmatched detections outside of area range to ignore
+        a = np.array([d["area"] < aRng[0] or d["area"] > aRng[1] for d in dt]).reshape((1, len(dt)))
+        dtIg = np.logical_or(dtIg, np.logical_and(dtm == 0, np.repeat(a, T, 0)))
+        # store results for given image and category
+        # print('Done with the function', len(self.ious[imgId, catId]))
+        return {
+            "image_id": imgId,
+            "category_id": catId,
+            "aRng": aRng,
+            "maxDet": maxDet,
+            "dtIds": [d["id"] for d in dt],
+            "gtIds": [g["id"] for g in gt],
+            "dtMatches": dtm,
+            "gtMatches": gtm,
+            "dtScores": [d["score"] for d in dt],
+            "gtIgnore": gtIg,
+            "dtIgnore": dtIg,
+        }
+
+    def accumulate(self, p=None):
+        """
+        Accumulate per image evaluation results and store the result in self.eval
+        :param p: input params for evaluation
+        :return: None
+        """
+        logger.info("Accumulating evaluation results...")
+        tic = time.time()
+        if not self.evalImgs:
+            logger.info("Please run evaluate() first")
+        # allows input customized parameters
+        if p is None:
+            p = self.params
+        p.catIds = p.catIds if p.useCats == 1 else [-1]
+        T = len(p.iouThrs)
+        R = len(p.recThrs)
+        K = len(p.catIds) if p.useCats else 1
+        A = len(p.areaRng)
+        M = len(p.maxDets)
+        precision = -(np.ones((T, R, K, A, M)))  # -1 for the precision of absent categories
+        recall = -(np.ones((T, K, A, M)))
+
+        # create dictionary for future indexing
+        logger.info("Categories: {}".format(p.catIds))
+        _pe = self._paramsEval
+        catIds = _pe.catIds if _pe.useCats else [-1]
+        setK = set(catIds)
+        setA = set(map(tuple, _pe.areaRng))
+        setM = set(_pe.maxDets)
+        setI = set(_pe.imgIds)
+        # get inds to evaluate
+        k_list = [n for n, k in enumerate(p.catIds) if k in setK]
+        m_list = [m for n, m in enumerate(p.maxDets) if m in setM]
+        a_list = [n for n, a in enumerate(map(lambda x: tuple(x), p.areaRng)) if a in setA]
+        i_list = [n for n, i in enumerate(p.imgIds) if i in setI]
+        I0 = len(_pe.imgIds)
+        A0 = len(_pe.areaRng)
+        # retrieve E at each category, area range, and max number of detections
+        for k, k0 in enumerate(k_list):
+            Nk = k0 * A0 * I0
+            for a, a0 in enumerate(a_list):
+                Na = a0 * I0
+                for m, maxDet in enumerate(m_list):
+                    E = [self.evalImgs[Nk + Na + i] for i in i_list]
+                    E = [e for e in E if e is not None]
+                    if len(E) == 0:
+                        continue
+                    dtScores = np.concatenate([e["dtScores"][0:maxDet] for e in E])
+
+                    # different sorting method generates slightly different results.
+                    # mergesort is used to be consistent as Matlab implementation.
+                    inds = np.argsort(-dtScores, kind="mergesort")
+
+                    dtm = np.concatenate([e["dtMatches"][:, 0:maxDet] for e in E], axis=1)[:, inds]
+                    dtIg = np.concatenate([e["dtIgnore"][:, 0:maxDet] for e in E], axis=1)[:, inds]
+                    gtIg = np.concatenate([e["gtIgnore"] for e in E])
+                    npig = np.count_nonzero(gtIg == 0)
+                    if npig == 0:
+                        continue
+                    tps = np.logical_and(dtm, np.logical_not(dtIg))
+                    fps = np.logical_and(np.logical_not(dtm), np.logical_not(dtIg))
+                    tp_sum = np.cumsum(tps, axis=1).astype(dtype=float)
+                    fp_sum = np.cumsum(fps, axis=1).astype(dtype=float)
+                    for t, (tp, fp) in enumerate(zip(tp_sum, fp_sum)):
+                        tp = np.array(tp)
+                        fp = np.array(fp)
+                        nd = len(tp)
+                        rc = tp / npig
+                        pr = tp / (fp + tp + np.spacing(1))
+                        q = np.zeros((R,))
+
+                        if nd:
+                            recall[t, k, a, m] = rc[-1]
+                        else:
+                            recall[t, k, a, m] = 0
+
+                        # numpy is slow without cython optimization for accessing elements
+                        # use python array gets significant speed improvement
+                        pr = pr.tolist()
+                        q = q.tolist()
+
+                        for i in range(nd - 1, 0, -1):
+                            if pr[i] > pr[i - 1]:
+                                pr[i - 1] = pr[i]
+
+                        inds = np.searchsorted(rc, p.recThrs, side="left")
+                        try:
+                            for ri, pi in enumerate(inds):
+                                q[ri] = pr[pi]
+                        except Exception:
+                            pass
+                        precision[t, :, k, a, m] = np.array(q)
+        logger.info(
+            "Final: max precision {}, min precision {}".format(np.max(precision), np.min(precision))
+        )
+        self.eval = {
+            "params": p,
+            "counts": [T, R, K, A, M],
+            "date": datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
+            "precision": precision,
+            "recall": recall,
+        }
+        toc = time.time()
+        logger.info("DONE (t={:0.2f}s).".format(toc - tic))
+
+    def summarize(self):
+        """
+        Compute and display summary metrics for evaluation results.
+        Note this function can *only* be applied on the default parameter setting
+        """
+
+        def _summarize(ap=1, iouThr=None, areaRng="all", maxDets=100):
+            p = self.params
+            iStr = " {:<18} {} @[ {}={:<9} | area={:>6s} | maxDets={:>3d} ] = {:0.3f}"
+            titleStr = "Average Precision" if ap == 1 else "Average Recall"
+            typeStr = "(AP)" if ap == 1 else "(AR)"
+            measure = "IoU"
+            if self.params.iouType == "keypoints":
+                measure = "OKS"
+            elif self.params.iouType == "densepose":
+                measure = "OGPS"
+            iouStr = (
+                "{:0.2f}:{:0.2f}".format(p.iouThrs[0], p.iouThrs[-1])
+                if iouThr is None
+                else "{:0.2f}".format(iouThr)
+            )
+
+            aind = [i for i, aRng in enumerate(p.areaRngLbl) if aRng == areaRng]
+            mind = [i for i, mDet in enumerate(p.maxDets) if mDet == maxDets]
+            if ap == 1:
+                # dimension of precision: [TxRxKxAxM]
+                s = self.eval["precision"]
+                # IoU
+                if iouThr is not None:
+                    t = np.where(np.abs(iouThr - p.iouThrs) < 0.001)[0]
+                    s = s[t]
+                s = s[:, :, :, aind, mind]
+            else:
+                # dimension of recall: [TxKxAxM]
+                s = self.eval["recall"]
+                if iouThr is not None:
+                    t = np.where(np.abs(iouThr - p.iouThrs) < 0.001)[0]
+                    s = s[t]
+                s = s[:, :, aind, mind]
+            if len(s[s > -1]) == 0:
+                mean_s = -1
+            else:
+                mean_s = np.mean(s[s > -1])
+            logger.info(iStr.format(titleStr, typeStr, measure, iouStr, areaRng, maxDets, mean_s))
+            return mean_s
+
+        def _summarizeDets():
+            stats = np.zeros((12,))
+            stats[0] = _summarize(1)
+            stats[1] = _summarize(1, iouThr=0.5, maxDets=self.params.maxDets[2])
+            stats[2] = _summarize(1, iouThr=0.75, maxDets=self.params.maxDets[2])
+            stats[3] = _summarize(1, areaRng="small", maxDets=self.params.maxDets[2])
+            stats[4] = _summarize(1, areaRng="medium", maxDets=self.params.maxDets[2])
+            stats[5] = _summarize(1, areaRng="large", maxDets=self.params.maxDets[2])
+            stats[6] = _summarize(0, maxDets=self.params.maxDets[0])
+            stats[7] = _summarize(0, maxDets=self.params.maxDets[1])
+            stats[8] = _summarize(0, maxDets=self.params.maxDets[2])
+            stats[9] = _summarize(0, areaRng="small", maxDets=self.params.maxDets[2])
+            stats[10] = _summarize(0, areaRng="medium", maxDets=self.params.maxDets[2])
+            stats[11] = _summarize(0, areaRng="large", maxDets=self.params.maxDets[2])
+            return stats
+
+        def _summarizeKps():
+            stats = np.zeros((10,))
+            stats[0] = _summarize(1, maxDets=20)
+            stats[1] = _summarize(1, maxDets=20, iouThr=0.5)
+            stats[2] = _summarize(1, maxDets=20, iouThr=0.75)
+            stats[3] = _summarize(1, maxDets=20, areaRng="medium")
+            stats[4] = _summarize(1, maxDets=20, areaRng="large")
+            stats[5] = _summarize(0, maxDets=20)
+            stats[6] = _summarize(0, maxDets=20, iouThr=0.5)
+            stats[7] = _summarize(0, maxDets=20, iouThr=0.75)
+            stats[8] = _summarize(0, maxDets=20, areaRng="medium")
+            stats[9] = _summarize(0, maxDets=20, areaRng="large")
+            return stats
+
+        def _summarizeUvs():
+            stats = [_summarize(1, maxDets=self.params.maxDets[0])]
+            min_threshold = self.params.iouThrs.min()
+            if min_threshold <= 0.201:
+                stats += [_summarize(1, maxDets=self.params.maxDets[0], iouThr=0.2)]
+            if min_threshold <= 0.301:
+                stats += [_summarize(1, maxDets=self.params.maxDets[0], iouThr=0.3)]
+            if min_threshold <= 0.401:
+                stats += [_summarize(1, maxDets=self.params.maxDets[0], iouThr=0.4)]
+            stats += [
+                _summarize(1, maxDets=self.params.maxDets[0], iouThr=0.5),
+                _summarize(1, maxDets=self.params.maxDets[0], iouThr=0.75),
+                _summarize(1, maxDets=self.params.maxDets[0], areaRng="medium"),
+                _summarize(1, maxDets=self.params.maxDets[0], areaRng="large"),
+                _summarize(0, maxDets=self.params.maxDets[0]),
+                _summarize(0, maxDets=self.params.maxDets[0], iouThr=0.5),
+                _summarize(0, maxDets=self.params.maxDets[0], iouThr=0.75),
+                _summarize(0, maxDets=self.params.maxDets[0], areaRng="medium"),
+                _summarize(0, maxDets=self.params.maxDets[0], areaRng="large"),
+            ]
+            return np.array(stats)
+
+        def _summarizeUvsOld():
+            stats = np.zeros((18,))
+            stats[0] = _summarize(1, maxDets=self.params.maxDets[0])
+            stats[1] = _summarize(1, maxDets=self.params.maxDets[0], iouThr=0.5)
+            stats[2] = _summarize(1, maxDets=self.params.maxDets[0], iouThr=0.55)
+            stats[3] = _summarize(1, maxDets=self.params.maxDets[0], iouThr=0.60)
+            stats[4] = _summarize(1, maxDets=self.params.maxDets[0], iouThr=0.65)
+            stats[5] = _summarize(1, maxDets=self.params.maxDets[0], iouThr=0.70)
+            stats[6] = _summarize(1, maxDets=self.params.maxDets[0], iouThr=0.75)
+            stats[7] = _summarize(1, maxDets=self.params.maxDets[0], iouThr=0.80)
+            stats[8] = _summarize(1, maxDets=self.params.maxDets[0], iouThr=0.85)
+            stats[9] = _summarize(1, maxDets=self.params.maxDets[0], iouThr=0.90)
+            stats[10] = _summarize(1, maxDets=self.params.maxDets[0], iouThr=0.95)
+            stats[11] = _summarize(1, maxDets=self.params.maxDets[0], areaRng="medium")
+            stats[12] = _summarize(1, maxDets=self.params.maxDets[0], areaRng="large")
+            stats[13] = _summarize(0, maxDets=self.params.maxDets[0])
+            stats[14] = _summarize(0, maxDets=self.params.maxDets[0], iouThr=0.5)
+            stats[15] = _summarize(0, maxDets=self.params.maxDets[0], iouThr=0.75)
+            stats[16] = _summarize(0, maxDets=self.params.maxDets[0], areaRng="medium")
+            stats[17] = _summarize(0, maxDets=self.params.maxDets[0], areaRng="large")
+            return stats
+
+        if not self.eval:
+            raise Exception("Please run accumulate() first")
+        iouType = self.params.iouType
+        if iouType in ["segm", "bbox"]:
+            summarize = _summarizeDets
+        elif iouType in ["keypoints"]:
+            summarize = _summarizeKps
+        elif iouType in ["densepose"]:
+            summarize = _summarizeUvs
+        self.stats = summarize()
+
+    def __str__(self):
+        self.summarize()
+
+    # ================ functions for dense pose ==============================
+    def findAllClosestVertsUV(self, U_points, V_points, Index_points):
+        ClosestVerts = np.ones(Index_points.shape) * -1
+        for i in np.arange(24):
+            #
+            if (i + 1) in Index_points:
+                UVs = np.array(
+                    [U_points[Index_points == (i + 1)], V_points[Index_points == (i + 1)]]
+                )
+                Current_Part_UVs = self.Part_UVs[i]
+                Current_Part_ClosestVertInds = self.Part_ClosestVertInds[i]
+                D = ssd.cdist(Current_Part_UVs.transpose(), UVs.transpose()).squeeze()
+                ClosestVerts[Index_points == (i + 1)] = Current_Part_ClosestVertInds[
+                    np.argmin(D, axis=0)
+                ]
+        ClosestVertsTransformed = self.PDIST_transform[ClosestVerts.astype(int) - 1]
+        ClosestVertsTransformed[ClosestVerts < 0] = 0
+        return ClosestVertsTransformed
+
+    def findClosestVertsCse(self, embedding, py, px, mask, mesh_name):
+        mesh_vertex_embeddings = self.embedder(mesh_name)
+        pixel_embeddings = embedding[:, py, px].t().to(device="cuda")
+        mask_vals = mask[py, px]
+        edm = squared_euclidean_distance_matrix(pixel_embeddings, mesh_vertex_embeddings)
+        vertex_indices = edm.argmin(dim=1).cpu()
+        vertex_indices[mask_vals <= 0] = -1
+        return vertex_indices
+
+    def findAllClosestVertsGT(self, gt):
+        #
+        I_gt = np.array(gt["dp_I"])
+        U_gt = np.array(gt["dp_U"])
+        V_gt = np.array(gt["dp_V"])
+        #
+        # print(I_gt)
+        #
+        ClosestVertsGT = np.ones(I_gt.shape) * -1
+        for i in np.arange(24):
+            if (i + 1) in I_gt:
+                UVs = np.array([U_gt[I_gt == (i + 1)], V_gt[I_gt == (i + 1)]])
+                Current_Part_UVs = self.Part_UVs[i]
+                Current_Part_ClosestVertInds = self.Part_ClosestVertInds[i]
+                D = ssd.cdist(Current_Part_UVs.transpose(), UVs.transpose()).squeeze()
+                ClosestVertsGT[I_gt == (i + 1)] = Current_Part_ClosestVertInds[np.argmin(D, axis=0)]
+        #
+        ClosestVertsGTTransformed = self.PDIST_transform[ClosestVertsGT.astype(int) - 1]
+        ClosestVertsGTTransformed[ClosestVertsGT < 0] = 0
+        return ClosestVertsGT, ClosestVertsGTTransformed
+
+    def getDistancesCse(self, cVertsGT, cVerts, mesh_name):
+        geodists_vertices = torch.ones_like(cVertsGT) * float("inf")
+        selected = (cVertsGT >= 0) * (cVerts >= 0)
+        mesh = create_mesh(mesh_name, "cpu")
+        geodists_vertices[selected] = mesh.geodists[cVertsGT[selected], cVerts[selected]]
+        return geodists_vertices.numpy()
+
+    def getDistancesUV(self, cVertsGT, cVerts):
+        #
+        n = 27554
+        dists = []
+        for d in range(len(cVertsGT)):
+            if cVertsGT[d] > 0:
+                if cVerts[d] > 0:
+                    i = cVertsGT[d] - 1
+                    j = cVerts[d] - 1
+                    if j == i:
+                        dists.append(0)
+                    elif j > i:
+                        ccc = i
+                        i = j
+                        j = ccc
+                        i = n - i - 1
+                        j = n - j - 1
+                        k = (n * (n - 1) / 2) - (n - i) * ((n - i) - 1) / 2 + j - i - 1
+                        k = (n * n - n) / 2 - k - 1
+                        dists.append(self.Pdist_matrix[int(k)][0])
+                    else:
+                        i = n - i - 1
+                        j = n - j - 1
+                        k = (n * (n - 1) / 2) - (n - i) * ((n - i) - 1) / 2 + j - i - 1
+                        k = (n * n - n) / 2 - k - 1
+                        dists.append(self.Pdist_matrix[int(k)][0])
+                else:
+                    dists.append(np.inf)
+        return np.atleast_1d(np.array(dists).squeeze())
+
+
+class Params:
+    """
+    Params for coco evaluation api
+    """
+
+    def setDetParams(self):
+        self.imgIds = []
+        self.catIds = []
+        # np.arange causes trouble.  the data point on arange is slightly larger than the true value
+        self.iouThrs = np.linspace(0.5, 0.95, int(np.round((0.95 - 0.5) / 0.05)) + 1, endpoint=True)
+        self.recThrs = np.linspace(0.0, 1.00, int(np.round((1.00 - 0.0) / 0.01)) + 1, endpoint=True)
+        self.maxDets = [1, 10, 100]
+        self.areaRng = [
+            [0**2, 1e5**2],
+            [0**2, 32**2],
+            [32**2, 96**2],
+            [96**2, 1e5**2],
+        ]
+        self.areaRngLbl = ["all", "small", "medium", "large"]
+        self.useCats = 1
+
+    def setKpParams(self):
+        self.imgIds = []
+        self.catIds = []
+        # np.arange causes trouble.  the data point on arange is slightly larger than the true value
+        self.iouThrs = np.linspace(0.5, 0.95, np.round((0.95 - 0.5) / 0.05) + 1, endpoint=True)
+        self.recThrs = np.linspace(0.0, 1.00, np.round((1.00 - 0.0) / 0.01) + 1, endpoint=True)
+        self.maxDets = [20]
+        self.areaRng = [[0**2, 1e5**2], [32**2, 96**2], [96**2, 1e5**2]]
+        self.areaRngLbl = ["all", "medium", "large"]
+        self.useCats = 1
+
+    def setUvParams(self):
+        self.imgIds = []
+        self.catIds = []
+        self.iouThrs = np.linspace(0.5, 0.95, int(np.round((0.95 - 0.5) / 0.05)) + 1, endpoint=True)
+        self.recThrs = np.linspace(0.0, 1.00, int(np.round((1.00 - 0.0) / 0.01)) + 1, endpoint=True)
+        self.maxDets = [20]
+        self.areaRng = [[0**2, 1e5**2], [32**2, 96**2], [96**2, 1e5**2]]
+        self.areaRngLbl = ["all", "medium", "large"]
+        self.useCats = 1
+
+    def __init__(self, iouType="segm"):
+        if iouType == "segm" or iouType == "bbox":
+            self.setDetParams()
+        elif iouType == "keypoints":
+            self.setKpParams()
+        elif iouType == "densepose":
+            self.setUvParams()
+        else:
+            raise Exception("iouType not supported")
+        self.iouType = iouType
+        # useSegm is deprecated
+        self.useSegm = None
diff --git a/densepose/evaluation/evaluator.py b/densepose/evaluation/evaluator.py
new file mode 100644
index 0000000000000000000000000000000000000000..d5d1d789bbe4b8791aa8529518ba1b964d31daca
--- /dev/null
+++ b/densepose/evaluation/evaluator.py
@@ -0,0 +1,421 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) Facebook, Inc. and its affiliates.
+
+import contextlib
+import copy
+import io
+import itertools
+import logging
+import numpy as np
+import os
+from collections import OrderedDict
+from typing import Dict, Iterable, List, Optional
+import pycocotools.mask as mask_utils
+import torch
+from pycocotools.coco import COCO
+from tabulate import tabulate
+
+from detectron2.config import CfgNode
+from detectron2.data import MetadataCatalog
+from detectron2.evaluation import DatasetEvaluator
+from detectron2.structures import BoxMode
+from detectron2.utils.comm import gather, get_rank, is_main_process, synchronize
+from detectron2.utils.file_io import PathManager
+from detectron2.utils.logger import create_small_table
+
+from densepose.converters import ToChartResultConverter, ToMaskConverter
+from densepose.data.datasets.coco import maybe_filter_and_map_categories_cocoapi
+from densepose.structures import (
+    DensePoseChartPredictorOutput,
+    DensePoseEmbeddingPredictorOutput,
+    quantize_densepose_chart_result,
+)
+
+from .densepose_coco_evaluation import DensePoseCocoEval, DensePoseEvalMode
+from .mesh_alignment_evaluator import MeshAlignmentEvaluator
+from .tensor_storage import (
+    SingleProcessFileTensorStorage,
+    SingleProcessRamTensorStorage,
+    SingleProcessTensorStorage,
+    SizeData,
+    storage_gather,
+)
+
+
+class DensePoseCOCOEvaluator(DatasetEvaluator):
+    def __init__(
+        self,
+        dataset_name,
+        distributed,
+        output_dir=None,
+        evaluator_type: str = "iuv",
+        min_iou_threshold: float = 0.5,
+        storage: Optional[SingleProcessTensorStorage] = None,
+        embedder=None,
+        should_evaluate_mesh_alignment: bool = False,
+        mesh_alignment_mesh_names: Optional[List[str]] = None,
+    ):
+        self._embedder = embedder
+        self._distributed = distributed
+        self._output_dir = output_dir
+        self._evaluator_type = evaluator_type
+        self._storage = storage
+        self._should_evaluate_mesh_alignment = should_evaluate_mesh_alignment
+
+        assert not (
+            should_evaluate_mesh_alignment and embedder is None
+        ), "Mesh alignment evaluation is activated, but no vertex embedder provided!"
+        if should_evaluate_mesh_alignment:
+            self._mesh_alignment_evaluator = MeshAlignmentEvaluator(
+                embedder,
+                mesh_alignment_mesh_names,
+            )
+
+        self._cpu_device = torch.device("cpu")
+        self._logger = logging.getLogger(__name__)
+
+        self._metadata = MetadataCatalog.get(dataset_name)
+        self._min_threshold = min_iou_threshold
+        json_file = PathManager.get_local_path(self._metadata.json_file)
+        with contextlib.redirect_stdout(io.StringIO()):
+            self._coco_api = COCO(json_file)
+        maybe_filter_and_map_categories_cocoapi(dataset_name, self._coco_api)
+
+    def reset(self):
+        self._predictions = []
+
+    def process(self, inputs, outputs):
+        """
+        Args:
+            inputs: the inputs to a COCO model (e.g., GeneralizedRCNN).
+                It is a list of dict. Each dict corresponds to an image and
+                contains keys like "height", "width", "file_name", "image_id".
+            outputs: the outputs of a COCO model. It is a list of dicts with key
+                "instances" that contains :class:`Instances`.
+                The :class:`Instances` object needs to have `densepose` field.
+        """
+        for input, output in zip(inputs, outputs):
+            instances = output["instances"].to(self._cpu_device)
+            if not instances.has("pred_densepose"):
+                continue
+            prediction_list = prediction_to_dict(
+                instances,
+                input["image_id"],
+                self._embedder,
+                self._metadata.class_to_mesh_name,
+                self._storage is not None,
+            )
+            if self._storage is not None:
+                for prediction_dict in prediction_list:
+                    dict_to_store = {}
+                    for field_name in self._storage.data_schema:
+                        dict_to_store[field_name] = prediction_dict[field_name]
+                    record_id = self._storage.put(dict_to_store)
+                    prediction_dict["record_id"] = record_id
+                    prediction_dict["rank"] = get_rank()
+                    for field_name in self._storage.data_schema:
+                        del prediction_dict[field_name]
+            self._predictions.extend(prediction_list)
+
+    def evaluate(self, img_ids=None):
+        if self._distributed:
+            synchronize()
+            predictions = gather(self._predictions)
+            predictions = list(itertools.chain(*predictions))
+        else:
+            predictions = self._predictions
+
+        multi_storage = storage_gather(self._storage) if self._storage is not None else None
+
+        if not is_main_process():
+            return
+        return copy.deepcopy(self._eval_predictions(predictions, multi_storage, img_ids))
+
+    def _eval_predictions(self, predictions, multi_storage=None, img_ids=None):
+        """
+        Evaluate predictions on densepose.
+        Return results with the metrics of the tasks.
+        """
+        self._logger.info("Preparing results for COCO format ...")
+
+        if self._output_dir:
+            PathManager.mkdirs(self._output_dir)
+            file_path = os.path.join(self._output_dir, "coco_densepose_predictions.pth")
+            with PathManager.open(file_path, "wb") as f:
+                torch.save(predictions, f)
+
+        self._logger.info("Evaluating predictions ...")
+        res = OrderedDict()
+        results_gps, results_gpsm, results_segm = _evaluate_predictions_on_coco(
+            self._coco_api,
+            predictions,
+            multi_storage,
+            self._embedder,
+            class_names=self._metadata.get("thing_classes"),
+            min_threshold=self._min_threshold,
+            img_ids=img_ids,
+        )
+        res["densepose_gps"] = results_gps
+        res["densepose_gpsm"] = results_gpsm
+        res["densepose_segm"] = results_segm
+        if self._should_evaluate_mesh_alignment:
+            res["densepose_mesh_alignment"] = self._evaluate_mesh_alignment()
+        return res
+
+    def _evaluate_mesh_alignment(self):
+        self._logger.info("Mesh alignment evaluation ...")
+        mean_ge, mean_gps, per_mesh_metrics = self._mesh_alignment_evaluator.evaluate()
+        results = {
+            "GE": mean_ge * 100,
+            "GPS": mean_gps * 100,
+        }
+        mesh_names = set()
+        for metric_name in per_mesh_metrics:
+            for mesh_name, value in per_mesh_metrics[metric_name].items():
+                results[f"{metric_name}-{mesh_name}"] = value * 100
+                mesh_names.add(mesh_name)
+        self._print_mesh_alignment_results(results, mesh_names)
+        return results
+
+    def _print_mesh_alignment_results(self, results: Dict[str, float], mesh_names: Iterable[str]):
+        self._logger.info("Evaluation results for densepose, mesh alignment:")
+        self._logger.info(f'| {"Mesh":13s} | {"GErr":7s} | {"GPS":7s} |')
+        self._logger.info("| :-----------: | :-----: | :-----: |")
+        for mesh_name in mesh_names:
+            ge_key = f"GE-{mesh_name}"
+            ge_str = f"{results[ge_key]:.4f}" if ge_key in results else " "
+            gps_key = f"GPS-{mesh_name}"
+            gps_str = f"{results[gps_key]:.4f}" if gps_key in results else " "
+            self._logger.info(f"| {mesh_name:13s} | {ge_str:7s} | {gps_str:7s} |")
+        self._logger.info("| :-------------------------------: |")
+        ge_key = "GE"
+        ge_str = f"{results[ge_key]:.4f}" if ge_key in results else " "
+        gps_key = "GPS"
+        gps_str = f"{results[gps_key]:.4f}" if gps_key in results else " "
+        self._logger.info(f'| {"MEAN":13s} | {ge_str:7s} | {gps_str:7s} |')
+
+
+def prediction_to_dict(instances, img_id, embedder, class_to_mesh_name, use_storage):
+    """
+    Args:
+        instances (Instances): the output of the model
+        img_id (str): the image id in COCO
+
+    Returns:
+        list[dict]: the results in densepose evaluation format
+    """
+    scores = instances.scores.tolist()
+    classes = instances.pred_classes.tolist()
+    raw_boxes_xywh = BoxMode.convert(
+        instances.pred_boxes.tensor.clone(), BoxMode.XYXY_ABS, BoxMode.XYWH_ABS
+    )
+
+    if isinstance(instances.pred_densepose, DensePoseEmbeddingPredictorOutput):
+        results_densepose = densepose_cse_predictions_to_dict(
+            instances, embedder, class_to_mesh_name, use_storage
+        )
+    elif isinstance(instances.pred_densepose, DensePoseChartPredictorOutput):
+        if not use_storage:
+            results_densepose = densepose_chart_predictions_to_dict(instances)
+        else:
+            results_densepose = densepose_chart_predictions_to_storage_dict(instances)
+
+    results = []
+    for k in range(len(instances)):
+        result = {
+            "image_id": img_id,
+            "category_id": classes[k],
+            "bbox": raw_boxes_xywh[k].tolist(),
+            "score": scores[k],
+        }
+        results.append({**result, **results_densepose[k]})
+    return results
+
+
+def densepose_chart_predictions_to_dict(instances):
+    segmentations = ToMaskConverter.convert(
+        instances.pred_densepose, instances.pred_boxes, instances.image_size
+    )
+
+    results = []
+    for k in range(len(instances)):
+        densepose_results_quantized = quantize_densepose_chart_result(
+            ToChartResultConverter.convert(instances.pred_densepose[k], instances.pred_boxes[k])
+        )
+        densepose_results_quantized.labels_uv_uint8 = (
+            densepose_results_quantized.labels_uv_uint8.cpu()
+        )
+        segmentation = segmentations.tensor[k]
+        segmentation_encoded = mask_utils.encode(
+            np.require(segmentation.numpy(), dtype=np.uint8, requirements=["F"])
+        )
+        segmentation_encoded["counts"] = segmentation_encoded["counts"].decode("utf-8")
+        result = {
+            "densepose": densepose_results_quantized,
+            "segmentation": segmentation_encoded,
+        }
+        results.append(result)
+    return results
+
+
+def densepose_chart_predictions_to_storage_dict(instances):
+    results = []
+    for k in range(len(instances)):
+        densepose_predictor_output = instances.pred_densepose[k]
+        result = {
+            "coarse_segm": densepose_predictor_output.coarse_segm.squeeze(0).cpu(),
+            "fine_segm": densepose_predictor_output.fine_segm.squeeze(0).cpu(),
+            "u": densepose_predictor_output.u.squeeze(0).cpu(),
+            "v": densepose_predictor_output.v.squeeze(0).cpu(),
+        }
+        results.append(result)
+    return results
+
+
+def densepose_cse_predictions_to_dict(instances, embedder, class_to_mesh_name, use_storage):
+    results = []
+    for k in range(len(instances)):
+        cse = instances.pred_densepose[k]
+        results.append(
+            {
+                "coarse_segm": cse.coarse_segm[0].cpu(),
+                "embedding": cse.embedding[0].cpu(),
+            }
+        )
+    return results
+
+
+def _evaluate_predictions_on_coco(
+    coco_gt,
+    coco_results,
+    multi_storage=None,
+    embedder=None,
+    class_names=None,
+    min_threshold: float = 0.5,
+    img_ids=None,
+):
+    logger = logging.getLogger(__name__)
+
+    densepose_metrics = _get_densepose_metrics(min_threshold)
+    if len(coco_results) == 0:  # cocoapi does not handle empty results very well
+        logger.warn("No predictions from the model! Set scores to -1")
+        results_gps = {metric: -1 for metric in densepose_metrics}
+        results_gpsm = {metric: -1 for metric in densepose_metrics}
+        results_segm = {metric: -1 for metric in densepose_metrics}
+        return results_gps, results_gpsm, results_segm
+
+    coco_dt = coco_gt.loadRes(coco_results)
+
+    results = []
+    for eval_mode_name in ["GPS", "GPSM", "IOU"]:
+        eval_mode = getattr(DensePoseEvalMode, eval_mode_name)
+        coco_eval = DensePoseCocoEval(
+            coco_gt, coco_dt, "densepose", multi_storage, embedder, dpEvalMode=eval_mode
+        )
+        result = _derive_results_from_coco_eval(
+            coco_eval, eval_mode_name, densepose_metrics, class_names, min_threshold, img_ids
+        )
+        results.append(result)
+    return results
+
+
+def _get_densepose_metrics(min_threshold: float = 0.5):
+    metrics = ["AP"]
+    if min_threshold <= 0.201:
+        metrics += ["AP20"]
+    if min_threshold <= 0.301:
+        metrics += ["AP30"]
+    if min_threshold <= 0.401:
+        metrics += ["AP40"]
+    metrics.extend(["AP50", "AP75", "APm", "APl", "AR", "AR50", "AR75", "ARm", "ARl"])
+    return metrics
+
+
+def _derive_results_from_coco_eval(
+    coco_eval, eval_mode_name, metrics, class_names, min_threshold: float, img_ids
+):
+    if img_ids is not None:
+        coco_eval.params.imgIds = img_ids
+    coco_eval.params.iouThrs = np.linspace(
+        min_threshold, 0.95, int(np.round((0.95 - min_threshold) / 0.05)) + 1, endpoint=True
+    )
+    coco_eval.evaluate()
+    coco_eval.accumulate()
+    coco_eval.summarize()
+    results = {metric: float(coco_eval.stats[idx] * 100) for idx, metric in enumerate(metrics)}
+    logger = logging.getLogger(__name__)
+    logger.info(
+        f"Evaluation results for densepose, {eval_mode_name} metric: \n"
+        + create_small_table(results)
+    )
+    if class_names is None or len(class_names) <= 1:
+        return results
+
+    # Compute per-category AP, the same way as it is done in D2
+    # (see detectron2/evaluation/coco_evaluation.py):
+    precisions = coco_eval.eval["precision"]
+    # precision has dims (iou, recall, cls, area range, max dets)
+    assert len(class_names) == precisions.shape[2]
+
+    results_per_category = []
+    for idx, name in enumerate(class_names):
+        # area range index 0: all area ranges
+        # max dets index -1: typically 100 per image
+        precision = precisions[:, :, idx, 0, -1]
+        precision = precision[precision > -1]
+        ap = np.mean(precision) if precision.size else float("nan")
+        results_per_category.append((f"{name}", float(ap * 100)))
+
+    # tabulate it
+    n_cols = min(6, len(results_per_category) * 2)
+    results_flatten = list(itertools.chain(*results_per_category))
+    results_2d = itertools.zip_longest(*[results_flatten[i::n_cols] for i in range(n_cols)])
+    table = tabulate(
+        results_2d,
+        tablefmt="pipe",
+        floatfmt=".3f",
+        headers=["category", "AP"] * (n_cols // 2),
+        numalign="left",
+    )
+    logger.info(f"Per-category {eval_mode_name} AP: \n" + table)
+
+    results.update({"AP-" + name: ap for name, ap in results_per_category})
+    return results
+
+
+def build_densepose_evaluator_storage(cfg: CfgNode, output_folder: str):
+    storage_spec = cfg.DENSEPOSE_EVALUATION.STORAGE
+    if storage_spec == "none":
+        return None
+    evaluator_type = cfg.DENSEPOSE_EVALUATION.TYPE
+    # common output tensor sizes
+    hout = cfg.MODEL.ROI_DENSEPOSE_HEAD.HEATMAP_SIZE
+    wout = cfg.MODEL.ROI_DENSEPOSE_HEAD.HEATMAP_SIZE
+    n_csc = cfg.MODEL.ROI_DENSEPOSE_HEAD.NUM_COARSE_SEGM_CHANNELS
+    # specific output tensors
+    if evaluator_type == "iuv":
+        n_fsc = cfg.MODEL.ROI_DENSEPOSE_HEAD.NUM_PATCHES + 1
+        schema = {
+            "coarse_segm": SizeData(dtype="float32", shape=(n_csc, hout, wout)),
+            "fine_segm": SizeData(dtype="float32", shape=(n_fsc, hout, wout)),
+            "u": SizeData(dtype="float32", shape=(n_fsc, hout, wout)),
+            "v": SizeData(dtype="float32", shape=(n_fsc, hout, wout)),
+        }
+    elif evaluator_type == "cse":
+        embed_size = cfg.MODEL.ROI_DENSEPOSE_HEAD.CSE.EMBED_SIZE
+        schema = {
+            "coarse_segm": SizeData(dtype="float32", shape=(n_csc, hout, wout)),
+            "embedding": SizeData(dtype="float32", shape=(embed_size, hout, wout)),
+        }
+    else:
+        raise ValueError(f"Unknown evaluator type: {evaluator_type}")
+    # storage types
+    if storage_spec == "ram":
+        storage = SingleProcessRamTensorStorage(schema, io.BytesIO())
+    elif storage_spec == "file":
+        fpath = os.path.join(output_folder, f"DensePoseEvaluatorStorage.{get_rank()}.bin")
+        PathManager.mkdirs(output_folder)
+        storage = SingleProcessFileTensorStorage(schema, fpath, "wb")
+    else:
+        raise ValueError(f"Unknown storage specification: {storage_spec}")
+    return storage
diff --git a/densepose/evaluation/mesh_alignment_evaluator.py b/densepose/evaluation/mesh_alignment_evaluator.py
new file mode 100644
index 0000000000000000000000000000000000000000..9d67c1a88a56332fb708c4618a34e96900926083
--- /dev/null
+++ b/densepose/evaluation/mesh_alignment_evaluator.py
@@ -0,0 +1,66 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+
+import json
+import logging
+from typing import List, Optional
+import torch
+from torch import nn
+
+from detectron2.utils.file_io import PathManager
+
+from densepose.structures.mesh import create_mesh
+
+
+class MeshAlignmentEvaluator:
+    """
+    Class for evaluation of 3D mesh alignment based on the learned vertex embeddings
+    """
+
+    def __init__(self, embedder: nn.Module, mesh_names: Optional[List[str]]):
+        self.embedder = embedder
+        # use the provided mesh names if not None and not an empty list
+        self.mesh_names = mesh_names if mesh_names else embedder.mesh_names
+        self.logger = logging.getLogger(__name__)
+        with PathManager.open(
+            "https://dl.fbaipublicfiles.com/densepose/data/cse/mesh_keyvertices_v0.json", "r"
+        ) as f:
+            self.mesh_keyvertices = json.load(f)
+
+    def evaluate(self):
+        ge_per_mesh = {}
+        gps_per_mesh = {}
+        for mesh_name_1 in self.mesh_names:
+            avg_errors = []
+            avg_gps = []
+            embeddings_1 = self.embedder(mesh_name_1)
+            keyvertices_1 = self.mesh_keyvertices[mesh_name_1]
+            keyvertex_names_1 = list(keyvertices_1.keys())
+            keyvertex_indices_1 = [keyvertices_1[name] for name in keyvertex_names_1]
+            for mesh_name_2 in self.mesh_names:
+                if mesh_name_1 == mesh_name_2:
+                    continue
+                embeddings_2 = self.embedder(mesh_name_2)
+                keyvertices_2 = self.mesh_keyvertices[mesh_name_2]
+                sim_matrix_12 = embeddings_1[keyvertex_indices_1].mm(embeddings_2.T)
+                vertices_2_matching_keyvertices_1 = sim_matrix_12.argmax(axis=1)
+                mesh_2 = create_mesh(mesh_name_2, embeddings_2.device)
+                geodists = mesh_2.geodists[
+                    vertices_2_matching_keyvertices_1,
+                    [keyvertices_2[name] for name in keyvertex_names_1],
+                ]
+                Current_Mean_Distances = 0.255
+                gps = (-(geodists**2) / (2 * (Current_Mean_Distances**2))).exp()
+                avg_errors.append(geodists.mean().item())
+                avg_gps.append(gps.mean().item())
+
+            ge_mean = torch.as_tensor(avg_errors).mean().item()
+            gps_mean = torch.as_tensor(avg_gps).mean().item()
+            ge_per_mesh[mesh_name_1] = ge_mean
+            gps_per_mesh[mesh_name_1] = gps_mean
+        ge_mean_global = torch.as_tensor(list(ge_per_mesh.values())).mean().item()
+        gps_mean_global = torch.as_tensor(list(gps_per_mesh.values())).mean().item()
+        per_mesh_metrics = {
+            "GE": ge_per_mesh,
+            "GPS": gps_per_mesh,
+        }
+        return ge_mean_global, gps_mean_global, per_mesh_metrics
diff --git a/densepose/evaluation/tensor_storage.py b/densepose/evaluation/tensor_storage.py
new file mode 100644
index 0000000000000000000000000000000000000000..db57c6ac73a423e39b1ed2e5a4a1f824aa233737
--- /dev/null
+++ b/densepose/evaluation/tensor_storage.py
@@ -0,0 +1,239 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+
+import io
+import numpy as np
+import os
+from dataclasses import dataclass
+from functools import reduce
+from operator import mul
+from typing import BinaryIO, Dict, Optional, Tuple
+import torch
+
+from detectron2.utils.comm import gather, get_rank
+from detectron2.utils.file_io import PathManager
+
+
+@dataclass
+class SizeData:
+    dtype: str
+    shape: Tuple[int]
+
+
+def _calculate_record_field_size_b(data_schema: Dict[str, SizeData], field_name: str) -> int:
+    schema = data_schema[field_name]
+    element_size_b = np.dtype(schema.dtype).itemsize
+    record_field_size_b = reduce(mul, schema.shape) * element_size_b
+    return record_field_size_b
+
+
+def _calculate_record_size_b(data_schema: Dict[str, SizeData]) -> int:
+    record_size_b = 0
+    for field_name in data_schema:
+        record_field_size_b = _calculate_record_field_size_b(data_schema, field_name)
+        record_size_b += record_field_size_b
+    return record_size_b
+
+
+def _calculate_record_field_sizes_b(data_schema: Dict[str, SizeData]) -> Dict[str, int]:
+    field_sizes_b = {}
+    for field_name in data_schema:
+        field_sizes_b[field_name] = _calculate_record_field_size_b(data_schema, field_name)
+    return field_sizes_b
+
+
+class SingleProcessTensorStorage:
+    """
+    Compact tensor storage to keep tensor data of predefined size and type.
+    """
+
+    def __init__(self, data_schema: Dict[str, SizeData], storage_impl: BinaryIO):
+        """
+        Construct tensor storage based on information on data shape and size.
+        Internally uses numpy to interpret the type specification.
+        The storage must support operations `seek(offset, whence=os.SEEK_SET)` and
+        `read(size)` to be able to perform the `get` operation.
+        The storage must support operation `write(bytes)` to be able to perform
+        the `put` operation.
+
+        Args:
+            data_schema (dict: str -> SizeData): dictionary which maps tensor name
+                to its size data (shape and data type), e.g.
+                ```
+                {
+                  "coarse_segm": SizeData(dtype="float32", shape=(112, 112)),
+                  "embedding": SizeData(dtype="float32", shape=(16, 112, 112)),
+                }
+                ```
+            storage_impl (BinaryIO): io instance that handles file-like seek, read
+                and write operations, e.g. a file handle or a memory buffer like io.BytesIO
+        """
+        self.data_schema = data_schema
+        self.record_size_b = _calculate_record_size_b(data_schema)
+        self.record_field_sizes_b = _calculate_record_field_sizes_b(data_schema)
+        self.storage_impl = storage_impl
+        self.next_record_id = 0
+
+    def get(self, record_id: int) -> Dict[str, torch.Tensor]:
+        """
+        Load tensors from the storage by record ID
+
+        Args:
+            record_id (int): Record ID, for which to load the data
+
+        Return:
+            dict: str -> tensor: tensor name mapped to tensor data, recorded under the provided ID
+        """
+        self.storage_impl.seek(record_id * self.record_size_b, os.SEEK_SET)
+        data_bytes = self.storage_impl.read(self.record_size_b)
+        assert len(data_bytes) == self.record_size_b, (
+            f"Expected data size {self.record_size_b} B could not be read: "
+            f"got {len(data_bytes)} B"
+        )
+        record = {}
+        cur_idx = 0
+        # it's important to read and write in the same order
+        for field_name in sorted(self.data_schema):
+            schema = self.data_schema[field_name]
+            field_size_b = self.record_field_sizes_b[field_name]
+            chunk = data_bytes[cur_idx : cur_idx + field_size_b]
+            data_np = np.frombuffer(
+                chunk, dtype=schema.dtype, count=reduce(mul, schema.shape)
+            ).reshape(schema.shape)
+            record[field_name] = torch.from_numpy(data_np)
+            cur_idx += field_size_b
+        return record
+
+    def put(self, data: Dict[str, torch.Tensor]) -> int:
+        """
+        Store tensors in the storage
+
+        Args:
+            data (dict: str -> tensor): data to store, a dictionary which maps
+                tensor names into tensors; tensor shapes must match those specified
+                in data schema.
+        Return:
+            int: record ID, under which the data is stored
+        """
+        # it's important to read and write in the same order
+        for field_name in sorted(self.data_schema):
+            assert (
+                field_name in data
+            ), f"Field '{field_name}' not present in data: data keys are {data.keys()}"
+            value = data[field_name]
+            assert value.shape == self.data_schema[field_name].shape, (
+                f"Mismatched tensor shapes for field '{field_name}': "
+                f"expected {self.data_schema[field_name].shape}, got {value.shape}"
+            )
+            data_bytes = value.cpu().numpy().tobytes()
+            assert len(data_bytes) == self.record_field_sizes_b[field_name], (
+                f"Expected field {field_name} to be of size "
+                f"{self.record_field_sizes_b[field_name]} B, got {len(data_bytes)} B"
+            )
+            self.storage_impl.write(data_bytes)
+        record_id = self.next_record_id
+        self.next_record_id += 1
+        return record_id
+
+
+class SingleProcessFileTensorStorage(SingleProcessTensorStorage):
+    """
+    Implementation of a single process tensor storage which stores data in a file
+    """
+
+    def __init__(self, data_schema: Dict[str, SizeData], fpath: str, mode: str):
+        self.fpath = fpath
+        assert "b" in mode, f"Tensor storage should be opened in binary mode, got '{mode}'"
+        if "w" in mode:
+            # pyre-fixme[6]: For 2nd argument expected `Union[typing_extensions.Liter...
+            file_h = PathManager.open(fpath, mode)
+        elif "r" in mode:
+            local_fpath = PathManager.get_local_path(fpath)
+            file_h = open(local_fpath, mode)
+        else:
+            raise ValueError(f"Unsupported file mode {mode}, supported modes: rb, wb")
+        super().__init__(data_schema, file_h)  # pyre-ignore[6]
+
+
+class SingleProcessRamTensorStorage(SingleProcessTensorStorage):
+    """
+    Implementation of a single process tensor storage which stores data in RAM
+    """
+
+    def __init__(self, data_schema: Dict[str, SizeData], buf: io.BytesIO):
+        super().__init__(data_schema, buf)
+
+
+class MultiProcessTensorStorage:
+    """
+    Representation of a set of tensor storages created by individual processes,
+    allows to access those storages from a single owner process. The storages
+    should either be shared or broadcasted to the owner process.
+    The processes are identified by their rank, data is uniquely defined by
+    the rank of the process and the record ID.
+    """
+
+    def __init__(self, rank_to_storage: Dict[int, SingleProcessTensorStorage]):
+        self.rank_to_storage = rank_to_storage
+
+    def get(self, rank: int, record_id: int) -> Dict[str, torch.Tensor]:
+        storage = self.rank_to_storage[rank]
+        return storage.get(record_id)
+
+    def put(self, rank: int, data: Dict[str, torch.Tensor]) -> int:
+        storage = self.rank_to_storage[rank]
+        return storage.put(data)
+
+
+class MultiProcessFileTensorStorage(MultiProcessTensorStorage):
+    def __init__(self, data_schema: Dict[str, SizeData], rank_to_fpath: Dict[int, str], mode: str):
+        rank_to_storage = {
+            rank: SingleProcessFileTensorStorage(data_schema, fpath, mode)
+            for rank, fpath in rank_to_fpath.items()
+        }
+        super().__init__(rank_to_storage)  # pyre-ignore[6]
+
+
+class MultiProcessRamTensorStorage(MultiProcessTensorStorage):
+    def __init__(self, data_schema: Dict[str, SizeData], rank_to_buffer: Dict[int, io.BytesIO]):
+        rank_to_storage = {
+            rank: SingleProcessRamTensorStorage(data_schema, buf)
+            for rank, buf in rank_to_buffer.items()
+        }
+        super().__init__(rank_to_storage)  # pyre-ignore[6]
+
+
+def _ram_storage_gather(
+    storage: SingleProcessRamTensorStorage, dst_rank: int = 0
+) -> Optional[MultiProcessRamTensorStorage]:
+    storage.storage_impl.seek(0, os.SEEK_SET)
+    # TODO: overhead, pickling a bytes object, can just pass bytes in a tensor directly
+    # see detectron2/utils.comm.py
+    data_list = gather(storage.storage_impl.read(), dst=dst_rank)
+    if get_rank() != dst_rank:
+        return None
+    rank_to_buffer = {i: io.BytesIO(data_list[i]) for i in range(len(data_list))}
+    multiprocess_storage = MultiProcessRamTensorStorage(storage.data_schema, rank_to_buffer)
+    return multiprocess_storage
+
+
+def _file_storage_gather(
+    storage: SingleProcessFileTensorStorage,
+    dst_rank: int = 0,
+    mode: str = "rb",
+) -> Optional[MultiProcessFileTensorStorage]:
+    storage.storage_impl.close()
+    fpath_list = gather(storage.fpath, dst=dst_rank)
+    if get_rank() != dst_rank:
+        return None
+    rank_to_fpath = {i: fpath_list[i] for i in range(len(fpath_list))}
+    return MultiProcessFileTensorStorage(storage.data_schema, rank_to_fpath, mode)
+
+
+def storage_gather(
+    storage: SingleProcessTensorStorage, dst_rank: int = 0
+) -> Optional[MultiProcessTensorStorage]:
+    if isinstance(storage, SingleProcessRamTensorStorage):
+        return _ram_storage_gather(storage, dst_rank)
+    elif isinstance(storage, SingleProcessFileTensorStorage):
+        return _file_storage_gather(storage, dst_rank)
+    raise Exception(f"Unsupported storage for gather operation: {storage}")
diff --git a/densepose/modeling/__init__.py b/densepose/modeling/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..4c49f6da0d182cc97f5fe6b21d77c8f8330d3c3d
--- /dev/null
+++ b/densepose/modeling/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+
+from .confidence import DensePoseConfidenceModelConfig, DensePoseUVConfidenceType
+from .filter import DensePoseDataFilter
+from .inference import densepose_inference
+from .utils import initialize_module_params
+from .build import (
+    build_densepose_data_filter,
+    build_densepose_embedder,
+    build_densepose_head,
+    build_densepose_losses,
+    build_densepose_predictor,
+)
diff --git a/densepose/modeling/__pycache__/__init__.cpython-310.pyc b/densepose/modeling/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c5fec4bfd69de124ac07e5ef3d45e8de49f1140b
Binary files /dev/null and b/densepose/modeling/__pycache__/__init__.cpython-310.pyc differ
diff --git a/densepose/modeling/__pycache__/__init__.cpython-39.pyc b/densepose/modeling/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e0fa1770157b0d1ee73f0db7eddd5c3a7f2bb7e4
Binary files /dev/null and b/densepose/modeling/__pycache__/__init__.cpython-39.pyc differ
diff --git a/densepose/modeling/__pycache__/build.cpython-310.pyc b/densepose/modeling/__pycache__/build.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..45a0bd3b63aba13a407c2ef9216d33070d7c0227
Binary files /dev/null and b/densepose/modeling/__pycache__/build.cpython-310.pyc differ
diff --git a/densepose/modeling/__pycache__/build.cpython-39.pyc b/densepose/modeling/__pycache__/build.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..db12c89ced6bfc6de83866b5a809fd6595ba17db
Binary files /dev/null and b/densepose/modeling/__pycache__/build.cpython-39.pyc differ
diff --git a/densepose/modeling/__pycache__/confidence.cpython-310.pyc b/densepose/modeling/__pycache__/confidence.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..54c921ab5be5930fc65513f7fb18050837e304de
Binary files /dev/null and b/densepose/modeling/__pycache__/confidence.cpython-310.pyc differ
diff --git a/densepose/modeling/__pycache__/confidence.cpython-39.pyc b/densepose/modeling/__pycache__/confidence.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..bb62817f004cbfde75f7e46e482f437c49ade5ab
Binary files /dev/null and b/densepose/modeling/__pycache__/confidence.cpython-39.pyc differ
diff --git a/densepose/modeling/__pycache__/filter.cpython-310.pyc b/densepose/modeling/__pycache__/filter.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3f6d86105f52f390c8d5336ee3d5aacd1d8532b6
Binary files /dev/null and b/densepose/modeling/__pycache__/filter.cpython-310.pyc differ
diff --git a/densepose/modeling/__pycache__/filter.cpython-39.pyc b/densepose/modeling/__pycache__/filter.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ab2a121d1af5a69ba28bdc4cfbbcbab87f391902
Binary files /dev/null and b/densepose/modeling/__pycache__/filter.cpython-39.pyc differ
diff --git a/densepose/modeling/__pycache__/hrfpn.cpython-310.pyc b/densepose/modeling/__pycache__/hrfpn.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..dfd4c23fb753f11e54e539ed6f02b588308cf81c
Binary files /dev/null and b/densepose/modeling/__pycache__/hrfpn.cpython-310.pyc differ
diff --git a/densepose/modeling/__pycache__/hrfpn.cpython-39.pyc b/densepose/modeling/__pycache__/hrfpn.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f31bfda577fb5cff0a3f130b26959cdb9a52dd81
Binary files /dev/null and b/densepose/modeling/__pycache__/hrfpn.cpython-39.pyc differ
diff --git a/densepose/modeling/__pycache__/hrnet.cpython-310.pyc b/densepose/modeling/__pycache__/hrnet.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c304b7f32f286178e2d3f4593d71c577a7198481
Binary files /dev/null and b/densepose/modeling/__pycache__/hrnet.cpython-310.pyc differ
diff --git a/densepose/modeling/__pycache__/hrnet.cpython-39.pyc b/densepose/modeling/__pycache__/hrnet.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..95727d8730d092ef4a18d23b7eedcdc448c789c2
Binary files /dev/null and b/densepose/modeling/__pycache__/hrnet.cpython-39.pyc differ
diff --git a/densepose/modeling/__pycache__/inference.cpython-310.pyc b/densepose/modeling/__pycache__/inference.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d608965fd445bdf9590ba3de0d2ccad67d991c77
Binary files /dev/null and b/densepose/modeling/__pycache__/inference.cpython-310.pyc differ
diff --git a/densepose/modeling/__pycache__/inference.cpython-39.pyc b/densepose/modeling/__pycache__/inference.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..98953d629d1559b1b014171e46ddb6ae4a3f76b1
Binary files /dev/null and b/densepose/modeling/__pycache__/inference.cpython-39.pyc differ
diff --git a/densepose/modeling/__pycache__/test_time_augmentation.cpython-310.pyc b/densepose/modeling/__pycache__/test_time_augmentation.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2dc34f9aa3f4fa2dd54ce9f627977048c5e90921
Binary files /dev/null and b/densepose/modeling/__pycache__/test_time_augmentation.cpython-310.pyc differ
diff --git a/densepose/modeling/__pycache__/test_time_augmentation.cpython-39.pyc b/densepose/modeling/__pycache__/test_time_augmentation.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5c625359ed29ce61a789a3cfa8612136caded1f4
Binary files /dev/null and b/densepose/modeling/__pycache__/test_time_augmentation.cpython-39.pyc differ
diff --git a/densepose/modeling/__pycache__/utils.cpython-310.pyc b/densepose/modeling/__pycache__/utils.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c2442032c5bee705e773f228e618054ea5825a42
Binary files /dev/null and b/densepose/modeling/__pycache__/utils.cpython-310.pyc differ
diff --git a/densepose/modeling/__pycache__/utils.cpython-39.pyc b/densepose/modeling/__pycache__/utils.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..65e26adbcc671932d87372905e57fb0d298c9df7
Binary files /dev/null and b/densepose/modeling/__pycache__/utils.cpython-39.pyc differ
diff --git a/densepose/modeling/build.py b/densepose/modeling/build.py
new file mode 100644
index 0000000000000000000000000000000000000000..bb7f54b4a1044bc518d66d89432dd52c79fdf293
--- /dev/null
+++ b/densepose/modeling/build.py
@@ -0,0 +1,87 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+
+from typing import Optional
+from torch import nn
+
+from detectron2.config import CfgNode
+
+from .cse.embedder import Embedder
+from .filter import DensePoseDataFilter
+
+
+def build_densepose_predictor(cfg: CfgNode, input_channels: int):
+    """
+    Create an instance of DensePose predictor based on configuration options.
+
+    Args:
+        cfg (CfgNode): configuration options
+        input_channels (int): input tensor size along the channel dimension
+    Return:
+        An instance of DensePose predictor
+    """
+    from .predictors import DENSEPOSE_PREDICTOR_REGISTRY
+
+    predictor_name = cfg.MODEL.ROI_DENSEPOSE_HEAD.PREDICTOR_NAME
+    return DENSEPOSE_PREDICTOR_REGISTRY.get(predictor_name)(cfg, input_channels)
+
+
+def build_densepose_data_filter(cfg: CfgNode):
+    """
+    Build DensePose data filter which selects data for training
+
+    Args:
+        cfg (CfgNode): configuration options
+
+    Return:
+        Callable: list(Tensor), list(Instances) -> list(Tensor), list(Instances)
+        An instance of DensePose filter, which takes feature tensors and proposals
+        as an input and returns filtered features and proposals
+    """
+    dp_filter = DensePoseDataFilter(cfg)
+    return dp_filter
+
+
+def build_densepose_head(cfg: CfgNode, input_channels: int):
+    """
+    Build DensePose head based on configurations options
+
+    Args:
+        cfg (CfgNode): configuration options
+        input_channels (int): input tensor size along the channel dimension
+    Return:
+        An instance of DensePose head
+    """
+    from .roi_heads.registry import ROI_DENSEPOSE_HEAD_REGISTRY
+
+    head_name = cfg.MODEL.ROI_DENSEPOSE_HEAD.NAME
+    return ROI_DENSEPOSE_HEAD_REGISTRY.get(head_name)(cfg, input_channels)
+
+
+def build_densepose_losses(cfg: CfgNode):
+    """
+    Build DensePose loss based on configurations options
+
+    Args:
+        cfg (CfgNode): configuration options
+    Return:
+        An instance of DensePose loss
+    """
+    from .losses import DENSEPOSE_LOSS_REGISTRY
+
+    loss_name = cfg.MODEL.ROI_DENSEPOSE_HEAD.LOSS_NAME
+    return DENSEPOSE_LOSS_REGISTRY.get(loss_name)(cfg)
+
+
+def build_densepose_embedder(cfg: CfgNode) -> Optional[nn.Module]:
+    """
+    Build embedder used to embed mesh vertices into an embedding space.
+    Embedder contains sub-embedders, one for each mesh ID.
+
+    Args:
+        cfg (cfgNode): configuration options
+    Return:
+        Embedding module
+    """
+    if cfg.MODEL.ROI_DENSEPOSE_HEAD.CSE.EMBEDDERS:
+        return Embedder(cfg)
+    return None
diff --git a/densepose/modeling/confidence.py b/densepose/modeling/confidence.py
new file mode 100644
index 0000000000000000000000000000000000000000..6f4a72efec06e055036ba70bc75b2624d20e1e0e
--- /dev/null
+++ b/densepose/modeling/confidence.py
@@ -0,0 +1,73 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+
+from dataclasses import dataclass
+from enum import Enum
+
+from detectron2.config import CfgNode
+
+
+class DensePoseUVConfidenceType(Enum):
+    """
+    Statistical model type for confidence learning, possible values:
+     - "iid_iso": statistically independent identically distributed residuals
+         with anisotropic covariance
+     - "indep_aniso": statistically independent residuals with anisotropic
+         covariances
+    For details, see:
+    N. Neverova, D. Novotny, A. Vedaldi "Correlated Uncertainty for Learning
+    Dense Correspondences from Noisy Labels", p. 918--926, in Proc. NIPS 2019
+    """
+
+    # fmt: off
+    IID_ISO     = "iid_iso"
+    INDEP_ANISO = "indep_aniso"
+    # fmt: on
+
+
+@dataclass
+class DensePoseUVConfidenceConfig:
+    """
+    Configuration options for confidence on UV data
+    """
+
+    enabled: bool = False
+    # lower bound on UV confidences
+    epsilon: float = 0.01
+    type: DensePoseUVConfidenceType = DensePoseUVConfidenceType.IID_ISO
+
+
+@dataclass
+class DensePoseSegmConfidenceConfig:
+    """
+    Configuration options for confidence on segmentation
+    """
+
+    enabled: bool = False
+    # lower bound on confidence values
+    epsilon: float = 0.01
+
+
+@dataclass
+class DensePoseConfidenceModelConfig:
+    """
+    Configuration options for confidence models
+    """
+
+    # confidence for U and V values
+    uv_confidence: DensePoseUVConfidenceConfig
+    # segmentation confidence
+    segm_confidence: DensePoseSegmConfidenceConfig
+
+    @staticmethod
+    def from_cfg(cfg: CfgNode) -> "DensePoseConfidenceModelConfig":
+        return DensePoseConfidenceModelConfig(
+            uv_confidence=DensePoseUVConfidenceConfig(
+                enabled=cfg.MODEL.ROI_DENSEPOSE_HEAD.UV_CONFIDENCE.ENABLED,
+                epsilon=cfg.MODEL.ROI_DENSEPOSE_HEAD.UV_CONFIDENCE.EPSILON,
+                type=DensePoseUVConfidenceType(cfg.MODEL.ROI_DENSEPOSE_HEAD.UV_CONFIDENCE.TYPE),
+            ),
+            segm_confidence=DensePoseSegmConfidenceConfig(
+                enabled=cfg.MODEL.ROI_DENSEPOSE_HEAD.SEGM_CONFIDENCE.ENABLED,
+                epsilon=cfg.MODEL.ROI_DENSEPOSE_HEAD.SEGM_CONFIDENCE.EPSILON,
+            ),
+        )
diff --git a/densepose/modeling/cse/__init__.py b/densepose/modeling/cse/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a2273609cc54fb96d002a49dcd58788060945059
--- /dev/null
+++ b/densepose/modeling/cse/__init__.py
@@ -0,0 +1,5 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+
+from .vertex_direct_embedder import VertexDirectEmbedder
+from .vertex_feature_embedder import VertexFeatureEmbedder
+from .embedder import Embedder
diff --git a/densepose/modeling/cse/__pycache__/__init__.cpython-310.pyc b/densepose/modeling/cse/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a8b2602e902f4f38fbe8d93fc54bda5516c1ae22
Binary files /dev/null and b/densepose/modeling/cse/__pycache__/__init__.cpython-310.pyc differ
diff --git a/densepose/modeling/cse/__pycache__/__init__.cpython-39.pyc b/densepose/modeling/cse/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6e09a0f5528b0856913c50ede767f6551aaaac90
Binary files /dev/null and b/densepose/modeling/cse/__pycache__/__init__.cpython-39.pyc differ
diff --git a/densepose/modeling/cse/__pycache__/embedder.cpython-310.pyc b/densepose/modeling/cse/__pycache__/embedder.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..44c3b87d7e04282705df19babf030504992602a8
Binary files /dev/null and b/densepose/modeling/cse/__pycache__/embedder.cpython-310.pyc differ
diff --git a/densepose/modeling/cse/__pycache__/embedder.cpython-39.pyc b/densepose/modeling/cse/__pycache__/embedder.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f6e549cf8f8056fa44342cc09fd18243981fa80f
Binary files /dev/null and b/densepose/modeling/cse/__pycache__/embedder.cpython-39.pyc differ
diff --git a/densepose/modeling/cse/__pycache__/utils.cpython-310.pyc b/densepose/modeling/cse/__pycache__/utils.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..51a83d101500a330f684dcd5f3f6fff14584d70a
Binary files /dev/null and b/densepose/modeling/cse/__pycache__/utils.cpython-310.pyc differ
diff --git a/densepose/modeling/cse/__pycache__/utils.cpython-39.pyc b/densepose/modeling/cse/__pycache__/utils.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..745c50f8cfa593b7962393ac9ca0c2f3c6028b74
Binary files /dev/null and b/densepose/modeling/cse/__pycache__/utils.cpython-39.pyc differ
diff --git a/densepose/modeling/cse/__pycache__/vertex_direct_embedder.cpython-310.pyc b/densepose/modeling/cse/__pycache__/vertex_direct_embedder.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9cc6a97f189e6025ac722e399bb10192d424fe2d
Binary files /dev/null and b/densepose/modeling/cse/__pycache__/vertex_direct_embedder.cpython-310.pyc differ
diff --git a/densepose/modeling/cse/__pycache__/vertex_direct_embedder.cpython-39.pyc b/densepose/modeling/cse/__pycache__/vertex_direct_embedder.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..10a6de036159cbc45f2cf95583b099d5b8a66066
Binary files /dev/null and b/densepose/modeling/cse/__pycache__/vertex_direct_embedder.cpython-39.pyc differ
diff --git a/densepose/modeling/cse/__pycache__/vertex_feature_embedder.cpython-310.pyc b/densepose/modeling/cse/__pycache__/vertex_feature_embedder.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..aaef5ef7fdad732fea5cd3659c3bfa80a5e264d6
Binary files /dev/null and b/densepose/modeling/cse/__pycache__/vertex_feature_embedder.cpython-310.pyc differ
diff --git a/densepose/modeling/cse/__pycache__/vertex_feature_embedder.cpython-39.pyc b/densepose/modeling/cse/__pycache__/vertex_feature_embedder.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..eac2506ff55193183d6ea3a7bc76e9301d65ff01
Binary files /dev/null and b/densepose/modeling/cse/__pycache__/vertex_feature_embedder.cpython-39.pyc differ
diff --git a/densepose/modeling/cse/embedder.py b/densepose/modeling/cse/embedder.py
new file mode 100644
index 0000000000000000000000000000000000000000..56f5cb9860b13aa38b2069e6b25c3f5f71ab1ecc
--- /dev/null
+++ b/densepose/modeling/cse/embedder.py
@@ -0,0 +1,128 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+
+import logging
+import numpy as np
+import pickle
+from enum import Enum
+from typing import Optional
+import torch
+from torch import nn
+
+from detectron2.config import CfgNode
+from detectron2.utils.file_io import PathManager
+
+from .vertex_direct_embedder import VertexDirectEmbedder
+from .vertex_feature_embedder import VertexFeatureEmbedder
+
+
+class EmbedderType(Enum):
+    """
+    Embedder type which defines how vertices are mapped into the embedding space:
+     - "vertex_direct": direct vertex embedding
+     - "vertex_feature": embedding vertex features
+    """
+
+    VERTEX_DIRECT = "vertex_direct"
+    VERTEX_FEATURE = "vertex_feature"
+
+
+def create_embedder(embedder_spec: CfgNode, embedder_dim: int) -> nn.Module:
+    """
+    Create an embedder based on the provided configuration
+
+    Args:
+        embedder_spec (CfgNode): embedder configuration
+        embedder_dim (int): embedding space dimensionality
+    Return:
+        An embedder instance for the specified configuration
+        Raises ValueError, in case of unexpected  embedder type
+    """
+    embedder_type = EmbedderType(embedder_spec.TYPE)
+    if embedder_type == EmbedderType.VERTEX_DIRECT:
+        embedder = VertexDirectEmbedder(
+            num_vertices=embedder_spec.NUM_VERTICES,
+            embed_dim=embedder_dim,
+        )
+        if embedder_spec.INIT_FILE != "":
+            embedder.load(embedder_spec.INIT_FILE)
+    elif embedder_type == EmbedderType.VERTEX_FEATURE:
+        embedder = VertexFeatureEmbedder(
+            num_vertices=embedder_spec.NUM_VERTICES,
+            feature_dim=embedder_spec.FEATURE_DIM,
+            embed_dim=embedder_dim,
+            train_features=embedder_spec.FEATURES_TRAINABLE,
+        )
+        if embedder_spec.INIT_FILE != "":
+            embedder.load(embedder_spec.INIT_FILE)
+    else:
+        raise ValueError(f"Unexpected embedder type {embedder_type}")
+
+    if not embedder_spec.IS_TRAINABLE:
+        embedder.requires_grad_(False)
+
+    return embedder
+
+
+class Embedder(nn.Module):
+    """
+    Embedder module that serves as a container for embedders to use with different
+    meshes. Extends Module to automatically save / load state dict.
+    """
+
+    DEFAULT_MODEL_CHECKPOINT_PREFIX = "roi_heads.embedder."
+
+    def __init__(self, cfg: CfgNode):
+        """
+        Initialize mesh embedders. An embedder for mesh `i` is stored in a submodule
+        "embedder_{i}".
+
+        Args:
+            cfg (CfgNode): configuration options
+        """
+        super(Embedder, self).__init__()
+        self.mesh_names = set()
+        embedder_dim = cfg.MODEL.ROI_DENSEPOSE_HEAD.CSE.EMBED_SIZE
+        logger = logging.getLogger(__name__)
+        for mesh_name, embedder_spec in cfg.MODEL.ROI_DENSEPOSE_HEAD.CSE.EMBEDDERS.items():
+            logger.info(f"Adding embedder embedder_{mesh_name} with spec {embedder_spec}")
+            self.add_module(f"embedder_{mesh_name}", create_embedder(embedder_spec, embedder_dim))
+            self.mesh_names.add(mesh_name)
+        if cfg.MODEL.WEIGHTS != "":
+            self.load_from_model_checkpoint(cfg.MODEL.WEIGHTS)
+
+    def load_from_model_checkpoint(self, fpath: str, prefix: Optional[str] = None):
+        if prefix is None:
+            prefix = Embedder.DEFAULT_MODEL_CHECKPOINT_PREFIX
+        state_dict = None
+        if fpath.endswith(".pkl"):
+            with PathManager.open(fpath, "rb") as hFile:
+                state_dict = pickle.load(hFile, encoding="latin1")
+        else:
+            with PathManager.open(fpath, "rb") as hFile:
+                state_dict = torch.load(hFile, map_location=torch.device("cpu"))
+        if state_dict is not None and "model" in state_dict:
+            state_dict_local = {}
+            for key in state_dict["model"]:
+                if key.startswith(prefix):
+                    v_key = state_dict["model"][key]
+                    if isinstance(v_key, np.ndarray):
+                        v_key = torch.from_numpy(v_key)
+                    state_dict_local[key[len(prefix) :]] = v_key
+            # non-strict loading to finetune on different meshes
+            self.load_state_dict(state_dict_local, strict=False)
+
+    def forward(self, mesh_name: str) -> torch.Tensor:
+        """
+        Produce vertex embeddings for the specific mesh; vertex embeddings are
+        a tensor of shape [N, D] where:
+            N = number of vertices
+            D = number of dimensions in the embedding space
+        Args:
+            mesh_name (str): name of a mesh for which to obtain vertex embeddings
+        Return:
+            Vertex embeddings, a tensor of shape [N, D]
+        """
+        return getattr(self, f"embedder_{mesh_name}")()
+
+    def has_embeddings(self, mesh_name: str) -> bool:
+        return hasattr(self, f"embedder_{mesh_name}")
diff --git a/densepose/modeling/cse/utils.py b/densepose/modeling/cse/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..6e70d25df7c8e2c1c408866cf7a6f0156b64114a
--- /dev/null
+++ b/densepose/modeling/cse/utils.py
@@ -0,0 +1,81 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+
+import torch
+from torch.nn import functional as F
+
+
+def squared_euclidean_distance_matrix(pts1: torch.Tensor, pts2: torch.Tensor) -> torch.Tensor:
+    """
+    Get squared Euclidean Distance Matrix
+    Computes pairwise squared Euclidean distances between points
+
+    Args:
+        pts1: Tensor [M x D], M is the number of points, D is feature dimensionality
+        pts2: Tensor [N x D], N is the number of points, D is feature dimensionality
+
+    Return:
+        Tensor [M, N]: matrix of squared Euclidean distances; at index (m, n)
+            it contains || pts1[m] - pts2[n] ||^2
+    """
+    edm = torch.mm(-2 * pts1, pts2.t())
+    edm += (pts1 * pts1).sum(1, keepdim=True) + (pts2 * pts2).sum(1, keepdim=True).t()
+    return edm.contiguous()
+
+
+def normalize_embeddings(embeddings: torch.Tensor, epsilon: float = 1e-6) -> torch.Tensor:
+    """
+    Normalize N D-dimensional embedding vectors arranged in a tensor [N, D]
+
+    Args:
+        embeddings (tensor [N, D]): N D-dimensional embedding vectors
+        epsilon (float): minimum value for a vector norm
+    Return:
+        Normalized embeddings (tensor [N, D]), such that L2 vector norms are all equal to 1.
+    """
+    return embeddings / torch.clamp(embeddings.norm(p=None, dim=1, keepdim=True), min=epsilon)
+
+
+def get_closest_vertices_mask_from_ES(
+    E: torch.Tensor,
+    S: torch.Tensor,
+    h: int,
+    w: int,
+    mesh_vertex_embeddings: torch.Tensor,
+    device: torch.device,
+):
+    """
+    Interpolate Embeddings and Segmentations to the size of a given bounding box,
+    and compute closest vertices and the segmentation mask
+
+    Args:
+        E (tensor [1, D, H, W]): D-dimensional embedding vectors for every point of the
+            default-sized box
+        S (tensor [1, 2, H, W]): 2-dimensional segmentation mask for every point of the
+            default-sized box
+        h (int): height of the target bounding box
+        w (int): width of the target bounding box
+        mesh_vertex_embeddings (tensor [N, D]): vertex embeddings for a chosen mesh
+            N is the number of vertices in the mesh, D is feature dimensionality
+        device (torch.device): device to move the tensors to
+    Return:
+        Closest Vertices (tensor [h, w]), int, for every point of the resulting box
+        Segmentation mask (tensor [h, w]), boolean, for every point of the resulting box
+    """
+    embedding_resized = F.interpolate(E, size=(h, w), mode="bilinear")[0].to(device)
+    coarse_segm_resized = F.interpolate(S, size=(h, w), mode="bilinear")[0].to(device)
+    mask = coarse_segm_resized.argmax(0) > 0
+    closest_vertices = torch.zeros(mask.shape, dtype=torch.long, device=device)
+    all_embeddings = embedding_resized[:, mask].t()
+    size_chunk = 10_000  # Chunking to avoid possible OOM
+    edm = []
+    if len(all_embeddings) == 0:
+        return closest_vertices, mask
+    for chunk in range((len(all_embeddings) - 1) // size_chunk + 1):
+        chunk_embeddings = all_embeddings[size_chunk * chunk : size_chunk * (chunk + 1)]
+        edm.append(
+            torch.argmin(
+                squared_euclidean_distance_matrix(chunk_embeddings, mesh_vertex_embeddings), dim=1
+            )
+        )
+    closest_vertices[mask] = torch.cat(edm)
+    return closest_vertices, mask
diff --git a/densepose/modeling/cse/vertex_direct_embedder.py b/densepose/modeling/cse/vertex_direct_embedder.py
new file mode 100644
index 0000000000000000000000000000000000000000..9d802adf10c18beaedb3bd56963366662ba753f7
--- /dev/null
+++ b/densepose/modeling/cse/vertex_direct_embedder.py
@@ -0,0 +1,64 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+
+import pickle
+import torch
+from torch import nn
+
+from detectron2.utils.file_io import PathManager
+
+from .utils import normalize_embeddings
+
+
+class VertexDirectEmbedder(nn.Module):
+    """
+    Class responsible for embedding vertices. Vertex embeddings take
+    the form of a tensor of size [N, D], where
+        N = number of vertices
+        D = number of dimensions in the embedding space
+    """
+
+    def __init__(self, num_vertices: int, embed_dim: int):
+        """
+        Initialize embedder, set random embeddings
+
+        Args:
+            num_vertices (int): number of vertices to embed
+            embed_dim (int): number of dimensions in the embedding space
+        """
+        super(VertexDirectEmbedder, self).__init__()
+        self.embeddings = nn.Parameter(torch.Tensor(num_vertices, embed_dim))
+        self.reset_parameters()
+
+    @torch.no_grad()
+    def reset_parameters(self):
+        """
+        Reset embeddings to random values
+        """
+        self.embeddings.zero_()
+
+    def forward(self) -> torch.Tensor:
+        """
+        Produce vertex embeddings, a tensor of shape [N, D] where:
+            N = number of vertices
+            D = number of dimensions in the embedding space
+
+        Return:
+           Full vertex embeddings, a tensor of shape [N, D]
+        """
+        return normalize_embeddings(self.embeddings)
+
+    @torch.no_grad()
+    def load(self, fpath: str):
+        """
+        Load data from a file
+
+        Args:
+            fpath (str): file path to load data from
+        """
+        with PathManager.open(fpath, "rb") as hFile:
+            data = pickle.load(hFile)
+            for name in ["embeddings"]:
+                if name in data:
+                    getattr(self, name).copy_(
+                        torch.tensor(data[name]).float().to(device=getattr(self, name).device)
+                    )
diff --git a/densepose/modeling/cse/vertex_feature_embedder.py b/densepose/modeling/cse/vertex_feature_embedder.py
new file mode 100644
index 0000000000000000000000000000000000000000..e9c7709c1eaec8f3e39441aadbc2b749c67874f2
--- /dev/null
+++ b/densepose/modeling/cse/vertex_feature_embedder.py
@@ -0,0 +1,75 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+
+import pickle
+import torch
+from torch import nn
+
+from detectron2.utils.file_io import PathManager
+
+from .utils import normalize_embeddings
+
+
+class VertexFeatureEmbedder(nn.Module):
+    """
+    Class responsible for embedding vertex features. Mapping from
+    feature space to the embedding space is a tensor of size [K, D], where
+        K = number of dimensions in the feature space
+        D = number of dimensions in the embedding space
+    Vertex features is a tensor of size [N, K], where
+        N = number of vertices
+        K = number of dimensions in the feature space
+    Vertex embeddings are computed as F * E = tensor of size [N, D]
+    """
+
+    def __init__(
+        self, num_vertices: int, feature_dim: int, embed_dim: int, train_features: bool = False
+    ):
+        """
+        Initialize embedder, set random embeddings
+
+        Args:
+            num_vertices (int): number of vertices to embed
+            feature_dim (int): number of dimensions in the feature space
+            embed_dim (int): number of dimensions in the embedding space
+            train_features (bool): determines whether vertex features should
+                be trained (default: False)
+        """
+        super(VertexFeatureEmbedder, self).__init__()
+        if train_features:
+            self.features = nn.Parameter(torch.Tensor(num_vertices, feature_dim))
+        else:
+            self.register_buffer("features", torch.Tensor(num_vertices, feature_dim))
+        self.embeddings = nn.Parameter(torch.Tensor(feature_dim, embed_dim))
+        self.reset_parameters()
+
+    @torch.no_grad()
+    def reset_parameters(self):
+        self.features.zero_()
+        self.embeddings.zero_()
+
+    def forward(self) -> torch.Tensor:
+        """
+        Produce vertex embeddings, a tensor of shape [N, D] where:
+            N = number of vertices
+            D = number of dimensions in the embedding space
+
+        Return:
+           Full vertex embeddings, a tensor of shape [N, D]
+        """
+        return normalize_embeddings(torch.mm(self.features, self.embeddings))
+
+    @torch.no_grad()
+    def load(self, fpath: str):
+        """
+        Load data from a file
+
+        Args:
+            fpath (str): file path to load data from
+        """
+        with PathManager.open(fpath, "rb") as hFile:
+            data = pickle.load(hFile)
+            for name in ["features", "embeddings"]:
+                if name in data:
+                    getattr(self, name).copy_(
+                        torch.tensor(data[name]).float().to(device=getattr(self, name).device)
+                    )
diff --git a/densepose/modeling/densepose_checkpoint.py b/densepose/modeling/densepose_checkpoint.py
new file mode 100644
index 0000000000000000000000000000000000000000..8c2b4f2e2cc9c6c798cf1bdb9c38dedc84058bd5
--- /dev/null
+++ b/densepose/modeling/densepose_checkpoint.py
@@ -0,0 +1,35 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+from collections import OrderedDict
+
+from detectron2.checkpoint import DetectionCheckpointer
+
+
+def _rename_HRNet_weights(weights):
+    # We detect and  rename HRNet weights for DensePose. 1956 and 1716 are values that are
+    # common to all HRNet pretrained weights, and should be enough to accurately identify them
+    if (
+        len(weights["model"].keys()) == 1956
+        and len([k for k in weights["model"].keys() if k.startswith("stage")]) == 1716
+    ):
+        hrnet_weights = OrderedDict()
+        for k in weights["model"].keys():
+            hrnet_weights["backbone.bottom_up." + str(k)] = weights["model"][k]
+        return {"model": hrnet_weights}
+    else:
+        return weights
+
+
+class DensePoseCheckpointer(DetectionCheckpointer):
+    """
+    Same as :class:`DetectionCheckpointer`, but is able to handle HRNet weights
+    """
+
+    def __init__(self, model, save_dir="", *, save_to_disk=None, **checkpointables):
+        super().__init__(model, save_dir, save_to_disk=save_to_disk, **checkpointables)
+
+    def _load_file(self, filename: str) -> object:
+        """
+        Adding hrnet support
+        """
+        weights = super()._load_file(filename)
+        return _rename_HRNet_weights(weights)
diff --git a/densepose/modeling/filter.py b/densepose/modeling/filter.py
new file mode 100644
index 0000000000000000000000000000000000000000..4682b225dbba1ce330c8f4ed6ad14dafcc935e5c
--- /dev/null
+++ b/densepose/modeling/filter.py
@@ -0,0 +1,94 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+
+from typing import List
+import torch
+
+from detectron2.config import CfgNode
+from detectron2.structures import Instances
+from detectron2.structures.boxes import matched_pairwise_iou
+
+
+class DensePoseDataFilter:
+    def __init__(self, cfg: CfgNode):
+        self.iou_threshold = cfg.MODEL.ROI_DENSEPOSE_HEAD.FG_IOU_THRESHOLD
+        self.keep_masks = cfg.MODEL.ROI_DENSEPOSE_HEAD.COARSE_SEGM_TRAINED_BY_MASKS
+
+    @torch.no_grad()
+    def __call__(self, features: List[torch.Tensor], proposals_with_targets: List[Instances]):
+        """
+        Filters proposals with targets to keep only the ones relevant for
+        DensePose training
+
+        Args:
+            features (list[Tensor]): input data as a list of features,
+                each feature is a tensor. Axis 0 represents the number of
+                images `N` in the input data; axes 1-3 are channels,
+                height, and width, which may vary between features
+                (e.g., if a feature pyramid is used).
+            proposals_with_targets (list[Instances]): length `N` list of
+                `Instances`. The i-th `Instances` contains instances
+                (proposals, GT) for the i-th input image,
+        Returns:
+            list[Tensor]: filtered features
+            list[Instances]: filtered proposals
+        """
+        proposals_filtered = []
+        # TODO: the commented out code was supposed to correctly deal with situations
+        # where no valid DensePose GT is available for certain images. The corresponding
+        # image features were sliced and proposals were filtered. This led to performance
+        # deterioration, both in terms of runtime and in terms of evaluation results.
+        #
+        # feature_mask = torch.ones(
+        #    len(proposals_with_targets),
+        #    dtype=torch.bool,
+        #    device=features[0].device if len(features) > 0 else torch.device("cpu"),
+        # )
+        for i, proposals_per_image in enumerate(proposals_with_targets):
+            if not proposals_per_image.has("gt_densepose") and (
+                not proposals_per_image.has("gt_masks") or not self.keep_masks
+            ):
+                # feature_mask[i] = 0
+                continue
+            gt_boxes = proposals_per_image.gt_boxes
+            est_boxes = proposals_per_image.proposal_boxes
+            # apply match threshold for densepose head
+            iou = matched_pairwise_iou(gt_boxes, est_boxes)
+            iou_select = iou > self.iou_threshold
+            proposals_per_image = proposals_per_image[iou_select]  # pyre-ignore[6]
+
+            N_gt_boxes = len(proposals_per_image.gt_boxes)
+            assert N_gt_boxes == len(proposals_per_image.proposal_boxes), (
+                f"The number of GT boxes {N_gt_boxes} is different from the "
+                f"number of proposal boxes {len(proposals_per_image.proposal_boxes)}"
+            )
+            # filter out any target without suitable annotation
+            if self.keep_masks:
+                gt_masks = (
+                    proposals_per_image.gt_masks
+                    if hasattr(proposals_per_image, "gt_masks")
+                    else [None] * N_gt_boxes
+                )
+            else:
+                gt_masks = [None] * N_gt_boxes
+            gt_densepose = (
+                proposals_per_image.gt_densepose
+                if hasattr(proposals_per_image, "gt_densepose")
+                else [None] * N_gt_boxes
+            )
+            assert len(gt_masks) == N_gt_boxes
+            assert len(gt_densepose) == N_gt_boxes
+            selected_indices = [
+                i
+                for i, (dp_target, mask_target) in enumerate(zip(gt_densepose, gt_masks))
+                if (dp_target is not None) or (mask_target is not None)
+            ]
+            # if not len(selected_indices):
+            #     feature_mask[i] = 0
+            #     continue
+            if len(selected_indices) != N_gt_boxes:
+                proposals_per_image = proposals_per_image[selected_indices]  # pyre-ignore[6]
+            assert len(proposals_per_image.gt_boxes) == len(proposals_per_image.proposal_boxes)
+            proposals_filtered.append(proposals_per_image)
+        # features_filtered = [feature[feature_mask] for feature in features]
+        # return features_filtered, proposals_filtered
+        return features, proposals_filtered
diff --git a/densepose/modeling/hrfpn.py b/densepose/modeling/hrfpn.py
new file mode 100644
index 0000000000000000000000000000000000000000..08ec420fa24e1e8f5074baf2e9ae737aff2ab12e
--- /dev/null
+++ b/densepose/modeling/hrfpn.py
@@ -0,0 +1,182 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+"""
+MIT License
+Copyright (c) 2019 Microsoft
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+"""
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from detectron2.layers import ShapeSpec
+from detectron2.modeling.backbone import BACKBONE_REGISTRY
+from detectron2.modeling.backbone.backbone import Backbone
+
+from .hrnet import build_pose_hrnet_backbone
+
+
+class HRFPN(Backbone):
+    """HRFPN (High Resolution Feature Pyramids)
+    Transforms outputs of HRNet backbone so they are suitable for the ROI_heads
+    arXiv: https://arxiv.org/abs/1904.04514
+    Adapted from https://github.com/open-mmlab/mmdetection/blob/master/mmdet/models/necks/hrfpn.py
+    Args:
+        bottom_up: (list) output of HRNet
+        in_features (list): names of the input features (output of HRNet)
+        in_channels (list): number of channels for each branch
+        out_channels (int): output channels of feature pyramids
+        n_out_features (int): number of output stages
+        pooling (str): pooling for generating feature pyramids (from {MAX, AVG})
+        share_conv (bool): Have one conv per output, or share one with all the outputs
+    """
+
+    def __init__(
+        self,
+        bottom_up,
+        in_features,
+        n_out_features,
+        in_channels,
+        out_channels,
+        pooling="AVG",
+        share_conv=False,
+    ):
+        super(HRFPN, self).__init__()
+        assert isinstance(in_channels, list)
+        self.bottom_up = bottom_up
+        self.in_features = in_features
+        self.n_out_features = n_out_features
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.num_ins = len(in_channels)
+        self.share_conv = share_conv
+
+        if self.share_conv:
+            self.fpn_conv = nn.Conv2d(
+                in_channels=out_channels, out_channels=out_channels, kernel_size=3, padding=1
+            )
+        else:
+            self.fpn_conv = nn.ModuleList()
+            for _ in range(self.n_out_features):
+                self.fpn_conv.append(
+                    nn.Conv2d(
+                        in_channels=out_channels,
+                        out_channels=out_channels,
+                        kernel_size=3,
+                        padding=1,
+                    )
+                )
+
+        # Custom change: Replaces a simple bilinear interpolation
+        self.interp_conv = nn.ModuleList()
+        for i in range(len(self.in_features)):
+            self.interp_conv.append(
+                nn.Sequential(
+                    nn.ConvTranspose2d(
+                        in_channels=in_channels[i],
+                        out_channels=in_channels[i],
+                        kernel_size=4,
+                        stride=2**i,
+                        padding=0,
+                        output_padding=0,
+                        bias=False,
+                    ),
+                    nn.BatchNorm2d(in_channels[i], momentum=0.1),
+                    nn.ReLU(inplace=True),
+                )
+            )
+
+        # Custom change: Replaces a couple (reduction conv + pooling) by one conv
+        self.reduction_pooling_conv = nn.ModuleList()
+        for i in range(self.n_out_features):
+            self.reduction_pooling_conv.append(
+                nn.Sequential(
+                    nn.Conv2d(sum(in_channels), out_channels, kernel_size=2**i, stride=2**i),
+                    nn.BatchNorm2d(out_channels, momentum=0.1),
+                    nn.ReLU(inplace=True),
+                )
+            )
+
+        if pooling == "MAX":
+            self.pooling = F.max_pool2d
+        else:
+            self.pooling = F.avg_pool2d
+
+        self._out_features = []
+        self._out_feature_channels = {}
+        self._out_feature_strides = {}
+
+        for i in range(self.n_out_features):
+            self._out_features.append("p%d" % (i + 1))
+            self._out_feature_channels.update({self._out_features[-1]: self.out_channels})
+            self._out_feature_strides.update({self._out_features[-1]: 2 ** (i + 2)})
+
+    # default init_weights for conv(msra) and norm in ConvModule
+    def init_weights(self):
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                nn.init.kaiming_normal_(m.weight, a=1)
+                nn.init.constant_(m.bias, 0)
+
+    def forward(self, inputs):
+        bottom_up_features = self.bottom_up(inputs)
+        assert len(bottom_up_features) == len(self.in_features)
+        inputs = [bottom_up_features[f] for f in self.in_features]
+
+        outs = []
+        for i in range(len(inputs)):
+            outs.append(self.interp_conv[i](inputs[i]))
+        shape_2 = min(o.shape[2] for o in outs)
+        shape_3 = min(o.shape[3] for o in outs)
+        out = torch.cat([o[:, :, :shape_2, :shape_3] for o in outs], dim=1)
+        outs = []
+        for i in range(self.n_out_features):
+            outs.append(self.reduction_pooling_conv[i](out))
+        for i in range(len(outs)):  # Make shapes consistent
+            outs[-1 - i] = outs[-1 - i][
+                :, :, : outs[-1].shape[2] * 2**i, : outs[-1].shape[3] * 2**i
+            ]
+        outputs = []
+        for i in range(len(outs)):
+            if self.share_conv:
+                outputs.append(self.fpn_conv(outs[i]))
+            else:
+                outputs.append(self.fpn_conv[i](outs[i]))
+
+        assert len(self._out_features) == len(outputs)
+        return dict(zip(self._out_features, outputs))
+
+
+@BACKBONE_REGISTRY.register()
+def build_hrfpn_backbone(cfg, input_shape: ShapeSpec) -> HRFPN:
+
+    in_channels = cfg.MODEL.HRNET.STAGE4.NUM_CHANNELS
+    in_features = ["p%d" % (i + 1) for i in range(cfg.MODEL.HRNET.STAGE4.NUM_BRANCHES)]
+    n_out_features = len(cfg.MODEL.ROI_HEADS.IN_FEATURES)
+    out_channels = cfg.MODEL.HRNET.HRFPN.OUT_CHANNELS
+    hrnet = build_pose_hrnet_backbone(cfg, input_shape)
+    hrfpn = HRFPN(
+        hrnet,
+        in_features,
+        n_out_features,
+        in_channels,
+        out_channels,
+        pooling="AVG",
+        share_conv=False,
+    )
+
+    return hrfpn
diff --git a/densepose/modeling/hrnet.py b/densepose/modeling/hrnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..ca2467107e8e5a50167de38ef6827fac646d1245
--- /dev/null
+++ b/densepose/modeling/hrnet.py
@@ -0,0 +1,474 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# ------------------------------------------------------------------------------
+# Copyright (c) Microsoft
+# Licensed under the MIT License.
+# Written by Bin Xiao (leoxiaobin@gmail.com)
+# Modified by Bowen Cheng (bcheng9@illinois.edu)
+# Adapted from https://github.com/HRNet/Higher-HRNet-Human-Pose-Estimation/blob/master/lib/models/pose_higher_hrnet.py  # noqa
+# ------------------------------------------------------------------------------
+
+from __future__ import absolute_import, division, print_function
+import logging
+import torch.nn as nn
+
+from detectron2.layers import ShapeSpec
+from detectron2.modeling.backbone import BACKBONE_REGISTRY
+from detectron2.modeling.backbone.backbone import Backbone
+
+BN_MOMENTUM = 0.1
+logger = logging.getLogger(__name__)
+
+__all__ = ["build_pose_hrnet_backbone", "PoseHigherResolutionNet"]
+
+
+def conv3x3(in_planes, out_planes, stride=1):
+    """3x3 convolution with padding"""
+    return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride, padding=1, bias=False)
+
+
+class BasicBlock(nn.Module):
+    expansion = 1
+
+    def __init__(self, inplanes, planes, stride=1, downsample=None):
+        super(BasicBlock, self).__init__()
+        self.conv1 = conv3x3(inplanes, planes, stride)
+        self.bn1 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM)
+        self.relu = nn.ReLU(inplace=True)
+        self.conv2 = conv3x3(planes, planes)
+        self.bn2 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM)
+        self.downsample = downsample
+        self.stride = stride
+
+    def forward(self, x):
+        residual = x
+
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+
+        out = self.conv2(out)
+        out = self.bn2(out)
+
+        if self.downsample is not None:
+            residual = self.downsample(x)
+
+        out += residual
+        out = self.relu(out)
+
+        return out
+
+
+class Bottleneck(nn.Module):
+    expansion = 4
+
+    def __init__(self, inplanes, planes, stride=1, downsample=None):
+        super(Bottleneck, self).__init__()
+        self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False)
+        self.bn1 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM)
+        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride, padding=1, bias=False)
+        self.bn2 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM)
+        self.conv3 = nn.Conv2d(planes, planes * self.expansion, kernel_size=1, bias=False)
+        self.bn3 = nn.BatchNorm2d(planes * self.expansion, momentum=BN_MOMENTUM)
+        self.relu = nn.ReLU(inplace=True)
+        self.downsample = downsample
+        self.stride = stride
+
+    def forward(self, x):
+        residual = x
+
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+
+        out = self.conv2(out)
+        out = self.bn2(out)
+        out = self.relu(out)
+
+        out = self.conv3(out)
+        out = self.bn3(out)
+
+        if self.downsample is not None:
+            residual = self.downsample(x)
+
+        out += residual
+        out = self.relu(out)
+
+        return out
+
+
+class HighResolutionModule(nn.Module):
+    """HighResolutionModule
+    Building block of the PoseHigherResolutionNet (see lower)
+    arXiv: https://arxiv.org/abs/1908.10357
+    Args:
+        num_branches (int): number of branches of the modyle
+        blocks (str): type of block of the module
+        num_blocks (int): number of blocks of the module
+        num_inchannels (int): number of input channels of the module
+        num_channels (list): number of channels of each branch
+        multi_scale_output (bool): only used by the last module of PoseHigherResolutionNet
+    """
+
+    def __init__(
+        self,
+        num_branches,
+        blocks,
+        num_blocks,
+        num_inchannels,
+        num_channels,
+        multi_scale_output=True,
+    ):
+        super(HighResolutionModule, self).__init__()
+        self._check_branches(num_branches, blocks, num_blocks, num_inchannels, num_channels)
+
+        self.num_inchannels = num_inchannels
+        self.num_branches = num_branches
+
+        self.multi_scale_output = multi_scale_output
+
+        self.branches = self._make_branches(num_branches, blocks, num_blocks, num_channels)
+        self.fuse_layers = self._make_fuse_layers()
+        self.relu = nn.ReLU(True)
+
+    def _check_branches(self, num_branches, blocks, num_blocks, num_inchannels, num_channels):
+        if num_branches != len(num_blocks):
+            error_msg = "NUM_BRANCHES({}) <> NUM_BLOCKS({})".format(num_branches, len(num_blocks))
+            logger.error(error_msg)
+            raise ValueError(error_msg)
+
+        if num_branches != len(num_channels):
+            error_msg = "NUM_BRANCHES({}) <> NUM_CHANNELS({})".format(
+                num_branches, len(num_channels)
+            )
+            logger.error(error_msg)
+            raise ValueError(error_msg)
+
+        if num_branches != len(num_inchannels):
+            error_msg = "NUM_BRANCHES({}) <> NUM_INCHANNELS({})".format(
+                num_branches, len(num_inchannels)
+            )
+            logger.error(error_msg)
+            raise ValueError(error_msg)
+
+    def _make_one_branch(self, branch_index, block, num_blocks, num_channels, stride=1):
+        downsample = None
+        if (
+            stride != 1
+            or self.num_inchannels[branch_index] != num_channels[branch_index] * block.expansion
+        ):
+            downsample = nn.Sequential(
+                nn.Conv2d(
+                    self.num_inchannels[branch_index],
+                    num_channels[branch_index] * block.expansion,
+                    kernel_size=1,
+                    stride=stride,
+                    bias=False,
+                ),
+                nn.BatchNorm2d(num_channels[branch_index] * block.expansion, momentum=BN_MOMENTUM),
+            )
+
+        layers = []
+        layers.append(
+            block(self.num_inchannels[branch_index], num_channels[branch_index], stride, downsample)
+        )
+        self.num_inchannels[branch_index] = num_channels[branch_index] * block.expansion
+        for _ in range(1, num_blocks[branch_index]):
+            layers.append(block(self.num_inchannels[branch_index], num_channels[branch_index]))
+
+        return nn.Sequential(*layers)
+
+    def _make_branches(self, num_branches, block, num_blocks, num_channels):
+        branches = []
+
+        for i in range(num_branches):
+            branches.append(self._make_one_branch(i, block, num_blocks, num_channels))
+
+        return nn.ModuleList(branches)
+
+    def _make_fuse_layers(self):
+        if self.num_branches == 1:
+            return None
+
+        num_branches = self.num_branches
+        num_inchannels = self.num_inchannels
+        fuse_layers = []
+        for i in range(num_branches if self.multi_scale_output else 1):
+            fuse_layer = []
+            for j in range(num_branches):
+                if j > i:
+                    fuse_layer.append(
+                        nn.Sequential(
+                            nn.Conv2d(num_inchannels[j], num_inchannels[i], 1, 1, 0, bias=False),
+                            nn.BatchNorm2d(num_inchannels[i]),
+                            nn.Upsample(scale_factor=2 ** (j - i), mode="nearest"),
+                        )
+                    )
+                elif j == i:
+                    fuse_layer.append(None)
+                else:
+                    conv3x3s = []
+                    for k in range(i - j):
+                        if k == i - j - 1:
+                            num_outchannels_conv3x3 = num_inchannels[i]
+                            conv3x3s.append(
+                                nn.Sequential(
+                                    nn.Conv2d(
+                                        num_inchannels[j],
+                                        num_outchannels_conv3x3,
+                                        3,
+                                        2,
+                                        1,
+                                        bias=False,
+                                    ),
+                                    nn.BatchNorm2d(num_outchannels_conv3x3),
+                                )
+                            )
+                        else:
+                            num_outchannels_conv3x3 = num_inchannels[j]
+                            conv3x3s.append(
+                                nn.Sequential(
+                                    nn.Conv2d(
+                                        num_inchannels[j],
+                                        num_outchannels_conv3x3,
+                                        3,
+                                        2,
+                                        1,
+                                        bias=False,
+                                    ),
+                                    nn.BatchNorm2d(num_outchannels_conv3x3),
+                                    nn.ReLU(True),
+                                )
+                            )
+                    fuse_layer.append(nn.Sequential(*conv3x3s))
+            fuse_layers.append(nn.ModuleList(fuse_layer))
+
+        return nn.ModuleList(fuse_layers)
+
+    def get_num_inchannels(self):
+        return self.num_inchannels
+
+    def forward(self, x):
+        if self.num_branches == 1:
+            return [self.branches[0](x[0])]
+
+        for i in range(self.num_branches):
+            x[i] = self.branches[i](x[i])
+
+        x_fuse = []
+
+        for i in range(len(self.fuse_layers)):
+            y = x[0] if i == 0 else self.fuse_layers[i][0](x[0])
+            for j in range(1, self.num_branches):
+                if i == j:
+                    y = y + x[j]
+                else:
+                    z = self.fuse_layers[i][j](x[j])[:, :, : y.shape[2], : y.shape[3]]
+                    y = y + z
+            x_fuse.append(self.relu(y))
+
+        return x_fuse
+
+
+blocks_dict = {"BASIC": BasicBlock, "BOTTLENECK": Bottleneck}
+
+
+class PoseHigherResolutionNet(Backbone):
+    """PoseHigherResolutionNet
+    Composed of several HighResolutionModule tied together with ConvNets
+    Adapted from the GitHub version to fit with HRFPN and the Detectron2 infrastructure
+    arXiv: https://arxiv.org/abs/1908.10357
+    """
+
+    def __init__(self, cfg, **kwargs):
+        self.inplanes = cfg.MODEL.HRNET.STEM_INPLANES
+        super(PoseHigherResolutionNet, self).__init__()
+
+        # stem net
+        self.conv1 = nn.Conv2d(3, 64, kernel_size=3, stride=2, padding=1, bias=False)
+        self.bn1 = nn.BatchNorm2d(64, momentum=BN_MOMENTUM)
+        self.conv2 = nn.Conv2d(64, 64, kernel_size=3, stride=2, padding=1, bias=False)
+        self.bn2 = nn.BatchNorm2d(64, momentum=BN_MOMENTUM)
+        self.relu = nn.ReLU(inplace=True)
+        self.layer1 = self._make_layer(Bottleneck, 64, 4)
+
+        self.stage2_cfg = cfg.MODEL.HRNET.STAGE2
+        num_channels = self.stage2_cfg.NUM_CHANNELS
+        block = blocks_dict[self.stage2_cfg.BLOCK]
+        num_channels = [num_channels[i] * block.expansion for i in range(len(num_channels))]
+        self.transition1 = self._make_transition_layer([256], num_channels)
+        self.stage2, pre_stage_channels = self._make_stage(self.stage2_cfg, num_channels)
+
+        self.stage3_cfg = cfg.MODEL.HRNET.STAGE3
+        num_channels = self.stage3_cfg.NUM_CHANNELS
+        block = blocks_dict[self.stage3_cfg.BLOCK]
+        num_channels = [num_channels[i] * block.expansion for i in range(len(num_channels))]
+        self.transition2 = self._make_transition_layer(pre_stage_channels, num_channels)
+        self.stage3, pre_stage_channels = self._make_stage(self.stage3_cfg, num_channels)
+
+        self.stage4_cfg = cfg.MODEL.HRNET.STAGE4
+        num_channels = self.stage4_cfg.NUM_CHANNELS
+        block = blocks_dict[self.stage4_cfg.BLOCK]
+        num_channels = [num_channels[i] * block.expansion for i in range(len(num_channels))]
+        self.transition3 = self._make_transition_layer(pre_stage_channels, num_channels)
+        self.stage4, pre_stage_channels = self._make_stage(
+            self.stage4_cfg, num_channels, multi_scale_output=True
+        )
+
+        self._out_features = []
+        self._out_feature_channels = {}
+        self._out_feature_strides = {}
+
+        for i in range(cfg.MODEL.HRNET.STAGE4.NUM_BRANCHES):
+            self._out_features.append("p%d" % (i + 1))
+            self._out_feature_channels.update(
+                {self._out_features[-1]: cfg.MODEL.HRNET.STAGE4.NUM_CHANNELS[i]}
+            )
+            self._out_feature_strides.update({self._out_features[-1]: 1})
+
+    def _get_deconv_cfg(self, deconv_kernel):
+        if deconv_kernel == 4:
+            padding = 1
+            output_padding = 0
+        elif deconv_kernel == 3:
+            padding = 1
+            output_padding = 1
+        elif deconv_kernel == 2:
+            padding = 0
+            output_padding = 0
+
+        return deconv_kernel, padding, output_padding
+
+    def _make_transition_layer(self, num_channels_pre_layer, num_channels_cur_layer):
+        num_branches_cur = len(num_channels_cur_layer)
+        num_branches_pre = len(num_channels_pre_layer)
+
+        transition_layers = []
+        for i in range(num_branches_cur):
+            if i < num_branches_pre:
+                if num_channels_cur_layer[i] != num_channels_pre_layer[i]:
+                    transition_layers.append(
+                        nn.Sequential(
+                            nn.Conv2d(
+                                num_channels_pre_layer[i],
+                                num_channels_cur_layer[i],
+                                3,
+                                1,
+                                1,
+                                bias=False,
+                            ),
+                            nn.BatchNorm2d(num_channels_cur_layer[i]),
+                            nn.ReLU(inplace=True),
+                        )
+                    )
+                else:
+                    transition_layers.append(None)
+            else:
+                conv3x3s = []
+                for j in range(i + 1 - num_branches_pre):
+                    inchannels = num_channels_pre_layer[-1]
+                    outchannels = (
+                        num_channels_cur_layer[i] if j == i - num_branches_pre else inchannels
+                    )
+                    conv3x3s.append(
+                        nn.Sequential(
+                            nn.Conv2d(inchannels, outchannels, 3, 2, 1, bias=False),
+                            nn.BatchNorm2d(outchannels),
+                            nn.ReLU(inplace=True),
+                        )
+                    )
+                transition_layers.append(nn.Sequential(*conv3x3s))
+
+        return nn.ModuleList(transition_layers)
+
+    def _make_layer(self, block, planes, blocks, stride=1):
+        downsample = None
+        if stride != 1 or self.inplanes != planes * block.expansion:
+            downsample = nn.Sequential(
+                nn.Conv2d(
+                    self.inplanes,
+                    planes * block.expansion,
+                    kernel_size=1,
+                    stride=stride,
+                    bias=False,
+                ),
+                nn.BatchNorm2d(planes * block.expansion, momentum=BN_MOMENTUM),
+            )
+
+        layers = []
+        layers.append(block(self.inplanes, planes, stride, downsample))
+        self.inplanes = planes * block.expansion
+        for _ in range(1, blocks):
+            layers.append(block(self.inplanes, planes))
+
+        return nn.Sequential(*layers)
+
+    def _make_stage(self, layer_config, num_inchannels, multi_scale_output=True):
+        num_modules = layer_config["NUM_MODULES"]
+        num_branches = layer_config["NUM_BRANCHES"]
+        num_blocks = layer_config["NUM_BLOCKS"]
+        num_channels = layer_config["NUM_CHANNELS"]
+        block = blocks_dict[layer_config["BLOCK"]]
+
+        modules = []
+        for i in range(num_modules):
+            # multi_scale_output is only used last module
+            if not multi_scale_output and i == num_modules - 1:
+                reset_multi_scale_output = False
+            else:
+                reset_multi_scale_output = True
+
+            modules.append(
+                HighResolutionModule(
+                    num_branches,
+                    block,
+                    num_blocks,
+                    num_inchannels,
+                    num_channels,
+                    reset_multi_scale_output,
+                )
+            )
+            num_inchannels = modules[-1].get_num_inchannels()
+
+        return nn.Sequential(*modules), num_inchannels
+
+    def forward(self, x):
+        x = self.conv1(x)
+        x = self.bn1(x)
+        x = self.relu(x)
+        x = self.conv2(x)
+        x = self.bn2(x)
+        x = self.relu(x)
+        x = self.layer1(x)
+
+        x_list = []
+        for i in range(self.stage2_cfg.NUM_BRANCHES):
+            if self.transition1[i] is not None:
+                x_list.append(self.transition1[i](x))
+            else:
+                x_list.append(x)
+        y_list = self.stage2(x_list)
+
+        x_list = []
+        for i in range(self.stage3_cfg.NUM_BRANCHES):
+            if self.transition2[i] is not None:
+                x_list.append(self.transition2[i](y_list[-1]))
+            else:
+                x_list.append(y_list[i])
+        y_list = self.stage3(x_list)
+
+        x_list = []
+        for i in range(self.stage4_cfg.NUM_BRANCHES):
+            if self.transition3[i] is not None:
+                x_list.append(self.transition3[i](y_list[-1]))
+            else:
+                x_list.append(y_list[i])
+        y_list = self.stage4(x_list)
+
+        assert len(self._out_features) == len(y_list)
+        return dict(zip(self._out_features, y_list))  # final_outputs
+
+
+@BACKBONE_REGISTRY.register()
+def build_pose_hrnet_backbone(cfg, input_shape: ShapeSpec):
+    model = PoseHigherResolutionNet(cfg)
+    return model
diff --git a/densepose/modeling/inference.py b/densepose/modeling/inference.py
new file mode 100644
index 0000000000000000000000000000000000000000..81049649edddb23aeebeac4085514da838f1463b
--- /dev/null
+++ b/densepose/modeling/inference.py
@@ -0,0 +1,44 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+from dataclasses import fields
+from typing import Any, List
+import torch
+
+from detectron2.structures import Instances
+
+
+def densepose_inference(densepose_predictor_output: Any, detections: List[Instances]) -> None:
+    """
+    Splits DensePose predictor outputs into chunks, each chunk corresponds to
+    detections on one image. Predictor output chunks are stored in `pred_densepose`
+    attribute of the corresponding `Instances` object.
+
+    Args:
+        densepose_predictor_output: a dataclass instance (can be of different types,
+            depending on predictor used for inference). Each field can be `None`
+            (if the corresponding output was not inferred) or a tensor of size
+            [N, ...], where N = N_1 + N_2 + .. + N_k is a total number of
+            detections on all images, N_1 is the number of detections on image 1,
+            N_2 is the number of detections on image 2, etc.
+        detections: a list of objects of type `Instance`, k-th object corresponds
+            to detections on k-th image.
+    """
+    k = 0
+    for detection_i in detections:
+        if densepose_predictor_output is None:
+            # don't add `pred_densepose` attribute
+            continue
+        n_i = detection_i.__len__()
+
+        PredictorOutput = type(densepose_predictor_output)
+        output_i_dict = {}
+        # we assume here that `densepose_predictor_output` is a dataclass object
+        for field in fields(densepose_predictor_output):
+            field_value = getattr(densepose_predictor_output, field.name)
+            # slice tensors
+            if isinstance(field_value, torch.Tensor):
+                output_i_dict[field.name] = field_value[k : k + n_i]
+            # leave others as is
+            else:
+                output_i_dict[field.name] = field_value
+        detection_i.pred_densepose = PredictorOutput(**output_i_dict)
+        k += n_i
diff --git a/densepose/modeling/losses/__init__.py b/densepose/modeling/losses/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e5c593700e7274ea9cbaf8f4a52e8a229ef4c5a1
--- /dev/null
+++ b/densepose/modeling/losses/__init__.py
@@ -0,0 +1,14 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+
+from .chart import DensePoseChartLoss
+from .chart_with_confidences import DensePoseChartWithConfidenceLoss
+from .cse import DensePoseCseLoss
+from .registry import DENSEPOSE_LOSS_REGISTRY
+
+
+__all__ = [
+    "DensePoseChartLoss",
+    "DensePoseChartWithConfidenceLoss",
+    "DensePoseCseLoss",
+    "DENSEPOSE_LOSS_REGISTRY",
+]
diff --git a/densepose/modeling/losses/__pycache__/__init__.cpython-39.pyc b/densepose/modeling/losses/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..66a174bf6a5a9850a8756f9a0d3957871298e23c
Binary files /dev/null and b/densepose/modeling/losses/__pycache__/__init__.cpython-39.pyc differ
diff --git a/densepose/modeling/losses/__pycache__/chart.cpython-39.pyc b/densepose/modeling/losses/__pycache__/chart.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..98555ca3ea1ea2ebc652692a6a2375c47fc0e89c
Binary files /dev/null and b/densepose/modeling/losses/__pycache__/chart.cpython-39.pyc differ
diff --git a/densepose/modeling/losses/__pycache__/chart_with_confidences.cpython-39.pyc b/densepose/modeling/losses/__pycache__/chart_with_confidences.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8e8ff34b3999cbbd3ce25f665df84905bb5a1d37
Binary files /dev/null and b/densepose/modeling/losses/__pycache__/chart_with_confidences.cpython-39.pyc differ
diff --git a/densepose/modeling/losses/__pycache__/cse.cpython-39.pyc b/densepose/modeling/losses/__pycache__/cse.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..629ce8c9536f30ae3abfcdd9c33973ddd20fefc8
Binary files /dev/null and b/densepose/modeling/losses/__pycache__/cse.cpython-39.pyc differ
diff --git a/densepose/modeling/losses/__pycache__/cycle_pix2shape.cpython-39.pyc b/densepose/modeling/losses/__pycache__/cycle_pix2shape.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b7e9cb347e2a894a6d47dc160c772cd85ed7c23d
Binary files /dev/null and b/densepose/modeling/losses/__pycache__/cycle_pix2shape.cpython-39.pyc differ
diff --git a/densepose/modeling/losses/__pycache__/cycle_shape2shape.cpython-39.pyc b/densepose/modeling/losses/__pycache__/cycle_shape2shape.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7d64e6c1e3799e423479f8c98259573b6ee63c3f
Binary files /dev/null and b/densepose/modeling/losses/__pycache__/cycle_shape2shape.cpython-39.pyc differ
diff --git a/densepose/modeling/losses/__pycache__/embed.cpython-39.pyc b/densepose/modeling/losses/__pycache__/embed.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3e631f8b612d4843728fa17a60b2311554304577
Binary files /dev/null and b/densepose/modeling/losses/__pycache__/embed.cpython-39.pyc differ
diff --git a/densepose/modeling/losses/__pycache__/embed_utils.cpython-39.pyc b/densepose/modeling/losses/__pycache__/embed_utils.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8d7c313e0c0f905d9c24fe71cde3428df42d0bb0
Binary files /dev/null and b/densepose/modeling/losses/__pycache__/embed_utils.cpython-39.pyc differ
diff --git a/densepose/modeling/losses/__pycache__/mask.cpython-39.pyc b/densepose/modeling/losses/__pycache__/mask.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..07bdfadf9e2f95cb361ed01536f8cacc2295016c
Binary files /dev/null and b/densepose/modeling/losses/__pycache__/mask.cpython-39.pyc differ
diff --git a/densepose/modeling/losses/__pycache__/mask_or_segm.cpython-39.pyc b/densepose/modeling/losses/__pycache__/mask_or_segm.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b728762018c6fa3761073a4d4e933678f2a13a4e
Binary files /dev/null and b/densepose/modeling/losses/__pycache__/mask_or_segm.cpython-39.pyc differ
diff --git a/densepose/modeling/losses/__pycache__/registry.cpython-39.pyc b/densepose/modeling/losses/__pycache__/registry.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..64f8cb0a9490b28b882dd1cc75d43530f3ffb4f3
Binary files /dev/null and b/densepose/modeling/losses/__pycache__/registry.cpython-39.pyc differ
diff --git a/densepose/modeling/losses/__pycache__/segm.cpython-39.pyc b/densepose/modeling/losses/__pycache__/segm.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c14eb9e13b1d9f765d0d098650ce2027d889980a
Binary files /dev/null and b/densepose/modeling/losses/__pycache__/segm.cpython-39.pyc differ
diff --git a/densepose/modeling/losses/__pycache__/soft_embed.cpython-39.pyc b/densepose/modeling/losses/__pycache__/soft_embed.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ce8da07f28c0036b146bf37e8065866e989b557d
Binary files /dev/null and b/densepose/modeling/losses/__pycache__/soft_embed.cpython-39.pyc differ
diff --git a/densepose/modeling/losses/__pycache__/utils.cpython-39.pyc b/densepose/modeling/losses/__pycache__/utils.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2c7c98b6e90ec45c76d663e71d12e143609481dc
Binary files /dev/null and b/densepose/modeling/losses/__pycache__/utils.cpython-39.pyc differ
diff --git a/densepose/modeling/losses/chart.py b/densepose/modeling/losses/chart.py
new file mode 100644
index 0000000000000000000000000000000000000000..02cdae8db3a41fc197be7fcc792c7119c7a21726
--- /dev/null
+++ b/densepose/modeling/losses/chart.py
@@ -0,0 +1,291 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+
+from typing import Any, List
+import torch
+from torch.nn import functional as F
+
+from detectron2.config import CfgNode
+from detectron2.structures import Instances
+
+from .mask_or_segm import MaskOrSegmentationLoss
+from .registry import DENSEPOSE_LOSS_REGISTRY
+from .utils import (
+    BilinearInterpolationHelper,
+    ChartBasedAnnotationsAccumulator,
+    LossDict,
+    extract_packed_annotations_from_matches,
+)
+
+
+@DENSEPOSE_LOSS_REGISTRY.register()
+class DensePoseChartLoss:
+    """
+    DensePose loss for chart-based training. A mesh is split into charts,
+    each chart is given a label (I) and parametrized by 2 coordinates referred to
+    as U and V. Ground truth consists of a number of points annotated with
+    I, U and V values and coarse segmentation S defined for all pixels of the
+    object bounding box. In some cases (see `COARSE_SEGM_TRAINED_BY_MASKS`),
+    semantic segmentation annotations can be used as ground truth inputs as well.
+
+    Estimated values are tensors:
+     * U coordinates, tensor of shape [N, C, S, S]
+     * V coordinates, tensor of shape [N, C, S, S]
+     * fine segmentation estimates, tensor of shape [N, C, S, S] with raw unnormalized
+       scores for each fine segmentation label at each location
+     * coarse segmentation estimates, tensor of shape [N, D, S, S] with raw unnormalized
+       scores for each coarse segmentation label at each location
+    where N is the number of detections, C is the number of fine segmentation
+    labels, S is the estimate size ( = width = height) and D is the number of
+    coarse segmentation channels.
+
+    The losses are:
+    * regression (smooth L1) loss for U and V coordinates
+    * cross entropy loss for fine (I) and coarse (S) segmentations
+    Each loss has an associated weight
+    """
+
+    def __init__(self, cfg: CfgNode):
+        """
+        Initialize chart-based loss from configuration options
+
+        Args:
+            cfg (CfgNode): configuration options
+        """
+        # fmt: off
+        self.heatmap_size = cfg.MODEL.ROI_DENSEPOSE_HEAD.HEATMAP_SIZE
+        self.w_points     = cfg.MODEL.ROI_DENSEPOSE_HEAD.POINT_REGRESSION_WEIGHTS
+        self.w_part       = cfg.MODEL.ROI_DENSEPOSE_HEAD.PART_WEIGHTS
+        self.w_segm       = cfg.MODEL.ROI_DENSEPOSE_HEAD.INDEX_WEIGHTS
+        self.n_segm_chan  = cfg.MODEL.ROI_DENSEPOSE_HEAD.NUM_COARSE_SEGM_CHANNELS
+        # fmt: on
+        self.segm_trained_by_masks = cfg.MODEL.ROI_DENSEPOSE_HEAD.COARSE_SEGM_TRAINED_BY_MASKS
+        self.segm_loss = MaskOrSegmentationLoss(cfg)
+
+    def __call__(
+        self, proposals_with_gt: List[Instances], densepose_predictor_outputs: Any, **kwargs
+    ) -> LossDict:
+        """
+        Produce chart-based DensePose losses
+
+        Args:
+            proposals_with_gt (list of Instances): detections with associated ground truth data
+            densepose_predictor_outputs: an object of a dataclass that contains predictor outputs
+                with estimated values; assumed to have the following attributes:
+                * coarse_segm - coarse segmentation estimates, tensor of shape [N, D, S, S]
+                * fine_segm - fine segmentation estimates, tensor of shape [N, C, S, S]
+                * u - U coordinate estimates per fine labels, tensor of shape [N, C, S, S]
+                * v - V coordinate estimates per fine labels, tensor of shape [N, C, S, S]
+            where N is the number of detections, C is the number of fine segmentation
+            labels, S is the estimate size ( = width = height) and D is the number of
+            coarse segmentation channels.
+
+        Return:
+            dict: str -> tensor: dict of losses with the following entries:
+             * `loss_densepose_U`: smooth L1 loss for U coordinate estimates
+             * `loss_densepose_V`: smooth L1 loss for V coordinate estimates
+             * `loss_densepose_I`: cross entropy for raw unnormalized scores for fine
+                 segmentation estimates given ground truth labels;
+             * `loss_densepose_S`: cross entropy for raw unnormalized scores for coarse
+                 segmentation estimates given ground truth labels;
+        """
+        # densepose outputs are computed for all images and all bounding boxes;
+        # i.e. if a batch has 4 images with (3, 1, 2, 1) proposals respectively,
+        # the outputs will have size(0) == 3+1+2+1 == 7
+
+        if not len(proposals_with_gt):
+            return self.produce_fake_densepose_losses(densepose_predictor_outputs)
+
+        accumulator = ChartBasedAnnotationsAccumulator()
+        packed_annotations = extract_packed_annotations_from_matches(proposals_with_gt, accumulator)
+
+        # NOTE: we need to keep the same computation graph on all the GPUs to
+        # perform reduction properly. Hence even if we have no data on one
+        # of the GPUs, we still need to generate the computation graph.
+        # Add fake (zero) loss in the form Tensor.sum() * 0
+        if packed_annotations is None:
+            return self.produce_fake_densepose_losses(densepose_predictor_outputs)
+
+        h, w = densepose_predictor_outputs.u.shape[2:]
+        interpolator = BilinearInterpolationHelper.from_matches(
+            packed_annotations,
+            (h, w),
+        )
+
+        j_valid_fg = interpolator.j_valid * (  # pyre-ignore[16]
+            packed_annotations.fine_segm_labels_gt > 0
+        )
+        # pyre-fixme[6]: For 1st param expected `Tensor` but got `int`.
+        if not torch.any(j_valid_fg):
+            return self.produce_fake_densepose_losses(densepose_predictor_outputs)
+
+        losses_uv = self.produce_densepose_losses_uv(
+            proposals_with_gt,
+            densepose_predictor_outputs,
+            packed_annotations,
+            interpolator,
+            j_valid_fg,  # pyre-ignore[6]
+        )
+
+        losses_segm = self.produce_densepose_losses_segm(
+            proposals_with_gt,
+            densepose_predictor_outputs,
+            packed_annotations,
+            interpolator,
+            j_valid_fg,  # pyre-ignore[6]
+        )
+
+        return {**losses_uv, **losses_segm}
+
+    def produce_fake_densepose_losses(self, densepose_predictor_outputs: Any) -> LossDict:
+        """
+        Fake losses for fine segmentation and U/V coordinates. These are used when
+        no suitable ground truth data was found in a batch. The loss has a value 0
+        and is primarily used to construct the computation graph, so that
+        `DistributedDataParallel` has similar graphs on all GPUs and can perform
+        reduction properly.
+
+        Args:
+            densepose_predictor_outputs: DensePose predictor outputs, an object
+                of a dataclass that is assumed to have the following attributes:
+             * fine_segm - fine segmentation estimates, tensor of shape [N, C, S, S]
+             * u - U coordinate estimates per fine labels, tensor of shape [N, C, S, S]
+             * v - V coordinate estimates per fine labels, tensor of shape [N, C, S, S]
+        Return:
+            dict: str -> tensor: dict of losses with the following entries:
+             * `loss_densepose_U`: has value 0
+             * `loss_densepose_V`: has value 0
+             * `loss_densepose_I`: has value 0
+             * `loss_densepose_S`: has value 0
+        """
+        losses_uv = self.produce_fake_densepose_losses_uv(densepose_predictor_outputs)
+        losses_segm = self.produce_fake_densepose_losses_segm(densepose_predictor_outputs)
+        return {**losses_uv, **losses_segm}
+
+    def produce_fake_densepose_losses_uv(self, densepose_predictor_outputs: Any) -> LossDict:
+        """
+        Fake losses for U/V coordinates. These are used when no suitable ground
+        truth data was found in a batch. The loss has a value 0
+        and is primarily used to construct the computation graph, so that
+        `DistributedDataParallel` has similar graphs on all GPUs and can perform
+        reduction properly.
+
+        Args:
+            densepose_predictor_outputs: DensePose predictor outputs, an object
+                of a dataclass that is assumed to have the following attributes:
+             * u - U coordinate estimates per fine labels, tensor of shape [N, C, S, S]
+             * v - V coordinate estimates per fine labels, tensor of shape [N, C, S, S]
+        Return:
+            dict: str -> tensor: dict of losses with the following entries:
+             * `loss_densepose_U`: has value 0
+             * `loss_densepose_V`: has value 0
+        """
+        return {
+            "loss_densepose_U": densepose_predictor_outputs.u.sum() * 0,
+            "loss_densepose_V": densepose_predictor_outputs.v.sum() * 0,
+        }
+
+    def produce_fake_densepose_losses_segm(self, densepose_predictor_outputs: Any) -> LossDict:
+        """
+        Fake losses for fine / coarse segmentation. These are used when
+        no suitable ground truth data was found in a batch. The loss has a value 0
+        and is primarily used to construct the computation graph, so that
+        `DistributedDataParallel` has similar graphs on all GPUs and can perform
+        reduction properly.
+
+        Args:
+            densepose_predictor_outputs: DensePose predictor outputs, an object
+                of a dataclass that is assumed to have the following attributes:
+             * fine_segm - fine segmentation estimates, tensor of shape [N, C, S, S]
+             * coarse_segm - coarse segmentation estimates, tensor of shape [N, D, S, S]
+        Return:
+            dict: str -> tensor: dict of losses with the following entries:
+             * `loss_densepose_I`: has value 0
+             * `loss_densepose_S`: has value 0, added only if `segm_trained_by_masks` is False
+        """
+        losses = {
+            "loss_densepose_I": densepose_predictor_outputs.fine_segm.sum() * 0,
+            "loss_densepose_S": self.segm_loss.fake_value(densepose_predictor_outputs),
+        }
+        return losses
+
+    def produce_densepose_losses_uv(
+        self,
+        proposals_with_gt: List[Instances],
+        densepose_predictor_outputs: Any,
+        packed_annotations: Any,
+        interpolator: BilinearInterpolationHelper,
+        j_valid_fg: torch.Tensor,
+    ) -> LossDict:
+        """
+        Compute losses for U/V coordinates: smooth L1 loss between
+        estimated coordinates and the ground truth.
+
+        Args:
+            proposals_with_gt (list of Instances): detections with associated ground truth data
+            densepose_predictor_outputs: DensePose predictor outputs, an object
+                of a dataclass that is assumed to have the following attributes:
+             * u - U coordinate estimates per fine labels, tensor of shape [N, C, S, S]
+             * v - V coordinate estimates per fine labels, tensor of shape [N, C, S, S]
+        Return:
+            dict: str -> tensor: dict of losses with the following entries:
+             * `loss_densepose_U`: smooth L1 loss for U coordinate estimates
+             * `loss_densepose_V`: smooth L1 loss for V coordinate estimates
+        """
+        u_gt = packed_annotations.u_gt[j_valid_fg]
+        u_est = interpolator.extract_at_points(densepose_predictor_outputs.u)[j_valid_fg]
+        v_gt = packed_annotations.v_gt[j_valid_fg]
+        v_est = interpolator.extract_at_points(densepose_predictor_outputs.v)[j_valid_fg]
+        return {
+            "loss_densepose_U": F.smooth_l1_loss(u_est, u_gt, reduction="sum") * self.w_points,
+            "loss_densepose_V": F.smooth_l1_loss(v_est, v_gt, reduction="sum") * self.w_points,
+        }
+
+    def produce_densepose_losses_segm(
+        self,
+        proposals_with_gt: List[Instances],
+        densepose_predictor_outputs: Any,
+        packed_annotations: Any,
+        interpolator: BilinearInterpolationHelper,
+        j_valid_fg: torch.Tensor,
+    ) -> LossDict:
+        """
+        Losses for fine / coarse segmentation: cross-entropy
+        for segmentation unnormalized scores given ground truth labels at
+        annotated points for fine segmentation and dense mask annotations
+        for coarse segmentation.
+
+        Args:
+            proposals_with_gt (list of Instances): detections with associated ground truth data
+            densepose_predictor_outputs: DensePose predictor outputs, an object
+                of a dataclass that is assumed to have the following attributes:
+             * fine_segm - fine segmentation estimates, tensor of shape [N, C, S, S]
+             * coarse_segm - coarse segmentation estimates, tensor of shape [N, D, S, S]
+        Return:
+            dict: str -> tensor: dict of losses with the following entries:
+             * `loss_densepose_I`: cross entropy for raw unnormalized scores for fine
+                 segmentation estimates given ground truth labels
+             * `loss_densepose_S`: cross entropy for raw unnormalized scores for coarse
+                 segmentation estimates given ground truth labels;
+                 may be included if coarse segmentation is only trained
+                 using DensePose ground truth; if additional supervision through
+                 instance segmentation data is performed (`segm_trained_by_masks` is True),
+                 this loss is handled by `produce_mask_losses` instead
+        """
+        fine_segm_gt = packed_annotations.fine_segm_labels_gt[
+            interpolator.j_valid  # pyre-ignore[16]
+        ]
+        fine_segm_est = interpolator.extract_at_points(
+            densepose_predictor_outputs.fine_segm,
+            slice_fine_segm=slice(None),
+            w_ylo_xlo=interpolator.w_ylo_xlo[:, None],  # pyre-ignore[16]
+            w_ylo_xhi=interpolator.w_ylo_xhi[:, None],  # pyre-ignore[16]
+            w_yhi_xlo=interpolator.w_yhi_xlo[:, None],  # pyre-ignore[16]
+            w_yhi_xhi=interpolator.w_yhi_xhi[:, None],  # pyre-ignore[16]
+        )[interpolator.j_valid, :]
+        return {
+            "loss_densepose_I": F.cross_entropy(fine_segm_est, fine_segm_gt.long()) * self.w_part,
+            "loss_densepose_S": self.segm_loss(
+                proposals_with_gt, densepose_predictor_outputs, packed_annotations
+            )
+            * self.w_segm,
+        }
diff --git a/densepose/modeling/losses/chart_with_confidences.py b/densepose/modeling/losses/chart_with_confidences.py
new file mode 100644
index 0000000000000000000000000000000000000000..78ce7c6cb02fa01f6319d088349ff4f422001839
--- /dev/null
+++ b/densepose/modeling/losses/chart_with_confidences.py
@@ -0,0 +1,209 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+import math
+from typing import Any, List
+import torch
+from torch import nn
+from torch.nn import functional as F
+
+from detectron2.config import CfgNode
+from detectron2.structures import Instances
+
+from .. import DensePoseConfidenceModelConfig, DensePoseUVConfidenceType
+from .chart import DensePoseChartLoss
+from .registry import DENSEPOSE_LOSS_REGISTRY
+from .utils import BilinearInterpolationHelper, LossDict
+
+
+@DENSEPOSE_LOSS_REGISTRY.register()
+class DensePoseChartWithConfidenceLoss(DensePoseChartLoss):
+    """ """
+
+    def __init__(self, cfg: CfgNode):
+        super().__init__(cfg)
+        self.confidence_model_cfg = DensePoseConfidenceModelConfig.from_cfg(cfg)
+        if self.confidence_model_cfg.uv_confidence.type == DensePoseUVConfidenceType.IID_ISO:
+            self.uv_loss_with_confidences = IIDIsotropicGaussianUVLoss(
+                self.confidence_model_cfg.uv_confidence.epsilon
+            )
+        elif self.confidence_model_cfg.uv_confidence.type == DensePoseUVConfidenceType.INDEP_ANISO:
+            self.uv_loss_with_confidences = IndepAnisotropicGaussianUVLoss(
+                self.confidence_model_cfg.uv_confidence.epsilon
+            )
+
+    def produce_fake_densepose_losses_uv(self, densepose_predictor_outputs: Any) -> LossDict:
+        """
+        Overrides fake losses for fine segmentation and U/V coordinates to
+        include computation graphs for additional confidence parameters.
+        These are used when no suitable ground truth data was found in a batch.
+        The loss has a value 0 and is primarily used to construct the computation graph,
+        so that `DistributedDataParallel` has similar graphs on all GPUs and can
+        perform reduction properly.
+
+        Args:
+            densepose_predictor_outputs: DensePose predictor outputs, an object
+                of a dataclass that is assumed to have the following attributes:
+             * fine_segm - fine segmentation estimates, tensor of shape [N, C, S, S]
+             * u - U coordinate estimates per fine labels, tensor of shape [N, C, S, S]
+             * v - V coordinate estimates per fine labels, tensor of shape [N, C, S, S]
+        Return:
+            dict: str -> tensor: dict of losses with the following entries:
+             * `loss_densepose_U`: has value 0
+             * `loss_densepose_V`: has value 0
+             * `loss_densepose_I`: has value 0
+        """
+        conf_type = self.confidence_model_cfg.uv_confidence.type
+        if self.confidence_model_cfg.uv_confidence.enabled:
+            loss_uv = (
+                densepose_predictor_outputs.u.sum() + densepose_predictor_outputs.v.sum()
+            ) * 0
+            if conf_type == DensePoseUVConfidenceType.IID_ISO:
+                loss_uv += densepose_predictor_outputs.sigma_2.sum() * 0
+            elif conf_type == DensePoseUVConfidenceType.INDEP_ANISO:
+                loss_uv += (
+                    densepose_predictor_outputs.sigma_2.sum()
+                    + densepose_predictor_outputs.kappa_u.sum()
+                    + densepose_predictor_outputs.kappa_v.sum()
+                ) * 0
+            return {"loss_densepose_UV": loss_uv}
+        else:
+            return super().produce_fake_densepose_losses_uv(densepose_predictor_outputs)
+
+    def produce_densepose_losses_uv(
+        self,
+        proposals_with_gt: List[Instances],
+        densepose_predictor_outputs: Any,
+        packed_annotations: Any,
+        interpolator: BilinearInterpolationHelper,
+        j_valid_fg: torch.Tensor,
+    ) -> LossDict:
+        conf_type = self.confidence_model_cfg.uv_confidence.type
+        if self.confidence_model_cfg.uv_confidence.enabled:
+            u_gt = packed_annotations.u_gt[j_valid_fg]
+            u_est = interpolator.extract_at_points(densepose_predictor_outputs.u)[j_valid_fg]
+            v_gt = packed_annotations.v_gt[j_valid_fg]
+            v_est = interpolator.extract_at_points(densepose_predictor_outputs.v)[j_valid_fg]
+            sigma_2_est = interpolator.extract_at_points(densepose_predictor_outputs.sigma_2)[
+                j_valid_fg
+            ]
+            if conf_type == DensePoseUVConfidenceType.IID_ISO:
+                return {
+                    "loss_densepose_UV": (
+                        self.uv_loss_with_confidences(u_est, v_est, sigma_2_est, u_gt, v_gt)
+                        * self.w_points
+                    )
+                }
+            elif conf_type in [DensePoseUVConfidenceType.INDEP_ANISO]:
+                kappa_u_est = interpolator.extract_at_points(densepose_predictor_outputs.kappa_u)[
+                    j_valid_fg
+                ]
+                kappa_v_est = interpolator.extract_at_points(densepose_predictor_outputs.kappa_v)[
+                    j_valid_fg
+                ]
+                return {
+                    "loss_densepose_UV": (
+                        self.uv_loss_with_confidences(
+                            u_est, v_est, sigma_2_est, kappa_u_est, kappa_v_est, u_gt, v_gt
+                        )
+                        * self.w_points
+                    )
+                }
+        return super().produce_densepose_losses_uv(
+            proposals_with_gt,
+            densepose_predictor_outputs,
+            packed_annotations,
+            interpolator,
+            j_valid_fg,
+        )
+
+
+class IIDIsotropicGaussianUVLoss(nn.Module):
+    """
+    Loss for the case of iid residuals with isotropic covariance:
+    $Sigma_i = sigma_i^2 I$
+    The loss (negative log likelihood) is then:
+    $1/2 sum_{i=1}^n (log(2 pi) + 2 log sigma_i^2 + ||delta_i||^2 / sigma_i^2)$,
+    where $delta_i=(u - u', v - v')$ is a 2D vector containing UV coordinates
+    difference between estimated and ground truth UV values
+    For details, see:
+    N. Neverova, D. Novotny, A. Vedaldi "Correlated Uncertainty for Learning
+    Dense Correspondences from Noisy Labels", p. 918--926, in Proc. NIPS 2019
+    """
+
+    def __init__(self, sigma_lower_bound: float):
+        super(IIDIsotropicGaussianUVLoss, self).__init__()
+        self.sigma_lower_bound = sigma_lower_bound
+        self.log2pi = math.log(2 * math.pi)
+
+    def forward(
+        self,
+        u: torch.Tensor,
+        v: torch.Tensor,
+        sigma_u: torch.Tensor,
+        target_u: torch.Tensor,
+        target_v: torch.Tensor,
+    ):
+        # compute $\sigma_i^2$
+        # use sigma_lower_bound to avoid degenerate solution for variance
+        # (sigma -> 0)
+        sigma2 = F.softplus(sigma_u) + self.sigma_lower_bound
+        # compute \|delta_i\|^2
+        # pyre-fixme[58]: `**` is not supported for operand types `Tensor` and `int`.
+        delta_t_delta = (u - target_u) ** 2 + (v - target_v) ** 2
+        # the total loss from the formula above:
+        loss = 0.5 * (self.log2pi + 2 * torch.log(sigma2) + delta_t_delta / sigma2)
+        return loss.sum()
+
+
+class IndepAnisotropicGaussianUVLoss(nn.Module):
+    """
+    Loss for the case of independent residuals with anisotropic covariances:
+    $Sigma_i = sigma_i^2 I + r_i r_i^T$
+    The loss (negative log likelihood) is then:
+    $1/2 sum_{i=1}^n (log(2 pi)
+      + log sigma_i^2 (sigma_i^2 + ||r_i||^2)
+      + ||delta_i||^2 / sigma_i^2
+      - <delta_i, r_i>^2 / (sigma_i^2 * (sigma_i^2 + ||r_i||^2)))$,
+    where $delta_i=(u - u', v - v')$ is a 2D vector containing UV coordinates
+    difference between estimated and ground truth UV values
+    For details, see:
+    N. Neverova, D. Novotny, A. Vedaldi "Correlated Uncertainty for Learning
+    Dense Correspondences from Noisy Labels", p. 918--926, in Proc. NIPS 2019
+    """
+
+    def __init__(self, sigma_lower_bound: float):
+        super(IndepAnisotropicGaussianUVLoss, self).__init__()
+        self.sigma_lower_bound = sigma_lower_bound
+        self.log2pi = math.log(2 * math.pi)
+
+    def forward(
+        self,
+        u: torch.Tensor,
+        v: torch.Tensor,
+        sigma_u: torch.Tensor,
+        kappa_u_est: torch.Tensor,
+        kappa_v_est: torch.Tensor,
+        target_u: torch.Tensor,
+        target_v: torch.Tensor,
+    ):
+        # compute $\sigma_i^2$
+        sigma2 = F.softplus(sigma_u) + self.sigma_lower_bound
+        # compute \|r_i\|^2
+        # pyre-fixme[58]: `**` is not supported for operand types `Tensor` and `int`.
+        r_sqnorm2 = kappa_u_est**2 + kappa_v_est**2
+        delta_u = u - target_u
+        delta_v = v - target_v
+        # compute \|delta_i\|^2
+        # pyre-fixme[58]: `**` is not supported for operand types `Tensor` and `int`.
+        delta_sqnorm = delta_u**2 + delta_v**2
+        delta_u_r_u = delta_u * kappa_u_est
+        delta_v_r_v = delta_v * kappa_v_est
+        # compute the scalar product <delta_i, r_i>
+        delta_r = delta_u_r_u + delta_v_r_v
+        # compute squared scalar product <delta_i, r_i>^2
+        # pyre-fixme[58]: `**` is not supported for operand types `Tensor` and `int`.
+        delta_r_sqnorm = delta_r**2
+        denom2 = sigma2 * (sigma2 + r_sqnorm2)
+        loss = 0.5 * (
+            self.log2pi + torch.log(denom2) + delta_sqnorm / sigma2 - delta_r_sqnorm / denom2
+        )
+        return loss.sum()
diff --git a/densepose/modeling/losses/cse.py b/densepose/modeling/losses/cse.py
new file mode 100644
index 0000000000000000000000000000000000000000..dd561ad518f42c769fd9a5c8517409ddc33edf6f
--- /dev/null
+++ b/densepose/modeling/losses/cse.py
@@ -0,0 +1,115 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+
+from typing import Any, List
+from torch import nn
+
+from detectron2.config import CfgNode
+from detectron2.structures import Instances
+
+from .cycle_pix2shape import PixToShapeCycleLoss
+from .cycle_shape2shape import ShapeToShapeCycleLoss
+from .embed import EmbeddingLoss
+from .embed_utils import CseAnnotationsAccumulator
+from .mask_or_segm import MaskOrSegmentationLoss
+from .registry import DENSEPOSE_LOSS_REGISTRY
+from .soft_embed import SoftEmbeddingLoss
+from .utils import BilinearInterpolationHelper, LossDict, extract_packed_annotations_from_matches
+
+
+@DENSEPOSE_LOSS_REGISTRY.register()
+class DensePoseCseLoss:
+    """ """
+
+    _EMBED_LOSS_REGISTRY = {
+        EmbeddingLoss.__name__: EmbeddingLoss,
+        SoftEmbeddingLoss.__name__: SoftEmbeddingLoss,
+    }
+
+    def __init__(self, cfg: CfgNode):
+        """
+        Initialize CSE loss from configuration options
+
+        Args:
+            cfg (CfgNode): configuration options
+        """
+        self.w_segm = cfg.MODEL.ROI_DENSEPOSE_HEAD.INDEX_WEIGHTS
+        self.w_embed = cfg.MODEL.ROI_DENSEPOSE_HEAD.CSE.EMBED_LOSS_WEIGHT
+        self.segm_loss = MaskOrSegmentationLoss(cfg)
+        self.embed_loss = DensePoseCseLoss.create_embed_loss(cfg)
+        self.do_shape2shape = cfg.MODEL.ROI_DENSEPOSE_HEAD.CSE.SHAPE_TO_SHAPE_CYCLE_LOSS.ENABLED
+        if self.do_shape2shape:
+            self.w_shape2shape = cfg.MODEL.ROI_DENSEPOSE_HEAD.CSE.SHAPE_TO_SHAPE_CYCLE_LOSS.WEIGHT
+            self.shape2shape_loss = ShapeToShapeCycleLoss(cfg)
+        self.do_pix2shape = cfg.MODEL.ROI_DENSEPOSE_HEAD.CSE.PIX_TO_SHAPE_CYCLE_LOSS.ENABLED
+        if self.do_pix2shape:
+            self.w_pix2shape = cfg.MODEL.ROI_DENSEPOSE_HEAD.CSE.PIX_TO_SHAPE_CYCLE_LOSS.WEIGHT
+            self.pix2shape_loss = PixToShapeCycleLoss(cfg)
+
+    @classmethod
+    def create_embed_loss(cls, cfg: CfgNode):
+        # registry not used here, since embedding losses are currently local
+        # and are not used anywhere else
+        return cls._EMBED_LOSS_REGISTRY[cfg.MODEL.ROI_DENSEPOSE_HEAD.CSE.EMBED_LOSS_NAME](cfg)
+
+    def __call__(
+        self,
+        proposals_with_gt: List[Instances],
+        densepose_predictor_outputs: Any,
+        embedder: nn.Module,
+    ) -> LossDict:
+        if not len(proposals_with_gt):
+            return self.produce_fake_losses(densepose_predictor_outputs, embedder)
+        accumulator = CseAnnotationsAccumulator()
+        packed_annotations = extract_packed_annotations_from_matches(proposals_with_gt, accumulator)
+        if packed_annotations is None:
+            return self.produce_fake_losses(densepose_predictor_outputs, embedder)
+        h, w = densepose_predictor_outputs.embedding.shape[2:]
+        interpolator = BilinearInterpolationHelper.from_matches(
+            packed_annotations,
+            (h, w),
+        )
+        meshid_to_embed_losses = self.embed_loss(
+            proposals_with_gt,
+            densepose_predictor_outputs,
+            packed_annotations,
+            interpolator,
+            embedder,
+        )
+        embed_loss_dict = {
+            f"loss_densepose_E{meshid}": self.w_embed * meshid_to_embed_losses[meshid]
+            for meshid in meshid_to_embed_losses
+        }
+        all_loss_dict = {
+            "loss_densepose_S": self.w_segm
+            * self.segm_loss(proposals_with_gt, densepose_predictor_outputs, packed_annotations),
+            **embed_loss_dict,
+        }
+        if self.do_shape2shape:
+            all_loss_dict["loss_shape2shape"] = self.w_shape2shape * self.shape2shape_loss(embedder)
+        if self.do_pix2shape:
+            all_loss_dict["loss_pix2shape"] = self.w_pix2shape * self.pix2shape_loss(
+                proposals_with_gt, densepose_predictor_outputs, packed_annotations, embedder
+            )
+        return all_loss_dict
+
+    def produce_fake_losses(
+        self, densepose_predictor_outputs: Any, embedder: nn.Module
+    ) -> LossDict:
+        meshname_to_embed_losses = self.embed_loss.fake_values(
+            densepose_predictor_outputs, embedder=embedder
+        )
+        embed_loss_dict = {
+            f"loss_densepose_E{mesh_name}": meshname_to_embed_losses[mesh_name]
+            for mesh_name in meshname_to_embed_losses
+        }
+        all_loss_dict = {
+            "loss_densepose_S": self.segm_loss.fake_value(densepose_predictor_outputs),
+            **embed_loss_dict,
+        }
+        if self.do_shape2shape:
+            all_loss_dict["loss_shape2shape"] = self.shape2shape_loss.fake_value(embedder)
+        if self.do_pix2shape:
+            all_loss_dict["loss_pix2shape"] = self.pix2shape_loss.fake_value(
+                densepose_predictor_outputs, embedder
+            )
+        return all_loss_dict
diff --git a/densepose/modeling/losses/cycle_pix2shape.py b/densepose/modeling/losses/cycle_pix2shape.py
new file mode 100644
index 0000000000000000000000000000000000000000..e305d29850ef04a712a0a3e7bdbffba887257777
--- /dev/null
+++ b/densepose/modeling/losses/cycle_pix2shape.py
@@ -0,0 +1,152 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+
+from typing import Any, List
+import torch
+from torch import nn
+from torch.nn import functional as F
+
+from detectron2.config import CfgNode
+from detectron2.structures import Instances
+
+from densepose.data.meshes.catalog import MeshCatalog
+from densepose.modeling.cse.utils import normalize_embeddings, squared_euclidean_distance_matrix
+
+from .embed_utils import PackedCseAnnotations
+from .mask import extract_data_for_mask_loss_from_matches
+
+
+def _create_pixel_dist_matrix(grid_size: int) -> torch.Tensor:
+    rows = torch.arange(grid_size)
+    cols = torch.arange(grid_size)
+    # at index `i` contains [row, col], where
+    # row = i // grid_size
+    # col = i % grid_size
+    pix_coords = (
+        torch.stack(torch.meshgrid(rows, cols), -1).reshape((grid_size * grid_size, 2)).float()
+    )
+    return squared_euclidean_distance_matrix(pix_coords, pix_coords)
+
+
+def _sample_fg_pixels_randperm(fg_mask: torch.Tensor, sample_size: int) -> torch.Tensor:
+    fg_mask_flattened = fg_mask.reshape((-1,))
+    num_pixels = int(fg_mask_flattened.sum().item())
+    fg_pixel_indices = fg_mask_flattened.nonzero(as_tuple=True)[0]
+    if (sample_size <= 0) or (num_pixels <= sample_size):
+        return fg_pixel_indices
+    sample_indices = torch.randperm(num_pixels, device=fg_mask.device)[:sample_size]
+    return fg_pixel_indices[sample_indices]
+
+
+def _sample_fg_pixels_multinomial(fg_mask: torch.Tensor, sample_size: int) -> torch.Tensor:
+    fg_mask_flattened = fg_mask.reshape((-1,))
+    num_pixels = int(fg_mask_flattened.sum().item())
+    if (sample_size <= 0) or (num_pixels <= sample_size):
+        return fg_mask_flattened.nonzero(as_tuple=True)[0]
+    return fg_mask_flattened.float().multinomial(sample_size, replacement=False)
+
+
+class PixToShapeCycleLoss(nn.Module):
+    """
+    Cycle loss for pixel-vertex correspondence
+    """
+
+    def __init__(self, cfg: CfgNode):
+        super().__init__()
+        self.shape_names = list(cfg.MODEL.ROI_DENSEPOSE_HEAD.CSE.EMBEDDERS.keys())
+        self.embed_size = cfg.MODEL.ROI_DENSEPOSE_HEAD.CSE.EMBED_SIZE
+        self.norm_p = cfg.MODEL.ROI_DENSEPOSE_HEAD.CSE.PIX_TO_SHAPE_CYCLE_LOSS.NORM_P
+        self.use_all_meshes_not_gt_only = (
+            cfg.MODEL.ROI_DENSEPOSE_HEAD.CSE.PIX_TO_SHAPE_CYCLE_LOSS.USE_ALL_MESHES_NOT_GT_ONLY
+        )
+        self.num_pixels_to_sample = (
+            cfg.MODEL.ROI_DENSEPOSE_HEAD.CSE.PIX_TO_SHAPE_CYCLE_LOSS.NUM_PIXELS_TO_SAMPLE
+        )
+        self.pix_sigma = cfg.MODEL.ROI_DENSEPOSE_HEAD.CSE.PIX_TO_SHAPE_CYCLE_LOSS.PIXEL_SIGMA
+        self.temperature_pix_to_vertex = (
+            cfg.MODEL.ROI_DENSEPOSE_HEAD.CSE.PIX_TO_SHAPE_CYCLE_LOSS.TEMPERATURE_PIXEL_TO_VERTEX
+        )
+        self.temperature_vertex_to_pix = (
+            cfg.MODEL.ROI_DENSEPOSE_HEAD.CSE.PIX_TO_SHAPE_CYCLE_LOSS.TEMPERATURE_VERTEX_TO_PIXEL
+        )
+        self.pixel_dists = _create_pixel_dist_matrix(cfg.MODEL.ROI_DENSEPOSE_HEAD.HEATMAP_SIZE)
+
+    def forward(
+        self,
+        proposals_with_gt: List[Instances],
+        densepose_predictor_outputs: Any,
+        packed_annotations: PackedCseAnnotations,
+        embedder: nn.Module,
+    ):
+        """
+        Args:
+            proposals_with_gt (list of Instances): detections with associated
+                ground truth data; each item corresponds to instances detected
+                on 1 image; the number of items corresponds to the number of
+                images in a batch
+            densepose_predictor_outputs: an object of a dataclass that contains predictor
+                outputs with estimated values; assumed to have the following attributes:
+                * embedding - embedding estimates, tensor of shape [N, D, S, S], where
+                  N = number of instances (= sum N_i, where N_i is the number of
+                      instances on image i)
+                  D = embedding space dimensionality (MODEL.ROI_DENSEPOSE_HEAD.CSE.EMBED_SIZE)
+                  S = output size (width and height)
+            packed_annotations (PackedCseAnnotations): contains various data useful
+                for loss computation, each data is packed into a single tensor
+            embedder (nn.Module): module that computes vertex embeddings for different meshes
+        """
+        pix_embeds = densepose_predictor_outputs.embedding
+        if self.pixel_dists.device != pix_embeds.device:
+            # should normally be done only once
+            self.pixel_dists = self.pixel_dists.to(device=pix_embeds.device)
+        with torch.no_grad():
+            mask_loss_data = extract_data_for_mask_loss_from_matches(
+                proposals_with_gt, densepose_predictor_outputs.coarse_segm
+            )
+        # GT masks - tensor of shape [N, S, S] of int64
+        masks_gt = mask_loss_data.masks_gt.long()  # pyre-ignore[16]
+        assert len(pix_embeds) == len(masks_gt), (
+            f"Number of instances with embeddings {len(pix_embeds)} != "
+            f"number of instances with GT masks {len(masks_gt)}"
+        )
+        losses = []
+        mesh_names = (
+            self.shape_names
+            if self.use_all_meshes_not_gt_only
+            else [
+                MeshCatalog.get_mesh_name(mesh_id.item())
+                for mesh_id in packed_annotations.vertex_mesh_ids_gt.unique()
+            ]
+        )
+        for pixel_embeddings, mask_gt in zip(pix_embeds, masks_gt):
+            # pixel_embeddings [D, S, S]
+            # mask_gt [S, S]
+            for mesh_name in mesh_names:
+                mesh_vertex_embeddings = embedder(mesh_name)
+                # pixel indices [M]
+                pixel_indices_flattened = _sample_fg_pixels_randperm(
+                    mask_gt, self.num_pixels_to_sample
+                )
+                # pixel distances [M, M]
+                pixel_dists = self.pixel_dists.to(pixel_embeddings.device)[
+                    torch.meshgrid(pixel_indices_flattened, pixel_indices_flattened)
+                ]
+                # pixel embeddings [M, D]
+                pixel_embeddings_sampled = normalize_embeddings(
+                    pixel_embeddings.reshape((self.embed_size, -1))[:, pixel_indices_flattened].T
+                )
+                # pixel-vertex similarity [M, K]
+                sim_matrix = pixel_embeddings_sampled.mm(mesh_vertex_embeddings.T)
+                c_pix_vertex = F.softmax(sim_matrix / self.temperature_pix_to_vertex, dim=1)
+                c_vertex_pix = F.softmax(sim_matrix.T / self.temperature_vertex_to_pix, dim=1)
+                c_cycle = c_pix_vertex.mm(c_vertex_pix)
+                loss_cycle = torch.norm(pixel_dists * c_cycle, p=self.norm_p)
+                losses.append(loss_cycle)
+
+        if len(losses) == 0:
+            return pix_embeds.sum() * 0
+        return torch.stack(losses, dim=0).mean()
+
+    def fake_value(self, densepose_predictor_outputs: Any, embedder: nn.Module):
+        losses = [embedder(mesh_name).sum() * 0 for mesh_name in embedder.mesh_names]
+        losses.append(densepose_predictor_outputs.embedding.sum() * 0)
+        return torch.mean(torch.stack(losses))
diff --git a/densepose/modeling/losses/cycle_shape2shape.py b/densepose/modeling/losses/cycle_shape2shape.py
new file mode 100644
index 0000000000000000000000000000000000000000..f71dbab7c5bd7484cd9001c3c15059971ff0f0cf
--- /dev/null
+++ b/densepose/modeling/losses/cycle_shape2shape.py
@@ -0,0 +1,117 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+
+import random
+from typing import Tuple
+import torch
+from torch import nn
+from torch.nn import functional as F
+
+from detectron2.config import CfgNode
+
+from densepose.structures.mesh import create_mesh
+
+from .utils import sample_random_indices
+
+
+class ShapeToShapeCycleLoss(nn.Module):
+    """
+    Cycle Loss for Shapes.
+    Inspired by:
+    "Mapping in a Cycle: Sinkhorn Regularized Unsupervised Learning for Point Cloud Shapes".
+    """
+
+    def __init__(self, cfg: CfgNode):
+        super().__init__()
+        self.shape_names = list(cfg.MODEL.ROI_DENSEPOSE_HEAD.CSE.EMBEDDERS.keys())
+        self.all_shape_pairs = [
+            (x, y) for i, x in enumerate(self.shape_names) for y in self.shape_names[i + 1 :]
+        ]
+        random.shuffle(self.all_shape_pairs)
+        self.cur_pos = 0
+        self.norm_p = cfg.MODEL.ROI_DENSEPOSE_HEAD.CSE.SHAPE_TO_SHAPE_CYCLE_LOSS.NORM_P
+        self.temperature = cfg.MODEL.ROI_DENSEPOSE_HEAD.CSE.SHAPE_TO_SHAPE_CYCLE_LOSS.TEMPERATURE
+        self.max_num_vertices = (
+            cfg.MODEL.ROI_DENSEPOSE_HEAD.CSE.SHAPE_TO_SHAPE_CYCLE_LOSS.MAX_NUM_VERTICES
+        )
+
+    def _sample_random_pair(self) -> Tuple[str, str]:
+        """
+        Produce a random pair of different mesh names
+
+        Return:
+            tuple(str, str): a pair of different mesh names
+        """
+        if self.cur_pos >= len(self.all_shape_pairs):
+            random.shuffle(self.all_shape_pairs)
+            self.cur_pos = 0
+        shape_pair = self.all_shape_pairs[self.cur_pos]
+        self.cur_pos += 1
+        return shape_pair
+
+    def forward(self, embedder: nn.Module):
+        """
+        Do a forward pass with a random pair (src, dst) pair of shapes
+        Args:
+            embedder (nn.Module): module that computes vertex embeddings for different meshes
+        """
+        src_mesh_name, dst_mesh_name = self._sample_random_pair()
+        return self._forward_one_pair(embedder, src_mesh_name, dst_mesh_name)
+
+    def fake_value(self, embedder: nn.Module):
+        losses = []
+        for mesh_name in embedder.mesh_names:
+            losses.append(embedder(mesh_name).sum() * 0)
+        return torch.mean(torch.stack(losses))
+
+    def _get_embeddings_and_geodists_for_mesh(
+        self, embedder: nn.Module, mesh_name: str
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Produces embeddings and geodesic distance tensors for a given mesh. May subsample
+        the mesh, if it contains too many vertices (controlled by
+        SHAPE_CYCLE_LOSS_MAX_NUM_VERTICES parameter).
+        Args:
+            embedder (nn.Module): module that computes embeddings for mesh vertices
+            mesh_name (str): mesh name
+        Return:
+            embeddings (torch.Tensor of size [N, D]): embeddings for selected mesh
+                vertices (N = number of selected vertices, D = embedding space dim)
+            geodists (torch.Tensor of size [N, N]): geodesic distances for the selected
+                mesh vertices (N = number of selected vertices)
+        """
+        embeddings = embedder(mesh_name)
+        indices = sample_random_indices(
+            embeddings.shape[0], self.max_num_vertices, embeddings.device
+        )
+        mesh = create_mesh(mesh_name, embeddings.device)
+        geodists = mesh.geodists
+        if indices is not None:
+            embeddings = embeddings[indices]
+            geodists = geodists[torch.meshgrid(indices, indices)]
+        return embeddings, geodists
+
+    def _forward_one_pair(
+        self, embedder: nn.Module, mesh_name_1: str, mesh_name_2: str
+    ) -> torch.Tensor:
+        """
+        Do a forward pass with a selected pair of meshes
+        Args:
+            embedder (nn.Module): module that computes vertex embeddings for different meshes
+            mesh_name_1 (str): first mesh name
+            mesh_name_2 (str): second mesh name
+        Return:
+            Tensor containing the loss value
+        """
+        embeddings_1, geodists_1 = self._get_embeddings_and_geodists_for_mesh(embedder, mesh_name_1)
+        embeddings_2, geodists_2 = self._get_embeddings_and_geodists_for_mesh(embedder, mesh_name_2)
+        sim_matrix_12 = embeddings_1.mm(embeddings_2.T)
+
+        c_12 = F.softmax(sim_matrix_12 / self.temperature, dim=1)
+        c_21 = F.softmax(sim_matrix_12.T / self.temperature, dim=1)
+        c_11 = c_12.mm(c_21)
+        c_22 = c_21.mm(c_12)
+
+        loss_cycle_11 = torch.norm(geodists_1 * c_11, p=self.norm_p)
+        loss_cycle_22 = torch.norm(geodists_2 * c_22, p=self.norm_p)
+
+        return loss_cycle_11 + loss_cycle_22
diff --git a/densepose/modeling/losses/embed.py b/densepose/modeling/losses/embed.py
new file mode 100644
index 0000000000000000000000000000000000000000..1e3a069763ca6fab0acc7c455b416b9634ceaedf
--- /dev/null
+++ b/densepose/modeling/losses/embed.py
@@ -0,0 +1,119 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+
+from typing import Any, Dict, List
+import torch
+from torch import nn
+from torch.nn import functional as F
+
+from detectron2.config import CfgNode
+from detectron2.structures import Instances
+
+from densepose.data.meshes.catalog import MeshCatalog
+from densepose.modeling.cse.utils import normalize_embeddings, squared_euclidean_distance_matrix
+
+from .embed_utils import PackedCseAnnotations
+from .utils import BilinearInterpolationHelper
+
+
+class EmbeddingLoss:
+    """
+    Computes losses for estimated embeddings given annotated vertices.
+    Instances in a minibatch that correspond to the same mesh are grouped
+    together. For each group, loss is computed as cross-entropy for
+    unnormalized scores given ground truth mesh vertex ids.
+    Scores are based on squared distances between estimated vertex embeddings
+    and mesh vertex embeddings.
+    """
+
+    def __init__(self, cfg: CfgNode):
+        """
+        Initialize embedding loss from config
+        """
+        self.embdist_gauss_sigma = cfg.MODEL.ROI_DENSEPOSE_HEAD.CSE.EMBEDDING_DIST_GAUSS_SIGMA
+
+    def __call__(
+        self,
+        proposals_with_gt: List[Instances],
+        densepose_predictor_outputs: Any,
+        packed_annotations: PackedCseAnnotations,
+        interpolator: BilinearInterpolationHelper,
+        embedder: nn.Module,
+    ) -> Dict[int, torch.Tensor]:
+        """
+        Produces losses for estimated embeddings given annotated vertices.
+        Embeddings for all the vertices of a mesh are computed by the embedder.
+        Embeddings for observed pixels are estimated by a predictor.
+        Losses are computed as cross-entropy for squared distances between
+        observed vertex embeddings and all mesh vertex embeddings given
+        ground truth vertex IDs.
+
+        Args:
+            proposals_with_gt (list of Instances): detections with associated
+                ground truth data; each item corresponds to instances detected
+                on 1 image; the number of items corresponds to the number of
+                images in a batch
+            densepose_predictor_outputs: an object of a dataclass that contains predictor
+                outputs with estimated values; assumed to have the following attributes:
+                * embedding - embedding estimates, tensor of shape [N, D, S, S], where
+                  N = number of instances (= sum N_i, where N_i is the number of
+                      instances on image i)
+                  D = embedding space dimensionality (MODEL.ROI_DENSEPOSE_HEAD.CSE.EMBED_SIZE)
+                  S = output size (width and height)
+            packed_annotations (PackedCseAnnotations): contains various data useful
+                for loss computation, each data is packed into a single tensor
+            interpolator (BilinearInterpolationHelper): bilinear interpolation helper
+            embedder (nn.Module): module that computes vertex embeddings for different meshes
+        Return:
+            dict(int -> tensor): losses for different mesh IDs
+        """
+        losses = {}
+        for mesh_id_tensor in packed_annotations.vertex_mesh_ids_gt.unique():
+            mesh_id = mesh_id_tensor.item()
+            mesh_name = MeshCatalog.get_mesh_name(mesh_id)
+            # valid points are those that fall into estimated bbox
+            # and correspond to the current mesh
+            j_valid = interpolator.j_valid * (  # pyre-ignore[16]
+                packed_annotations.vertex_mesh_ids_gt == mesh_id
+            )
+            if not torch.any(j_valid):
+                continue
+            # extract estimated embeddings for valid points
+            # -> tensor [J, D]
+            vertex_embeddings_i = normalize_embeddings(
+                interpolator.extract_at_points(
+                    densepose_predictor_outputs.embedding,
+                    slice_fine_segm=slice(None),
+                    w_ylo_xlo=interpolator.w_ylo_xlo[:, None],  # pyre-ignore[16]
+                    w_ylo_xhi=interpolator.w_ylo_xhi[:, None],  # pyre-ignore[16]
+                    w_yhi_xlo=interpolator.w_yhi_xlo[:, None],  # pyre-ignore[16]
+                    w_yhi_xhi=interpolator.w_yhi_xhi[:, None],  # pyre-ignore[16]
+                )[j_valid, :]
+            )
+            # extract vertex ids for valid points
+            # -> tensor [J]
+            vertex_indices_i = packed_annotations.vertex_ids_gt[j_valid]
+            # embeddings for all mesh vertices
+            # -> tensor [K, D]
+            mesh_vertex_embeddings = embedder(mesh_name)
+            # unnormalized scores for valid points
+            # -> tensor [J, K]
+            scores = squared_euclidean_distance_matrix(
+                vertex_embeddings_i, mesh_vertex_embeddings
+            ) / (-self.embdist_gauss_sigma)
+            losses[mesh_name] = F.cross_entropy(scores, vertex_indices_i, ignore_index=-1)
+
+        for mesh_name in embedder.mesh_names:
+            if mesh_name not in losses:
+                losses[mesh_name] = self.fake_value(
+                    densepose_predictor_outputs, embedder, mesh_name
+                )
+        return losses
+
+    def fake_values(self, densepose_predictor_outputs: Any, embedder: nn.Module):
+        losses = {}
+        for mesh_name in embedder.mesh_names:
+            losses[mesh_name] = self.fake_value(densepose_predictor_outputs, embedder, mesh_name)
+        return losses
+
+    def fake_value(self, densepose_predictor_outputs: Any, embedder: nn.Module, mesh_name: str):
+        return densepose_predictor_outputs.embedding.sum() * 0 + embedder(mesh_name).sum() * 0
diff --git a/densepose/modeling/losses/embed_utils.py b/densepose/modeling/losses/embed_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..f2ca16fd3809b89e1c05636242a84d02d3a42d88
--- /dev/null
+++ b/densepose/modeling/losses/embed_utils.py
@@ -0,0 +1,137 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+
+from dataclasses import dataclass
+from typing import Any, Optional
+import torch
+
+from detectron2.structures import BoxMode, Instances
+
+from .utils import AnnotationsAccumulator
+
+
+@dataclass
+class PackedCseAnnotations:
+    x_gt: torch.Tensor
+    y_gt: torch.Tensor
+    coarse_segm_gt: Optional[torch.Tensor]
+    vertex_mesh_ids_gt: torch.Tensor
+    vertex_ids_gt: torch.Tensor
+    bbox_xywh_gt: torch.Tensor
+    bbox_xywh_est: torch.Tensor
+    point_bbox_with_dp_indices: torch.Tensor
+    point_bbox_indices: torch.Tensor
+    bbox_indices: torch.Tensor
+
+
+class CseAnnotationsAccumulator(AnnotationsAccumulator):
+    """
+    Accumulates annotations by batches that correspond to objects detected on
+    individual images. Can pack them together into single tensors.
+    """
+
+    def __init__(self):
+        self.x_gt = []
+        self.y_gt = []
+        self.s_gt = []
+        self.vertex_mesh_ids_gt = []
+        self.vertex_ids_gt = []
+        self.bbox_xywh_gt = []
+        self.bbox_xywh_est = []
+        self.point_bbox_with_dp_indices = []
+        self.point_bbox_indices = []
+        self.bbox_indices = []
+        self.nxt_bbox_with_dp_index = 0
+        self.nxt_bbox_index = 0
+
+    def accumulate(self, instances_one_image: Instances):
+        """
+        Accumulate instances data for one image
+
+        Args:
+            instances_one_image (Instances): instances data to accumulate
+        """
+        boxes_xywh_est = BoxMode.convert(
+            instances_one_image.proposal_boxes.tensor.clone(), BoxMode.XYXY_ABS, BoxMode.XYWH_ABS
+        )
+        boxes_xywh_gt = BoxMode.convert(
+            instances_one_image.gt_boxes.tensor.clone(), BoxMode.XYXY_ABS, BoxMode.XYWH_ABS
+        )
+        n_matches = len(boxes_xywh_gt)
+        assert n_matches == len(
+            boxes_xywh_est
+        ), f"Got {len(boxes_xywh_est)} proposal boxes and {len(boxes_xywh_gt)} GT boxes"
+        if not n_matches:
+            # no detection - GT matches
+            return
+        if (
+            not hasattr(instances_one_image, "gt_densepose")
+            or instances_one_image.gt_densepose is None
+        ):
+            # no densepose GT for the detections, just increase the bbox index
+            self.nxt_bbox_index += n_matches
+            return
+        for box_xywh_est, box_xywh_gt, dp_gt in zip(
+            boxes_xywh_est, boxes_xywh_gt, instances_one_image.gt_densepose
+        ):
+            if (dp_gt is not None) and (len(dp_gt.x) > 0):
+                # pyre-fixme[6]: For 1st argument expected `Tensor` but got `float`.
+                # pyre-fixme[6]: For 2nd argument expected `Tensor` but got `float`.
+                self._do_accumulate(box_xywh_gt, box_xywh_est, dp_gt)
+            self.nxt_bbox_index += 1
+
+    def _do_accumulate(self, box_xywh_gt: torch.Tensor, box_xywh_est: torch.Tensor, dp_gt: Any):
+        """
+        Accumulate instances data for one image, given that the data is not empty
+
+        Args:
+            box_xywh_gt (tensor): GT bounding box
+            box_xywh_est (tensor): estimated bounding box
+            dp_gt: GT densepose data with the following attributes:
+             - x: normalized X coordinates
+             - y: normalized Y coordinates
+             - segm: tensor of size [S, S] with coarse segmentation
+             -
+        """
+        self.x_gt.append(dp_gt.x)
+        self.y_gt.append(dp_gt.y)
+        if hasattr(dp_gt, "segm"):
+            self.s_gt.append(dp_gt.segm.unsqueeze(0))
+        self.vertex_ids_gt.append(dp_gt.vertex_ids)
+        self.vertex_mesh_ids_gt.append(torch.full_like(dp_gt.vertex_ids, dp_gt.mesh_id))
+        self.bbox_xywh_gt.append(box_xywh_gt.view(-1, 4))
+        self.bbox_xywh_est.append(box_xywh_est.view(-1, 4))
+        self.point_bbox_with_dp_indices.append(
+            torch.full_like(dp_gt.vertex_ids, self.nxt_bbox_with_dp_index)
+        )
+        self.point_bbox_indices.append(torch.full_like(dp_gt.vertex_ids, self.nxt_bbox_index))
+        self.bbox_indices.append(self.nxt_bbox_index)
+        self.nxt_bbox_with_dp_index += 1
+
+    def pack(self) -> Optional[PackedCseAnnotations]:
+        """
+        Pack data into tensors
+        """
+        if not len(self.x_gt):
+            # TODO:
+            # returning proper empty annotations would require
+            # creating empty tensors of appropriate shape and
+            # type on an appropriate device;
+            # we return None so far to indicate empty annotations
+            return None
+        return PackedCseAnnotations(
+            x_gt=torch.cat(self.x_gt, 0),
+            y_gt=torch.cat(self.y_gt, 0),
+            vertex_mesh_ids_gt=torch.cat(self.vertex_mesh_ids_gt, 0),
+            vertex_ids_gt=torch.cat(self.vertex_ids_gt, 0),
+            # ignore segmentation annotations, if not all the instances contain those
+            coarse_segm_gt=torch.cat(self.s_gt, 0)
+            if len(self.s_gt) == len(self.bbox_xywh_gt)
+            else None,
+            bbox_xywh_gt=torch.cat(self.bbox_xywh_gt, 0),
+            bbox_xywh_est=torch.cat(self.bbox_xywh_est, 0),
+            point_bbox_with_dp_indices=torch.cat(self.point_bbox_with_dp_indices, 0),
+            point_bbox_indices=torch.cat(self.point_bbox_indices, 0),
+            bbox_indices=torch.as_tensor(
+                self.bbox_indices, dtype=torch.long, device=self.x_gt[0].device
+            ),
+        )
diff --git a/densepose/modeling/losses/mask.py b/densepose/modeling/losses/mask.py
new file mode 100644
index 0000000000000000000000000000000000000000..c16b15c53de9f02dc734148e05f2bde799046aa0
--- /dev/null
+++ b/densepose/modeling/losses/mask.py
@@ -0,0 +1,125 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+
+from dataclasses import dataclass
+from typing import Any, Iterable, List, Optional
+import torch
+from torch.nn import functional as F
+
+from detectron2.structures import Instances
+
+
+@dataclass
+class DataForMaskLoss:
+    """
+    Contains mask GT and estimated data for proposals from multiple images:
+    """
+
+    # tensor of size (K, H, W) containing GT labels
+    masks_gt: Optional[torch.Tensor] = None
+    # tensor of size (K, C, H, W) containing estimated scores
+    masks_est: Optional[torch.Tensor] = None
+
+
+def extract_data_for_mask_loss_from_matches(
+    proposals_targets: Iterable[Instances], estimated_segm: torch.Tensor
+) -> DataForMaskLoss:
+    """
+    Extract data for mask loss from instances that contain matched GT and
+    estimated bounding boxes.
+    Args:
+        proposals_targets: Iterable[Instances]
+            matched GT and estimated results, each item in the iterable
+            corresponds to data in 1 image
+        estimated_segm: tensor(K, C, S, S) of float - raw unnormalized
+            segmentation scores, here S is the size to which GT masks are
+            to be resized
+    Return:
+        masks_est: tensor(K, C, S, S) of float - class scores
+        masks_gt: tensor(K, S, S) of int64 - labels
+    """
+    data = DataForMaskLoss()
+    masks_gt = []
+    offset = 0
+    assert estimated_segm.shape[2] == estimated_segm.shape[3], (
+        f"Expected estimated segmentation to have a square shape, "
+        f"but the actual shape is {estimated_segm.shape[2:]}"
+    )
+    mask_size = estimated_segm.shape[2]
+    num_proposals = sum(inst.proposal_boxes.tensor.size(0) for inst in proposals_targets)
+    num_estimated = estimated_segm.shape[0]
+    assert (
+        num_proposals == num_estimated
+    ), "The number of proposals {} must be equal to the number of estimates {}".format(
+        num_proposals, num_estimated
+    )
+
+    for proposals_targets_per_image in proposals_targets:
+        n_i = proposals_targets_per_image.proposal_boxes.tensor.size(0)
+        if not n_i:
+            continue
+        gt_masks_per_image = proposals_targets_per_image.gt_masks.crop_and_resize(
+            proposals_targets_per_image.proposal_boxes.tensor, mask_size
+        ).to(device=estimated_segm.device)
+        masks_gt.append(gt_masks_per_image)
+        offset += n_i
+    if masks_gt:
+        data.masks_est = estimated_segm
+        data.masks_gt = torch.cat(masks_gt, dim=0)
+    return data
+
+
+class MaskLoss:
+    """
+    Mask loss as cross-entropy for raw unnormalized scores given ground truth labels.
+    Mask ground truth labels are defined for the whole image and not only the
+    bounding box of interest. They are stored as objects that are assumed to implement
+    the `crop_and_resize` interface (e.g. BitMasks, PolygonMasks).
+    """
+
+    def __call__(
+        self, proposals_with_gt: List[Instances], densepose_predictor_outputs: Any
+    ) -> torch.Tensor:
+        """
+        Computes segmentation loss as cross-entropy for raw unnormalized
+        scores given ground truth labels.
+
+        Args:
+            proposals_with_gt (list of Instances): detections with associated ground truth data
+            densepose_predictor_outputs: an object of a dataclass that contains predictor outputs
+                with estimated values; assumed to have the following attribute:
+                * coarse_segm (tensor of shape [N, D, S, S]): coarse segmentation estimates
+                    as raw unnormalized scores
+                where N is the number of detections, S is the estimate size ( = width = height)
+                and D is the number of coarse segmentation channels.
+        Return:
+            Cross entropy for raw unnormalized scores for coarse segmentation given
+            ground truth labels from masks
+        """
+        if not len(proposals_with_gt):
+            return self.fake_value(densepose_predictor_outputs)
+        # densepose outputs are computed for all images and all bounding boxes;
+        # i.e. if a batch has 4 images with (3, 1, 2, 1) proposals respectively,
+        # the outputs will have size(0) == 3+1+2+1 == 7
+        with torch.no_grad():
+            mask_loss_data = extract_data_for_mask_loss_from_matches(
+                proposals_with_gt, densepose_predictor_outputs.coarse_segm
+            )
+        if (mask_loss_data.masks_gt is None) or (mask_loss_data.masks_est is None):
+            return self.fake_value(densepose_predictor_outputs)
+        return F.cross_entropy(mask_loss_data.masks_est, mask_loss_data.masks_gt.long())
+
+    def fake_value(self, densepose_predictor_outputs: Any) -> torch.Tensor:
+        """
+        Fake segmentation loss used when no suitable ground truth data
+        was found in a batch. The loss has a value 0 and is primarily used to
+        construct the computation graph, so that `DistributedDataParallel`
+        has similar graphs on all GPUs and can perform reduction properly.
+
+        Args:
+            densepose_predictor_outputs: DensePose predictor outputs, an object
+                of a dataclass that is assumed to have `coarse_segm`
+                attribute
+        Return:
+            Zero value loss with proper computation graph
+        """
+        return densepose_predictor_outputs.coarse_segm.sum() * 0
diff --git a/densepose/modeling/losses/mask_or_segm.py b/densepose/modeling/losses/mask_or_segm.py
new file mode 100644
index 0000000000000000000000000000000000000000..98b773d99fd29a48cbdfa94c5882c9c3d94003ee
--- /dev/null
+++ b/densepose/modeling/losses/mask_or_segm.py
@@ -0,0 +1,72 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+
+from typing import Any, List
+import torch
+
+from detectron2.config import CfgNode
+from detectron2.structures import Instances
+
+from .mask import MaskLoss
+from .segm import SegmentationLoss
+
+
+class MaskOrSegmentationLoss:
+    """
+    Mask or segmentation loss as cross-entropy for raw unnormalized scores
+    given ground truth labels. Ground truth labels are either defined by coarse
+    segmentation annotation, or by mask annotation, depending on the config
+    value MODEL.ROI_DENSEPOSE_HEAD.COARSE_SEGM_TRAINED_BY_MASKS
+    """
+
+    def __init__(self, cfg: CfgNode):
+        """
+        Initialize segmentation loss from configuration options
+
+        Args:
+            cfg (CfgNode): configuration options
+        """
+        self.segm_trained_by_masks = cfg.MODEL.ROI_DENSEPOSE_HEAD.COARSE_SEGM_TRAINED_BY_MASKS
+        if self.segm_trained_by_masks:
+            self.mask_loss = MaskLoss()
+        self.segm_loss = SegmentationLoss(cfg)
+
+    def __call__(
+        self,
+        proposals_with_gt: List[Instances],
+        densepose_predictor_outputs: Any,
+        packed_annotations: Any,
+    ) -> torch.Tensor:
+        """
+        Compute segmentation loss as cross-entropy between aligned unnormalized
+        score estimates and ground truth; with ground truth given
+        either by masks, or by coarse segmentation annotations.
+
+        Args:
+            proposals_with_gt (list of Instances): detections with associated ground truth data
+            densepose_predictor_outputs: an object of a dataclass that contains predictor outputs
+                with estimated values; assumed to have the following attributes:
+                * coarse_segm - coarse segmentation estimates, tensor of shape [N, D, S, S]
+            packed_annotations: packed annotations for efficient loss computation
+        Return:
+            tensor: loss value as cross-entropy for raw unnormalized scores
+                given ground truth labels
+        """
+        if self.segm_trained_by_masks:
+            return self.mask_loss(proposals_with_gt, densepose_predictor_outputs)
+        return self.segm_loss(proposals_with_gt, densepose_predictor_outputs, packed_annotations)
+
+    def fake_value(self, densepose_predictor_outputs: Any) -> torch.Tensor:
+        """
+        Fake segmentation loss used when no suitable ground truth data
+        was found in a batch. The loss has a value 0 and is primarily used to
+        construct the computation graph, so that `DistributedDataParallel`
+        has similar graphs on all GPUs and can perform reduction properly.
+
+        Args:
+            densepose_predictor_outputs: DensePose predictor outputs, an object
+                of a dataclass that is assumed to have `coarse_segm`
+                attribute
+        Return:
+            Zero value loss with proper computation graph
+        """
+        return densepose_predictor_outputs.coarse_segm.sum() * 0
diff --git a/densepose/modeling/losses/registry.py b/densepose/modeling/losses/registry.py
new file mode 100644
index 0000000000000000000000000000000000000000..d9c8817a743e42b2aec382818f0cc1bb39a66004
--- /dev/null
+++ b/densepose/modeling/losses/registry.py
@@ -0,0 +1,5 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+
+from detectron2.utils.registry import Registry
+
+DENSEPOSE_LOSS_REGISTRY = Registry("DENSEPOSE_LOSS")
diff --git a/densepose/modeling/losses/segm.py b/densepose/modeling/losses/segm.py
new file mode 100644
index 0000000000000000000000000000000000000000..1962b886e1946fa4896776da8a007ae0a9a4fab3
--- /dev/null
+++ b/densepose/modeling/losses/segm.py
@@ -0,0 +1,83 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+
+from typing import Any, List
+import torch
+from torch.nn import functional as F
+
+from detectron2.config import CfgNode
+from detectron2.structures import Instances
+
+from .utils import resample_data
+
+
+class SegmentationLoss:
+    """
+    Segmentation loss as cross-entropy for raw unnormalized scores given ground truth
+    labels. Segmentation ground truth labels are defined for the bounding box of
+    interest at some fixed resolution [S, S], where
+        S = MODEL.ROI_DENSEPOSE_HEAD.HEATMAP_SIZE.
+    """
+
+    def __init__(self, cfg: CfgNode):
+        """
+        Initialize segmentation loss from configuration options
+
+        Args:
+            cfg (CfgNode): configuration options
+        """
+        self.heatmap_size = cfg.MODEL.ROI_DENSEPOSE_HEAD.HEATMAP_SIZE
+        self.n_segm_chan = cfg.MODEL.ROI_DENSEPOSE_HEAD.NUM_COARSE_SEGM_CHANNELS
+
+    def __call__(
+        self,
+        proposals_with_gt: List[Instances],
+        densepose_predictor_outputs: Any,
+        packed_annotations: Any,
+    ) -> torch.Tensor:
+        """
+        Compute segmentation loss as cross-entropy on aligned segmentation
+        ground truth and estimated scores.
+
+        Args:
+            proposals_with_gt (list of Instances): detections with associated ground truth data
+            densepose_predictor_outputs: an object of a dataclass that contains predictor outputs
+                with estimated values; assumed to have the following attributes:
+                * coarse_segm - coarse segmentation estimates, tensor of shape [N, D, S, S]
+            packed_annotations: packed annotations for efficient loss computation;
+                the following attributes are used:
+                 - coarse_segm_gt
+                 - bbox_xywh_gt
+                 - bbox_xywh_est
+        """
+        if packed_annotations.coarse_segm_gt is None:
+            return self.fake_value(densepose_predictor_outputs)
+        coarse_segm_est = densepose_predictor_outputs.coarse_segm[packed_annotations.bbox_indices]
+        with torch.no_grad():
+            coarse_segm_gt = resample_data(
+                packed_annotations.coarse_segm_gt.unsqueeze(1),
+                packed_annotations.bbox_xywh_gt,
+                packed_annotations.bbox_xywh_est,
+                self.heatmap_size,
+                self.heatmap_size,
+                mode="nearest",
+                padding_mode="zeros",
+            ).squeeze(1)
+        if self.n_segm_chan == 2:
+            coarse_segm_gt = coarse_segm_gt > 0
+        return F.cross_entropy(coarse_segm_est, coarse_segm_gt.long())
+
+    def fake_value(self, densepose_predictor_outputs: Any) -> torch.Tensor:
+        """
+        Fake segmentation loss used when no suitable ground truth data
+        was found in a batch. The loss has a value 0 and is primarily used to
+        construct the computation graph, so that `DistributedDataParallel`
+        has similar graphs on all GPUs and can perform reduction properly.
+
+        Args:
+            densepose_predictor_outputs: DensePose predictor outputs, an object
+                of a dataclass that is assumed to have `coarse_segm`
+                attribute
+        Return:
+            Zero value loss with proper computation graph
+        """
+        return densepose_predictor_outputs.coarse_segm.sum() * 0
diff --git a/densepose/modeling/losses/soft_embed.py b/densepose/modeling/losses/soft_embed.py
new file mode 100644
index 0000000000000000000000000000000000000000..03b69ec36a59ae0d69bb77efa77f93c6f95fad97
--- /dev/null
+++ b/densepose/modeling/losses/soft_embed.py
@@ -0,0 +1,133 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+
+from typing import Any, Dict, List
+import torch
+from torch import nn
+from torch.nn import functional as F
+
+from detectron2.config import CfgNode
+from detectron2.structures import Instances
+
+from densepose.data.meshes.catalog import MeshCatalog
+from densepose.modeling.cse.utils import normalize_embeddings, squared_euclidean_distance_matrix
+from densepose.structures.mesh import create_mesh
+
+from .embed_utils import PackedCseAnnotations
+from .utils import BilinearInterpolationHelper
+
+
+class SoftEmbeddingLoss:
+    """
+    Computes losses for estimated embeddings given annotated vertices.
+    Instances in a minibatch that correspond to the same mesh are grouped
+    together. For each group, loss is computed as cross-entropy for
+    unnormalized scores given ground truth mesh vertex ids.
+    Scores are based on:
+     1) squared distances between estimated vertex embeddings
+        and mesh vertex embeddings;
+     2) geodesic distances between vertices of a mesh
+    """
+
+    def __init__(self, cfg: CfgNode):
+        """
+        Initialize embedding loss from config
+        """
+        self.embdist_gauss_sigma = cfg.MODEL.ROI_DENSEPOSE_HEAD.CSE.EMBEDDING_DIST_GAUSS_SIGMA
+        self.geodist_gauss_sigma = cfg.MODEL.ROI_DENSEPOSE_HEAD.CSE.GEODESIC_DIST_GAUSS_SIGMA
+
+    def __call__(
+        self,
+        proposals_with_gt: List[Instances],
+        densepose_predictor_outputs: Any,
+        packed_annotations: PackedCseAnnotations,
+        interpolator: BilinearInterpolationHelper,
+        embedder: nn.Module,
+    ) -> Dict[int, torch.Tensor]:
+        """
+        Produces losses for estimated embeddings given annotated vertices.
+        Embeddings for all the vertices of a mesh are computed by the embedder.
+        Embeddings for observed pixels are estimated by a predictor.
+        Losses are computed as cross-entropy for unnormalized scores given
+        ground truth vertex IDs.
+         1) squared distances between estimated vertex embeddings
+            and mesh vertex embeddings;
+         2) geodesic distances between vertices of a mesh
+
+        Args:
+            proposals_with_gt (list of Instances): detections with associated
+                ground truth data; each item corresponds to instances detected
+                on 1 image; the number of items corresponds to the number of
+                images in a batch
+            densepose_predictor_outputs: an object of a dataclass that contains predictor
+                outputs with estimated values; assumed to have the following attributes:
+                * embedding - embedding estimates, tensor of shape [N, D, S, S], where
+                  N = number of instances (= sum N_i, where N_i is the number of
+                      instances on image i)
+                  D = embedding space dimensionality (MODEL.ROI_DENSEPOSE_HEAD.CSE.EMBED_SIZE)
+                  S = output size (width and height)
+            packed_annotations (PackedCseAnnotations): contains various data useful
+                for loss computation, each data is packed into a single tensor
+            interpolator (BilinearInterpolationHelper): bilinear interpolation helper
+            embedder (nn.Module): module that computes vertex embeddings for different meshes
+        Return:
+            dict(int -> tensor): losses for different mesh IDs
+        """
+        losses = {}
+        for mesh_id_tensor in packed_annotations.vertex_mesh_ids_gt.unique():
+            mesh_id = mesh_id_tensor.item()
+            mesh_name = MeshCatalog.get_mesh_name(mesh_id)
+            # valid points are those that fall into estimated bbox
+            # and correspond to the current mesh
+            j_valid = interpolator.j_valid * (  # pyre-ignore[16]
+                packed_annotations.vertex_mesh_ids_gt == mesh_id
+            )
+            if not torch.any(j_valid):
+                continue
+            # extract estimated embeddings for valid points
+            # -> tensor [J, D]
+            vertex_embeddings_i = normalize_embeddings(
+                interpolator.extract_at_points(
+                    densepose_predictor_outputs.embedding,
+                    slice_fine_segm=slice(None),
+                    w_ylo_xlo=interpolator.w_ylo_xlo[:, None],  # pyre-ignore[16]
+                    w_ylo_xhi=interpolator.w_ylo_xhi[:, None],  # pyre-ignore[16]
+                    w_yhi_xlo=interpolator.w_yhi_xlo[:, None],  # pyre-ignore[16]
+                    w_yhi_xhi=interpolator.w_yhi_xhi[:, None],  # pyre-ignore[16]
+                )[j_valid, :]
+            )
+            # extract vertex ids for valid points
+            # -> tensor [J]
+            vertex_indices_i = packed_annotations.vertex_ids_gt[j_valid]
+            # embeddings for all mesh vertices
+            # -> tensor [K, D]
+            mesh_vertex_embeddings = embedder(mesh_name)
+            # softmax values of geodesic distances for GT mesh vertices
+            # -> tensor [J, K]
+            mesh = create_mesh(mesh_name, mesh_vertex_embeddings.device)
+            geodist_softmax_values = F.softmax(
+                mesh.geodists[vertex_indices_i] / (-self.geodist_gauss_sigma), dim=1
+            )
+            # logsoftmax values for valid points
+            # -> tensor [J, K]
+            embdist_logsoftmax_values = F.log_softmax(
+                squared_euclidean_distance_matrix(vertex_embeddings_i, mesh_vertex_embeddings)
+                / (-self.embdist_gauss_sigma),
+                dim=1,
+            )
+            losses[mesh_name] = (-geodist_softmax_values * embdist_logsoftmax_values).sum(1).mean()
+
+        for mesh_name in embedder.mesh_names:
+            if mesh_name not in losses:
+                losses[mesh_name] = self.fake_value(
+                    densepose_predictor_outputs, embedder, mesh_name
+                )
+        return losses
+
+    def fake_values(self, densepose_predictor_outputs: Any, embedder: nn.Module):
+        losses = {}
+        for mesh_name in embedder.mesh_names:
+            losses[mesh_name] = self.fake_value(densepose_predictor_outputs, embedder, mesh_name)
+        return losses
+
+    def fake_value(self, densepose_predictor_outputs: Any, embedder: nn.Module, mesh_name: str):
+        return densepose_predictor_outputs.embedding.sum() * 0 + embedder(mesh_name).sum() * 0
diff --git a/densepose/modeling/losses/utils.py b/densepose/modeling/losses/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..ceea981d11650af80cb007fe129a3ee4864fc48f
--- /dev/null
+++ b/densepose/modeling/losses/utils.py
@@ -0,0 +1,443 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+
+from abc import ABC, abstractmethod
+from dataclasses import dataclass
+from typing import Any, Dict, List, Optional, Tuple
+import torch
+from torch.nn import functional as F
+
+from detectron2.structures import BoxMode, Instances
+
+from densepose import DensePoseDataRelative
+
+LossDict = Dict[str, torch.Tensor]
+
+
+def _linear_interpolation_utilities(v_norm, v0_src, size_src, v0_dst, size_dst, size_z):
+    """
+    Computes utility values for linear interpolation at points v.
+    The points are given as normalized offsets in the source interval
+    (v0_src, v0_src + size_src), more precisely:
+        v = v0_src + v_norm * size_src / 256.0
+    The computed utilities include lower points v_lo, upper points v_hi,
+    interpolation weights v_w and flags j_valid indicating whether the
+    points falls into the destination interval (v0_dst, v0_dst + size_dst).
+
+    Args:
+        v_norm (:obj: `torch.Tensor`): tensor of size N containing
+            normalized point offsets
+        v0_src (:obj: `torch.Tensor`): tensor of size N containing
+            left bounds of source intervals for normalized points
+        size_src (:obj: `torch.Tensor`): tensor of size N containing
+            source interval sizes for normalized points
+        v0_dst (:obj: `torch.Tensor`): tensor of size N containing
+            left bounds of destination intervals
+        size_dst (:obj: `torch.Tensor`): tensor of size N containing
+            destination interval sizes
+        size_z (int): interval size for data to be interpolated
+
+    Returns:
+        v_lo (:obj: `torch.Tensor`): int tensor of size N containing
+            indices of lower values used for interpolation, all values are
+            integers from [0, size_z - 1]
+        v_hi (:obj: `torch.Tensor`): int tensor of size N containing
+            indices of upper values used for interpolation, all values are
+            integers from [0, size_z - 1]
+        v_w (:obj: `torch.Tensor`): float tensor of size N containing
+            interpolation weights
+        j_valid (:obj: `torch.Tensor`): uint8 tensor of size N containing
+            0 for points outside the estimation interval
+            (v0_est, v0_est + size_est) and 1 otherwise
+    """
+    v = v0_src + v_norm * size_src / 256.0
+    j_valid = (v - v0_dst >= 0) * (v - v0_dst < size_dst)
+    v_grid = (v - v0_dst) * size_z / size_dst
+    v_lo = v_grid.floor().long().clamp(min=0, max=size_z - 1)
+    v_hi = (v_lo + 1).clamp(max=size_z - 1)
+    v_grid = torch.min(v_hi.float(), v_grid)
+    v_w = v_grid - v_lo.float()
+    return v_lo, v_hi, v_w, j_valid
+
+
+class BilinearInterpolationHelper:
+    """
+    Args:
+        packed_annotations: object that contains packed annotations
+        j_valid (:obj: `torch.Tensor`): uint8 tensor of size M containing
+            0 for points to be discarded and 1 for points to be selected
+        y_lo (:obj: `torch.Tensor`): int tensor of indices of upper values
+            in z_est for each point
+        y_hi (:obj: `torch.Tensor`): int tensor of indices of lower values
+            in z_est for each point
+        x_lo (:obj: `torch.Tensor`): int tensor of indices of left values
+            in z_est for each point
+        x_hi (:obj: `torch.Tensor`): int tensor of indices of right values
+            in z_est for each point
+        w_ylo_xlo (:obj: `torch.Tensor`): float tensor of size M;
+            contains upper-left value weight for each point
+        w_ylo_xhi (:obj: `torch.Tensor`): float tensor of size M;
+            contains upper-right value weight for each point
+        w_yhi_xlo (:obj: `torch.Tensor`): float tensor of size M;
+            contains lower-left value weight for each point
+        w_yhi_xhi (:obj: `torch.Tensor`): float tensor of size M;
+            contains lower-right value weight for each point
+    """
+
+    def __init__(
+        self,
+        packed_annotations: Any,
+        j_valid: torch.Tensor,
+        y_lo: torch.Tensor,
+        y_hi: torch.Tensor,
+        x_lo: torch.Tensor,
+        x_hi: torch.Tensor,
+        w_ylo_xlo: torch.Tensor,
+        w_ylo_xhi: torch.Tensor,
+        w_yhi_xlo: torch.Tensor,
+        w_yhi_xhi: torch.Tensor,
+    ):
+        for k, v in locals().items():
+            if k != "self":
+                setattr(self, k, v)
+
+    @staticmethod
+    def from_matches(
+        packed_annotations: Any, densepose_outputs_size_hw: Tuple[int, int]
+    ) -> "BilinearInterpolationHelper":
+        """
+        Args:
+            packed_annotations: annotations packed into tensors, the following
+                attributes are required:
+                 - bbox_xywh_gt
+                 - bbox_xywh_est
+                 - x_gt
+                 - y_gt
+                 - point_bbox_with_dp_indices
+                 - point_bbox_indices
+            densepose_outputs_size_hw (tuple [int, int]): resolution of
+                DensePose predictor outputs (H, W)
+        Return:
+            An instance of `BilinearInterpolationHelper` used to perform
+            interpolation for the given annotation points and output resolution
+        """
+
+        zh, zw = densepose_outputs_size_hw
+        x0_gt, y0_gt, w_gt, h_gt = packed_annotations.bbox_xywh_gt[
+            packed_annotations.point_bbox_with_dp_indices
+        ].unbind(dim=1)
+        x0_est, y0_est, w_est, h_est = packed_annotations.bbox_xywh_est[
+            packed_annotations.point_bbox_with_dp_indices
+        ].unbind(dim=1)
+        x_lo, x_hi, x_w, jx_valid = _linear_interpolation_utilities(
+            packed_annotations.x_gt, x0_gt, w_gt, x0_est, w_est, zw
+        )
+        y_lo, y_hi, y_w, jy_valid = _linear_interpolation_utilities(
+            packed_annotations.y_gt, y0_gt, h_gt, y0_est, h_est, zh
+        )
+        j_valid = jx_valid * jy_valid
+
+        w_ylo_xlo = (1.0 - x_w) * (1.0 - y_w)
+        w_ylo_xhi = x_w * (1.0 - y_w)
+        w_yhi_xlo = (1.0 - x_w) * y_w
+        w_yhi_xhi = x_w * y_w
+
+        return BilinearInterpolationHelper(
+            packed_annotations,
+            j_valid,
+            y_lo,
+            y_hi,
+            x_lo,
+            x_hi,
+            w_ylo_xlo,  # pyre-ignore[6]
+            w_ylo_xhi,
+            # pyre-fixme[6]: Expected `Tensor` for 9th param but got `float`.
+            w_yhi_xlo,
+            w_yhi_xhi,
+        )
+
+    def extract_at_points(
+        self,
+        z_est,
+        slice_fine_segm=None,
+        w_ylo_xlo=None,
+        w_ylo_xhi=None,
+        w_yhi_xlo=None,
+        w_yhi_xhi=None,
+    ):
+        """
+        Extract ground truth values z_gt for valid point indices and estimated
+        values z_est using bilinear interpolation over top-left (y_lo, x_lo),
+        top-right (y_lo, x_hi), bottom-left (y_hi, x_lo) and bottom-right
+        (y_hi, x_hi) values in z_est with corresponding weights:
+        w_ylo_xlo, w_ylo_xhi, w_yhi_xlo and w_yhi_xhi.
+        Use slice_fine_segm to slice dim=1 in z_est
+        """
+        slice_fine_segm = (
+            self.packed_annotations.fine_segm_labels_gt
+            if slice_fine_segm is None
+            else slice_fine_segm
+        )
+        w_ylo_xlo = self.w_ylo_xlo if w_ylo_xlo is None else w_ylo_xlo
+        w_ylo_xhi = self.w_ylo_xhi if w_ylo_xhi is None else w_ylo_xhi
+        w_yhi_xlo = self.w_yhi_xlo if w_yhi_xlo is None else w_yhi_xlo
+        w_yhi_xhi = self.w_yhi_xhi if w_yhi_xhi is None else w_yhi_xhi
+
+        index_bbox = self.packed_annotations.point_bbox_indices
+        z_est_sampled = (
+            z_est[index_bbox, slice_fine_segm, self.y_lo, self.x_lo] * w_ylo_xlo
+            + z_est[index_bbox, slice_fine_segm, self.y_lo, self.x_hi] * w_ylo_xhi
+            + z_est[index_bbox, slice_fine_segm, self.y_hi, self.x_lo] * w_yhi_xlo
+            + z_est[index_bbox, slice_fine_segm, self.y_hi, self.x_hi] * w_yhi_xhi
+        )
+        return z_est_sampled
+
+
+def resample_data(
+    z, bbox_xywh_src, bbox_xywh_dst, wout, hout, mode: str = "nearest", padding_mode: str = "zeros"
+):
+    """
+    Args:
+        z (:obj: `torch.Tensor`): tensor of size (N,C,H,W) with data to be
+            resampled
+        bbox_xywh_src (:obj: `torch.Tensor`): tensor of size (N,4) containing
+            source bounding boxes in format XYWH
+        bbox_xywh_dst (:obj: `torch.Tensor`): tensor of size (N,4) containing
+            destination bounding boxes in format XYWH
+    Return:
+        zresampled (:obj: `torch.Tensor`): tensor of size (N, C, Hout, Wout)
+            with resampled values of z, where D is the discretization size
+    """
+    n = bbox_xywh_src.size(0)
+    assert n == bbox_xywh_dst.size(0), (
+        "The number of "
+        "source ROIs for resampling ({}) should be equal to the number "
+        "of destination ROIs ({})".format(bbox_xywh_src.size(0), bbox_xywh_dst.size(0))
+    )
+    x0src, y0src, wsrc, hsrc = bbox_xywh_src.unbind(dim=1)
+    x0dst, y0dst, wdst, hdst = bbox_xywh_dst.unbind(dim=1)
+    x0dst_norm = 2 * (x0dst - x0src) / wsrc - 1
+    y0dst_norm = 2 * (y0dst - y0src) / hsrc - 1
+    x1dst_norm = 2 * (x0dst + wdst - x0src) / wsrc - 1
+    y1dst_norm = 2 * (y0dst + hdst - y0src) / hsrc - 1
+    grid_w = torch.arange(wout, device=z.device, dtype=torch.float) / wout
+    grid_h = torch.arange(hout, device=z.device, dtype=torch.float) / hout
+    grid_w_expanded = grid_w[None, None, :].expand(n, hout, wout)
+    grid_h_expanded = grid_h[None, :, None].expand(n, hout, wout)
+    dx_expanded = (x1dst_norm - x0dst_norm)[:, None, None].expand(n, hout, wout)
+    dy_expanded = (y1dst_norm - y0dst_norm)[:, None, None].expand(n, hout, wout)
+    x0_expanded = x0dst_norm[:, None, None].expand(n, hout, wout)
+    y0_expanded = y0dst_norm[:, None, None].expand(n, hout, wout)
+    grid_x = grid_w_expanded * dx_expanded + x0_expanded
+    grid_y = grid_h_expanded * dy_expanded + y0_expanded
+    grid = torch.stack((grid_x, grid_y), dim=3)
+    # resample Z from (N, C, H, W) into (N, C, Hout, Wout)
+    zresampled = F.grid_sample(z, grid, mode=mode, padding_mode=padding_mode, align_corners=True)
+    return zresampled
+
+
+class AnnotationsAccumulator(ABC):
+    """
+    Abstract class for an accumulator for annotations that can produce
+    dense annotations packed into tensors.
+    """
+
+    @abstractmethod
+    def accumulate(self, instances_one_image: Instances):
+        """
+        Accumulate instances data for one image
+
+        Args:
+            instances_one_image (Instances): instances data to accumulate
+        """
+        pass
+
+    @abstractmethod
+    def pack(self) -> Any:
+        """
+        Pack data into tensors
+        """
+        pass
+
+
+@dataclass
+class PackedChartBasedAnnotations:
+    """
+    Packed annotations for chart-based model training. The following attributes
+    are defined:
+     - fine_segm_labels_gt (tensor [K] of `int64`): GT fine segmentation point labels
+     - x_gt (tensor [K] of `float32`): GT normalized X point coordinates
+     - y_gt (tensor [K] of `float32`): GT normalized Y point coordinates
+     - u_gt (tensor [K] of `float32`): GT point U values
+     - v_gt (tensor [K] of `float32`): GT point V values
+     - coarse_segm_gt (tensor [N, S, S] of `float32`): GT segmentation for bounding boxes
+     - bbox_xywh_gt (tensor [N, 4] of `float32`): selected GT bounding boxes in
+         XYWH format
+     - bbox_xywh_est (tensor [N, 4] of `float32`): selected matching estimated
+         bounding boxes in XYWH format
+     - point_bbox_with_dp_indices (tensor [K] of `int64`): indices of bounding boxes
+         with DensePose annotations that correspond to the point data
+     - point_bbox_indices (tensor [K] of `int64`): indices of bounding boxes
+         (not necessarily the selected ones with DensePose data) that correspond
+         to the point data
+     - bbox_indices (tensor [N] of `int64`): global indices of selected bounding
+         boxes with DensePose annotations; these indices could be used to access
+         features that are computed for all bounding boxes, not only the ones with
+         DensePose annotations.
+    Here K is the total number of points and N is the total number of instances
+    with DensePose annotations.
+    """
+
+    fine_segm_labels_gt: torch.Tensor
+    x_gt: torch.Tensor
+    y_gt: torch.Tensor
+    u_gt: torch.Tensor
+    v_gt: torch.Tensor
+    coarse_segm_gt: Optional[torch.Tensor]
+    bbox_xywh_gt: torch.Tensor
+    bbox_xywh_est: torch.Tensor
+    point_bbox_with_dp_indices: torch.Tensor
+    point_bbox_indices: torch.Tensor
+    bbox_indices: torch.Tensor
+
+
+class ChartBasedAnnotationsAccumulator(AnnotationsAccumulator):
+    """
+    Accumulates annotations by batches that correspond to objects detected on
+    individual images. Can pack them together into single tensors.
+    """
+
+    def __init__(self):
+        self.i_gt = []
+        self.x_gt = []
+        self.y_gt = []
+        self.u_gt = []
+        self.v_gt = []
+        self.s_gt = []
+        self.bbox_xywh_gt = []
+        self.bbox_xywh_est = []
+        self.point_bbox_with_dp_indices = []
+        self.point_bbox_indices = []
+        self.bbox_indices = []
+        self.nxt_bbox_with_dp_index = 0
+        self.nxt_bbox_index = 0
+
+    def accumulate(self, instances_one_image: Instances):
+        """
+        Accumulate instances data for one image
+
+        Args:
+            instances_one_image (Instances): instances data to accumulate
+        """
+        boxes_xywh_est = BoxMode.convert(
+            instances_one_image.proposal_boxes.tensor.clone(), BoxMode.XYXY_ABS, BoxMode.XYWH_ABS
+        )
+        boxes_xywh_gt = BoxMode.convert(
+            instances_one_image.gt_boxes.tensor.clone(), BoxMode.XYXY_ABS, BoxMode.XYWH_ABS
+        )
+        n_matches = len(boxes_xywh_gt)
+        assert n_matches == len(
+            boxes_xywh_est
+        ), f"Got {len(boxes_xywh_est)} proposal boxes and {len(boxes_xywh_gt)} GT boxes"
+        if not n_matches:
+            # no detection - GT matches
+            return
+        if (
+            not hasattr(instances_one_image, "gt_densepose")
+            or instances_one_image.gt_densepose is None
+        ):
+            # no densepose GT for the detections, just increase the bbox index
+            self.nxt_bbox_index += n_matches
+            return
+        for box_xywh_est, box_xywh_gt, dp_gt in zip(
+            boxes_xywh_est, boxes_xywh_gt, instances_one_image.gt_densepose
+        ):
+            if (dp_gt is not None) and (len(dp_gt.x) > 0):
+                # pyre-fixme[6]: For 1st argument expected `Tensor` but got `float`.
+                # pyre-fixme[6]: For 2nd argument expected `Tensor` but got `float`.
+                self._do_accumulate(box_xywh_gt, box_xywh_est, dp_gt)
+            self.nxt_bbox_index += 1
+
+    def _do_accumulate(
+        self, box_xywh_gt: torch.Tensor, box_xywh_est: torch.Tensor, dp_gt: DensePoseDataRelative
+    ):
+        """
+        Accumulate instances data for one image, given that the data is not empty
+
+        Args:
+            box_xywh_gt (tensor): GT bounding box
+            box_xywh_est (tensor): estimated bounding box
+            dp_gt (DensePoseDataRelative): GT densepose data
+        """
+        self.i_gt.append(dp_gt.i)
+        self.x_gt.append(dp_gt.x)
+        self.y_gt.append(dp_gt.y)
+        self.u_gt.append(dp_gt.u)
+        self.v_gt.append(dp_gt.v)
+        if hasattr(dp_gt, "segm"):
+            self.s_gt.append(dp_gt.segm.unsqueeze(0))
+        self.bbox_xywh_gt.append(box_xywh_gt.view(-1, 4))
+        self.bbox_xywh_est.append(box_xywh_est.view(-1, 4))
+        self.point_bbox_with_dp_indices.append(
+            torch.full_like(dp_gt.i, self.nxt_bbox_with_dp_index)
+        )
+        self.point_bbox_indices.append(torch.full_like(dp_gt.i, self.nxt_bbox_index))
+        self.bbox_indices.append(self.nxt_bbox_index)
+        self.nxt_bbox_with_dp_index += 1
+
+    def pack(self) -> Optional[PackedChartBasedAnnotations]:
+        """
+        Pack data into tensors
+        """
+        if not len(self.i_gt):
+            # TODO:
+            # returning proper empty annotations would require
+            # creating empty tensors of appropriate shape and
+            # type on an appropriate device;
+            # we return None so far to indicate empty annotations
+            return None
+        return PackedChartBasedAnnotations(
+            fine_segm_labels_gt=torch.cat(self.i_gt, 0).long(),
+            x_gt=torch.cat(self.x_gt, 0),
+            y_gt=torch.cat(self.y_gt, 0),
+            u_gt=torch.cat(self.u_gt, 0),
+            v_gt=torch.cat(self.v_gt, 0),
+            # ignore segmentation annotations, if not all the instances contain those
+            coarse_segm_gt=torch.cat(self.s_gt, 0)
+            if len(self.s_gt) == len(self.bbox_xywh_gt)
+            else None,
+            bbox_xywh_gt=torch.cat(self.bbox_xywh_gt, 0),
+            bbox_xywh_est=torch.cat(self.bbox_xywh_est, 0),
+            point_bbox_with_dp_indices=torch.cat(self.point_bbox_with_dp_indices, 0).long(),
+            point_bbox_indices=torch.cat(self.point_bbox_indices, 0).long(),
+            bbox_indices=torch.as_tensor(
+                self.bbox_indices, dtype=torch.long, device=self.x_gt[0].device
+            ).long(),
+        )
+
+
+def extract_packed_annotations_from_matches(
+    proposals_with_targets: List[Instances], accumulator: AnnotationsAccumulator
+) -> Any:
+    for proposals_targets_per_image in proposals_with_targets:
+        accumulator.accumulate(proposals_targets_per_image)
+    return accumulator.pack()
+
+
+def sample_random_indices(
+    n_indices: int, n_samples: int, device: Optional[torch.device] = None
+) -> Optional[torch.Tensor]:
+    """
+    Samples `n_samples` random indices from range `[0..n_indices - 1]`.
+    If `n_indices` is smaller than `n_samples`, returns `None` meaning that all indices
+    are selected.
+    Args:
+        n_indices (int): total number of indices
+        n_samples (int): number of indices to sample
+        device (torch.device): the desired device of returned tensor
+    Return:
+        Tensor of selected vertex indices, or `None`, if all vertices are selected
+    """
+    if (n_samples <= 0) or (n_indices <= n_samples):
+        return None
+    indices = torch.randperm(n_indices, device=device)[:n_samples]
+    return indices
diff --git a/densepose/modeling/predictors/__init__.py b/densepose/modeling/predictors/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..1ece0757acf2a4924079c884cab44a71cea22c37
--- /dev/null
+++ b/densepose/modeling/predictors/__init__.py
@@ -0,0 +1,9 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+
+from .chart import DensePoseChartPredictor
+from .chart_confidence import DensePoseChartConfidencePredictorMixin
+from .chart_with_confidence import DensePoseChartWithConfidencePredictor
+from .cse import DensePoseEmbeddingPredictor
+from .cse_confidence import DensePoseEmbeddingConfidencePredictorMixin
+from .cse_with_confidence import DensePoseEmbeddingWithConfidencePredictor
+from .registry import DENSEPOSE_PREDICTOR_REGISTRY
diff --git a/densepose/modeling/predictors/__pycache__/__init__.cpython-39.pyc b/densepose/modeling/predictors/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b68a643541541c8e496263a5e61eecc35154e2b9
Binary files /dev/null and b/densepose/modeling/predictors/__pycache__/__init__.cpython-39.pyc differ
diff --git a/densepose/modeling/predictors/__pycache__/chart.cpython-39.pyc b/densepose/modeling/predictors/__pycache__/chart.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2c4ddb681c102844befe518891588945b12565ab
Binary files /dev/null and b/densepose/modeling/predictors/__pycache__/chart.cpython-39.pyc differ
diff --git a/densepose/modeling/predictors/__pycache__/chart_confidence.cpython-39.pyc b/densepose/modeling/predictors/__pycache__/chart_confidence.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..cded50b9998366d164da11266266cbb587b76672
Binary files /dev/null and b/densepose/modeling/predictors/__pycache__/chart_confidence.cpython-39.pyc differ
diff --git a/densepose/modeling/predictors/__pycache__/chart_with_confidence.cpython-39.pyc b/densepose/modeling/predictors/__pycache__/chart_with_confidence.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..84f26c2454562aeac3c6bba3a7116de21a68d5c0
Binary files /dev/null and b/densepose/modeling/predictors/__pycache__/chart_with_confidence.cpython-39.pyc differ
diff --git a/densepose/modeling/predictors/__pycache__/cse.cpython-39.pyc b/densepose/modeling/predictors/__pycache__/cse.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..40019d38ee6bfdb56e63d7f5699000d1b4ea7ace
Binary files /dev/null and b/densepose/modeling/predictors/__pycache__/cse.cpython-39.pyc differ
diff --git a/densepose/modeling/predictors/__pycache__/cse_confidence.cpython-39.pyc b/densepose/modeling/predictors/__pycache__/cse_confidence.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c3a9a09abe9709e7a735b387ee7bf419a8231253
Binary files /dev/null and b/densepose/modeling/predictors/__pycache__/cse_confidence.cpython-39.pyc differ
diff --git a/densepose/modeling/predictors/__pycache__/cse_with_confidence.cpython-39.pyc b/densepose/modeling/predictors/__pycache__/cse_with_confidence.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3fa33715e0a110b56932258cc60cd03f9cbf72cd
Binary files /dev/null and b/densepose/modeling/predictors/__pycache__/cse_with_confidence.cpython-39.pyc differ
diff --git a/densepose/modeling/predictors/__pycache__/registry.cpython-39.pyc b/densepose/modeling/predictors/__pycache__/registry.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7651cd60c22b8be77723d4e501a5c7a7b0be283f
Binary files /dev/null and b/densepose/modeling/predictors/__pycache__/registry.cpython-39.pyc differ
diff --git a/densepose/modeling/predictors/chart.py b/densepose/modeling/predictors/chart.py
new file mode 100644
index 0000000000000000000000000000000000000000..3bcd13f7c592e37c2751556cda1f6e9cd3400b73
--- /dev/null
+++ b/densepose/modeling/predictors/chart.py
@@ -0,0 +1,94 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+
+import torch
+from torch import nn
+
+from detectron2.config import CfgNode
+from detectron2.layers import ConvTranspose2d, interpolate
+
+from ...structures import DensePoseChartPredictorOutput
+from ..utils import initialize_module_params
+from .registry import DENSEPOSE_PREDICTOR_REGISTRY
+
+
+@DENSEPOSE_PREDICTOR_REGISTRY.register()
+class DensePoseChartPredictor(nn.Module):
+    """
+    Predictor (last layers of a DensePose model) that takes DensePose head outputs as an input
+    and produces 4 tensors which represent DensePose results for predefined body parts
+    (patches / charts):
+     * coarse segmentation, a tensor of shape [N, K, Hout, Wout]
+     * fine segmentation, a tensor of shape [N, C, Hout, Wout]
+     * U coordinates, a tensor of shape [N, C, Hout, Wout]
+     * V coordinates, a tensor of shape [N, C, Hout, Wout]
+    where
+     - N is the number of instances
+     - K is the number of coarse segmentation channels (
+         2 = foreground / background,
+         15 = one of 14 body parts / background)
+     - C is the number of fine segmentation channels (
+         24 fine body parts / background)
+     - Hout and Wout are height and width of predictions
+    """
+
+    def __init__(self, cfg: CfgNode, input_channels: int):
+        """
+        Initialize predictor using configuration options
+
+        Args:
+            cfg (CfgNode): configuration options
+            input_channels (int): input tensor size along the channel dimension
+        """
+        super().__init__()
+        dim_in = input_channels
+        n_segm_chan = cfg.MODEL.ROI_DENSEPOSE_HEAD.NUM_COARSE_SEGM_CHANNELS
+        dim_out_patches = cfg.MODEL.ROI_DENSEPOSE_HEAD.NUM_PATCHES + 1
+        kernel_size = cfg.MODEL.ROI_DENSEPOSE_HEAD.DECONV_KERNEL
+        # coarse segmentation
+        self.ann_index_lowres = ConvTranspose2d(
+            dim_in, n_segm_chan, kernel_size, stride=2, padding=int(kernel_size / 2 - 1)
+        )
+        # fine segmentation
+        self.index_uv_lowres = ConvTranspose2d(
+            dim_in, dim_out_patches, kernel_size, stride=2, padding=int(kernel_size / 2 - 1)
+        )
+        # U
+        self.u_lowres = ConvTranspose2d(
+            dim_in, dim_out_patches, kernel_size, stride=2, padding=int(kernel_size / 2 - 1)
+        )
+        # V
+        self.v_lowres = ConvTranspose2d(
+            dim_in, dim_out_patches, kernel_size, stride=2, padding=int(kernel_size / 2 - 1)
+        )
+        self.scale_factor = cfg.MODEL.ROI_DENSEPOSE_HEAD.UP_SCALE
+        initialize_module_params(self)
+
+    def interp2d(self, tensor_nchw: torch.Tensor):
+        """
+        Bilinear interpolation method to be used for upscaling
+
+        Args:
+            tensor_nchw (tensor): tensor of shape (N, C, H, W)
+        Return:
+            tensor of shape (N, C, Hout, Wout), where Hout and Wout are computed
+                by applying the scale factor to H and W
+        """
+        return interpolate(
+            tensor_nchw, scale_factor=self.scale_factor, mode="bilinear", align_corners=False
+        )
+
+    def forward(self, head_outputs: torch.Tensor):
+        """
+        Perform forward step on DensePose head outputs
+
+        Args:
+            head_outputs (tensor): DensePose head outputs, tensor of shape [N, D, H, W]
+        Return:
+           An instance of DensePoseChartPredictorOutput
+        """
+        return DensePoseChartPredictorOutput(
+            coarse_segm=self.interp2d(self.ann_index_lowres(head_outputs)),
+            fine_segm=self.interp2d(self.index_uv_lowres(head_outputs)),
+            u=self.interp2d(self.u_lowres(head_outputs)),
+            v=self.interp2d(self.v_lowres(head_outputs)),
+        )
diff --git a/densepose/modeling/predictors/chart_confidence.py b/densepose/modeling/predictors/chart_confidence.py
new file mode 100644
index 0000000000000000000000000000000000000000..0c0099952f3e675e42aa7d3b6d35065fdaf43dbb
--- /dev/null
+++ b/densepose/modeling/predictors/chart_confidence.py
@@ -0,0 +1,174 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+
+from typing import Any
+import torch
+from torch.nn import functional as F
+
+from detectron2.config import CfgNode
+from detectron2.layers import ConvTranspose2d
+
+from ...structures import decorate_predictor_output_class_with_confidences
+from ..confidence import DensePoseConfidenceModelConfig, DensePoseUVConfidenceType
+from ..utils import initialize_module_params
+
+
+class DensePoseChartConfidencePredictorMixin:
+    """
+    Predictor contains the last layers of a DensePose model that take DensePose head
+    outputs as an input and produce model outputs. Confidence predictor mixin is used
+    to generate confidences for segmentation and UV tensors estimated by some
+    base predictor. Several assumptions need to hold for the base predictor:
+    1) the `forward` method must return SIUV tuple as the first result (
+        S = coarse segmentation, I = fine segmentation, U and V are intrinsic
+        chart coordinates)
+    2) `interp2d` method must be defined to perform bilinear interpolation;
+        the same method is typically used for SIUV and confidences
+    Confidence predictor mixin provides confidence estimates, as described in:
+        N. Neverova et al., Correlated Uncertainty for Learning Dense Correspondences
+            from Noisy Labels, NeurIPS 2019
+        A. Sanakoyeu et al., Transferring Dense Pose to Proximal Animal Classes, CVPR 2020
+    """
+
+    def __init__(self, cfg: CfgNode, input_channels: int):
+        """
+        Initialize confidence predictor using configuration options.
+
+        Args:
+            cfg (CfgNode): configuration options
+            input_channels (int): number of input channels
+        """
+        # we rely on base predictor to call nn.Module.__init__
+        super().__init__(cfg, input_channels)  # pyre-ignore[19]
+        self.confidence_model_cfg = DensePoseConfidenceModelConfig.from_cfg(cfg)
+        self._initialize_confidence_estimation_layers(cfg, input_channels)
+        self._registry = {}
+        initialize_module_params(self)  # pyre-ignore[6]
+
+    def _initialize_confidence_estimation_layers(self, cfg: CfgNode, dim_in: int):
+        """
+        Initialize confidence estimation layers based on configuration options
+
+        Args:
+            cfg (CfgNode): configuration options
+            dim_in (int): number of input channels
+        """
+        dim_out_patches = cfg.MODEL.ROI_DENSEPOSE_HEAD.NUM_PATCHES + 1
+        kernel_size = cfg.MODEL.ROI_DENSEPOSE_HEAD.DECONV_KERNEL
+        if self.confidence_model_cfg.uv_confidence.enabled:
+            if self.confidence_model_cfg.uv_confidence.type == DensePoseUVConfidenceType.IID_ISO:
+                self.sigma_2_lowres = ConvTranspose2d(  # pyre-ignore[16]
+                    dim_in, dim_out_patches, kernel_size, stride=2, padding=int(kernel_size / 2 - 1)
+                )
+            elif (
+                self.confidence_model_cfg.uv_confidence.type
+                == DensePoseUVConfidenceType.INDEP_ANISO
+            ):
+                self.sigma_2_lowres = ConvTranspose2d(
+                    dim_in, dim_out_patches, kernel_size, stride=2, padding=int(kernel_size / 2 - 1)
+                )
+                self.kappa_u_lowres = ConvTranspose2d(  # pyre-ignore[16]
+                    dim_in, dim_out_patches, kernel_size, stride=2, padding=int(kernel_size / 2 - 1)
+                )
+                self.kappa_v_lowres = ConvTranspose2d(  # pyre-ignore[16]
+                    dim_in, dim_out_patches, kernel_size, stride=2, padding=int(kernel_size / 2 - 1)
+                )
+            else:
+                raise ValueError(
+                    f"Unknown confidence model type: "
+                    f"{self.confidence_model_cfg.confidence_model_type}"
+                )
+        if self.confidence_model_cfg.segm_confidence.enabled:
+            self.fine_segm_confidence_lowres = ConvTranspose2d(  # pyre-ignore[16]
+                dim_in, 1, kernel_size, stride=2, padding=int(kernel_size / 2 - 1)
+            )
+            self.coarse_segm_confidence_lowres = ConvTranspose2d(  # pyre-ignore[16]
+                dim_in, 1, kernel_size, stride=2, padding=int(kernel_size / 2 - 1)
+            )
+
+    def forward(self, head_outputs: torch.Tensor):
+        """
+        Perform forward operation on head outputs used as inputs for the predictor.
+        Calls forward method from the base predictor and uses its outputs to compute
+        confidences.
+
+        Args:
+            head_outputs (Tensor): head outputs used as predictor inputs
+        Return:
+            An instance of outputs with confidences,
+            see `decorate_predictor_output_class_with_confidences`
+        """
+        # assuming base class returns SIUV estimates in its first result
+        base_predictor_outputs = super().forward(head_outputs)  # pyre-ignore[16]
+
+        # create output instance by extending base predictor outputs:
+        output = self._create_output_instance(base_predictor_outputs)
+
+        if self.confidence_model_cfg.uv_confidence.enabled:
+            if self.confidence_model_cfg.uv_confidence.type == DensePoseUVConfidenceType.IID_ISO:
+                # assuming base class defines interp2d method for bilinear interpolation
+                output.sigma_2 = self.interp2d(self.sigma_2_lowres(head_outputs))  # pyre-ignore[16]
+            elif (
+                self.confidence_model_cfg.uv_confidence.type
+                == DensePoseUVConfidenceType.INDEP_ANISO
+            ):
+                # assuming base class defines interp2d method for bilinear interpolation
+                output.sigma_2 = self.interp2d(self.sigma_2_lowres(head_outputs))
+                output.kappa_u = self.interp2d(self.kappa_u_lowres(head_outputs))  # pyre-ignore[16]
+                output.kappa_v = self.interp2d(self.kappa_v_lowres(head_outputs))  # pyre-ignore[16]
+            else:
+                raise ValueError(
+                    f"Unknown confidence model type: "
+                    f"{self.confidence_model_cfg.confidence_model_type}"
+                )
+        if self.confidence_model_cfg.segm_confidence.enabled:
+            # base predictor outputs are assumed to have `fine_segm` and `coarse_segm` attributes
+            # base predictor is assumed to define `interp2d` method for bilinear interpolation
+            output.fine_segm_confidence = (
+                F.softplus(
+                    self.interp2d(self.fine_segm_confidence_lowres(head_outputs))  # pyre-ignore[16]
+                )
+                + self.confidence_model_cfg.segm_confidence.epsilon
+            )
+            output.fine_segm = base_predictor_outputs.fine_segm * torch.repeat_interleave(
+                output.fine_segm_confidence, base_predictor_outputs.fine_segm.shape[1], dim=1
+            )
+            output.coarse_segm_confidence = (
+                F.softplus(
+                    self.interp2d(
+                        self.coarse_segm_confidence_lowres(head_outputs)  # pyre-ignore[16]
+                    )
+                )
+                + self.confidence_model_cfg.segm_confidence.epsilon
+            )
+            output.coarse_segm = base_predictor_outputs.coarse_segm * torch.repeat_interleave(
+                output.coarse_segm_confidence, base_predictor_outputs.coarse_segm.shape[1], dim=1
+            )
+
+        return output
+
+    def _create_output_instance(self, base_predictor_outputs: Any):
+        """
+        Create an instance of predictor outputs by copying the outputs from the
+        base predictor and initializing confidence
+
+        Args:
+            base_predictor_outputs: an instance of base predictor outputs
+                (the outputs type is assumed to be a dataclass)
+        Return:
+           An instance of outputs with confidences
+        """
+        PredictorOutput = decorate_predictor_output_class_with_confidences(
+            type(base_predictor_outputs)  # pyre-ignore[6]
+        )
+        # base_predictor_outputs is assumed to be a dataclass
+        # reassign all the fields from base_predictor_outputs (no deep copy!), add new fields
+        output = PredictorOutput(
+            **base_predictor_outputs.__dict__,
+            coarse_segm_confidence=None,
+            fine_segm_confidence=None,
+            sigma_1=None,
+            sigma_2=None,
+            kappa_u=None,
+            kappa_v=None,
+        )
+        return output
diff --git a/densepose/modeling/predictors/chart_with_confidence.py b/densepose/modeling/predictors/chart_with_confidence.py
new file mode 100644
index 0000000000000000000000000000000000000000..9c1cd6cc8fda56e831fbc02a8ffdd844866c0e4f
--- /dev/null
+++ b/densepose/modeling/predictors/chart_with_confidence.py
@@ -0,0 +1,15 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+
+from . import DensePoseChartConfidencePredictorMixin, DensePoseChartPredictor
+from .registry import DENSEPOSE_PREDICTOR_REGISTRY
+
+
+@DENSEPOSE_PREDICTOR_REGISTRY.register()
+class DensePoseChartWithConfidencePredictor(
+    DensePoseChartConfidencePredictorMixin, DensePoseChartPredictor
+):
+    """
+    Predictor that combines chart and chart confidence estimation
+    """
+
+    pass
diff --git a/densepose/modeling/predictors/cse.py b/densepose/modeling/predictors/cse.py
new file mode 100644
index 0000000000000000000000000000000000000000..466a5ecddbfa338a2b603facf06d1f4510fff6eb
--- /dev/null
+++ b/densepose/modeling/predictors/cse.py
@@ -0,0 +1,70 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+
+import torch
+from torch import nn
+
+from detectron2.config import CfgNode
+from detectron2.layers import ConvTranspose2d, interpolate
+
+from ...structures import DensePoseEmbeddingPredictorOutput
+from ..utils import initialize_module_params
+from .registry import DENSEPOSE_PREDICTOR_REGISTRY
+
+
+@DENSEPOSE_PREDICTOR_REGISTRY.register()
+class DensePoseEmbeddingPredictor(nn.Module):
+    """
+    Last layers of a DensePose model that take DensePose head outputs as an input
+    and produce model outputs for continuous surface embeddings (CSE).
+    """
+
+    def __init__(self, cfg: CfgNode, input_channels: int):
+        """
+        Initialize predictor using configuration options
+
+        Args:
+            cfg (CfgNode): configuration options
+            input_channels (int): input tensor size along the channel dimension
+        """
+        super().__init__()
+        dim_in = input_channels
+        n_segm_chan = cfg.MODEL.ROI_DENSEPOSE_HEAD.NUM_COARSE_SEGM_CHANNELS
+        embed_size = cfg.MODEL.ROI_DENSEPOSE_HEAD.CSE.EMBED_SIZE
+        kernel_size = cfg.MODEL.ROI_DENSEPOSE_HEAD.DECONV_KERNEL
+        # coarse segmentation
+        self.coarse_segm_lowres = ConvTranspose2d(
+            dim_in, n_segm_chan, kernel_size, stride=2, padding=int(kernel_size / 2 - 1)
+        )
+        # embedding
+        self.embed_lowres = ConvTranspose2d(
+            dim_in, embed_size, kernel_size, stride=2, padding=int(kernel_size / 2 - 1)
+        )
+        self.scale_factor = cfg.MODEL.ROI_DENSEPOSE_HEAD.UP_SCALE
+        initialize_module_params(self)
+
+    def interp2d(self, tensor_nchw: torch.Tensor):
+        """
+        Bilinear interpolation method to be used for upscaling
+
+        Args:
+            tensor_nchw (tensor): tensor of shape (N, C, H, W)
+        Return:
+            tensor of shape (N, C, Hout, Wout), where Hout and Wout are computed
+                by applying the scale factor to H and W
+        """
+        return interpolate(
+            tensor_nchw, scale_factor=self.scale_factor, mode="bilinear", align_corners=False
+        )
+
+    def forward(self, head_outputs):
+        """
+        Perform forward step on DensePose head outputs
+
+        Args:
+            head_outputs (tensor): DensePose head outputs, tensor of shape [N, D, H, W]
+        """
+        embed_lowres = self.embed_lowres(head_outputs)
+        coarse_segm_lowres = self.coarse_segm_lowres(head_outputs)
+        embed = self.interp2d(embed_lowres)
+        coarse_segm = self.interp2d(coarse_segm_lowres)
+        return DensePoseEmbeddingPredictorOutput(embedding=embed, coarse_segm=coarse_segm)
diff --git a/densepose/modeling/predictors/cse_confidence.py b/densepose/modeling/predictors/cse_confidence.py
new file mode 100644
index 0000000000000000000000000000000000000000..8220337cea8eb87bbdf74378079551259dcc37e2
--- /dev/null
+++ b/densepose/modeling/predictors/cse_confidence.py
@@ -0,0 +1,115 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+
+from typing import Any
+import torch
+from torch.nn import functional as F
+
+from detectron2.config import CfgNode
+from detectron2.layers import ConvTranspose2d
+
+from densepose.modeling.confidence import DensePoseConfidenceModelConfig
+from densepose.modeling.utils import initialize_module_params
+from densepose.structures import decorate_cse_predictor_output_class_with_confidences
+
+
+class DensePoseEmbeddingConfidencePredictorMixin:
+    """
+    Predictor contains the last layers of a DensePose model that take DensePose head
+    outputs as an input and produce model outputs. Confidence predictor mixin is used
+    to generate confidences for coarse segmentation estimated by some
+    base predictor. Several assumptions need to hold for the base predictor:
+    1) the `forward` method must return CSE DensePose head outputs,
+        tensor of shape [N, D, H, W]
+    2) `interp2d` method must be defined to perform bilinear interpolation;
+        the same method is typically used for masks and confidences
+    Confidence predictor mixin provides confidence estimates, as described in:
+        N. Neverova et al., Correlated Uncertainty for Learning Dense Correspondences
+            from Noisy Labels, NeurIPS 2019
+        A. Sanakoyeu et al., Transferring Dense Pose to Proximal Animal Classes, CVPR 2020
+    """
+
+    def __init__(self, cfg: CfgNode, input_channels: int):
+        """
+        Initialize confidence predictor using configuration options.
+
+        Args:
+            cfg (CfgNode): configuration options
+            input_channels (int): number of input channels
+        """
+        # we rely on base predictor to call nn.Module.__init__
+        super().__init__(cfg, input_channels)  # pyre-ignore[19]
+        self.confidence_model_cfg = DensePoseConfidenceModelConfig.from_cfg(cfg)
+        self._initialize_confidence_estimation_layers(cfg, input_channels)
+        self._registry = {}
+        initialize_module_params(self)  # pyre-ignore[6]
+
+    def _initialize_confidence_estimation_layers(self, cfg: CfgNode, dim_in: int):
+        """
+        Initialize confidence estimation layers based on configuration options
+
+        Args:
+            cfg (CfgNode): configuration options
+            dim_in (int): number of input channels
+        """
+        kernel_size = cfg.MODEL.ROI_DENSEPOSE_HEAD.DECONV_KERNEL
+        if self.confidence_model_cfg.segm_confidence.enabled:
+            self.coarse_segm_confidence_lowres = ConvTranspose2d(  # pyre-ignore[16]
+                dim_in, 1, kernel_size, stride=2, padding=int(kernel_size / 2 - 1)
+            )
+
+    def forward(self, head_outputs: torch.Tensor):
+        """
+        Perform forward operation on head outputs used as inputs for the predictor.
+        Calls forward method from the base predictor and uses its outputs to compute
+        confidences.
+
+        Args:
+            head_outputs (Tensor): head outputs used as predictor inputs
+        Return:
+            An instance of outputs with confidences,
+            see `decorate_cse_predictor_output_class_with_confidences`
+        """
+        # assuming base class returns SIUV estimates in its first result
+        base_predictor_outputs = super().forward(head_outputs)  # pyre-ignore[16]
+
+        # create output instance by extending base predictor outputs:
+        output = self._create_output_instance(base_predictor_outputs)
+
+        if self.confidence_model_cfg.segm_confidence.enabled:
+            # base predictor outputs are assumed to have `coarse_segm` attribute
+            # base predictor is assumed to define `interp2d` method for bilinear interpolation
+            output.coarse_segm_confidence = (
+                F.softplus(
+                    self.interp2d(  # pyre-ignore[16]
+                        self.coarse_segm_confidence_lowres(head_outputs)  # pyre-ignore[16]
+                    )
+                )
+                + self.confidence_model_cfg.segm_confidence.epsilon
+            )
+            output.coarse_segm = base_predictor_outputs.coarse_segm * torch.repeat_interleave(
+                output.coarse_segm_confidence, base_predictor_outputs.coarse_segm.shape[1], dim=1
+            )
+
+        return output
+
+    def _create_output_instance(self, base_predictor_outputs: Any):
+        """
+        Create an instance of predictor outputs by copying the outputs from the
+        base predictor and initializing confidence
+
+        Args:
+            base_predictor_outputs: an instance of base predictor outputs
+                (the outputs type is assumed to be a dataclass)
+        Return:
+           An instance of outputs with confidences
+        """
+        PredictorOutput = decorate_cse_predictor_output_class_with_confidences(
+            type(base_predictor_outputs)  # pyre-ignore[6]
+        )
+        # base_predictor_outputs is assumed to be a dataclass
+        # reassign all the fields from base_predictor_outputs (no deep copy!), add new fields
+        output = PredictorOutput(
+            **base_predictor_outputs.__dict__,
+            coarse_segm_confidence=None,
+        )
+        return output
diff --git a/densepose/modeling/predictors/cse_with_confidence.py b/densepose/modeling/predictors/cse_with_confidence.py
new file mode 100644
index 0000000000000000000000000000000000000000..17ecef67ffb67cd0e64de73632eaede1d8f3c701
--- /dev/null
+++ b/densepose/modeling/predictors/cse_with_confidence.py
@@ -0,0 +1,15 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+
+from . import DensePoseEmbeddingConfidencePredictorMixin, DensePoseEmbeddingPredictor
+from .registry import DENSEPOSE_PREDICTOR_REGISTRY
+
+
+@DENSEPOSE_PREDICTOR_REGISTRY.register()
+class DensePoseEmbeddingWithConfidencePredictor(
+    DensePoseEmbeddingConfidencePredictorMixin, DensePoseEmbeddingPredictor
+):
+    """
+    Predictor that combines CSE and CSE confidence estimation
+    """
+
+    pass
diff --git a/densepose/modeling/predictors/registry.py b/densepose/modeling/predictors/registry.py
new file mode 100644
index 0000000000000000000000000000000000000000..f96901d3242fa8f3d35d053ed0bdd7649a045b88
--- /dev/null
+++ b/densepose/modeling/predictors/registry.py
@@ -0,0 +1,5 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+
+from detectron2.utils.registry import Registry
+
+DENSEPOSE_PREDICTOR_REGISTRY = Registry("DENSEPOSE_PREDICTOR")
diff --git a/densepose/modeling/roi_heads/__init__.py b/densepose/modeling/roi_heads/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..8403589f23ec2ffa8afafcd566ca0b0b7b2671a7
--- /dev/null
+++ b/densepose/modeling/roi_heads/__init__.py
@@ -0,0 +1,6 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+
+from .v1convx import DensePoseV1ConvXHead
+from .deeplab import DensePoseDeepLabHead
+from .registry import ROI_DENSEPOSE_HEAD_REGISTRY
+from .roi_head import Decoder, DensePoseROIHeads
diff --git a/densepose/modeling/roi_heads/__pycache__/__init__.cpython-310.pyc b/densepose/modeling/roi_heads/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a9a9181786f1d186c74e97b435367133183e45e6
Binary files /dev/null and b/densepose/modeling/roi_heads/__pycache__/__init__.cpython-310.pyc differ
diff --git a/densepose/modeling/roi_heads/__pycache__/__init__.cpython-39.pyc b/densepose/modeling/roi_heads/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b2c0ec6cae3993472218bb74451d623af38cfed2
Binary files /dev/null and b/densepose/modeling/roi_heads/__pycache__/__init__.cpython-39.pyc differ
diff --git a/densepose/modeling/roi_heads/__pycache__/deeplab.cpython-310.pyc b/densepose/modeling/roi_heads/__pycache__/deeplab.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4ff1629b05c8ca1b250363345a2b55a4ba47f4e1
Binary files /dev/null and b/densepose/modeling/roi_heads/__pycache__/deeplab.cpython-310.pyc differ
diff --git a/densepose/modeling/roi_heads/__pycache__/deeplab.cpython-39.pyc b/densepose/modeling/roi_heads/__pycache__/deeplab.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a0566aa7b2f95f871d6016a30573ba367d21e870
Binary files /dev/null and b/densepose/modeling/roi_heads/__pycache__/deeplab.cpython-39.pyc differ
diff --git a/densepose/modeling/roi_heads/__pycache__/registry.cpython-310.pyc b/densepose/modeling/roi_heads/__pycache__/registry.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6e4c67c53748ac3e1573e1de21e79107ea214ba7
Binary files /dev/null and b/densepose/modeling/roi_heads/__pycache__/registry.cpython-310.pyc differ
diff --git a/densepose/modeling/roi_heads/__pycache__/registry.cpython-39.pyc b/densepose/modeling/roi_heads/__pycache__/registry.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..97423dfb2776802bbb5cd5c76e68438eb39e1c35
Binary files /dev/null and b/densepose/modeling/roi_heads/__pycache__/registry.cpython-39.pyc differ
diff --git a/densepose/modeling/roi_heads/__pycache__/roi_head.cpython-310.pyc b/densepose/modeling/roi_heads/__pycache__/roi_head.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..fef0e0071c9680341265f9ea66079596ae162b95
Binary files /dev/null and b/densepose/modeling/roi_heads/__pycache__/roi_head.cpython-310.pyc differ
diff --git a/densepose/modeling/roi_heads/__pycache__/roi_head.cpython-39.pyc b/densepose/modeling/roi_heads/__pycache__/roi_head.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..cebb7654ebb729213c490ab9afc095d417e49d63
Binary files /dev/null and b/densepose/modeling/roi_heads/__pycache__/roi_head.cpython-39.pyc differ
diff --git a/densepose/modeling/roi_heads/__pycache__/v1convx.cpython-310.pyc b/densepose/modeling/roi_heads/__pycache__/v1convx.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c43ed52ddaa018e9cc37091e0adf70ac087db5a3
Binary files /dev/null and b/densepose/modeling/roi_heads/__pycache__/v1convx.cpython-310.pyc differ
diff --git a/densepose/modeling/roi_heads/__pycache__/v1convx.cpython-39.pyc b/densepose/modeling/roi_heads/__pycache__/v1convx.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a51c1bef9cc583272d0e44c366a15eac81da32a9
Binary files /dev/null and b/densepose/modeling/roi_heads/__pycache__/v1convx.cpython-39.pyc differ
diff --git a/densepose/modeling/roi_heads/deeplab.py b/densepose/modeling/roi_heads/deeplab.py
new file mode 100644
index 0000000000000000000000000000000000000000..4e5cb483037b302ff1fb2c305275a65e4ba4e941
--- /dev/null
+++ b/densepose/modeling/roi_heads/deeplab.py
@@ -0,0 +1,263 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+
+import fvcore.nn.weight_init as weight_init
+import torch
+from torch import nn
+from torch.nn import functional as F
+
+from detectron2.config import CfgNode
+from detectron2.layers import Conv2d
+
+from .registry import ROI_DENSEPOSE_HEAD_REGISTRY
+
+
+@ROI_DENSEPOSE_HEAD_REGISTRY.register()
+class DensePoseDeepLabHead(nn.Module):
+    """
+    DensePose head using DeepLabV3 model from
+    "Rethinking Atrous Convolution for Semantic Image Segmentation"
+    <https://arxiv.org/abs/1706.05587>.
+    """
+
+    def __init__(self, cfg: CfgNode, input_channels: int):
+        super(DensePoseDeepLabHead, self).__init__()
+        # fmt: off
+        hidden_dim           = cfg.MODEL.ROI_DENSEPOSE_HEAD.CONV_HEAD_DIM
+        kernel_size          = cfg.MODEL.ROI_DENSEPOSE_HEAD.CONV_HEAD_KERNEL
+        norm                 = cfg.MODEL.ROI_DENSEPOSE_HEAD.DEEPLAB.NORM
+        self.n_stacked_convs = cfg.MODEL.ROI_DENSEPOSE_HEAD.NUM_STACKED_CONVS
+        self.use_nonlocal    = cfg.MODEL.ROI_DENSEPOSE_HEAD.DEEPLAB.NONLOCAL_ON
+        # fmt: on
+        pad_size = kernel_size // 2
+        n_channels = input_channels
+
+        self.ASPP = ASPP(input_channels, [6, 12, 56], n_channels)  # 6, 12, 56
+        self.add_module("ASPP", self.ASPP)
+
+        if self.use_nonlocal:
+            self.NLBlock = NONLocalBlock2D(input_channels, bn_layer=True)
+            self.add_module("NLBlock", self.NLBlock)
+        # weight_init.c2_msra_fill(self.ASPP)
+
+        for i in range(self.n_stacked_convs):
+            norm_module = nn.GroupNorm(32, hidden_dim) if norm == "GN" else None
+            layer = Conv2d(
+                n_channels,
+                hidden_dim,
+                kernel_size,
+                stride=1,
+                padding=pad_size,
+                bias=not norm,
+                norm=norm_module,
+            )
+            weight_init.c2_msra_fill(layer)
+            n_channels = hidden_dim
+            layer_name = self._get_layer_name(i)
+            self.add_module(layer_name, layer)
+        self.n_out_channels = hidden_dim
+        # initialize_module_params(self)
+
+    def forward(self, features):
+        x0 = features
+        x = self.ASPP(x0)
+        if self.use_nonlocal:
+            x = self.NLBlock(x)
+        output = x
+        for i in range(self.n_stacked_convs):
+            layer_name = self._get_layer_name(i)
+            x = getattr(self, layer_name)(x)
+            x = F.relu(x)
+            output = x
+        return output
+
+    def _get_layer_name(self, i: int):
+        layer_name = "body_conv_fcn{}".format(i + 1)
+        return layer_name
+
+
+# Copied from
+# https://github.com/pytorch/vision/blob/master/torchvision/models/segmentation/deeplabv3.py
+# See https://arxiv.org/pdf/1706.05587.pdf for details
+class ASPPConv(nn.Sequential):
+    def __init__(self, in_channels, out_channels, dilation):
+        modules = [
+            nn.Conv2d(
+                in_channels, out_channels, 3, padding=dilation, dilation=dilation, bias=False
+            ),
+            nn.GroupNorm(32, out_channels),
+            nn.ReLU(),
+        ]
+        super(ASPPConv, self).__init__(*modules)
+
+
+class ASPPPooling(nn.Sequential):
+    def __init__(self, in_channels, out_channels):
+        super(ASPPPooling, self).__init__(
+            nn.AdaptiveAvgPool2d(1),
+            nn.Conv2d(in_channels, out_channels, 1, bias=False),
+            nn.GroupNorm(32, out_channels),
+            nn.ReLU(),
+        )
+
+    def forward(self, x):
+        size = x.shape[-2:]
+        x = super(ASPPPooling, self).forward(x)
+        return F.interpolate(x, size=size, mode="bilinear", align_corners=False)
+
+
+class ASPP(nn.Module):
+    def __init__(self, in_channels, atrous_rates, out_channels):
+        super(ASPP, self).__init__()
+        modules = []
+        modules.append(
+            nn.Sequential(
+                nn.Conv2d(in_channels, out_channels, 1, bias=False),
+                nn.GroupNorm(32, out_channels),
+                nn.ReLU(),
+            )
+        )
+
+        rate1, rate2, rate3 = tuple(atrous_rates)
+        modules.append(ASPPConv(in_channels, out_channels, rate1))
+        modules.append(ASPPConv(in_channels, out_channels, rate2))
+        modules.append(ASPPConv(in_channels, out_channels, rate3))
+        modules.append(ASPPPooling(in_channels, out_channels))
+
+        self.convs = nn.ModuleList(modules)
+
+        self.project = nn.Sequential(
+            nn.Conv2d(5 * out_channels, out_channels, 1, bias=False),
+            # nn.BatchNorm2d(out_channels),
+            nn.ReLU()
+            # nn.Dropout(0.5)
+        )
+
+    def forward(self, x):
+        res = []
+        for conv in self.convs:
+            res.append(conv(x))
+        res = torch.cat(res, dim=1)
+        return self.project(res)
+
+
+# copied from
+# https://github.com/AlexHex7/Non-local_pytorch/blob/master/lib/non_local_embedded_gaussian.py
+# See https://arxiv.org/abs/1711.07971 for details
+class _NonLocalBlockND(nn.Module):
+    def __init__(
+        self, in_channels, inter_channels=None, dimension=3, sub_sample=True, bn_layer=True
+    ):
+        super(_NonLocalBlockND, self).__init__()
+
+        assert dimension in [1, 2, 3]
+
+        self.dimension = dimension
+        self.sub_sample = sub_sample
+
+        self.in_channels = in_channels
+        self.inter_channels = inter_channels
+
+        if self.inter_channels is None:
+            self.inter_channels = in_channels // 2
+            if self.inter_channels == 0:
+                self.inter_channels = 1
+
+        if dimension == 3:
+            conv_nd = nn.Conv3d
+            max_pool_layer = nn.MaxPool3d(kernel_size=(1, 2, 2))
+            bn = nn.GroupNorm  # (32, hidden_dim) #nn.BatchNorm3d
+        elif dimension == 2:
+            conv_nd = nn.Conv2d
+            max_pool_layer = nn.MaxPool2d(kernel_size=(2, 2))
+            bn = nn.GroupNorm  # (32, hidden_dim)nn.BatchNorm2d
+        else:
+            conv_nd = nn.Conv1d
+            max_pool_layer = nn.MaxPool1d(kernel_size=2)
+            bn = nn.GroupNorm  # (32, hidden_dim)nn.BatchNorm1d
+
+        self.g = conv_nd(
+            in_channels=self.in_channels,
+            out_channels=self.inter_channels,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+        )
+
+        if bn_layer:
+            self.W = nn.Sequential(
+                conv_nd(
+                    in_channels=self.inter_channels,
+                    out_channels=self.in_channels,
+                    kernel_size=1,
+                    stride=1,
+                    padding=0,
+                ),
+                bn(32, self.in_channels),
+            )
+            nn.init.constant_(self.W[1].weight, 0)
+            nn.init.constant_(self.W[1].bias, 0)
+        else:
+            self.W = conv_nd(
+                in_channels=self.inter_channels,
+                out_channels=self.in_channels,
+                kernel_size=1,
+                stride=1,
+                padding=0,
+            )
+            nn.init.constant_(self.W.weight, 0)
+            nn.init.constant_(self.W.bias, 0)
+
+        self.theta = conv_nd(
+            in_channels=self.in_channels,
+            out_channels=self.inter_channels,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+        )
+        self.phi = conv_nd(
+            in_channels=self.in_channels,
+            out_channels=self.inter_channels,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+        )
+
+        if sub_sample:
+            self.g = nn.Sequential(self.g, max_pool_layer)
+            self.phi = nn.Sequential(self.phi, max_pool_layer)
+
+    def forward(self, x):
+        """
+        :param x: (b, c, t, h, w)
+        :return:
+        """
+
+        batch_size = x.size(0)
+
+        g_x = self.g(x).view(batch_size, self.inter_channels, -1)
+        g_x = g_x.permute(0, 2, 1)
+
+        theta_x = self.theta(x).view(batch_size, self.inter_channels, -1)
+        theta_x = theta_x.permute(0, 2, 1)
+        phi_x = self.phi(x).view(batch_size, self.inter_channels, -1)
+        f = torch.matmul(theta_x, phi_x)
+        f_div_C = F.softmax(f, dim=-1)
+
+        y = torch.matmul(f_div_C, g_x)
+        y = y.permute(0, 2, 1).contiguous()
+        y = y.view(batch_size, self.inter_channels, *x.size()[2:])
+        W_y = self.W(y)
+        z = W_y + x
+
+        return z
+
+
+class NONLocalBlock2D(_NonLocalBlockND):
+    def __init__(self, in_channels, inter_channels=None, sub_sample=True, bn_layer=True):
+        super(NONLocalBlock2D, self).__init__(
+            in_channels,
+            inter_channels=inter_channels,
+            dimension=2,
+            sub_sample=sub_sample,
+            bn_layer=bn_layer,
+        )
diff --git a/densepose/modeling/roi_heads/registry.py b/densepose/modeling/roi_heads/registry.py
new file mode 100644
index 0000000000000000000000000000000000000000..e1cea432f1fda3861266fa636d002667b3fb46a0
--- /dev/null
+++ b/densepose/modeling/roi_heads/registry.py
@@ -0,0 +1,5 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+
+from detectron2.utils.registry import Registry
+
+ROI_DENSEPOSE_HEAD_REGISTRY = Registry("ROI_DENSEPOSE_HEAD")
diff --git a/densepose/modeling/roi_heads/roi_head.py b/densepose/modeling/roi_heads/roi_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..aee645fde0d8321de9181a624a0c921b6dc167c4
--- /dev/null
+++ b/densepose/modeling/roi_heads/roi_head.py
@@ -0,0 +1,218 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+
+import numpy as np
+from typing import Dict, List, Optional
+import fvcore.nn.weight_init as weight_init
+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+
+from detectron2.layers import Conv2d, ShapeSpec, get_norm
+from detectron2.modeling import ROI_HEADS_REGISTRY, StandardROIHeads
+from detectron2.modeling.poolers import ROIPooler
+from detectron2.modeling.roi_heads import select_foreground_proposals
+from detectron2.structures import ImageList, Instances
+
+from .. import (
+    build_densepose_data_filter,
+    build_densepose_embedder,
+    build_densepose_head,
+    build_densepose_losses,
+    build_densepose_predictor,
+    densepose_inference,
+)
+
+
+class Decoder(nn.Module):
+    """
+    A semantic segmentation head described in detail in the Panoptic Feature Pyramid Networks paper
+    (https://arxiv.org/abs/1901.02446). It takes FPN features as input and merges information from
+    all levels of the FPN into single output.
+    """
+
+    def __init__(self, cfg, input_shape: Dict[str, ShapeSpec], in_features):
+        super(Decoder, self).__init__()
+
+        # fmt: off
+        self.in_features      = in_features
+        feature_strides       = {k: v.stride for k, v in input_shape.items()}
+        feature_channels      = {k: v.channels for k, v in input_shape.items()}
+        num_classes           = cfg.MODEL.ROI_DENSEPOSE_HEAD.DECODER_NUM_CLASSES
+        conv_dims             = cfg.MODEL.ROI_DENSEPOSE_HEAD.DECODER_CONV_DIMS
+        self.common_stride    = cfg.MODEL.ROI_DENSEPOSE_HEAD.DECODER_COMMON_STRIDE
+        norm                  = cfg.MODEL.ROI_DENSEPOSE_HEAD.DECODER_NORM
+        # fmt: on
+
+        self.scale_heads = []
+        for in_feature in self.in_features:
+            head_ops = []
+            head_length = max(
+                1, int(np.log2(feature_strides[in_feature]) - np.log2(self.common_stride))
+            )
+            for k in range(head_length):
+                conv = Conv2d(
+                    feature_channels[in_feature] if k == 0 else conv_dims,
+                    conv_dims,
+                    kernel_size=3,
+                    stride=1,
+                    padding=1,
+                    bias=not norm,
+                    norm=get_norm(norm, conv_dims),
+                    activation=F.relu,
+                )
+                weight_init.c2_msra_fill(conv)
+                head_ops.append(conv)
+                if feature_strides[in_feature] != self.common_stride:
+                    head_ops.append(
+                        nn.Upsample(scale_factor=2, mode="bilinear", align_corners=False)
+                    )
+            self.scale_heads.append(nn.Sequential(*head_ops))
+            self.add_module(in_feature, self.scale_heads[-1])
+        self.predictor = Conv2d(conv_dims, num_classes, kernel_size=1, stride=1, padding=0)
+        weight_init.c2_msra_fill(self.predictor)
+
+    def forward(self, features: List[torch.Tensor]):
+        for i, _ in enumerate(self.in_features):
+            if i == 0:
+                x = self.scale_heads[i](features[i])
+            else:
+                x = x + self.scale_heads[i](features[i])
+        x = self.predictor(x)
+        return x
+
+
+@ROI_HEADS_REGISTRY.register()
+class DensePoseROIHeads(StandardROIHeads):
+    """
+    A Standard ROIHeads which contains an addition of DensePose head.
+    """
+
+    def __init__(self, cfg, input_shape):
+        super().__init__(cfg, input_shape)
+        self._init_densepose_head(cfg, input_shape)
+
+    def _init_densepose_head(self, cfg, input_shape):
+        # fmt: off
+        self.densepose_on          = cfg.MODEL.DENSEPOSE_ON
+        if not self.densepose_on:
+            return
+        self.densepose_data_filter = build_densepose_data_filter(cfg)
+        dp_pooler_resolution       = cfg.MODEL.ROI_DENSEPOSE_HEAD.POOLER_RESOLUTION
+        dp_pooler_sampling_ratio   = cfg.MODEL.ROI_DENSEPOSE_HEAD.POOLER_SAMPLING_RATIO
+        dp_pooler_type             = cfg.MODEL.ROI_DENSEPOSE_HEAD.POOLER_TYPE
+        self.use_decoder           = cfg.MODEL.ROI_DENSEPOSE_HEAD.DECODER_ON
+        # fmt: on
+        if self.use_decoder:
+            dp_pooler_scales = (1.0 / input_shape[self.in_features[0]].stride,)
+        else:
+            dp_pooler_scales = tuple(1.0 / input_shape[k].stride for k in self.in_features)
+        in_channels = [input_shape[f].channels for f in self.in_features][0]
+
+        if self.use_decoder:
+            self.decoder = Decoder(cfg, input_shape, self.in_features)
+
+        self.densepose_pooler = ROIPooler(
+            output_size=dp_pooler_resolution,
+            scales=dp_pooler_scales,
+            sampling_ratio=dp_pooler_sampling_ratio,
+            pooler_type=dp_pooler_type,
+        )
+        self.densepose_head = build_densepose_head(cfg, in_channels)
+        self.densepose_predictor = build_densepose_predictor(
+            cfg, self.densepose_head.n_out_channels
+        )
+        self.densepose_losses = build_densepose_losses(cfg)
+        self.embedder = build_densepose_embedder(cfg)
+
+    def _forward_densepose(self, features: Dict[str, torch.Tensor], instances: List[Instances]):
+        """
+        Forward logic of the densepose prediction branch.
+
+        Args:
+            features (dict[str, Tensor]): input data as a mapping from feature
+                map name to tensor. Axis 0 represents the number of images `N` in
+                the input data; axes 1-3 are channels, height, and width, which may
+                vary between feature maps (e.g., if a feature pyramid is used).
+            instances (list[Instances]): length `N` list of `Instances`. The i-th
+                `Instances` contains instances for the i-th input image,
+                In training, they can be the proposals.
+                In inference, they can be the predicted boxes.
+
+        Returns:
+            In training, a dict of losses.
+            In inference, update `instances` with new fields "densepose" and return it.
+        """
+        if not self.densepose_on:
+            return {} if self.training else instances
+
+        features_list = [features[f] for f in self.in_features]
+        if self.training:
+            proposals, _ = select_foreground_proposals(instances, self.num_classes)
+            features_list, proposals = self.densepose_data_filter(features_list, proposals)
+            if len(proposals) > 0:
+                proposal_boxes = [x.proposal_boxes for x in proposals]
+
+                if self.use_decoder:
+                    features_list = [self.decoder(features_list)]
+
+                features_dp = self.densepose_pooler(features_list, proposal_boxes)
+                densepose_head_outputs = self.densepose_head(features_dp)
+                densepose_predictor_outputs = self.densepose_predictor(densepose_head_outputs)
+                densepose_loss_dict = self.densepose_losses(
+                    proposals, densepose_predictor_outputs, embedder=self.embedder
+                )
+                return densepose_loss_dict
+        else:
+            pred_boxes = [x.pred_boxes for x in instances]
+
+            if self.use_decoder:
+                features_list = [self.decoder(features_list)]
+
+            features_dp = self.densepose_pooler(features_list, pred_boxes)
+            if len(features_dp) > 0:
+                densepose_head_outputs = self.densepose_head(features_dp)
+                densepose_predictor_outputs = self.densepose_predictor(densepose_head_outputs)
+            else:
+                densepose_predictor_outputs = None
+
+            densepose_inference(densepose_predictor_outputs, instances)
+            return instances
+
+    def forward(
+        self,
+        images: ImageList,
+        features: Dict[str, torch.Tensor],
+        proposals: List[Instances],
+        targets: Optional[List[Instances]] = None,
+    ):
+        instances, losses = super().forward(images, features, proposals, targets)
+        del targets, images
+
+        if self.training:
+            losses.update(self._forward_densepose(features, instances))
+        return instances, losses
+
+    def forward_with_given_boxes(
+        self, features: Dict[str, torch.Tensor], instances: List[Instances]
+    ):
+        """
+        Use the given boxes in `instances` to produce other (non-box) per-ROI outputs.
+
+        This is useful for downstream tasks where a box is known, but need to obtain
+        other attributes (outputs of other heads).
+        Test-time augmentation also uses this.
+
+        Args:
+            features: same as in `forward()`
+            instances (list[Instances]): instances to predict other outputs. Expect the keys
+                "pred_boxes" and "pred_classes" to exist.
+
+        Returns:
+            instances (list[Instances]):
+                the same `Instances` objects, with extra
+                fields such as `pred_masks` or `pred_keypoints`.
+        """
+
+        instances = super().forward_with_given_boxes(features, instances)
+        instances = self._forward_densepose(features, instances)
+        return instances
diff --git a/densepose/modeling/roi_heads/v1convx.py b/densepose/modeling/roi_heads/v1convx.py
new file mode 100644
index 0000000000000000000000000000000000000000..df79f658d8f7149e44aa1a31072adc4dadd89a48
--- /dev/null
+++ b/densepose/modeling/roi_heads/v1convx.py
@@ -0,0 +1,64 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+
+import torch
+from torch import nn
+from torch.nn import functional as F
+
+from detectron2.config import CfgNode
+from detectron2.layers import Conv2d
+
+from ..utils import initialize_module_params
+from .registry import ROI_DENSEPOSE_HEAD_REGISTRY
+
+
+@ROI_DENSEPOSE_HEAD_REGISTRY.register()
+class DensePoseV1ConvXHead(nn.Module):
+    """
+    Fully convolutional DensePose head.
+    """
+
+    def __init__(self, cfg: CfgNode, input_channels: int):
+        """
+        Initialize DensePose fully convolutional head
+
+        Args:
+            cfg (CfgNode): configuration options
+            input_channels (int): number of input channels
+        """
+        super(DensePoseV1ConvXHead, self).__init__()
+        # fmt: off
+        hidden_dim           = cfg.MODEL.ROI_DENSEPOSE_HEAD.CONV_HEAD_DIM
+        kernel_size          = cfg.MODEL.ROI_DENSEPOSE_HEAD.CONV_HEAD_KERNEL
+        self.n_stacked_convs = cfg.MODEL.ROI_DENSEPOSE_HEAD.NUM_STACKED_CONVS
+        # fmt: on
+        pad_size = kernel_size // 2
+        n_channels = input_channels
+        for i in range(self.n_stacked_convs):
+            layer = Conv2d(n_channels, hidden_dim, kernel_size, stride=1, padding=pad_size)
+            layer_name = self._get_layer_name(i)
+            self.add_module(layer_name, layer)
+            n_channels = hidden_dim
+        self.n_out_channels = n_channels
+        initialize_module_params(self)
+
+    def forward(self, features: torch.Tensor):
+        """
+        Apply DensePose fully convolutional head to the input features
+
+        Args:
+            features (tensor): input features
+        Result:
+            A tensor of DensePose head outputs
+        """
+        x = features
+        output = x
+        for i in range(self.n_stacked_convs):
+            layer_name = self._get_layer_name(i)
+            x = getattr(self, layer_name)(x)
+            x = F.relu(x)
+            output = x
+        return output
+
+    def _get_layer_name(self, i: int):
+        layer_name = "body_conv_fcn{}".format(i + 1)
+        return layer_name
diff --git a/densepose/modeling/test_time_augmentation.py b/densepose/modeling/test_time_augmentation.py
new file mode 100644
index 0000000000000000000000000000000000000000..ec2022ed16727f538993d2c7db60a60a1183b90d
--- /dev/null
+++ b/densepose/modeling/test_time_augmentation.py
@@ -0,0 +1,207 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+import copy
+import numpy as np
+import torch
+from fvcore.transforms import HFlipTransform, TransformList
+from torch.nn import functional as F
+
+from detectron2.data.transforms import RandomRotation, RotationTransform, apply_transform_gens
+from detectron2.modeling.postprocessing import detector_postprocess
+from detectron2.modeling.test_time_augmentation import DatasetMapperTTA, GeneralizedRCNNWithTTA
+
+from ..converters import HFlipConverter
+
+
+class DensePoseDatasetMapperTTA(DatasetMapperTTA):
+    def __init__(self, cfg):
+        super().__init__(cfg=cfg)
+        self.angles = cfg.TEST.AUG.ROTATION_ANGLES
+
+    def __call__(self, dataset_dict):
+        ret = super().__call__(dataset_dict=dataset_dict)
+        numpy_image = dataset_dict["image"].permute(1, 2, 0).numpy()
+        for angle in self.angles:
+            rotate = RandomRotation(angle=angle, expand=True)
+            new_numpy_image, tfms = apply_transform_gens([rotate], np.copy(numpy_image))
+            torch_image = torch.from_numpy(np.ascontiguousarray(new_numpy_image.transpose(2, 0, 1)))
+            dic = copy.deepcopy(dataset_dict)
+            # In DatasetMapperTTA, there is a pre_tfm transform (resize or no-op) that is
+            # added at the beginning of each TransformList. That's '.transforms[0]'.
+            dic["transforms"] = TransformList(
+                [ret[-1]["transforms"].transforms[0]] + tfms.transforms
+            )
+            dic["image"] = torch_image
+            ret.append(dic)
+        return ret
+
+
+class DensePoseGeneralizedRCNNWithTTA(GeneralizedRCNNWithTTA):
+    def __init__(self, cfg, model, transform_data, tta_mapper=None, batch_size=1):
+        """
+        Args:
+            cfg (CfgNode):
+            model (GeneralizedRCNN): a GeneralizedRCNN to apply TTA on.
+            transform_data (DensePoseTransformData): contains symmetry label
+                transforms used for horizontal flip
+            tta_mapper (callable): takes a dataset dict and returns a list of
+                augmented versions of the dataset dict. Defaults to
+                `DatasetMapperTTA(cfg)`.
+            batch_size (int): batch the augmented images into this batch size for inference.
+        """
+        self._transform_data = transform_data.to(model.device)
+        super().__init__(cfg=cfg, model=model, tta_mapper=tta_mapper, batch_size=batch_size)
+
+    # the implementation follows closely the one from detectron2/modeling
+    def _inference_one_image(self, input):
+        """
+        Args:
+            input (dict): one dataset dict with "image" field being a CHW tensor
+
+        Returns:
+            dict: one output dict
+        """
+        orig_shape = (input["height"], input["width"])
+        # For some reason, resize with uint8 slightly increases box AP but decreases densepose AP
+        input["image"] = input["image"].to(torch.uint8)
+        augmented_inputs, tfms = self._get_augmented_inputs(input)
+        # Detect boxes from all augmented versions
+        with self._turn_off_roi_heads(["mask_on", "keypoint_on", "densepose_on"]):
+            # temporarily disable roi heads
+            all_boxes, all_scores, all_classes = self._get_augmented_boxes(augmented_inputs, tfms)
+        merged_instances = self._merge_detections(all_boxes, all_scores, all_classes, orig_shape)
+
+        if self.cfg.MODEL.MASK_ON or self.cfg.MODEL.DENSEPOSE_ON:
+            # Use the detected boxes to obtain new fields
+            augmented_instances = self._rescale_detected_boxes(
+                augmented_inputs, merged_instances, tfms
+            )
+            # run forward on the detected boxes
+            outputs = self._batch_inference(augmented_inputs, augmented_instances)
+            # Delete now useless variables to avoid being out of memory
+            del augmented_inputs, augmented_instances
+            # average the predictions
+            if self.cfg.MODEL.MASK_ON:
+                merged_instances.pred_masks = self._reduce_pred_masks(outputs, tfms)
+            if self.cfg.MODEL.DENSEPOSE_ON:
+                merged_instances.pred_densepose = self._reduce_pred_densepose(outputs, tfms)
+            # postprocess
+            merged_instances = detector_postprocess(merged_instances, *orig_shape)
+            return {"instances": merged_instances}
+        else:
+            return {"instances": merged_instances}
+
+    def _get_augmented_boxes(self, augmented_inputs, tfms):
+        # Heavily based on detectron2/modeling/test_time_augmentation.py
+        # Only difference is that RotationTransform is excluded from bbox computation
+        # 1: forward with all augmented images
+        outputs = self._batch_inference(augmented_inputs)
+        # 2: union the results
+        all_boxes = []
+        all_scores = []
+        all_classes = []
+        for output, tfm in zip(outputs, tfms):
+            # Need to inverse the transforms on boxes, to obtain results on original image
+            if not any(isinstance(t, RotationTransform) for t in tfm.transforms):
+                # Some transforms can't compute bbox correctly
+                pred_boxes = output.pred_boxes.tensor
+                original_pred_boxes = tfm.inverse().apply_box(pred_boxes.cpu().numpy())
+                all_boxes.append(torch.from_numpy(original_pred_boxes).to(pred_boxes.device))
+                all_scores.extend(output.scores)
+                all_classes.extend(output.pred_classes)
+        all_boxes = torch.cat(all_boxes, dim=0)
+        return all_boxes, all_scores, all_classes
+
+    def _reduce_pred_densepose(self, outputs, tfms):
+        # Should apply inverse transforms on densepose preds.
+        # We assume only rotation, resize & flip are used. pred_masks is a scale-invariant
+        # representation, so we handle the other ones specially
+        for idx, (output, tfm) in enumerate(zip(outputs, tfms)):
+            for t in tfm.transforms:
+                for attr in ["coarse_segm", "fine_segm", "u", "v"]:
+                    setattr(
+                        output.pred_densepose,
+                        attr,
+                        _inverse_rotation(
+                            getattr(output.pred_densepose, attr), output.pred_boxes.tensor, t
+                        ),
+                    )
+            if any(isinstance(t, HFlipTransform) for t in tfm.transforms):
+                output.pred_densepose = HFlipConverter.convert(
+                    output.pred_densepose, self._transform_data
+                )
+            self._incremental_avg_dp(outputs[0].pred_densepose, output.pred_densepose, idx)
+        return outputs[0].pred_densepose
+
+    # incrementally computed average: u_(n + 1) = u_n + (x_(n+1) - u_n) / (n + 1).
+    def _incremental_avg_dp(self, avg, new_el, idx):
+        for attr in ["coarse_segm", "fine_segm", "u", "v"]:
+            setattr(avg, attr, (getattr(avg, attr) * idx + getattr(new_el, attr)) / (idx + 1))
+            if idx:
+                # Deletion of the > 0 index intermediary values to prevent GPU OOM
+                setattr(new_el, attr, None)
+        return avg
+
+
+def _inverse_rotation(densepose_attrs, boxes, transform):
+    # resample outputs to image size and rotate back the densepose preds
+    # on the rotated images to the space of the original image
+    if len(boxes) == 0 or not isinstance(transform, RotationTransform):
+        return densepose_attrs
+    boxes = boxes.int().cpu().numpy()
+    wh_boxes = boxes[:, 2:] - boxes[:, :2]  # bboxes in the rotated space
+    inv_boxes = rotate_box_inverse(transform, boxes).astype(int)  # bboxes in original image
+    wh_diff = (inv_boxes[:, 2:] - inv_boxes[:, :2] - wh_boxes) // 2  # diff between new/old bboxes
+    rotation_matrix = torch.tensor([transform.rm_image]).to(device=densepose_attrs.device).float()
+    rotation_matrix[:, :, -1] = 0
+    # To apply grid_sample for rotation, we need to have enough space to fit the original and
+    # rotated bboxes. l_bds and r_bds are the left/right bounds that will be used to
+    # crop the difference once the rotation is done
+    l_bds = np.maximum(0, -wh_diff)
+    for i in range(len(densepose_attrs)):
+        if min(wh_boxes[i]) <= 0:
+            continue
+        densepose_attr = densepose_attrs[[i]].clone()
+        # 1. Interpolate densepose attribute to size of the rotated bbox
+        densepose_attr = F.interpolate(densepose_attr, wh_boxes[i].tolist()[::-1], mode="bilinear")
+        # 2. Pad the interpolated attribute so it has room for the original + rotated bbox
+        densepose_attr = F.pad(densepose_attr, tuple(np.repeat(np.maximum(0, wh_diff[i]), 2)))
+        # 3. Compute rotation grid and transform
+        grid = F.affine_grid(rotation_matrix, size=densepose_attr.shape)
+        densepose_attr = F.grid_sample(densepose_attr, grid)
+        # 4. Compute right bounds and crop the densepose_attr to the size of the original bbox
+        r_bds = densepose_attr.shape[2:][::-1] - l_bds[i]
+        densepose_attr = densepose_attr[:, :, l_bds[i][1] : r_bds[1], l_bds[i][0] : r_bds[0]]
+        if min(densepose_attr.shape) > 0:
+            # Interpolate back to the original size of the densepose attribute
+            densepose_attr = F.interpolate(
+                densepose_attr, densepose_attrs.shape[-2:], mode="bilinear"
+            )
+            # Adding a very small probability to the background class to fill padded zones
+            densepose_attr[:, 0] += 1e-10
+            densepose_attrs[i] = densepose_attr
+    return densepose_attrs
+
+
+def rotate_box_inverse(rot_tfm, rotated_box):
+    """
+    rotated_box is a N * 4 array of [x0, y0, x1, y1] boxes
+    When a bbox is rotated, it gets bigger, because we need to surround the tilted bbox
+    So when a bbox is rotated then inverse-rotated, it is much bigger than the original
+    This function aims to invert the rotation on the box, but also resize it to its original size
+    """
+    # 1. Compute the inverse rotation of the rotated bboxes (bigger than it )
+    invrot_box = rot_tfm.inverse().apply_box(rotated_box)
+    h, w = rotated_box[:, 3] - rotated_box[:, 1], rotated_box[:, 2] - rotated_box[:, 0]
+    ih, iw = invrot_box[:, 3] - invrot_box[:, 1], invrot_box[:, 2] - invrot_box[:, 0]
+    assert 2 * rot_tfm.abs_sin**2 != 1, "45 degrees angle can't be inverted"
+    # 2. Inverse the corresponding computation in the rotation transform
+    # to get the original height/width of the rotated boxes
+    orig_h = (h * rot_tfm.abs_cos - w * rot_tfm.abs_sin) / (1 - 2 * rot_tfm.abs_sin**2)
+    orig_w = (w * rot_tfm.abs_cos - h * rot_tfm.abs_sin) / (1 - 2 * rot_tfm.abs_sin**2)
+    # 3. Resize the inverse-rotated bboxes to their original size
+    invrot_box[:, 0] += (iw - orig_w) / 2
+    invrot_box[:, 1] += (ih - orig_h) / 2
+    invrot_box[:, 2] -= (iw - orig_w) / 2
+    invrot_box[:, 3] -= (ih - orig_h) / 2
+
+    return invrot_box
diff --git a/densepose/modeling/utils.py b/densepose/modeling/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..2e76eb9535a68dcb4ccb065556c55289294e42c8
--- /dev/null
+++ b/densepose/modeling/utils.py
@@ -0,0 +1,11 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+
+from torch import nn
+
+
+def initialize_module_params(module: nn.Module) -> None:
+    for name, param in module.named_parameters():
+        if "bias" in name:
+            nn.init.constant_(param, 0)
+        elif "weight" in name:
+            nn.init.kaiming_normal_(param, mode="fan_out", nonlinearity="relu")
diff --git a/densepose/structures/__init__.py b/densepose/structures/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..ed32c5e9d6c4c1599ba960681d9e86889e2cdbd8
--- /dev/null
+++ b/densepose/structures/__init__.py
@@ -0,0 +1,17 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+
+from .chart import DensePoseChartPredictorOutput
+from .chart_confidence import decorate_predictor_output_class_with_confidences
+from .cse_confidence import decorate_cse_predictor_output_class_with_confidences
+from .chart_result import (
+    DensePoseChartResult,
+    DensePoseChartResultWithConfidences,
+    quantize_densepose_chart_result,
+    compress_quantized_densepose_chart_result,
+    decompress_compressed_densepose_chart_result,
+)
+from .cse import DensePoseEmbeddingPredictorOutput
+from .data_relative import DensePoseDataRelative
+from .list import DensePoseList
+from .mesh import Mesh, create_mesh
+from .transform_data import DensePoseTransformData, normalized_coords_transform
diff --git a/densepose/structures/__pycache__/__init__.cpython-310.pyc b/densepose/structures/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1e75ff5ee693d336cd76848bfcaa09a60797620a
Binary files /dev/null and b/densepose/structures/__pycache__/__init__.cpython-310.pyc differ
diff --git a/densepose/structures/__pycache__/__init__.cpython-39.pyc b/densepose/structures/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6a50efb1d7ea0fdd800140e96362e1dd82e40290
Binary files /dev/null and b/densepose/structures/__pycache__/__init__.cpython-39.pyc differ
diff --git a/densepose/structures/__pycache__/chart.cpython-310.pyc b/densepose/structures/__pycache__/chart.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1e96123acae236fe682f0d76193b8fc0f50573e5
Binary files /dev/null and b/densepose/structures/__pycache__/chart.cpython-310.pyc differ
diff --git a/densepose/structures/__pycache__/chart.cpython-39.pyc b/densepose/structures/__pycache__/chart.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f2e9274d14d8b36263842c91f9e229a136bcbdb5
Binary files /dev/null and b/densepose/structures/__pycache__/chart.cpython-39.pyc differ
diff --git a/densepose/structures/__pycache__/chart_confidence.cpython-310.pyc b/densepose/structures/__pycache__/chart_confidence.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a19f1fff5d9226b98b232fee647134f3876f1895
Binary files /dev/null and b/densepose/structures/__pycache__/chart_confidence.cpython-310.pyc differ
diff --git a/densepose/structures/__pycache__/chart_confidence.cpython-39.pyc b/densepose/structures/__pycache__/chart_confidence.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..cf62a9d26c74411ca5dfa496bba715dfb847078e
Binary files /dev/null and b/densepose/structures/__pycache__/chart_confidence.cpython-39.pyc differ
diff --git a/densepose/structures/__pycache__/chart_result.cpython-310.pyc b/densepose/structures/__pycache__/chart_result.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..03acf97a184f9f3c1267112c45642b8dfcbc3cb5
Binary files /dev/null and b/densepose/structures/__pycache__/chart_result.cpython-310.pyc differ
diff --git a/densepose/structures/__pycache__/chart_result.cpython-39.pyc b/densepose/structures/__pycache__/chart_result.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4ad563b0f6e2e259595159743b3b4273b50f5af3
Binary files /dev/null and b/densepose/structures/__pycache__/chart_result.cpython-39.pyc differ
diff --git a/densepose/structures/__pycache__/cse.cpython-310.pyc b/densepose/structures/__pycache__/cse.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..85a623bd3819a7191a60593e16a1b4feaf643808
Binary files /dev/null and b/densepose/structures/__pycache__/cse.cpython-310.pyc differ
diff --git a/densepose/structures/__pycache__/cse.cpython-39.pyc b/densepose/structures/__pycache__/cse.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6a05f2296a34cb6ce62db8431278bcd08937fe77
Binary files /dev/null and b/densepose/structures/__pycache__/cse.cpython-39.pyc differ
diff --git a/densepose/structures/__pycache__/cse_confidence.cpython-310.pyc b/densepose/structures/__pycache__/cse_confidence.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..081b46dd9462526d93f2001ada2dd2049df0f254
Binary files /dev/null and b/densepose/structures/__pycache__/cse_confidence.cpython-310.pyc differ
diff --git a/densepose/structures/__pycache__/cse_confidence.cpython-39.pyc b/densepose/structures/__pycache__/cse_confidence.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..577cf5b3bec01850a3868e136d7a646187a18456
Binary files /dev/null and b/densepose/structures/__pycache__/cse_confidence.cpython-39.pyc differ
diff --git a/densepose/structures/__pycache__/data_relative.cpython-310.pyc b/densepose/structures/__pycache__/data_relative.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f7c37333e755ef9d35f697ebe23ae131e2824199
Binary files /dev/null and b/densepose/structures/__pycache__/data_relative.cpython-310.pyc differ
diff --git a/densepose/structures/__pycache__/data_relative.cpython-39.pyc b/densepose/structures/__pycache__/data_relative.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7183b1fcf9b26c37fdf86ac0304742b1b9635949
Binary files /dev/null and b/densepose/structures/__pycache__/data_relative.cpython-39.pyc differ
diff --git a/densepose/structures/__pycache__/list.cpython-310.pyc b/densepose/structures/__pycache__/list.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..dfa003a523f87a6dfa8919f36a9625e7c13f3a88
Binary files /dev/null and b/densepose/structures/__pycache__/list.cpython-310.pyc differ
diff --git a/densepose/structures/__pycache__/list.cpython-39.pyc b/densepose/structures/__pycache__/list.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ca40953ffc7c42219bd0171b52ce1abc1a46671f
Binary files /dev/null and b/densepose/structures/__pycache__/list.cpython-39.pyc differ
diff --git a/densepose/structures/__pycache__/mesh.cpython-310.pyc b/densepose/structures/__pycache__/mesh.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..268415ccdabff693b64ce6a691877343cce28301
Binary files /dev/null and b/densepose/structures/__pycache__/mesh.cpython-310.pyc differ
diff --git a/densepose/structures/__pycache__/mesh.cpython-39.pyc b/densepose/structures/__pycache__/mesh.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5335bd1ec4778d497812811aca681a6008edd59c
Binary files /dev/null and b/densepose/structures/__pycache__/mesh.cpython-39.pyc differ
diff --git a/densepose/structures/__pycache__/transform_data.cpython-310.pyc b/densepose/structures/__pycache__/transform_data.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9720799a8edaa14344a24f624c61a8c42b2abb06
Binary files /dev/null and b/densepose/structures/__pycache__/transform_data.cpython-310.pyc differ
diff --git a/densepose/structures/__pycache__/transform_data.cpython-39.pyc b/densepose/structures/__pycache__/transform_data.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7aeb53887d08f298368021782fc193bd496a01d3
Binary files /dev/null and b/densepose/structures/__pycache__/transform_data.cpython-39.pyc differ
diff --git a/densepose/structures/chart.py b/densepose/structures/chart.py
new file mode 100644
index 0000000000000000000000000000000000000000..115cc084e98115c537382494af9eb0e246cd375b
--- /dev/null
+++ b/densepose/structures/chart.py
@@ -0,0 +1,70 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+
+from dataclasses import dataclass
+from typing import Union
+import torch
+
+
+@dataclass
+class DensePoseChartPredictorOutput:
+    """
+    Predictor output that contains segmentation and inner coordinates predictions for predefined
+    body parts:
+     * coarse segmentation, a tensor of shape [N, K, Hout, Wout]
+     * fine segmentation, a tensor of shape [N, C, Hout, Wout]
+     * U coordinates, a tensor of shape [N, C, Hout, Wout]
+     * V coordinates, a tensor of shape [N, C, Hout, Wout]
+    where
+     - N is the number of instances
+     - K is the number of coarse segmentation channels (
+         2 = foreground / background,
+         15 = one of 14 body parts / background)
+     - C is the number of fine segmentation channels (
+         24 fine body parts / background)
+     - Hout and Wout are height and width of predictions
+    """
+
+    coarse_segm: torch.Tensor
+    fine_segm: torch.Tensor
+    u: torch.Tensor
+    v: torch.Tensor
+
+    def __len__(self):
+        """
+        Number of instances (N) in the output
+        """
+        return self.coarse_segm.size(0)
+
+    def __getitem__(
+        self, item: Union[int, slice, torch.BoolTensor]
+    ) -> "DensePoseChartPredictorOutput":
+        """
+        Get outputs for the selected instance(s)
+
+        Args:
+            item (int or slice or tensor): selected items
+        """
+        if isinstance(item, int):
+            return DensePoseChartPredictorOutput(
+                coarse_segm=self.coarse_segm[item].unsqueeze(0),
+                fine_segm=self.fine_segm[item].unsqueeze(0),
+                u=self.u[item].unsqueeze(0),
+                v=self.v[item].unsqueeze(0),
+            )
+        else:
+            return DensePoseChartPredictorOutput(
+                coarse_segm=self.coarse_segm[item],
+                fine_segm=self.fine_segm[item],
+                u=self.u[item],
+                v=self.v[item],
+            )
+
+    def to(self, device: torch.device):
+        """
+        Transfers all tensors to the given device
+        """
+        coarse_segm = self.coarse_segm.to(device)
+        fine_segm = self.fine_segm.to(device)
+        u = self.u.to(device)
+        v = self.v.to(device)
+        return DensePoseChartPredictorOutput(coarse_segm=coarse_segm, fine_segm=fine_segm, u=u, v=v)
diff --git a/densepose/structures/chart_confidence.py b/densepose/structures/chart_confidence.py
new file mode 100644
index 0000000000000000000000000000000000000000..57c63257a7c176af1522e2f143ed594c26906c76
--- /dev/null
+++ b/densepose/structures/chart_confidence.py
@@ -0,0 +1,98 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+
+from dataclasses import make_dataclass
+from functools import lru_cache
+from typing import Any, Optional
+import torch
+
+
+@lru_cache(maxsize=None)
+def decorate_predictor_output_class_with_confidences(BasePredictorOutput: type) -> type:
+    """
+    Create a new output class from an existing one by adding new attributes
+    related to confidence estimation:
+    - sigma_1 (tensor)
+    - sigma_2 (tensor)
+    - kappa_u (tensor)
+    - kappa_v (tensor)
+    - fine_segm_confidence (tensor)
+    - coarse_segm_confidence (tensor)
+
+    Details on confidence estimation parameters can be found in:
+    N. Neverova, D. Novotny, A. Vedaldi "Correlated Uncertainty for Learning
+        Dense Correspondences from Noisy Labels", p. 918--926, in Proc. NIPS 2019
+    A. Sanakoyeu et al., Transferring Dense Pose to Proximal Animal Classes, CVPR 2020
+
+    The new class inherits the provided `BasePredictorOutput` class,
+    it's name is composed of the name of the provided class and
+    "WithConfidences" suffix.
+
+    Args:
+        BasePredictorOutput (type): output type to which confidence data
+            is to be added, assumed to be a dataclass
+    Return:
+        New dataclass derived from the provided one that has attributes
+        for confidence estimation
+    """
+
+    PredictorOutput = make_dataclass(
+        BasePredictorOutput.__name__ + "WithConfidences",
+        fields=[
+            ("sigma_1", Optional[torch.Tensor], None),
+            ("sigma_2", Optional[torch.Tensor], None),
+            ("kappa_u", Optional[torch.Tensor], None),
+            ("kappa_v", Optional[torch.Tensor], None),
+            ("fine_segm_confidence", Optional[torch.Tensor], None),
+            ("coarse_segm_confidence", Optional[torch.Tensor], None),
+        ],
+        bases=(BasePredictorOutput,),
+    )
+
+    # add possibility to index PredictorOutput
+
+    def slice_if_not_none(data, item):
+        if data is None:
+            return None
+        if isinstance(item, int):
+            return data[item].unsqueeze(0)
+        return data[item]
+
+    def PredictorOutput_getitem(self, item):
+        PredictorOutput = type(self)
+        base_predictor_output_sliced = super(PredictorOutput, self).__getitem__(item)
+        return PredictorOutput(
+            **base_predictor_output_sliced.__dict__,
+            coarse_segm_confidence=slice_if_not_none(self.coarse_segm_confidence, item),
+            fine_segm_confidence=slice_if_not_none(self.fine_segm_confidence, item),
+            sigma_1=slice_if_not_none(self.sigma_1, item),
+            sigma_2=slice_if_not_none(self.sigma_2, item),
+            kappa_u=slice_if_not_none(self.kappa_u, item),
+            kappa_v=slice_if_not_none(self.kappa_v, item),
+        )
+
+    PredictorOutput.__getitem__ = PredictorOutput_getitem
+
+    def PredictorOutput_to(self, device: torch.device):
+        """
+        Transfers all tensors to the given device
+        """
+        PredictorOutput = type(self)
+        base_predictor_output_to = super(PredictorOutput, self).to(device)  # pyre-ignore[16]
+
+        def to_device_if_tensor(var: Any):
+            if isinstance(var, torch.Tensor):
+                return var.to(device)
+            return var
+
+        return PredictorOutput(
+            **base_predictor_output_to.__dict__,
+            sigma_1=to_device_if_tensor(self.sigma_1),
+            sigma_2=to_device_if_tensor(self.sigma_2),
+            kappa_u=to_device_if_tensor(self.kappa_u),
+            kappa_v=to_device_if_tensor(self.kappa_v),
+            fine_segm_confidence=to_device_if_tensor(self.fine_segm_confidence),
+            coarse_segm_confidence=to_device_if_tensor(self.coarse_segm_confidence),
+        )
+
+    PredictorOutput.to = PredictorOutput_to
+    return PredictorOutput
diff --git a/densepose/structures/chart_result.py b/densepose/structures/chart_result.py
new file mode 100644
index 0000000000000000000000000000000000000000..003933d03d153d045c0bf551c465bc7a224d90cb
--- /dev/null
+++ b/densepose/structures/chart_result.py
@@ -0,0 +1,183 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+
+from dataclasses import dataclass
+from typing import Any, Optional, Tuple
+import torch
+
+
+@dataclass
+class DensePoseChartResult:
+    """
+    DensePose results for chart-based methods represented by labels and inner
+    coordinates (U, V) of individual charts. Each chart is a 2D manifold
+    that has an associated label and is parameterized by two coordinates U and V.
+    Both U and V take values in [0, 1].
+    Thus the results are represented by two tensors:
+    - labels (tensor [H, W] of long): contains estimated label for each pixel of
+        the detection bounding box of size (H, W)
+    - uv (tensor [2, H, W] of float): contains estimated U and V coordinates
+        for each pixel of the detection bounding box of size (H, W)
+    """
+
+    labels: torch.Tensor
+    uv: torch.Tensor
+
+    def to(self, device: torch.device):
+        """
+        Transfers all tensors to the given device
+        """
+        labels = self.labels.to(device)
+        uv = self.uv.to(device)
+        return DensePoseChartResult(labels=labels, uv=uv)
+
+
+@dataclass
+class DensePoseChartResultWithConfidences:
+    """
+    We add confidence values to DensePoseChartResult
+    Thus the results are represented by two tensors:
+    - labels (tensor [H, W] of long): contains estimated label for each pixel of
+        the detection bounding box of size (H, W)
+    - uv (tensor [2, H, W] of float): contains estimated U and V coordinates
+        for each pixel of the detection bounding box of size (H, W)
+    Plus one [H, W] tensor of float for each confidence type
+    """
+
+    labels: torch.Tensor
+    uv: torch.Tensor
+    sigma_1: Optional[torch.Tensor] = None
+    sigma_2: Optional[torch.Tensor] = None
+    kappa_u: Optional[torch.Tensor] = None
+    kappa_v: Optional[torch.Tensor] = None
+    fine_segm_confidence: Optional[torch.Tensor] = None
+    coarse_segm_confidence: Optional[torch.Tensor] = None
+
+    def to(self, device: torch.device):
+        """
+        Transfers all tensors to the given device, except if their value is None
+        """
+
+        def to_device_if_tensor(var: Any):
+            if isinstance(var, torch.Tensor):
+                return var.to(device)
+            return var
+
+        return DensePoseChartResultWithConfidences(
+            labels=self.labels.to(device),
+            uv=self.uv.to(device),
+            sigma_1=to_device_if_tensor(self.sigma_1),
+            sigma_2=to_device_if_tensor(self.sigma_2),
+            kappa_u=to_device_if_tensor(self.kappa_u),
+            kappa_v=to_device_if_tensor(self.kappa_v),
+            fine_segm_confidence=to_device_if_tensor(self.fine_segm_confidence),
+            coarse_segm_confidence=to_device_if_tensor(self.coarse_segm_confidence),
+        )
+
+
+@dataclass
+class DensePoseChartResultQuantized:
+    """
+    DensePose results for chart-based methods represented by labels and quantized
+    inner coordinates (U, V) of individual charts. Each chart is a 2D manifold
+    that has an associated label and is parameterized by two coordinates U and V.
+    Both U and V take values in [0, 1].
+    Quantized coordinates Uq and Vq have uint8 values which are obtained as:
+      Uq = U * 255 (hence 0 <= Uq <= 255)
+      Vq = V * 255 (hence 0 <= Vq <= 255)
+    Thus the results are represented by one tensor:
+    - labels_uv_uint8 (tensor [3, H, W] of uint8): contains estimated label
+        and quantized coordinates Uq and Vq for each pixel of the detection
+        bounding box of size (H, W)
+    """
+
+    labels_uv_uint8: torch.Tensor
+
+    def to(self, device: torch.device):
+        """
+        Transfers all tensors to the given device
+        """
+        labels_uv_uint8 = self.labels_uv_uint8.to(device)
+        return DensePoseChartResultQuantized(labels_uv_uint8=labels_uv_uint8)
+
+
+@dataclass
+class DensePoseChartResultCompressed:
+    """
+    DensePose results for chart-based methods represented by a PNG-encoded string.
+    The tensor of quantized DensePose results of size [3, H, W] is considered
+    as an image with 3 color channels. PNG compression is applied and the result
+    is stored as a Base64-encoded string. The following attributes are defined:
+    - shape_chw (tuple of 3 int): contains shape of the result tensor
+        (number of channels, height, width)
+    - labels_uv_str (str): contains Base64-encoded results tensor of size
+        [3, H, W] compressed with PNG compression methods
+    """
+
+    shape_chw: Tuple[int, int, int]
+    labels_uv_str: str
+
+
+def quantize_densepose_chart_result(result: DensePoseChartResult) -> DensePoseChartResultQuantized:
+    """
+    Applies quantization to DensePose chart-based result.
+
+    Args:
+        result (DensePoseChartResult): DensePose chart-based result
+    Return:
+        Quantized DensePose chart-based result (DensePoseChartResultQuantized)
+    """
+    h, w = result.labels.shape
+    labels_uv_uint8 = torch.zeros([3, h, w], dtype=torch.uint8, device=result.labels.device)
+    labels_uv_uint8[0] = result.labels
+    labels_uv_uint8[1:] = (result.uv * 255).clamp(0, 255).byte()
+    return DensePoseChartResultQuantized(labels_uv_uint8=labels_uv_uint8)
+
+
+def compress_quantized_densepose_chart_result(
+    result: DensePoseChartResultQuantized,
+) -> DensePoseChartResultCompressed:
+    """
+    Compresses quantized DensePose chart-based result
+
+    Args:
+        result (DensePoseChartResultQuantized): quantized DensePose chart-based result
+    Return:
+        Compressed DensePose chart-based result (DensePoseChartResultCompressed)
+    """
+    import base64
+    import numpy as np
+    from io import BytesIO
+    from PIL import Image
+
+    labels_uv_uint8_np_chw = result.labels_uv_uint8.cpu().numpy()
+    labels_uv_uint8_np_hwc = np.moveaxis(labels_uv_uint8_np_chw, 0, -1)
+    im = Image.fromarray(labels_uv_uint8_np_hwc)
+    fstream = BytesIO()
+    im.save(fstream, format="png", optimize=True)
+    labels_uv_str = base64.encodebytes(fstream.getvalue()).decode()
+    shape_chw = labels_uv_uint8_np_chw.shape
+    return DensePoseChartResultCompressed(labels_uv_str=labels_uv_str, shape_chw=shape_chw)
+
+
+def decompress_compressed_densepose_chart_result(
+    result: DensePoseChartResultCompressed,
+) -> DensePoseChartResultQuantized:
+    """
+    Decompresses DensePose chart-based result encoded into a base64 string
+
+    Args:
+        result (DensePoseChartResultCompressed): compressed DensePose chart result
+    Return:
+        Quantized DensePose chart-based result (DensePoseChartResultQuantized)
+    """
+    import base64
+    import numpy as np
+    from io import BytesIO
+    from PIL import Image
+
+    fstream = BytesIO(base64.decodebytes(result.labels_uv_str.encode()))
+    im = Image.open(fstream)
+    labels_uv_uint8_np_chw = np.moveaxis(np.array(im, dtype=np.uint8), -1, 0)
+    return DensePoseChartResultQuantized(
+        labels_uv_uint8=torch.from_numpy(labels_uv_uint8_np_chw.reshape(result.shape_chw))
+    )
diff --git a/densepose/structures/cse.py b/densepose/structures/cse.py
new file mode 100644
index 0000000000000000000000000000000000000000..9cd65da96c04613053e21494bc2dcc04f37fe1fd
--- /dev/null
+++ b/densepose/structures/cse.py
@@ -0,0 +1,52 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+
+from dataclasses import dataclass
+from typing import Union
+import torch
+
+
+@dataclass
+class DensePoseEmbeddingPredictorOutput:
+    """
+    Predictor output that contains embedding and coarse segmentation data:
+     * embedding: float tensor of size [N, D, H, W], contains estimated embeddings
+     * coarse_segm: float tensor of size [N, K, H, W]
+    Here D = MODEL.ROI_DENSEPOSE_HEAD.CSE.EMBED_SIZE
+         K = MODEL.ROI_DENSEPOSE_HEAD.NUM_COARSE_SEGM_CHANNELS
+    """
+
+    embedding: torch.Tensor
+    coarse_segm: torch.Tensor
+
+    def __len__(self):
+        """
+        Number of instances (N) in the output
+        """
+        return self.coarse_segm.size(0)
+
+    def __getitem__(
+        self, item: Union[int, slice, torch.BoolTensor]
+    ) -> "DensePoseEmbeddingPredictorOutput":
+        """
+        Get outputs for the selected instance(s)
+
+        Args:
+            item (int or slice or tensor): selected items
+        """
+        if isinstance(item, int):
+            return DensePoseEmbeddingPredictorOutput(
+                coarse_segm=self.coarse_segm[item].unsqueeze(0),
+                embedding=self.embedding[item].unsqueeze(0),
+            )
+        else:
+            return DensePoseEmbeddingPredictorOutput(
+                coarse_segm=self.coarse_segm[item], embedding=self.embedding[item]
+            )
+
+    def to(self, device: torch.device):
+        """
+        Transfers all tensors to the given device
+        """
+        coarse_segm = self.coarse_segm.to(device)
+        embedding = self.embedding.to(device)
+        return DensePoseEmbeddingPredictorOutput(coarse_segm=coarse_segm, embedding=embedding)
diff --git a/densepose/structures/cse_confidence.py b/densepose/structures/cse_confidence.py
new file mode 100644
index 0000000000000000000000000000000000000000..ee5166f82d45ecb4ea829ec2ecab248161c19421
--- /dev/null
+++ b/densepose/structures/cse_confidence.py
@@ -0,0 +1,78 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+
+from dataclasses import make_dataclass
+from functools import lru_cache
+from typing import Any, Optional
+import torch
+
+
+@lru_cache(maxsize=None)
+def decorate_cse_predictor_output_class_with_confidences(BasePredictorOutput: type) -> type:
+    """
+    Create a new output class from an existing one by adding new attributes
+    related to confidence estimation:
+    - coarse_segm_confidence (tensor)
+
+    Details on confidence estimation parameters can be found in:
+    N. Neverova, D. Novotny, A. Vedaldi "Correlated Uncertainty for Learning
+        Dense Correspondences from Noisy Labels", p. 918--926, in Proc. NIPS 2019
+    A. Sanakoyeu et al., Transferring Dense Pose to Proximal Animal Classes, CVPR 2020
+
+    The new class inherits the provided `BasePredictorOutput` class,
+    it's name is composed of the name of the provided class and
+    "WithConfidences" suffix.
+
+    Args:
+        BasePredictorOutput (type): output type to which confidence data
+            is to be added, assumed to be a dataclass
+    Return:
+        New dataclass derived from the provided one that has attributes
+        for confidence estimation
+    """
+
+    PredictorOutput = make_dataclass(
+        BasePredictorOutput.__name__ + "WithConfidences",
+        fields=[
+            ("coarse_segm_confidence", Optional[torch.Tensor], None),
+        ],
+        bases=(BasePredictorOutput,),
+    )
+
+    # add possibility to index PredictorOutput
+
+    def slice_if_not_none(data, item):
+        if data is None:
+            return None
+        if isinstance(item, int):
+            return data[item].unsqueeze(0)
+        return data[item]
+
+    def PredictorOutput_getitem(self, item):
+        PredictorOutput = type(self)
+        base_predictor_output_sliced = super(PredictorOutput, self).__getitem__(item)
+        return PredictorOutput(
+            **base_predictor_output_sliced.__dict__,
+            coarse_segm_confidence=slice_if_not_none(self.coarse_segm_confidence, item),
+        )
+
+    PredictorOutput.__getitem__ = PredictorOutput_getitem
+
+    def PredictorOutput_to(self, device: torch.device):
+        """
+        Transfers all tensors to the given device
+        """
+        PredictorOutput = type(self)
+        base_predictor_output_to = super(PredictorOutput, self).to(device)  # pyre-ignore[16]
+
+        def to_device_if_tensor(var: Any):
+            if isinstance(var, torch.Tensor):
+                return var.to(device)
+            return var
+
+        return PredictorOutput(
+            **base_predictor_output_to.__dict__,
+            coarse_segm_confidence=to_device_if_tensor(self.coarse_segm_confidence),
+        )
+
+    PredictorOutput.to = PredictorOutput_to
+    return PredictorOutput
diff --git a/densepose/structures/data_relative.py b/densepose/structures/data_relative.py
new file mode 100644
index 0000000000000000000000000000000000000000..187e140495f94a740fdd91d756f2195a0c8f4f30
--- /dev/null
+++ b/densepose/structures/data_relative.py
@@ -0,0 +1,243 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+import numpy as np
+import torch
+from torch.nn import functional as F
+
+from densepose.data.meshes.catalog import MeshCatalog
+from densepose.structures.mesh import load_mesh_symmetry
+from densepose.structures.transform_data import DensePoseTransformData
+
+
+class DensePoseDataRelative:
+    """
+    Dense pose relative annotations that can be applied to any bounding box:
+        x - normalized X coordinates [0, 255] of annotated points
+        y - normalized Y coordinates [0, 255] of annotated points
+        i - body part labels 0,...,24 for annotated points
+        u - body part U coordinates [0, 1] for annotated points
+        v - body part V coordinates [0, 1] for annotated points
+        segm - 256x256 segmentation mask with values 0,...,14
+    To obtain absolute x and y data wrt some bounding box one needs to first
+    divide the data by 256, multiply by the respective bounding box size
+    and add bounding box offset:
+        x_img = x0 + x_norm * w / 256.0
+        y_img = y0 + y_norm * h / 256.0
+    Segmentation masks are typically sampled to get image-based masks.
+    """
+
+    # Key for normalized X coordinates in annotation dict
+    X_KEY = "dp_x"
+    # Key for normalized Y coordinates in annotation dict
+    Y_KEY = "dp_y"
+    # Key for U part coordinates in annotation dict (used in chart-based annotations)
+    U_KEY = "dp_U"
+    # Key for V part coordinates in annotation dict (used in chart-based annotations)
+    V_KEY = "dp_V"
+    # Key for I point labels in annotation dict (used in chart-based annotations)
+    I_KEY = "dp_I"
+    # Key for segmentation mask in annotation dict
+    S_KEY = "dp_masks"
+    # Key for vertex ids (used in continuous surface embeddings annotations)
+    VERTEX_IDS_KEY = "dp_vertex"
+    # Key for mesh id (used in continuous surface embeddings annotations)
+    MESH_NAME_KEY = "ref_model"
+    # Number of body parts in segmentation masks
+    N_BODY_PARTS = 14
+    # Number of parts in point labels
+    N_PART_LABELS = 24
+    MASK_SIZE = 256
+
+    def __init__(self, annotation, cleanup=False):
+        self.x = torch.as_tensor(annotation[DensePoseDataRelative.X_KEY])
+        self.y = torch.as_tensor(annotation[DensePoseDataRelative.Y_KEY])
+        if (
+            DensePoseDataRelative.I_KEY in annotation
+            and DensePoseDataRelative.U_KEY in annotation
+            and DensePoseDataRelative.V_KEY in annotation
+        ):
+            self.i = torch.as_tensor(annotation[DensePoseDataRelative.I_KEY])
+            self.u = torch.as_tensor(annotation[DensePoseDataRelative.U_KEY])
+            self.v = torch.as_tensor(annotation[DensePoseDataRelative.V_KEY])
+        if (
+            DensePoseDataRelative.VERTEX_IDS_KEY in annotation
+            and DensePoseDataRelative.MESH_NAME_KEY in annotation
+        ):
+            self.vertex_ids = torch.as_tensor(
+                annotation[DensePoseDataRelative.VERTEX_IDS_KEY], dtype=torch.long
+            )
+            self.mesh_id = MeshCatalog.get_mesh_id(annotation[DensePoseDataRelative.MESH_NAME_KEY])
+        if DensePoseDataRelative.S_KEY in annotation:
+            self.segm = DensePoseDataRelative.extract_segmentation_mask(annotation)
+        self.device = torch.device("cpu")
+        if cleanup:
+            DensePoseDataRelative.cleanup_annotation(annotation)
+
+    def to(self, device):
+        if self.device == device:
+            return self
+        new_data = DensePoseDataRelative.__new__(DensePoseDataRelative)
+        new_data.x = self.x.to(device)
+        new_data.y = self.y.to(device)
+        for attr in ["i", "u", "v", "vertex_ids", "segm"]:
+            if hasattr(self, attr):
+                setattr(new_data, attr, getattr(self, attr).to(device))
+        if hasattr(self, "mesh_id"):
+            new_data.mesh_id = self.mesh_id
+        new_data.device = device
+        return new_data
+
+    @staticmethod
+    def extract_segmentation_mask(annotation):
+        import pycocotools.mask as mask_utils
+
+        # TODO: annotation instance is accepted if it contains either
+        # DensePose segmentation or instance segmentation. However, here we
+        # only rely on DensePose segmentation
+        poly_specs = annotation[DensePoseDataRelative.S_KEY]
+        if isinstance(poly_specs, torch.Tensor):
+            # data is already given as mask tensors, no need to decode
+            return poly_specs
+        segm = torch.zeros((DensePoseDataRelative.MASK_SIZE,) * 2, dtype=torch.float32)
+        if isinstance(poly_specs, dict):
+            if poly_specs:
+                mask = mask_utils.decode(poly_specs)
+                segm[mask > 0] = 1
+        else:
+            for i in range(len(poly_specs)):
+                poly_i = poly_specs[i]
+                if poly_i:
+                    mask_i = mask_utils.decode(poly_i)
+                    segm[mask_i > 0] = i + 1
+        return segm
+
+    @staticmethod
+    def validate_annotation(annotation):
+        for key in [
+            DensePoseDataRelative.X_KEY,
+            DensePoseDataRelative.Y_KEY,
+        ]:
+            if key not in annotation:
+                return False, "no {key} data in the annotation".format(key=key)
+        valid_for_iuv_setting = all(
+            key in annotation
+            for key in [
+                DensePoseDataRelative.I_KEY,
+                DensePoseDataRelative.U_KEY,
+                DensePoseDataRelative.V_KEY,
+            ]
+        )
+        valid_for_cse_setting = all(
+            key in annotation
+            for key in [
+                DensePoseDataRelative.VERTEX_IDS_KEY,
+                DensePoseDataRelative.MESH_NAME_KEY,
+            ]
+        )
+        if not valid_for_iuv_setting and not valid_for_cse_setting:
+            return (
+                False,
+                "expected either {} (IUV setting) or {} (CSE setting) annotations".format(
+                    ", ".join(
+                        [
+                            DensePoseDataRelative.I_KEY,
+                            DensePoseDataRelative.U_KEY,
+                            DensePoseDataRelative.V_KEY,
+                        ]
+                    ),
+                    ", ".join(
+                        [
+                            DensePoseDataRelative.VERTEX_IDS_KEY,
+                            DensePoseDataRelative.MESH_NAME_KEY,
+                        ]
+                    ),
+                ),
+            )
+        return True, None
+
+    @staticmethod
+    def cleanup_annotation(annotation):
+        for key in [
+            DensePoseDataRelative.X_KEY,
+            DensePoseDataRelative.Y_KEY,
+            DensePoseDataRelative.I_KEY,
+            DensePoseDataRelative.U_KEY,
+            DensePoseDataRelative.V_KEY,
+            DensePoseDataRelative.S_KEY,
+            DensePoseDataRelative.VERTEX_IDS_KEY,
+            DensePoseDataRelative.MESH_NAME_KEY,
+        ]:
+            if key in annotation:
+                del annotation[key]
+
+    def apply_transform(self, transforms, densepose_transform_data):
+        self._transform_pts(transforms, densepose_transform_data)
+        if hasattr(self, "segm"):
+            self._transform_segm(transforms, densepose_transform_data)
+
+    def _transform_pts(self, transforms, dp_transform_data):
+        import detectron2.data.transforms as T
+
+        # NOTE: This assumes that HorizFlipTransform is the only one that does flip
+        do_hflip = sum(isinstance(t, T.HFlipTransform) for t in transforms.transforms) % 2 == 1
+        if do_hflip:
+            self.x = self.MASK_SIZE - self.x
+            if hasattr(self, "i"):
+                self._flip_iuv_semantics(dp_transform_data)
+            if hasattr(self, "vertex_ids"):
+                self._flip_vertices()
+
+        for t in transforms.transforms:
+            if isinstance(t, T.RotationTransform):
+                xy_scale = np.array((t.w, t.h)) / DensePoseDataRelative.MASK_SIZE
+                xy = t.apply_coords(np.stack((self.x, self.y), axis=1) * xy_scale)
+                self.x, self.y = torch.tensor(xy / xy_scale, dtype=self.x.dtype).T
+
+    def _flip_iuv_semantics(self, dp_transform_data: DensePoseTransformData) -> None:
+        i_old = self.i.clone()
+        uv_symmetries = dp_transform_data.uv_symmetries
+        pt_label_symmetries = dp_transform_data.point_label_symmetries
+        for i in range(self.N_PART_LABELS):
+            if i + 1 in i_old:
+                annot_indices_i = i_old == i + 1
+                if pt_label_symmetries[i + 1] != i + 1:
+                    self.i[annot_indices_i] = pt_label_symmetries[i + 1]
+                u_loc = (self.u[annot_indices_i] * 255).long()
+                v_loc = (self.v[annot_indices_i] * 255).long()
+                self.u[annot_indices_i] = uv_symmetries["U_transforms"][i][v_loc, u_loc].to(
+                    device=self.u.device
+                )
+                self.v[annot_indices_i] = uv_symmetries["V_transforms"][i][v_loc, u_loc].to(
+                    device=self.v.device
+                )
+
+    def _flip_vertices(self):
+        mesh_info = MeshCatalog[MeshCatalog.get_mesh_name(self.mesh_id)]
+        mesh_symmetry = (
+            load_mesh_symmetry(mesh_info.symmetry) if mesh_info.symmetry is not None else None
+        )
+        self.vertex_ids = mesh_symmetry["vertex_transforms"][self.vertex_ids]
+
+    def _transform_segm(self, transforms, dp_transform_data):
+        import detectron2.data.transforms as T
+
+        # NOTE: This assumes that HorizFlipTransform is the only one that does flip
+        do_hflip = sum(isinstance(t, T.HFlipTransform) for t in transforms.transforms) % 2 == 1
+        if do_hflip:
+            self.segm = torch.flip(self.segm, [1])
+            self._flip_segm_semantics(dp_transform_data)
+
+        for t in transforms.transforms:
+            if isinstance(t, T.RotationTransform):
+                self._transform_segm_rotation(t)
+
+    def _flip_segm_semantics(self, dp_transform_data):
+        old_segm = self.segm.clone()
+        mask_label_symmetries = dp_transform_data.mask_label_symmetries
+        for i in range(self.N_BODY_PARTS):
+            if mask_label_symmetries[i + 1] != i + 1:
+                self.segm[old_segm == i + 1] = mask_label_symmetries[i + 1]
+
+    def _transform_segm_rotation(self, rotation):
+        self.segm = F.interpolate(self.segm[None, None, :], (rotation.h, rotation.w)).numpy()
+        self.segm = torch.tensor(rotation.apply_segmentation(self.segm[0, 0]))[None, None, :]
+        self.segm = F.interpolate(self.segm, [DensePoseDataRelative.MASK_SIZE] * 2)[0, 0]
diff --git a/densepose/structures/list.py b/densepose/structures/list.py
new file mode 100644
index 0000000000000000000000000000000000000000..7631f8f78f4e9b1a94653d4e47639c50affe58eb
--- /dev/null
+++ b/densepose/structures/list.py
@@ -0,0 +1,70 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+import torch
+
+from densepose.structures.data_relative import DensePoseDataRelative
+
+
+class DensePoseList:
+
+    _TORCH_DEVICE_CPU = torch.device("cpu")
+
+    def __init__(self, densepose_datas, boxes_xyxy_abs, image_size_hw, device=_TORCH_DEVICE_CPU):
+        assert len(densepose_datas) == len(
+            boxes_xyxy_abs
+        ), "Attempt to initialize DensePoseList with {} DensePose datas " "and {} boxes".format(
+            len(densepose_datas), len(boxes_xyxy_abs)
+        )
+        self.densepose_datas = []
+        for densepose_data in densepose_datas:
+            assert isinstance(densepose_data, DensePoseDataRelative) or densepose_data is None, (
+                "Attempt to initialize DensePoseList with DensePose datas "
+                "of type {}, expected DensePoseDataRelative".format(type(densepose_data))
+            )
+            densepose_data_ondevice = (
+                densepose_data.to(device) if densepose_data is not None else None
+            )
+            self.densepose_datas.append(densepose_data_ondevice)
+        self.boxes_xyxy_abs = boxes_xyxy_abs.to(device)
+        self.image_size_hw = image_size_hw
+        self.device = device
+
+    def to(self, device):
+        if self.device == device:
+            return self
+        return DensePoseList(self.densepose_datas, self.boxes_xyxy_abs, self.image_size_hw, device)
+
+    def __iter__(self):
+        return iter(self.densepose_datas)
+
+    def __len__(self):
+        return len(self.densepose_datas)
+
+    def __repr__(self):
+        s = self.__class__.__name__ + "("
+        s += "num_instances={}, ".format(len(self.densepose_datas))
+        s += "image_width={}, ".format(self.image_size_hw[1])
+        s += "image_height={})".format(self.image_size_hw[0])
+        return s
+
+    def __getitem__(self, item):
+        if isinstance(item, int):
+            densepose_data_rel = self.densepose_datas[item]
+            return densepose_data_rel
+        elif isinstance(item, slice):
+            densepose_datas_rel = self.densepose_datas[item]
+            boxes_xyxy_abs = self.boxes_xyxy_abs[item]
+            return DensePoseList(
+                densepose_datas_rel, boxes_xyxy_abs, self.image_size_hw, self.device
+            )
+        elif isinstance(item, torch.Tensor) and (item.dtype == torch.bool):
+            densepose_datas_rel = [self.densepose_datas[i] for i, x in enumerate(item) if x > 0]
+            boxes_xyxy_abs = self.boxes_xyxy_abs[item]
+            return DensePoseList(
+                densepose_datas_rel, boxes_xyxy_abs, self.image_size_hw, self.device
+            )
+        else:
+            densepose_datas_rel = [self.densepose_datas[i] for i in item]
+            boxes_xyxy_abs = self.boxes_xyxy_abs[item]
+            return DensePoseList(
+                densepose_datas_rel, boxes_xyxy_abs, self.image_size_hw, self.device
+            )
diff --git a/densepose/structures/mesh.py b/densepose/structures/mesh.py
new file mode 100644
index 0000000000000000000000000000000000000000..a5abd3419b35234e6b44c0577bef2818f99a5cdc
--- /dev/null
+++ b/densepose/structures/mesh.py
@@ -0,0 +1,170 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+
+import pickle
+from functools import lru_cache
+from typing import Dict, Optional, Tuple
+import torch
+
+from detectron2.utils.file_io import PathManager
+
+from densepose.data.meshes.catalog import MeshCatalog, MeshInfo
+
+
+def _maybe_copy_to_device(
+    attribute: Optional[torch.Tensor], device: torch.device
+) -> Optional[torch.Tensor]:
+    if attribute is None:
+        return None
+    return attribute.to(device)
+
+
+class Mesh:
+    def __init__(
+        self,
+        vertices: Optional[torch.Tensor] = None,
+        faces: Optional[torch.Tensor] = None,
+        geodists: Optional[torch.Tensor] = None,
+        symmetry: Optional[Dict[str, torch.Tensor]] = None,
+        texcoords: Optional[torch.Tensor] = None,
+        mesh_info: Optional[MeshInfo] = None,
+        device: Optional[torch.device] = None,
+    ):
+        """
+        Args:
+            vertices (tensor [N, 3] of float32): vertex coordinates in 3D
+            faces (tensor [M, 3] of long): triangular face represented as 3
+                vertex indices
+            geodists (tensor [N, N] of float32): geodesic distances from
+                vertex `i` to vertex `j` (optional, default: None)
+            symmetry (dict: str -> tensor): various mesh symmetry data:
+                - "vertex_transforms": vertex mapping under horizontal flip,
+                  tensor of size [N] of type long; vertex `i` is mapped to
+                  vertex `tensor[i]` (optional, default: None)
+            texcoords (tensor [N, 2] of float32): texture coordinates, i.e. global
+                and normalized mesh UVs (optional, default: None)
+            mesh_info (MeshInfo type): necessary to load the attributes on-the-go,
+                can be used instead of passing all the variables one by one
+            device (torch.device): device of the Mesh. If not provided, will use
+                the device of the vertices
+        """
+        self._vertices = vertices
+        self._faces = faces
+        self._geodists = geodists
+        self._symmetry = symmetry
+        self._texcoords = texcoords
+        self.mesh_info = mesh_info
+        self.device = device
+
+        assert self._vertices is not None or self.mesh_info is not None
+
+        all_fields = [self._vertices, self._faces, self._geodists, self._texcoords]
+
+        if self.device is None:
+            for field in all_fields:
+                if field is not None:
+                    self.device = field.device
+                    break
+            if self.device is None and symmetry is not None:
+                for key in symmetry:
+                    self.device = symmetry[key].device
+                    break
+            self.device = torch.device("cpu") if self.device is None else self.device
+
+        assert all([var.device == self.device for var in all_fields if var is not None])
+        if symmetry:
+            assert all(symmetry[key].device == self.device for key in symmetry)
+        if texcoords and vertices:
+            assert len(vertices) == len(texcoords)
+
+    def to(self, device: torch.device):
+        device_symmetry = self._symmetry
+        if device_symmetry:
+            device_symmetry = {key: value.to(device) for key, value in device_symmetry.items()}
+        return Mesh(
+            _maybe_copy_to_device(self._vertices, device),
+            _maybe_copy_to_device(self._faces, device),
+            _maybe_copy_to_device(self._geodists, device),
+            device_symmetry,
+            _maybe_copy_to_device(self._texcoords, device),
+            self.mesh_info,
+            device,
+        )
+
+    @property
+    def vertices(self):
+        if self._vertices is None and self.mesh_info is not None:
+            self._vertices = load_mesh_data(self.mesh_info.data, "vertices", self.device)
+        return self._vertices
+
+    @property
+    def faces(self):
+        if self._faces is None and self.mesh_info is not None:
+            self._faces = load_mesh_data(self.mesh_info.data, "faces", self.device)
+        return self._faces
+
+    @property
+    def geodists(self):
+        if self._geodists is None and self.mesh_info is not None:
+            self._geodists = load_mesh_auxiliary_data(self.mesh_info.geodists, self.device)
+        return self._geodists
+
+    @property
+    def symmetry(self):
+        if self._symmetry is None and self.mesh_info is not None:
+            self._symmetry = load_mesh_symmetry(self.mesh_info.symmetry, self.device)
+        return self._symmetry
+
+    @property
+    def texcoords(self):
+        if self._texcoords is None and self.mesh_info is not None:
+            self._texcoords = load_mesh_auxiliary_data(self.mesh_info.texcoords, self.device)
+        return self._texcoords
+
+    def get_geodists(self):
+        if self.geodists is None:
+            self.geodists = self._compute_geodists()
+        return self.geodists
+
+    def _compute_geodists(self):
+        # TODO: compute using Laplace-Beltrami
+        geodists = None
+        return geodists
+
+
+def load_mesh_data(
+    mesh_fpath: str, field: str, device: Optional[torch.device] = None
+) -> Tuple[Optional[torch.Tensor], Optional[torch.Tensor]]:
+    with PathManager.open(mesh_fpath, "rb") as hFile:
+        # pyre-fixme[7]: Expected `Tuple[Optional[Tensor], Optional[Tensor]]` but
+        #  got `Tensor`.
+        return torch.as_tensor(pickle.load(hFile)[field], dtype=torch.float).to(device)
+    return None
+
+
+def load_mesh_auxiliary_data(
+    fpath: str, device: Optional[torch.device] = None
+) -> Optional[torch.Tensor]:
+    fpath_local = PathManager.get_local_path(fpath)
+    with PathManager.open(fpath_local, "rb") as hFile:
+        return torch.as_tensor(pickle.load(hFile), dtype=torch.float).to(device)
+    return None
+
+
+@lru_cache()
+def load_mesh_symmetry(
+    symmetry_fpath: str, device: Optional[torch.device] = None
+) -> Optional[Dict[str, torch.Tensor]]:
+    with PathManager.open(symmetry_fpath, "rb") as hFile:
+        symmetry_loaded = pickle.load(hFile)
+        symmetry = {
+            "vertex_transforms": torch.as_tensor(
+                symmetry_loaded["vertex_transforms"], dtype=torch.long
+            ).to(device),
+        }
+        return symmetry
+    return None
+
+
+@lru_cache()
+def create_mesh(mesh_name: str, device: Optional[torch.device] = None) -> Mesh:
+    return Mesh(mesh_info=MeshCatalog[mesh_name], device=device)
diff --git a/densepose/structures/transform_data.py b/densepose/structures/transform_data.py
new file mode 100644
index 0000000000000000000000000000000000000000..a345d66945fbd709ea6644caa7a71435aa0ed569
--- /dev/null
+++ b/densepose/structures/transform_data.py
@@ -0,0 +1,71 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+from typing import BinaryIO, Dict, Union
+import torch
+
+
+def normalized_coords_transform(x0, y0, w, h):
+    """
+    Coordinates transform that maps top left corner to (-1, -1) and bottom
+    right corner to (1, 1). Used for torch.grid_sample to initialize the
+    grid
+    """
+
+    def f(p):
+        return (2 * (p[0] - x0) / w - 1, 2 * (p[1] - y0) / h - 1)
+
+    return f
+
+
+class DensePoseTransformData:
+
+    # Horizontal symmetry label transforms used for horizontal flip
+    MASK_LABEL_SYMMETRIES = [0, 1, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 14]
+    # fmt: off
+    POINT_LABEL_SYMMETRIES = [ 0, 1, 2, 4, 3, 6, 5, 8, 7, 10, 9, 12, 11, 14, 13, 16, 15, 18, 17, 20, 19, 22, 21, 24, 23]  # noqa
+    # fmt: on
+
+    def __init__(self, uv_symmetries: Dict[str, torch.Tensor], device: torch.device):
+        self.mask_label_symmetries = DensePoseTransformData.MASK_LABEL_SYMMETRIES
+        self.point_label_symmetries = DensePoseTransformData.POINT_LABEL_SYMMETRIES
+        self.uv_symmetries = uv_symmetries
+        self.device = torch.device("cpu")
+
+    def to(self, device: torch.device, copy: bool = False) -> "DensePoseTransformData":
+        """
+        Convert transform data to the specified device
+
+        Args:
+            device (torch.device): device to convert the data to
+            copy (bool): flag that specifies whether to copy or to reference the data
+                in case the device is the same
+        Return:
+            An instance of `DensePoseTransformData` with data stored on the specified device
+        """
+        if self.device == device and not copy:
+            return self
+        uv_symmetry_map = {}
+        for key in self.uv_symmetries:
+            uv_symmetry_map[key] = self.uv_symmetries[key].to(device=device, copy=copy)
+        return DensePoseTransformData(uv_symmetry_map, device)
+
+    @staticmethod
+    def load(io: Union[str, BinaryIO]):
+        """
+        Args:
+            io: (str or binary file-like object): input file to load data from
+        Returns:
+            An instance of `DensePoseTransformData` with transforms loaded from the file
+        """
+        import scipy.io
+
+        uv_symmetry_map = scipy.io.loadmat(io)
+        uv_symmetry_map_torch = {}
+        for key in ["U_transforms", "V_transforms"]:
+            uv_symmetry_map_torch[key] = []
+            map_src = uv_symmetry_map[key]
+            map_dst = uv_symmetry_map_torch[key]
+            for i in range(map_src.shape[1]):
+                map_dst.append(torch.from_numpy(map_src[0, i]).to(dtype=torch.float))
+            uv_symmetry_map_torch[key] = torch.stack(map_dst, dim=0)
+        transform_data = DensePoseTransformData(uv_symmetry_map_torch, device=torch.device("cpu"))
+        return transform_data
diff --git a/densepose/utils/__init__.py b/densepose/utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/densepose/utils/__pycache__/__init__.cpython-310.pyc b/densepose/utils/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2a1d1b33bcc946d554f371a0c9534f210fd189b3
Binary files /dev/null and b/densepose/utils/__pycache__/__init__.cpython-310.pyc differ
diff --git a/densepose/utils/__pycache__/__init__.cpython-39.pyc b/densepose/utils/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..22106749dcd82f326d228a9b3ae9cc7761a66200
Binary files /dev/null and b/densepose/utils/__pycache__/__init__.cpython-39.pyc differ
diff --git a/densepose/utils/__pycache__/logger.cpython-310.pyc b/densepose/utils/__pycache__/logger.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..374cee5eb11e2f2ad3a4fcab825fff355279f7e0
Binary files /dev/null and b/densepose/utils/__pycache__/logger.cpython-310.pyc differ
diff --git a/densepose/utils/__pycache__/logger.cpython-39.pyc b/densepose/utils/__pycache__/logger.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..65cb7a180e42d6be4da39daf7b4ab254e6c6969b
Binary files /dev/null and b/densepose/utils/__pycache__/logger.cpython-39.pyc differ
diff --git a/densepose/utils/__pycache__/transform.cpython-310.pyc b/densepose/utils/__pycache__/transform.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ea791fdec0bb6b9feabc276dfcfccfc5f125f3d5
Binary files /dev/null and b/densepose/utils/__pycache__/transform.cpython-310.pyc differ
diff --git a/densepose/utils/__pycache__/transform.cpython-39.pyc b/densepose/utils/__pycache__/transform.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d9fb0e5512e324a5078423ad8ce0ef878d260391
Binary files /dev/null and b/densepose/utils/__pycache__/transform.cpython-39.pyc differ
diff --git a/densepose/utils/dbhelper.py b/densepose/utils/dbhelper.py
new file mode 100644
index 0000000000000000000000000000000000000000..772e31874b2f65da9ae8b4e03c7515d5af282586
--- /dev/null
+++ b/densepose/utils/dbhelper.py
@@ -0,0 +1,147 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+from typing import Any, Dict, Optional, Tuple
+
+
+class EntrySelector:
+    """
+    Base class for entry selectors
+    """
+
+    @staticmethod
+    def from_string(spec: str) -> "EntrySelector":
+        if spec == "*":
+            return AllEntrySelector()
+        return FieldEntrySelector(spec)
+
+
+class AllEntrySelector(EntrySelector):
+    """
+    Selector that accepts all entries
+    """
+
+    SPECIFIER = "*"
+
+    def __call__(self, entry):
+        return True
+
+
+class FieldEntrySelector(EntrySelector):
+    """
+    Selector that accepts only entries that match provided field
+    specifier(s). Only a limited set of specifiers is supported for now:
+      <specifiers>::=<specifier>[<comma><specifiers>]
+      <specifier>::=<field_name>[<type_delim><type>]<equal><value_or_range>
+      <field_name> is a valid identifier
+      <type> ::= "int" | "str"
+      <equal> ::= "="
+      <comma> ::= ","
+      <type_delim> ::= ":"
+      <value_or_range> ::= <value> | <range>
+      <range> ::= <value><range_delim><value>
+      <range_delim> ::= "-"
+      <value> is a string without spaces and special symbols
+        (e.g. <comma>, <equal>, <type_delim>, <range_delim>)
+    """
+
+    _SPEC_DELIM = ","
+    _TYPE_DELIM = ":"
+    _RANGE_DELIM = "-"
+    _EQUAL = "="
+    _ERROR_PREFIX = "Invalid field selector specifier"
+
+    class _FieldEntryValuePredicate:
+        """
+        Predicate that checks strict equality for the specified entry field
+        """
+
+        def __init__(self, name: str, typespec: Optional[str], value: str):
+            import builtins
+
+            self.name = name
+            self.type = getattr(builtins, typespec) if typespec is not None else str
+            self.value = value
+
+        def __call__(self, entry):
+            return entry[self.name] == self.type(self.value)
+
+    class _FieldEntryRangePredicate:
+        """
+        Predicate that checks whether an entry field falls into the specified range
+        """
+
+        def __init__(self, name: str, typespec: Optional[str], vmin: str, vmax: str):
+            import builtins
+
+            self.name = name
+            self.type = getattr(builtins, typespec) if typespec is not None else str
+            self.vmin = vmin
+            self.vmax = vmax
+
+        def __call__(self, entry):
+            return (entry[self.name] >= self.type(self.vmin)) and (
+                entry[self.name] <= self.type(self.vmax)
+            )
+
+    def __init__(self, spec: str):
+        self._predicates = self._parse_specifier_into_predicates(spec)
+
+    def __call__(self, entry: Dict[str, Any]):
+        for predicate in self._predicates:
+            if not predicate(entry):
+                return False
+        return True
+
+    def _parse_specifier_into_predicates(self, spec: str):
+        predicates = []
+        specs = spec.split(self._SPEC_DELIM)
+        for subspec in specs:
+            eq_idx = subspec.find(self._EQUAL)
+            if eq_idx > 0:
+                field_name_with_type = subspec[:eq_idx]
+                field_name, field_type = self._parse_field_name_type(field_name_with_type)
+                field_value_or_range = subspec[eq_idx + 1 :]
+                if self._is_range_spec(field_value_or_range):
+                    vmin, vmax = self._get_range_spec(field_value_or_range)
+                    predicate = FieldEntrySelector._FieldEntryRangePredicate(
+                        field_name, field_type, vmin, vmax
+                    )
+                else:
+                    predicate = FieldEntrySelector._FieldEntryValuePredicate(
+                        field_name, field_type, field_value_or_range
+                    )
+                predicates.append(predicate)
+            elif eq_idx == 0:
+                self._parse_error(f'"{subspec}", field name is empty!')
+            else:
+                self._parse_error(f'"{subspec}", should have format ' "<field>=<value_or_range>!")
+        return predicates
+
+    def _parse_field_name_type(self, field_name_with_type: str) -> Tuple[str, Optional[str]]:
+        type_delim_idx = field_name_with_type.find(self._TYPE_DELIM)
+        if type_delim_idx > 0:
+            field_name = field_name_with_type[:type_delim_idx]
+            field_type = field_name_with_type[type_delim_idx + 1 :]
+        elif type_delim_idx == 0:
+            self._parse_error(f'"{field_name_with_type}", field name is empty!')
+        else:
+            field_name = field_name_with_type
+            field_type = None
+        # pyre-fixme[61]: `field_name` may not be initialized here.
+        # pyre-fixme[61]: `field_type` may not be initialized here.
+        return field_name, field_type
+
+    def _is_range_spec(self, field_value_or_range):
+        delim_idx = field_value_or_range.find(self._RANGE_DELIM)
+        return delim_idx > 0
+
+    def _get_range_spec(self, field_value_or_range):
+        if self._is_range_spec(field_value_or_range):
+            delim_idx = field_value_or_range.find(self._RANGE_DELIM)
+            vmin = field_value_or_range[:delim_idx]
+            vmax = field_value_or_range[delim_idx + 1 :]
+            return vmin, vmax
+        else:
+            self._parse_error('"field_value_or_range", range of values expected!')
+
+    def _parse_error(self, msg):
+        raise ValueError(f"{self._ERROR_PREFIX}: {msg}")
diff --git a/densepose/utils/logger.py b/densepose/utils/logger.py
new file mode 100644
index 0000000000000000000000000000000000000000..70cd3cb0eb0fc7495b1a4b50a05725a0e5b1baba
--- /dev/null
+++ b/densepose/utils/logger.py
@@ -0,0 +1,13 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+import logging
+
+
+def verbosity_to_level(verbosity) -> int:
+    if verbosity is not None:
+        if verbosity == 0:
+            return logging.WARNING
+        elif verbosity == 1:
+            return logging.INFO
+        elif verbosity >= 2:
+            return logging.DEBUG
+    return logging.WARNING
diff --git a/densepose/utils/transform.py b/densepose/utils/transform.py
new file mode 100644
index 0000000000000000000000000000000000000000..8dc4ae7be878302ec39b7f235e3ae1b7a3ca29ee
--- /dev/null
+++ b/densepose/utils/transform.py
@@ -0,0 +1,15 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+from detectron2.data import MetadataCatalog
+from detectron2.utils.file_io import PathManager
+
+from densepose import DensePoseTransformData
+
+
+def load_for_dataset(dataset_name):
+    path = MetadataCatalog.get(dataset_name).densepose_transform_src
+    densepose_transform_data_fpath = PathManager.get_local_path(path)
+    return DensePoseTransformData.load(densepose_transform_data_fpath)
+
+
+def load_from_cfg(cfg):
+    return load_for_dataset(cfg.DATASETS.TEST[0])
diff --git a/densepose/vis/__init__.py b/densepose/vis/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/densepose/vis/__pycache__/__init__.cpython-310.pyc b/densepose/vis/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..88715911e731d172974dee8667016788c02a34d6
Binary files /dev/null and b/densepose/vis/__pycache__/__init__.cpython-310.pyc differ
diff --git a/densepose/vis/__pycache__/__init__.cpython-39.pyc b/densepose/vis/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..572c11366a1179549163e9194db683d859fecf61
Binary files /dev/null and b/densepose/vis/__pycache__/__init__.cpython-39.pyc differ
diff --git a/densepose/vis/__pycache__/base.cpython-310.pyc b/densepose/vis/__pycache__/base.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b6846932d9032fbe22c6b0b2317e4f2a6bf68aff
Binary files /dev/null and b/densepose/vis/__pycache__/base.cpython-310.pyc differ
diff --git a/densepose/vis/__pycache__/base.cpython-39.pyc b/densepose/vis/__pycache__/base.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..31c4a94cbb5a6113751d10b0363329dbccad6cc2
Binary files /dev/null and b/densepose/vis/__pycache__/base.cpython-39.pyc differ
diff --git a/densepose/vis/__pycache__/bounding_box.cpython-310.pyc b/densepose/vis/__pycache__/bounding_box.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1a729cc6a0ee19912b3f5d7bb326c12bf985aeed
Binary files /dev/null and b/densepose/vis/__pycache__/bounding_box.cpython-310.pyc differ
diff --git a/densepose/vis/__pycache__/bounding_box.cpython-39.pyc b/densepose/vis/__pycache__/bounding_box.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..858d011135b39220a3c0aa8f538edaba2335d705
Binary files /dev/null and b/densepose/vis/__pycache__/bounding_box.cpython-39.pyc differ
diff --git a/densepose/vis/__pycache__/densepose_outputs_vertex.cpython-310.pyc b/densepose/vis/__pycache__/densepose_outputs_vertex.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0eedeadfb8fc45410b30ebb59f61d14f60b521a6
Binary files /dev/null and b/densepose/vis/__pycache__/densepose_outputs_vertex.cpython-310.pyc differ
diff --git a/densepose/vis/__pycache__/densepose_outputs_vertex.cpython-39.pyc b/densepose/vis/__pycache__/densepose_outputs_vertex.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1d8b29c3019c841845a963bf79ca42e154e26ef7
Binary files /dev/null and b/densepose/vis/__pycache__/densepose_outputs_vertex.cpython-39.pyc differ
diff --git a/densepose/vis/__pycache__/densepose_results.cpython-310.pyc b/densepose/vis/__pycache__/densepose_results.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0792387612fae1434dad65f5047b3090c7dc77ff
Binary files /dev/null and b/densepose/vis/__pycache__/densepose_results.cpython-310.pyc differ
diff --git a/densepose/vis/__pycache__/densepose_results.cpython-39.pyc b/densepose/vis/__pycache__/densepose_results.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..392193eedb9560facc9ed33e7b5f4e76c5bd22a8
Binary files /dev/null and b/densepose/vis/__pycache__/densepose_results.cpython-39.pyc differ
diff --git a/densepose/vis/__pycache__/densepose_results_textures.cpython-310.pyc b/densepose/vis/__pycache__/densepose_results_textures.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..12570813d345f55eaa04d106b6df0ef7ba93f0da
Binary files /dev/null and b/densepose/vis/__pycache__/densepose_results_textures.cpython-310.pyc differ
diff --git a/densepose/vis/__pycache__/densepose_results_textures.cpython-39.pyc b/densepose/vis/__pycache__/densepose_results_textures.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c504c01b4bc3b87030559b37755d90eb9195cd46
Binary files /dev/null and b/densepose/vis/__pycache__/densepose_results_textures.cpython-39.pyc differ
diff --git a/densepose/vis/__pycache__/extractor.cpython-310.pyc b/densepose/vis/__pycache__/extractor.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3da3c25550e36a6097b5901bfaf29e2f75b570b7
Binary files /dev/null and b/densepose/vis/__pycache__/extractor.cpython-310.pyc differ
diff --git a/densepose/vis/__pycache__/extractor.cpython-39.pyc b/densepose/vis/__pycache__/extractor.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8e15202201e2cb5e45d183922bb0ea31f82202cb
Binary files /dev/null and b/densepose/vis/__pycache__/extractor.cpython-39.pyc differ
diff --git a/densepose/vis/base.py b/densepose/vis/base.py
new file mode 100644
index 0000000000000000000000000000000000000000..a5c7e4ad19b6df600d8dcecacde6a3488c83a3e1
--- /dev/null
+++ b/densepose/vis/base.py
@@ -0,0 +1,192 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+import logging
+import numpy as np
+import cv2
+import torch
+
+Image = np.ndarray
+Boxes = torch.Tensor
+
+
+class MatrixVisualizer:
+    """
+    Base visualizer for matrix data
+    """
+
+    def __init__(
+        self,
+        inplace=True,
+        cmap=cv2.COLORMAP_PARULA,
+        val_scale=1.0,
+        alpha=0.7,
+        interp_method_matrix=cv2.INTER_LINEAR,
+        interp_method_mask=cv2.INTER_NEAREST,
+    ):
+        self.inplace = inplace
+        self.cmap = cmap
+        self.val_scale = val_scale
+        self.alpha = alpha
+        self.interp_method_matrix = interp_method_matrix
+        self.interp_method_mask = interp_method_mask
+
+    def visualize(self, image_bgr, mask, matrix, bbox_xywh):
+        self._check_image(image_bgr)
+        self._check_mask_matrix(mask, matrix)
+        if self.inplace:
+            image_target_bgr = image_bgr
+        else:
+            image_target_bgr = image_bgr 
+            image_target_bgr *= 0
+        x, y, w, h = [int(v) for v in bbox_xywh]
+        if w <= 0 or h <= 0:
+            return image_bgr
+        mask, matrix = self._resize(mask, matrix, w, h)
+        mask_bg = np.tile((mask == 0)[:, :, np.newaxis], [1, 1, 3])
+        matrix_scaled = matrix.astype(np.float32) * self.val_scale
+        _EPSILON = 1e-6
+        if np.any(matrix_scaled > 255 + _EPSILON):
+            logger = logging.getLogger(__name__)
+            logger.warning(
+                f"Matrix has values > {255 + _EPSILON} after " f"scaling, clipping to [0..255]"
+            )
+        matrix_scaled_8u = matrix_scaled.clip(0, 255).astype(np.uint8)
+        matrix_vis = cv2.applyColorMap(matrix_scaled_8u, self.cmap)
+        matrix_vis[mask_bg] = image_target_bgr[y : y + h, x : x + w, :][mask_bg]
+        image_target_bgr[y : y + h, x : x + w, :] = (
+            image_target_bgr[y : y + h, x : x + w, :] * (1.0 - self.alpha) + matrix_vis * self.alpha
+        )
+        return image_target_bgr.astype(np.uint8)
+
+    def _resize(self, mask, matrix, w, h):
+        if (w != mask.shape[1]) or (h != mask.shape[0]):
+            mask = cv2.resize(mask, (w, h), self.interp_method_mask)
+        if (w != matrix.shape[1]) or (h != matrix.shape[0]):
+            matrix = cv2.resize(matrix, (w, h), self.interp_method_matrix)
+        return mask, matrix
+
+    def _check_image(self, image_rgb):
+        assert len(image_rgb.shape) == 3
+        assert image_rgb.shape[2] == 3
+        assert image_rgb.dtype == np.uint8
+
+    def _check_mask_matrix(self, mask, matrix):
+        assert len(matrix.shape) == 2
+        assert len(mask.shape) == 2
+        assert mask.dtype == np.uint8
+
+
+class RectangleVisualizer:
+
+    _COLOR_GREEN = (18, 127, 15)
+
+    def __init__(self, color=_COLOR_GREEN, thickness=1):
+        self.color = color
+        self.thickness = thickness
+
+    def visualize(self, image_bgr, bbox_xywh, color=None, thickness=None):
+        x, y, w, h = bbox_xywh
+        color = color or self.color
+        thickness = thickness or self.thickness
+        cv2.rectangle(image_bgr, (int(x), int(y)), (int(x + w), int(y + h)), color, thickness)
+        return image_bgr
+
+
+class PointsVisualizer:
+
+    _COLOR_GREEN = (18, 127, 15)
+
+    def __init__(self, color_bgr=_COLOR_GREEN, r=5):
+        self.color_bgr = color_bgr
+        self.r = r
+
+    def visualize(self, image_bgr, pts_xy, colors_bgr=None, rs=None):
+        for j, pt_xy in enumerate(pts_xy):
+            x, y = pt_xy
+            color_bgr = colors_bgr[j] if colors_bgr is not None else self.color_bgr
+            r = rs[j] if rs is not None else self.r
+            cv2.circle(image_bgr, (x, y), r, color_bgr, -1)
+        return image_bgr
+
+
+class TextVisualizer:
+
+    _COLOR_GRAY = (218, 227, 218)
+    _COLOR_WHITE = (255, 255, 255)
+
+    def __init__(
+        self,
+        font_face=cv2.FONT_HERSHEY_SIMPLEX,
+        font_color_bgr=_COLOR_GRAY,
+        font_scale=0.35,
+        font_line_type=cv2.LINE_AA,
+        font_line_thickness=1,
+        fill_color_bgr=_COLOR_WHITE,
+        fill_color_transparency=1.0,
+        frame_color_bgr=_COLOR_WHITE,
+        frame_color_transparency=1.0,
+        frame_thickness=1,
+    ):
+        self.font_face = font_face
+        self.font_color_bgr = font_color_bgr
+        self.font_scale = font_scale
+        self.font_line_type = font_line_type
+        self.font_line_thickness = font_line_thickness
+        self.fill_color_bgr = fill_color_bgr
+        self.fill_color_transparency = fill_color_transparency
+        self.frame_color_bgr = frame_color_bgr
+        self.frame_color_transparency = frame_color_transparency
+        self.frame_thickness = frame_thickness
+
+    def visualize(self, image_bgr, txt, topleft_xy):
+        txt_w, txt_h = self.get_text_size_wh(txt)
+        topleft_xy = tuple(map(int, topleft_xy))
+        x, y = topleft_xy
+        if self.frame_color_transparency < 1.0:
+            t = self.frame_thickness
+            image_bgr[y - t : y + txt_h + t, x - t : x + txt_w + t, :] = (
+                image_bgr[y - t : y + txt_h + t, x - t : x + txt_w + t, :]
+                * self.frame_color_transparency
+                + np.array(self.frame_color_bgr) * (1.0 - self.frame_color_transparency)
+            ).astype(float)
+        if self.fill_color_transparency < 1.0:
+            image_bgr[y : y + txt_h, x : x + txt_w, :] = (
+                image_bgr[y : y + txt_h, x : x + txt_w, :] * self.fill_color_transparency
+                + np.array(self.fill_color_bgr) * (1.0 - self.fill_color_transparency)
+            ).astype(float)
+        cv2.putText(
+            image_bgr,
+            txt,
+            topleft_xy,
+            self.font_face,
+            self.font_scale,
+            self.font_color_bgr,
+            self.font_line_thickness,
+            self.font_line_type,
+        )
+        return image_bgr
+
+    def get_text_size_wh(self, txt):
+        ((txt_w, txt_h), _) = cv2.getTextSize(
+            txt, self.font_face, self.font_scale, self.font_line_thickness
+        )
+        return txt_w, txt_h
+
+
+class CompoundVisualizer:
+    def __init__(self, visualizers):
+        self.visualizers = visualizers
+
+    def visualize(self, image_bgr, data):
+        assert len(data) == len(
+            self.visualizers
+        ), "The number of datas {} should match the number of visualizers" " {}".format(
+            len(data), len(self.visualizers)
+        )
+        image = image_bgr
+        for i, visualizer in enumerate(self.visualizers):
+            image = visualizer.visualize(image, data[i])
+        return image
+
+    def __str__(self):
+        visualizer_str = ", ".join([str(v) for v in self.visualizers])
+        return "Compound Visualizer [{}]".format(visualizer_str)
diff --git a/densepose/vis/bounding_box.py b/densepose/vis/bounding_box.py
new file mode 100644
index 0000000000000000000000000000000000000000..6a9d37f40b1b64832a53d8c2cc8ce3bf90b30a3b
--- /dev/null
+++ b/densepose/vis/bounding_box.py
@@ -0,0 +1,37 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+from .base import RectangleVisualizer, TextVisualizer
+
+
+class BoundingBoxVisualizer:
+    def __init__(self):
+        self.rectangle_visualizer = RectangleVisualizer()
+
+    def visualize(self, image_bgr, boxes_xywh):
+        for bbox_xywh in boxes_xywh:
+            image_bgr = self.rectangle_visualizer.visualize(image_bgr, bbox_xywh)
+        return image_bgr
+
+
+class ScoredBoundingBoxVisualizer:
+    def __init__(self, bbox_visualizer_params=None, score_visualizer_params=None, **kwargs):
+        if bbox_visualizer_params is None:
+            bbox_visualizer_params = {}
+        if score_visualizer_params is None:
+            score_visualizer_params = {}
+        self.visualizer_bbox = RectangleVisualizer(**bbox_visualizer_params)
+        self.visualizer_score = TextVisualizer(**score_visualizer_params)
+
+    def visualize(self, image_bgr, scored_bboxes):
+        boxes_xywh, box_scores = scored_bboxes
+        assert len(boxes_xywh) == len(
+            box_scores
+        ), "Number of bounding boxes {} should be equal to the number of scores {}".format(
+            len(boxes_xywh), len(box_scores)
+        )
+        for i, box_xywh in enumerate(boxes_xywh):
+            score_i = box_scores[i]
+            image_bgr = self.visualizer_bbox.visualize(image_bgr, box_xywh)
+            score_txt = "{0:6.4f}".format(score_i)
+            topleft_xy = box_xywh[0], box_xywh[1]
+            image_bgr = self.visualizer_score.visualize(image_bgr, score_txt, topleft_xy)
+        return image_bgr
diff --git a/densepose/vis/densepose_data_points.py b/densepose/vis/densepose_data_points.py
new file mode 100644
index 0000000000000000000000000000000000000000..17e67cbf96022e09363cf1deac21814e4544f570
--- /dev/null
+++ b/densepose/vis/densepose_data_points.py
@@ -0,0 +1,106 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+import numpy as np
+from typing import Iterable, Optional, Tuple
+import cv2
+
+from densepose.structures import DensePoseDataRelative
+
+from .base import Boxes, Image, MatrixVisualizer, PointsVisualizer
+
+
+class DensePoseDataCoarseSegmentationVisualizer:
+    """
+    Visualizer for ground truth segmentation
+    """
+
+    def __init__(self, inplace=True, cmap=cv2.COLORMAP_PARULA, alpha=0.7, **kwargs):
+        self.mask_visualizer = MatrixVisualizer(
+            inplace=inplace,
+            cmap=cmap,
+            val_scale=255.0 / DensePoseDataRelative.N_BODY_PARTS,
+            alpha=alpha,
+        )
+
+    def visualize(
+        self,
+        image_bgr: Image,
+        bbox_densepose_datas: Optional[Tuple[Iterable[Boxes], Iterable[DensePoseDataRelative]]],
+    ) -> Image:
+        if bbox_densepose_datas is None:
+            return image_bgr
+        for bbox_xywh, densepose_data in zip(*bbox_densepose_datas):
+            matrix = densepose_data.segm.numpy()
+            mask = np.zeros(matrix.shape, dtype=np.uint8)
+            mask[matrix > 0] = 1
+            image_bgr = self.mask_visualizer.visualize(image_bgr, mask, matrix, bbox_xywh.numpy())
+        return image_bgr
+
+
+class DensePoseDataPointsVisualizer:
+    def __init__(self, densepose_data_to_value_fn=None, cmap=cv2.COLORMAP_PARULA, **kwargs):
+        self.points_visualizer = PointsVisualizer()
+        self.densepose_data_to_value_fn = densepose_data_to_value_fn
+        self.cmap = cmap
+
+    def visualize(
+        self,
+        image_bgr: Image,
+        bbox_densepose_datas: Optional[Tuple[Iterable[Boxes], Iterable[DensePoseDataRelative]]],
+    ) -> Image:
+        if bbox_densepose_datas is None:
+            return image_bgr
+        for bbox_xywh, densepose_data in zip(*bbox_densepose_datas):
+            x0, y0, w, h = bbox_xywh.numpy()
+            x = densepose_data.x.numpy() * w / 255.0 + x0
+            y = densepose_data.y.numpy() * h / 255.0 + y0
+            pts_xy = zip(x, y)
+            if self.densepose_data_to_value_fn is None:
+                image_bgr = self.points_visualizer.visualize(image_bgr, pts_xy)
+            else:
+                v = self.densepose_data_to_value_fn(densepose_data)
+                img_colors_bgr = cv2.applyColorMap(v, self.cmap)
+                colors_bgr = [
+                    [int(v) for v in img_color_bgr.ravel()] for img_color_bgr in img_colors_bgr
+                ]
+                image_bgr = self.points_visualizer.visualize(image_bgr, pts_xy, colors_bgr)
+        return image_bgr
+
+
+def _densepose_data_u_for_cmap(densepose_data):
+    u = np.clip(densepose_data.u.numpy(), 0, 1) * 255.0
+    return u.astype(np.uint8)
+
+
+def _densepose_data_v_for_cmap(densepose_data):
+    v = np.clip(densepose_data.v.numpy(), 0, 1) * 255.0
+    return v.astype(np.uint8)
+
+
+def _densepose_data_i_for_cmap(densepose_data):
+    i = (
+        np.clip(densepose_data.i.numpy(), 0.0, DensePoseDataRelative.N_PART_LABELS)
+        * 255.0
+        / DensePoseDataRelative.N_PART_LABELS
+    )
+    return i.astype(np.uint8)
+
+
+class DensePoseDataPointsUVisualizer(DensePoseDataPointsVisualizer):
+    def __init__(self, **kwargs):
+        super(DensePoseDataPointsUVisualizer, self).__init__(
+            densepose_data_to_value_fn=_densepose_data_u_for_cmap, **kwargs
+        )
+
+
+class DensePoseDataPointsVVisualizer(DensePoseDataPointsVisualizer):
+    def __init__(self, **kwargs):
+        super(DensePoseDataPointsVVisualizer, self).__init__(
+            densepose_data_to_value_fn=_densepose_data_v_for_cmap, **kwargs
+        )
+
+
+class DensePoseDataPointsIVisualizer(DensePoseDataPointsVisualizer):
+    def __init__(self, **kwargs):
+        super(DensePoseDataPointsIVisualizer, self).__init__(
+            densepose_data_to_value_fn=_densepose_data_i_for_cmap, **kwargs
+        )
diff --git a/densepose/vis/densepose_outputs_iuv.py b/densepose/vis/densepose_outputs_iuv.py
new file mode 100644
index 0000000000000000000000000000000000000000..454627cbe9fd5507cae8cbb9aab7c7bc0ffcfab7
--- /dev/null
+++ b/densepose/vis/densepose_outputs_iuv.py
@@ -0,0 +1,101 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+import numpy as np
+from typing import Optional, Tuple
+import cv2
+
+from densepose.structures import DensePoseDataRelative
+
+from ..structures import DensePoseChartPredictorOutput
+from .base import Boxes, Image, MatrixVisualizer
+
+
+class DensePoseOutputsVisualizer:
+    def __init__(
+        self, inplace=True, cmap=cv2.COLORMAP_PARULA, alpha=0.7, to_visualize=None, **kwargs
+    ):
+        assert to_visualize in "IUV", "can only visualize IUV"
+        self.to_visualize = to_visualize
+
+        if self.to_visualize == "I":
+            val_scale = 255.0 / DensePoseDataRelative.N_PART_LABELS
+        else:
+            val_scale = 1.0
+        self.mask_visualizer = MatrixVisualizer(
+            inplace=inplace, cmap=cmap, val_scale=val_scale, alpha=alpha
+        )
+
+    def visualize(
+        self,
+        image_bgr: Image,
+        dp_output_with_bboxes: Tuple[Optional[DensePoseChartPredictorOutput], Optional[Boxes]],
+    ) -> Image:
+        densepose_output, bboxes_xywh = dp_output_with_bboxes
+        if densepose_output is None or bboxes_xywh is None:
+            return image_bgr
+
+        assert isinstance(
+            densepose_output, DensePoseChartPredictorOutput
+        ), "DensePoseChartPredictorOutput expected, {} encountered".format(type(densepose_output))
+
+        S = densepose_output.coarse_segm
+        I = densepose_output.fine_segm  # noqa
+        U = densepose_output.u
+        V = densepose_output.v
+        N = S.size(0)
+        assert N == I.size(
+            0
+        ), "densepose outputs S {} and I {}" " should have equal first dim size".format(
+            S.size(), I.size()
+        )
+        assert N == U.size(
+            0
+        ), "densepose outputs S {} and U {}" " should have equal first dim size".format(
+            S.size(), U.size()
+        )
+        assert N == V.size(
+            0
+        ), "densepose outputs S {} and V {}" " should have equal first dim size".format(
+            S.size(), V.size()
+        )
+        assert N == len(
+            bboxes_xywh
+        ), "number of bounding boxes {}" " should be equal to first dim size of outputs {}".format(
+            len(bboxes_xywh), N
+        )
+        for n in range(N):
+            Sn = S[n].argmax(dim=0)
+            In = I[n].argmax(dim=0) * (Sn > 0).long()
+            segmentation = In.cpu().numpy().astype(np.uint8)
+            mask = np.zeros(segmentation.shape, dtype=np.uint8)
+            mask[segmentation > 0] = 1
+            bbox_xywh = bboxes_xywh[n]
+
+            if self.to_visualize == "I":
+                vis = segmentation
+            elif self.to_visualize in "UV":
+                U_or_Vn = {"U": U, "V": V}[self.to_visualize][n].cpu().numpy().astype(np.float32)
+                vis = np.zeros(segmentation.shape, dtype=np.float32)
+                for partId in range(U_or_Vn.shape[0]):
+                    vis[segmentation == partId] = (
+                        U_or_Vn[partId][segmentation == partId].clip(0, 1) * 255
+                    )
+
+            # pyre-fixme[61]: `vis` may not be initialized here.
+            image_bgr = self.mask_visualizer.visualize(image_bgr, mask, vis, bbox_xywh)
+
+        return image_bgr
+
+
+class DensePoseOutputsUVisualizer(DensePoseOutputsVisualizer):
+    def __init__(self, inplace=True, cmap=cv2.COLORMAP_PARULA, alpha=0.7, **kwargs):
+        super().__init__(inplace=inplace, cmap=cmap, alpha=alpha, to_visualize="U", **kwargs)
+
+
+class DensePoseOutputsVVisualizer(DensePoseOutputsVisualizer):
+    def __init__(self, inplace=True, cmap=cv2.COLORMAP_PARULA, alpha=0.7, **kwargs):
+        super().__init__(inplace=inplace, cmap=cmap, alpha=alpha, to_visualize="V", **kwargs)
+
+
+class DensePoseOutputsFineSegmentationVisualizer(DensePoseOutputsVisualizer):
+    def __init__(self, inplace=True, cmap=cv2.COLORMAP_PARULA, alpha=0.7, **kwargs):
+        super().__init__(inplace=inplace, cmap=cmap, alpha=alpha, to_visualize="I", **kwargs)
diff --git a/densepose/vis/densepose_outputs_vertex.py b/densepose/vis/densepose_outputs_vertex.py
new file mode 100644
index 0000000000000000000000000000000000000000..4e10bf82cfea92f488480d423d3ced5d61d04c49
--- /dev/null
+++ b/densepose/vis/densepose_outputs_vertex.py
@@ -0,0 +1,229 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+import json
+import numpy as np
+from functools import lru_cache
+from typing import Dict, List, Optional, Tuple
+import cv2
+import torch
+
+from detectron2.utils.file_io import PathManager
+
+from densepose.modeling import build_densepose_embedder
+from densepose.modeling.cse.utils import get_closest_vertices_mask_from_ES
+
+from ..data.utils import get_class_to_mesh_name_mapping
+from ..structures import DensePoseEmbeddingPredictorOutput
+from ..structures.mesh import create_mesh
+from .base import Boxes, Image, MatrixVisualizer
+from .densepose_results_textures import get_texture_atlas
+
+
+@lru_cache()
+def get_xyz_vertex_embedding(mesh_name: str, device: torch.device):
+    if mesh_name == "smpl_27554":
+        embed_path = PathManager.get_local_path(
+            "https://dl.fbaipublicfiles.com/densepose/data/cse/mds_d=256.npy"
+        )
+        embed_map, _ = np.load(embed_path, allow_pickle=True)
+        embed_map = torch.tensor(embed_map).float()[:, 0]
+        embed_map -= embed_map.min()
+        embed_map /= embed_map.max()
+    else:
+        mesh = create_mesh(mesh_name, device)
+        embed_map = mesh.vertices.sum(dim=1)
+        embed_map -= embed_map.min()
+        embed_map /= embed_map.max()
+        embed_map = embed_map**2
+    return embed_map
+
+
+class DensePoseOutputsVertexVisualizer:
+    def __init__(
+        self,
+        cfg,
+        inplace=True,
+        cmap=cv2.COLORMAP_JET,
+        alpha=0.7,
+        device="cuda",
+        default_class=0,
+        **kwargs,
+    ):
+        self.mask_visualizer = MatrixVisualizer(
+            inplace=inplace, cmap=cmap, val_scale=1.0, alpha=alpha
+        )
+        self.class_to_mesh_name = get_class_to_mesh_name_mapping(cfg)
+        self.embedder = build_densepose_embedder(cfg)
+        self.device = torch.device(device)
+        self.default_class = default_class
+
+        self.mesh_vertex_embeddings = {
+            mesh_name: self.embedder(mesh_name).to(self.device)
+            for mesh_name in self.class_to_mesh_name.values()
+            if self.embedder.has_embeddings(mesh_name)
+        }
+
+    def visualize(
+        self,
+        image_bgr: Image,
+        outputs_boxes_xywh_classes: Tuple[
+            Optional[DensePoseEmbeddingPredictorOutput], Optional[Boxes], Optional[List[int]]
+        ],
+    ) -> Image:
+        if outputs_boxes_xywh_classes[0] is None:
+            return image_bgr
+
+        S, E, N, bboxes_xywh, pred_classes = self.extract_and_check_outputs_and_boxes(
+            outputs_boxes_xywh_classes
+        )
+
+        for n in range(N):
+            x, y, w, h = bboxes_xywh[n].int().tolist()
+            mesh_name = self.class_to_mesh_name[pred_classes[n]]
+            closest_vertices, mask = get_closest_vertices_mask_from_ES(
+                E[[n]],
+                S[[n]],
+                h,
+                w,
+                self.mesh_vertex_embeddings[mesh_name],
+                self.device,
+            )
+            embed_map = get_xyz_vertex_embedding(mesh_name, self.device)
+            vis = (embed_map[closest_vertices].clip(0, 1) * 255.0).cpu().numpy()
+            mask_numpy = mask.cpu().numpy().astype(dtype=np.uint8)
+            image_bgr = self.mask_visualizer.visualize(image_bgr, mask_numpy, vis, [x, y, w, h])
+
+        return image_bgr
+
+    def extract_and_check_outputs_and_boxes(self, outputs_boxes_xywh_classes):
+
+        densepose_output, bboxes_xywh, pred_classes = outputs_boxes_xywh_classes
+
+        if pred_classes is None:
+            pred_classes = [self.default_class] * len(bboxes_xywh)
+
+        assert isinstance(
+            densepose_output, DensePoseEmbeddingPredictorOutput
+        ), "DensePoseEmbeddingPredictorOutput expected, {} encountered".format(
+            type(densepose_output)
+        )
+
+        S = densepose_output.coarse_segm
+        E = densepose_output.embedding
+        N = S.size(0)
+        assert N == E.size(
+            0
+        ), "CSE coarse_segm {} and embeddings {}" " should have equal first dim size".format(
+            S.size(), E.size()
+        )
+        assert N == len(
+            bboxes_xywh
+        ), "number of bounding boxes {}" " should be equal to first dim size of outputs {}".format(
+            len(bboxes_xywh), N
+        )
+        assert N == len(pred_classes), (
+            "number of predicted classes {}"
+            " should be equal to first dim size of outputs {}".format(len(bboxes_xywh), N)
+        )
+
+        return S, E, N, bboxes_xywh, pred_classes
+
+
+def get_texture_atlases(json_str: Optional[str]) -> Optional[Dict[str, Optional[np.ndarray]]]:
+    """
+    json_str is a JSON string representing a mesh_name -> texture_atlas_path dictionary
+    """
+    if json_str is None:
+        return None
+
+    paths = json.loads(json_str)
+    return {mesh_name: get_texture_atlas(path) for mesh_name, path in paths.items()}
+
+
+class DensePoseOutputsTextureVisualizer(DensePoseOutputsVertexVisualizer):
+    def __init__(
+        self,
+        cfg,
+        texture_atlases_dict,
+        device="cuda",
+        default_class=0,
+        **kwargs,
+    ):
+        self.embedder = build_densepose_embedder(cfg)
+
+        self.texture_image_dict = {}
+        self.alpha_dict = {}
+
+        for mesh_name in texture_atlases_dict.keys():
+            if texture_atlases_dict[mesh_name].shape[-1] == 4:  # Image with alpha channel
+                self.alpha_dict[mesh_name] = texture_atlases_dict[mesh_name][:, :, -1] / 255.0
+                self.texture_image_dict[mesh_name] = texture_atlases_dict[mesh_name][:, :, :3]
+            else:
+                self.alpha_dict[mesh_name] = texture_atlases_dict[mesh_name].sum(axis=-1) > 0
+                self.texture_image_dict[mesh_name] = texture_atlases_dict[mesh_name]
+
+        self.device = torch.device(device)
+        self.class_to_mesh_name = get_class_to_mesh_name_mapping(cfg)
+        self.default_class = default_class
+
+        self.mesh_vertex_embeddings = {
+            mesh_name: self.embedder(mesh_name).to(self.device)
+            for mesh_name in self.class_to_mesh_name.values()
+        }
+
+    def visualize(
+        self,
+        image_bgr: Image,
+        outputs_boxes_xywh_classes: Tuple[
+            Optional[DensePoseEmbeddingPredictorOutput], Optional[Boxes], Optional[List[int]]
+        ],
+    ) -> Image:
+        image_target_bgr = image_bgr.copy()
+        if outputs_boxes_xywh_classes[0] is None:
+            return image_target_bgr
+
+        S, E, N, bboxes_xywh, pred_classes = self.extract_and_check_outputs_and_boxes(
+            outputs_boxes_xywh_classes
+        )
+
+        meshes = {
+            p: create_mesh(self.class_to_mesh_name[p], self.device) for p in np.unique(pred_classes)
+        }
+
+        for n in range(N):
+            x, y, w, h = bboxes_xywh[n].int().cpu().numpy()
+            mesh_name = self.class_to_mesh_name[pred_classes[n]]
+            closest_vertices, mask = get_closest_vertices_mask_from_ES(
+                E[[n]],
+                S[[n]],
+                h,
+                w,
+                self.mesh_vertex_embeddings[mesh_name],
+                self.device,
+            )
+            uv_array = meshes[pred_classes[n]].texcoords[closest_vertices].permute((2, 0, 1))
+            uv_array = uv_array.cpu().numpy().clip(0, 1)
+            textured_image = self.generate_image_with_texture(
+                image_target_bgr[y : y + h, x : x + w],
+                uv_array,
+                mask.cpu().numpy(),
+                self.class_to_mesh_name[pred_classes[n]],
+            )
+            if textured_image is None:
+                continue
+            image_target_bgr[y : y + h, x : x + w] = textured_image
+
+        return image_target_bgr
+
+    def generate_image_with_texture(self, bbox_image_bgr, uv_array, mask, mesh_name):
+        alpha = self.alpha_dict.get(mesh_name)
+        texture_image = self.texture_image_dict.get(mesh_name)
+        if alpha is None or texture_image is None:
+            return None
+        U, V = uv_array
+        x_index = (U * texture_image.shape[1]).astype(int)
+        y_index = (V * texture_image.shape[0]).astype(int)
+        local_texture = texture_image[y_index, x_index][mask]
+        local_alpha = np.expand_dims(alpha[y_index, x_index][mask], -1)
+        output_image = bbox_image_bgr.copy()
+        output_image[mask] = output_image[mask] * (1 - local_alpha) + local_texture * local_alpha
+        return output_image.astype(np.uint8)
diff --git a/densepose/vis/densepose_results.py b/densepose/vis/densepose_results.py
new file mode 100644
index 0000000000000000000000000000000000000000..a660d26eec6c873bd5cf9b35d0c8387075789afc
--- /dev/null
+++ b/densepose/vis/densepose_results.py
@@ -0,0 +1,355 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+import logging
+import numpy as np
+from typing import List, Optional, Tuple
+import cv2
+import torch
+
+from densepose.structures import DensePoseDataRelative
+
+from ..structures import DensePoseChartResult
+from .base import Boxes, Image, MatrixVisualizer
+
+
+class DensePoseResultsVisualizer:
+    def visualize(
+        self,
+        image_bgr: Image,
+        results_and_boxes_xywh: Tuple[Optional[List[DensePoseChartResult]], Optional[Boxes]],
+    ) -> Image:
+        densepose_result, boxes_xywh = results_and_boxes_xywh
+        if densepose_result is None or boxes_xywh is None:
+            return image_bgr
+
+        boxes_xywh = boxes_xywh.cpu().numpy()
+        context = self.create_visualization_context(image_bgr)
+        for i, result in enumerate(densepose_result):
+            iuv_array = torch.cat(
+                (result.labels[None].type(torch.float32), result.uv * 255.0)
+            ).type(torch.uint8)
+            self.visualize_iuv_arr(context, iuv_array.cpu().numpy(), boxes_xywh[i])
+        image_bgr = self.context_to_image_bgr(context)
+        return image_bgr
+
+    def create_visualization_context(self, image_bgr: Image):
+        return image_bgr
+
+    def visualize_iuv_arr(self, context, iuv_arr: np.ndarray, bbox_xywh) -> None:
+        pass
+
+    def context_to_image_bgr(self, context):
+        return context
+
+    def get_image_bgr_from_context(self, context):
+        return context
+
+
+class DensePoseMaskedColormapResultsVisualizer(DensePoseResultsVisualizer):
+    def __init__(
+        self,
+        data_extractor,
+        segm_extractor,
+        inplace=True,
+        cmap=cv2.COLORMAP_PARULA,
+        alpha=0.7,
+        val_scale=1.0,
+        **kwargs,
+    ):
+        self.mask_visualizer = MatrixVisualizer(
+            inplace=inplace, cmap=cmap, val_scale=val_scale, alpha=alpha
+        )
+        self.data_extractor = data_extractor
+        self.segm_extractor = segm_extractor
+
+    def context_to_image_bgr(self, context):
+        return context
+
+    def visualize_iuv_arr(self, context, iuv_arr: np.ndarray, bbox_xywh) -> None:
+        image_bgr = self.get_image_bgr_from_context(context)
+        matrix = self.data_extractor(iuv_arr)
+        segm = self.segm_extractor(iuv_arr)
+        mask = np.zeros(matrix.shape, dtype=np.uint8)
+        mask[segm > 0] = 1
+        image_bgr = self.mask_visualizer.visualize(image_bgr, mask, matrix, bbox_xywh)
+
+
+def _extract_i_from_iuvarr(iuv_arr):
+    return iuv_arr[0, :, :]
+
+
+def _extract_u_from_iuvarr(iuv_arr):
+    return iuv_arr[1, :, :]
+
+
+def _extract_v_from_iuvarr(iuv_arr):
+    return iuv_arr[2, :, :]
+
+
+class DensePoseResultsMplContourVisualizer(DensePoseResultsVisualizer):
+    def __init__(self, levels=10, **kwargs):
+        self.levels = levels
+        self.plot_args = kwargs
+
+    def create_visualization_context(self, image_bgr: Image):
+        import matplotlib.pyplot as plt
+        from matplotlib.backends.backend_agg import FigureCanvasAgg as FigureCanvas
+
+        context = {}
+        context["image_bgr"] = image_bgr
+        dpi = 100
+        height_inches = float(image_bgr.shape[0]) / dpi
+        width_inches = float(image_bgr.shape[1]) / dpi
+        fig = plt.figure(figsize=(width_inches, height_inches), dpi=dpi)
+        plt.axes([0, 0, 1, 1])
+        plt.axis("off")
+        context["fig"] = fig
+        canvas = FigureCanvas(fig)
+        context["canvas"] = canvas
+        extent = (0, image_bgr.shape[1], image_bgr.shape[0], 0)
+        plt.imshow(image_bgr[:, :, ::-1], extent=extent)
+        return context
+
+    def context_to_image_bgr(self, context):
+        fig = context["fig"]
+        w, h = map(int, fig.get_size_inches() * fig.get_dpi())
+        canvas = context["canvas"]
+        canvas.draw()
+        image_1d = np.fromstring(canvas.tostring_rgb(), dtype="uint8")
+        image_rgb = image_1d.reshape(h, w, 3)
+        image_bgr = image_rgb[:, :, ::-1].copy()
+        return image_bgr
+
+    def visualize_iuv_arr(self, context, iuv_arr: np.ndarray, bbox_xywh: Boxes) -> None:
+        import matplotlib.pyplot as plt
+
+        u = _extract_u_from_iuvarr(iuv_arr).astype(float) / 255.0
+        v = _extract_v_from_iuvarr(iuv_arr).astype(float) / 255.0
+        extent = (
+            bbox_xywh[0],
+            bbox_xywh[0] + bbox_xywh[2],
+            bbox_xywh[1],
+            bbox_xywh[1] + bbox_xywh[3],
+        )
+        plt.contour(u, self.levels, extent=extent, **self.plot_args)
+        plt.contour(v, self.levels, extent=extent, **self.plot_args)
+
+
+class DensePoseResultsCustomContourVisualizer(DensePoseResultsVisualizer):
+    """
+    Contour visualization using marching squares
+    """
+
+    def __init__(self, levels=10, **kwargs):
+        # TODO: colormap is hardcoded
+        cmap = cv2.COLORMAP_PARULA
+        if isinstance(levels, int):
+            self.levels = np.linspace(0, 1, levels)
+        else:
+            self.levels = levels
+        if "linewidths" in kwargs:
+            self.linewidths = kwargs["linewidths"]
+        else:
+            self.linewidths = [1] * len(self.levels)
+        self.plot_args = kwargs
+        img_colors_bgr = cv2.applyColorMap((self.levels * 255).astype(np.uint8), cmap)
+        self.level_colors_bgr = [
+            [int(v) for v in img_color_bgr.ravel()] for img_color_bgr in img_colors_bgr
+        ]
+
+    def visualize_iuv_arr(self, context, iuv_arr: np.ndarray, bbox_xywh: Boxes) -> None:
+        image_bgr = self.get_image_bgr_from_context(context)
+        segm = _extract_i_from_iuvarr(iuv_arr)
+        u = _extract_u_from_iuvarr(iuv_arr).astype(float) / 255.0
+        v = _extract_v_from_iuvarr(iuv_arr).astype(float) / 255.0
+        self._contours(image_bgr, u, segm, bbox_xywh)
+        self._contours(image_bgr, v, segm, bbox_xywh)
+
+    def _contours(self, image_bgr, arr, segm, bbox_xywh):
+        for part_idx in range(1, DensePoseDataRelative.N_PART_LABELS + 1):
+            mask = segm == part_idx
+            if not np.any(mask):
+                continue
+            arr_min = np.amin(arr[mask])
+            arr_max = np.amax(arr[mask])
+            I, J = np.nonzero(mask)
+            i0 = np.amin(I)
+            i1 = np.amax(I) + 1
+            j0 = np.amin(J)
+            j1 = np.amax(J) + 1
+            if (j1 == j0 + 1) or (i1 == i0 + 1):
+                continue
+            Nw = arr.shape[1] - 1
+            Nh = arr.shape[0] - 1
+            for level_idx, level in enumerate(self.levels):
+                if (level < arr_min) or (level > arr_max):
+                    continue
+                vp = arr[i0:i1, j0:j1] >= level
+                bin_codes = vp[:-1, :-1] + vp[1:, :-1] * 2 + vp[1:, 1:] * 4 + vp[:-1, 1:] * 8
+                mp = mask[i0:i1, j0:j1]
+                bin_mask_codes = mp[:-1, :-1] + mp[1:, :-1] * 2 + mp[1:, 1:] * 4 + mp[:-1, 1:] * 8
+                it = np.nditer(bin_codes, flags=["multi_index"])
+                color_bgr = self.level_colors_bgr[level_idx]
+                linewidth = self.linewidths[level_idx]
+                while not it.finished:
+                    if (it[0] != 0) and (it[0] != 15):
+                        i, j = it.multi_index
+                        if bin_mask_codes[i, j] != 0:
+                            self._draw_line(
+                                image_bgr,
+                                arr,
+                                mask,
+                                level,
+                                color_bgr,
+                                linewidth,
+                                it[0],
+                                it.multi_index,
+                                bbox_xywh,
+                                Nw,
+                                Nh,
+                                (i0, j0),
+                            )
+                    it.iternext()
+
+    def _draw_line(
+        self,
+        image_bgr,
+        arr,
+        mask,
+        v,
+        color_bgr,
+        linewidth,
+        bin_code,
+        multi_idx,
+        bbox_xywh,
+        Nw,
+        Nh,
+        offset,
+    ):
+        lines = self._bin_code_2_lines(arr, v, bin_code, multi_idx, Nw, Nh, offset)
+        x0, y0, w, h = bbox_xywh
+        x1 = x0 + w
+        y1 = y0 + h
+        for line in lines:
+            x0r, y0r = line[0]
+            x1r, y1r = line[1]
+            pt0 = (int(x0 + x0r * (x1 - x0)), int(y0 + y0r * (y1 - y0)))
+            pt1 = (int(x0 + x1r * (x1 - x0)), int(y0 + y1r * (y1 - y0)))
+            cv2.line(image_bgr, pt0, pt1, color_bgr, linewidth)
+
+    def _bin_code_2_lines(self, arr, v, bin_code, multi_idx, Nw, Nh, offset):
+        i0, j0 = offset
+        i, j = multi_idx
+        i += i0
+        j += j0
+        v0, v1, v2, v3 = arr[i, j], arr[i + 1, j], arr[i + 1, j + 1], arr[i, j + 1]
+        x0i = float(j) / Nw
+        y0j = float(i) / Nh
+        He = 1.0 / Nh
+        We = 1.0 / Nw
+        if (bin_code == 1) or (bin_code == 14):
+            a = (v - v0) / (v1 - v0)
+            b = (v - v0) / (v3 - v0)
+            pt1 = (x0i, y0j + a * He)
+            pt2 = (x0i + b * We, y0j)
+            return [(pt1, pt2)]
+        elif (bin_code == 2) or (bin_code == 13):
+            a = (v - v0) / (v1 - v0)
+            b = (v - v1) / (v2 - v1)
+            pt1 = (x0i, y0j + a * He)
+            pt2 = (x0i + b * We, y0j + He)
+            return [(pt1, pt2)]
+        elif (bin_code == 3) or (bin_code == 12):
+            a = (v - v0) / (v3 - v0)
+            b = (v - v1) / (v2 - v1)
+            pt1 = (x0i + a * We, y0j)
+            pt2 = (x0i + b * We, y0j + He)
+            return [(pt1, pt2)]
+        elif (bin_code == 4) or (bin_code == 11):
+            a = (v - v1) / (v2 - v1)
+            b = (v - v3) / (v2 - v3)
+            pt1 = (x0i + a * We, y0j + He)
+            pt2 = (x0i + We, y0j + b * He)
+            return [(pt1, pt2)]
+        elif (bin_code == 6) or (bin_code == 9):
+            a = (v - v0) / (v1 - v0)
+            b = (v - v3) / (v2 - v3)
+            pt1 = (x0i, y0j + a * He)
+            pt2 = (x0i + We, y0j + b * He)
+            return [(pt1, pt2)]
+        elif (bin_code == 7) or (bin_code == 8):
+            a = (v - v0) / (v3 - v0)
+            b = (v - v3) / (v2 - v3)
+            pt1 = (x0i + a * We, y0j)
+            pt2 = (x0i + We, y0j + b * He)
+            return [(pt1, pt2)]
+        elif bin_code == 5:
+            a1 = (v - v0) / (v1 - v0)
+            b1 = (v - v1) / (v2 - v1)
+            pt11 = (x0i, y0j + a1 * He)
+            pt12 = (x0i + b1 * We, y0j + He)
+            a2 = (v - v0) / (v3 - v0)
+            b2 = (v - v3) / (v2 - v3)
+            pt21 = (x0i + a2 * We, y0j)
+            pt22 = (x0i + We, y0j + b2 * He)
+            return [(pt11, pt12), (pt21, pt22)]
+        elif bin_code == 10:
+            a1 = (v - v0) / (v3 - v0)
+            b1 = (v - v0) / (v1 - v0)
+            pt11 = (x0i + a1 * We, y0j)
+            pt12 = (x0i, y0j + b1 * He)
+            a2 = (v - v1) / (v2 - v1)
+            b2 = (v - v3) / (v2 - v3)
+            pt21 = (x0i + a2 * We, y0j + He)
+            pt22 = (x0i + We, y0j + b2 * He)
+            return [(pt11, pt12), (pt21, pt22)]
+        return []
+
+
+try:
+    import matplotlib
+
+    matplotlib.use("Agg")
+    DensePoseResultsContourVisualizer = DensePoseResultsMplContourVisualizer
+except ModuleNotFoundError:
+    logger = logging.getLogger(__name__)
+    logger.warning("Could not import matplotlib, using custom contour visualizer")
+    DensePoseResultsContourVisualizer = DensePoseResultsCustomContourVisualizer
+
+
+class DensePoseResultsFineSegmentationVisualizer(DensePoseMaskedColormapResultsVisualizer):
+    def __init__(self, inplace=False, cmap=cv2.COLORMAP_PARULA, alpha=1, **kwargs):
+        super(DensePoseResultsFineSegmentationVisualizer, self).__init__(
+            _extract_i_from_iuvarr,
+            _extract_i_from_iuvarr,
+            inplace,
+            cmap,
+            alpha,
+            val_scale=255.0 / DensePoseDataRelative.N_PART_LABELS,
+            **kwargs,
+        )
+
+
+class DensePoseResultsUVisualizer(DensePoseMaskedColormapResultsVisualizer):
+    def __init__(self, inplace=True, cmap=cv2.COLORMAP_PARULA, alpha=0.7, **kwargs):
+        super(DensePoseResultsUVisualizer, self).__init__(
+            _extract_u_from_iuvarr,
+            _extract_i_from_iuvarr,
+            inplace,
+            cmap,
+            alpha,
+            val_scale=1.0,
+            **kwargs,
+        )
+
+
+class DensePoseResultsVVisualizer(DensePoseMaskedColormapResultsVisualizer):
+    def __init__(self, inplace=True, cmap=cv2.COLORMAP_PARULA, alpha=0.7, **kwargs):
+        super(DensePoseResultsVVisualizer, self).__init__(
+            _extract_v_from_iuvarr,
+            _extract_i_from_iuvarr,
+            inplace,
+            cmap,
+            alpha,
+            val_scale=1.0,
+            **kwargs,
+        )
diff --git a/densepose/vis/densepose_results_textures.py b/densepose/vis/densepose_results_textures.py
new file mode 100644
index 0000000000000000000000000000000000000000..8b02f2bdbaa8bb1b70bc0f690a568ac4f8f1c91a
--- /dev/null
+++ b/densepose/vis/densepose_results_textures.py
@@ -0,0 +1,91 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+import numpy as np
+from typing import List, Optional, Tuple
+import torch
+
+from detectron2.data.detection_utils import read_image
+
+from ..structures import DensePoseChartResult
+from .base import Boxes, Image
+from .densepose_results import DensePoseResultsVisualizer
+
+
+def get_texture_atlas(path: Optional[str]) -> Optional[np.ndarray]:
+    if path is None:
+        return None
+
+    # Reading images like that downsamples 16-bit images to 8-bit
+    # If 16-bit images are needed, we can replace that by cv2.imread with the
+    # cv2.IMREAD_UNCHANGED flag (with cv2 we also need it to keep alpha channels)
+    # The rest of the pipeline would need to be adapted to 16-bit images too
+    bgr_image = read_image(path)
+    rgb_image = np.copy(bgr_image)  # Convert BGR -> RGB
+    rgb_image[:, :, :3] = rgb_image[:, :, 2::-1]  # Works with alpha channel
+    return rgb_image
+
+
+class DensePoseResultsVisualizerWithTexture(DensePoseResultsVisualizer):
+    """
+    texture_atlas: An image, size 6N * 4N, with N * N squares for each of the 24 body parts.
+            It must follow the grid found at https://github.com/facebookresearch/DensePose/blob/master/DensePoseData/demo_data/texture_atlas_200.png  # noqa
+            For each body part, U is proportional to the x coordinate, and (1 - V) to y
+    """
+
+    def __init__(self, texture_atlas, **kwargs):
+        self.texture_atlas = texture_atlas
+        self.body_part_size = texture_atlas.shape[0] // 6
+        assert self.body_part_size == texture_atlas.shape[1] // 4
+
+    def visualize(
+        self,
+        image_bgr: Image,
+        results_and_boxes_xywh: Tuple[Optional[List[DensePoseChartResult]], Optional[Boxes]],
+    ) -> Image:
+        densepose_result, boxes_xywh = results_and_boxes_xywh
+        if densepose_result is None or boxes_xywh is None:
+            return image_bgr
+
+        boxes_xywh = boxes_xywh.int().cpu().numpy()
+        texture_image, alpha = self.get_texture()
+        for i, result in enumerate(densepose_result):
+            iuv_array = torch.cat((result.labels[None], result.uv.clamp(0, 1)))
+            x, y, w, h = boxes_xywh[i]
+            bbox_image = image_bgr[y : y + h, x : x + w]
+            image_bgr[y : y + h, x : x + w] = self.generate_image_with_texture(
+                texture_image, alpha, bbox_image, iuv_array.cpu().numpy()
+            )
+        return image_bgr
+
+    def get_texture(self):
+        N = self.body_part_size
+        texture_image = np.zeros([24, N, N, self.texture_atlas.shape[-1]])
+        for i in range(4):
+            for j in range(6):
+                texture_image[(6 * i + j), :, :, :] = self.texture_atlas[
+                    N * j : N * (j + 1), N * i : N * (i + 1), :
+                ]
+
+        if texture_image.shape[-1] == 4:  # Image with alpha channel
+            alpha = texture_image[:, :, :, -1] / 255.0
+            texture_image = texture_image[:, :, :, :3]
+        else:
+            alpha = texture_image.sum(axis=-1) > 0
+
+        return texture_image, alpha
+
+    def generate_image_with_texture(self, texture_image, alpha, bbox_image_bgr, iuv_array):
+
+        I, U, V = iuv_array
+        generated_image_bgr = bbox_image_bgr.copy()
+
+        for PartInd in range(1, 25):
+            x, y = np.where(I == PartInd)
+            x_index = (U[x, y] * (self.body_part_size - 1)).astype(int)
+            y_index = ((1 - V[x, y]) * (self.body_part_size - 1)).astype(int)
+            part_alpha = np.expand_dims(alpha[PartInd - 1, y_index, x_index], -1)
+            generated_image_bgr[I == PartInd] = (
+                generated_image_bgr[I == PartInd] * (1 - part_alpha)
+                + texture_image[PartInd - 1, y_index, x_index] * part_alpha
+            )
+
+        return generated_image_bgr.astype(np.uint8)
diff --git a/densepose/vis/extractor.py b/densepose/vis/extractor.py
new file mode 100644
index 0000000000000000000000000000000000000000..543efac9272746b8cc130534d5d09251404c8d97
--- /dev/null
+++ b/densepose/vis/extractor.py
@@ -0,0 +1,199 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+import logging
+from typing import List, Optional, Sequence, Tuple
+import torch
+
+from detectron2.layers.nms import batched_nms
+from detectron2.structures.instances import Instances
+
+from densepose.converters import ToChartResultConverterWithConfidences
+from densepose.structures import (
+    DensePoseChartResultWithConfidences,
+    DensePoseEmbeddingPredictorOutput,
+)
+from densepose.vis.bounding_box import BoundingBoxVisualizer, ScoredBoundingBoxVisualizer
+from densepose.vis.densepose_outputs_vertex import DensePoseOutputsVertexVisualizer
+from densepose.vis.densepose_results import DensePoseResultsVisualizer
+
+from .base import CompoundVisualizer
+
+Scores = Sequence[float]
+DensePoseChartResultsWithConfidences = List[DensePoseChartResultWithConfidences]
+
+
+def extract_scores_from_instances(instances: Instances, select=None):
+    if instances.has("scores"):
+        return instances.scores if select is None else instances.scores[select]
+    return None
+
+
+def extract_boxes_xywh_from_instances(instances: Instances, select=None):
+    if instances.has("pred_boxes"):
+        boxes_xywh = instances.pred_boxes.tensor.clone()
+        boxes_xywh[:, 2] -= boxes_xywh[:, 0]
+        boxes_xywh[:, 3] -= boxes_xywh[:, 1]
+        return boxes_xywh if select is None else boxes_xywh[select]
+    return None
+
+
+def create_extractor(visualizer: object):
+    """
+    Create an extractor for the provided visualizer
+    """
+    if isinstance(visualizer, CompoundVisualizer):
+        extractors = [create_extractor(v) for v in visualizer.visualizers]
+        return CompoundExtractor(extractors)
+    elif isinstance(visualizer, DensePoseResultsVisualizer):
+        return DensePoseResultExtractor()
+    elif isinstance(visualizer, ScoredBoundingBoxVisualizer):
+        return CompoundExtractor([extract_boxes_xywh_from_instances, extract_scores_from_instances])
+    elif isinstance(visualizer, BoundingBoxVisualizer):
+        return extract_boxes_xywh_from_instances
+    elif isinstance(visualizer, DensePoseOutputsVertexVisualizer):
+        return DensePoseOutputsExtractor()
+    else:
+        logger = logging.getLogger(__name__)
+        logger.error(f"Could not create extractor for {visualizer}")
+        return None
+
+
+class BoundingBoxExtractor:
+    """
+    Extracts bounding boxes from instances
+    """
+
+    def __call__(self, instances: Instances):
+        boxes_xywh = extract_boxes_xywh_from_instances(instances)
+        return boxes_xywh
+
+
+class ScoredBoundingBoxExtractor:
+    """
+    Extracts bounding boxes from instances
+    """
+
+    def __call__(self, instances: Instances, select=None):
+        scores = extract_scores_from_instances(instances)
+        boxes_xywh = extract_boxes_xywh_from_instances(instances)
+        if (scores is None) or (boxes_xywh is None):
+            return (boxes_xywh, scores)
+        if select is not None:
+            scores = scores[select]
+            boxes_xywh = boxes_xywh[select]
+        return (boxes_xywh, scores)
+
+
+class DensePoseResultExtractor:
+    """
+    Extracts DensePose chart result with confidences from instances
+    """
+
+    def __call__(
+        self, instances: Instances, select=None
+    ) -> Tuple[Optional[DensePoseChartResultsWithConfidences], Optional[torch.Tensor]]:
+        if instances.has("pred_densepose") and instances.has("pred_boxes"):
+            dpout = instances.pred_densepose
+            boxes_xyxy = instances.pred_boxes
+            boxes_xywh = extract_boxes_xywh_from_instances(instances)
+            if select is not None:
+                dpout = dpout[select]
+                boxes_xyxy = boxes_xyxy[select]
+            converter = ToChartResultConverterWithConfidences()
+            results = [converter.convert(dpout[i], boxes_xyxy[[i]]) for i in range(len(dpout))]
+            return results, boxes_xywh
+        else:
+            return None, None
+
+
+class DensePoseOutputsExtractor:
+    """
+    Extracts DensePose result from instances
+    """
+
+    def __call__(
+        self,
+        instances: Instances,
+        select=None,
+    ) -> Tuple[
+        Optional[DensePoseEmbeddingPredictorOutput], Optional[torch.Tensor], Optional[List[int]]
+    ]:
+        if not (instances.has("pred_densepose") and instances.has("pred_boxes")):
+            return None, None, None
+
+        dpout = instances.pred_densepose
+        boxes_xyxy = instances.pred_boxes
+        boxes_xywh = extract_boxes_xywh_from_instances(instances)
+
+        if instances.has("pred_classes"):
+            classes = instances.pred_classes.tolist()
+        else:
+            classes = None
+
+        if select is not None:
+            dpout = dpout[select]
+            boxes_xyxy = boxes_xyxy[select]
+            if classes is not None:
+                classes = classes[select]
+
+        return dpout, boxes_xywh, classes
+
+
+class CompoundExtractor:
+    """
+    Extracts data for CompoundVisualizer
+    """
+
+    def __init__(self, extractors):
+        self.extractors = extractors
+
+    def __call__(self, instances: Instances, select=None):
+        datas = []
+        for extractor in self.extractors:
+            data = extractor(instances, select)
+            datas.append(data)
+        return datas
+
+
+class NmsFilteredExtractor:
+    """
+    Extracts data in the format accepted by NmsFilteredVisualizer
+    """
+
+    def __init__(self, extractor, iou_threshold):
+        self.extractor = extractor
+        self.iou_threshold = iou_threshold
+
+    def __call__(self, instances: Instances, select=None):
+        scores = extract_scores_from_instances(instances)
+        boxes_xywh = extract_boxes_xywh_from_instances(instances)
+        if boxes_xywh is None:
+            return None
+        select_local_idx = batched_nms(
+            boxes_xywh,
+            scores,
+            torch.zeros(len(scores), dtype=torch.int32),
+            iou_threshold=self.iou_threshold,
+        ).squeeze()
+        select_local = torch.zeros(len(boxes_xywh), dtype=torch.bool, device=boxes_xywh.device)
+        select_local[select_local_idx] = True
+        select = select_local if select is None else (select & select_local)
+        return self.extractor(instances, select=select)
+
+
+class ScoreThresholdedExtractor:
+    """
+    Extracts data in the format accepted by ScoreThresholdedVisualizer
+    """
+
+    def __init__(self, extractor, min_score):
+        self.extractor = extractor
+        self.min_score = min_score
+
+    def __call__(self, instances: Instances, select=None):
+        scores = extract_scores_from_instances(instances)
+        if scores is None:
+            return None
+        select_local = scores > self.min_score
+        select = select_local if select is None else (select & select_local)
+        data = self.extractor(instances, select=select)
+        return data
diff --git a/detectron2/_C.cpython-39-x86_64-linux-gnu.so b/detectron2/_C.cpython-39-x86_64-linux-gnu.so
new file mode 100755
index 0000000000000000000000000000000000000000..55ca5abd74a0890de6f14969bbe64e5707050ceb
Binary files /dev/null and b/detectron2/_C.cpython-39-x86_64-linux-gnu.so differ
diff --git a/detectron2/__init__.py b/detectron2/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..bdd994b49294485c27610772f97f177741f5518f
--- /dev/null
+++ b/detectron2/__init__.py
@@ -0,0 +1,10 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+
+from .utils.env import setup_environment
+
+setup_environment()
+
+
+# This line will be programatically read/write by setup.py.
+# Leave them at the bottom of this file and don't touch them.
+__version__ = "0.6"
diff --git a/detectron2/checkpoint/__init__.py b/detectron2/checkpoint/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..99da0469ae7e169d8970e4b642fed3f870076860
--- /dev/null
+++ b/detectron2/checkpoint/__init__.py
@@ -0,0 +1,10 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) Facebook, Inc. and its affiliates.
+# File:
+
+
+from . import catalog as _UNUSED  # register the handler
+from .detection_checkpoint import DetectionCheckpointer
+from fvcore.common.checkpoint import Checkpointer, PeriodicCheckpointer
+
+__all__ = ["Checkpointer", "PeriodicCheckpointer", "DetectionCheckpointer"]
diff --git a/detectron2/checkpoint/__pycache__/__init__.cpython-310.pyc b/detectron2/checkpoint/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7f986df4e591afca8050ac4e9aab8373c0efd1e8
Binary files /dev/null and b/detectron2/checkpoint/__pycache__/__init__.cpython-310.pyc differ
diff --git a/detectron2/checkpoint/__pycache__/__init__.cpython-39.pyc b/detectron2/checkpoint/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..11c5dd7b12338563f348934d668016511e27dbd5
Binary files /dev/null and b/detectron2/checkpoint/__pycache__/__init__.cpython-39.pyc differ
diff --git a/detectron2/checkpoint/__pycache__/c2_model_loading.cpython-310.pyc b/detectron2/checkpoint/__pycache__/c2_model_loading.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..bf98c5519e210445967db636e2244055c38cfda5
Binary files /dev/null and b/detectron2/checkpoint/__pycache__/c2_model_loading.cpython-310.pyc differ
diff --git a/detectron2/checkpoint/__pycache__/c2_model_loading.cpython-39.pyc b/detectron2/checkpoint/__pycache__/c2_model_loading.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ce464e7b10f405b4623aa8f4e7305d86d5d1bb17
Binary files /dev/null and b/detectron2/checkpoint/__pycache__/c2_model_loading.cpython-39.pyc differ
diff --git a/detectron2/checkpoint/__pycache__/catalog.cpython-310.pyc b/detectron2/checkpoint/__pycache__/catalog.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..23c48961f17e0f4a92e89e5146bc0480f2bb0b77
Binary files /dev/null and b/detectron2/checkpoint/__pycache__/catalog.cpython-310.pyc differ
diff --git a/detectron2/checkpoint/__pycache__/catalog.cpython-39.pyc b/detectron2/checkpoint/__pycache__/catalog.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f693734458ef75027a9c01dc682fe5de1436bf63
Binary files /dev/null and b/detectron2/checkpoint/__pycache__/catalog.cpython-39.pyc differ
diff --git a/detectron2/checkpoint/__pycache__/detection_checkpoint.cpython-310.pyc b/detectron2/checkpoint/__pycache__/detection_checkpoint.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..dd8d1d3239f37d14c4a03f06a09012738145b111
Binary files /dev/null and b/detectron2/checkpoint/__pycache__/detection_checkpoint.cpython-310.pyc differ
diff --git a/detectron2/checkpoint/__pycache__/detection_checkpoint.cpython-39.pyc b/detectron2/checkpoint/__pycache__/detection_checkpoint.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..10cfb8a0f9dd4ae433e7cc9483479ed4fbd692b3
Binary files /dev/null and b/detectron2/checkpoint/__pycache__/detection_checkpoint.cpython-39.pyc differ
diff --git a/detectron2/checkpoint/c2_model_loading.py b/detectron2/checkpoint/c2_model_loading.py
new file mode 100644
index 0000000000000000000000000000000000000000..0cf8b77a93f76a3a6149f121222918acbb9e3994
--- /dev/null
+++ b/detectron2/checkpoint/c2_model_loading.py
@@ -0,0 +1,407 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+import copy
+import logging
+import re
+from typing import Dict, List
+import torch
+from tabulate import tabulate
+
+
+def convert_basic_c2_names(original_keys):
+    """
+    Apply some basic name conversion to names in C2 weights.
+    It only deals with typical backbone models.
+
+    Args:
+        original_keys (list[str]):
+    Returns:
+        list[str]: The same number of strings matching those in original_keys.
+    """
+    layer_keys = copy.deepcopy(original_keys)
+    layer_keys = [
+        {"pred_b": "linear_b", "pred_w": "linear_w"}.get(k, k) for k in layer_keys
+    ]  # some hard-coded mappings
+
+    layer_keys = [k.replace("_", ".") for k in layer_keys]
+    layer_keys = [re.sub("\\.b$", ".bias", k) for k in layer_keys]
+    layer_keys = [re.sub("\\.w$", ".weight", k) for k in layer_keys]
+    # Uniform both bn and gn names to "norm"
+    layer_keys = [re.sub("bn\\.s$", "norm.weight", k) for k in layer_keys]
+    layer_keys = [re.sub("bn\\.bias$", "norm.bias", k) for k in layer_keys]
+    layer_keys = [re.sub("bn\\.rm", "norm.running_mean", k) for k in layer_keys]
+    layer_keys = [re.sub("bn\\.running.mean$", "norm.running_mean", k) for k in layer_keys]
+    layer_keys = [re.sub("bn\\.riv$", "norm.running_var", k) for k in layer_keys]
+    layer_keys = [re.sub("bn\\.running.var$", "norm.running_var", k) for k in layer_keys]
+    layer_keys = [re.sub("bn\\.gamma$", "norm.weight", k) for k in layer_keys]
+    layer_keys = [re.sub("bn\\.beta$", "norm.bias", k) for k in layer_keys]
+    layer_keys = [re.sub("gn\\.s$", "norm.weight", k) for k in layer_keys]
+    layer_keys = [re.sub("gn\\.bias$", "norm.bias", k) for k in layer_keys]
+
+    # stem
+    layer_keys = [re.sub("^res\\.conv1\\.norm\\.", "conv1.norm.", k) for k in layer_keys]
+    # to avoid mis-matching with "conv1" in other components (e.g. detection head)
+    layer_keys = [re.sub("^conv1\\.", "stem.conv1.", k) for k in layer_keys]
+
+    # layer1-4 is used by torchvision, however we follow the C2 naming strategy (res2-5)
+    # layer_keys = [re.sub("^res2.", "layer1.", k) for k in layer_keys]
+    # layer_keys = [re.sub("^res3.", "layer2.", k) for k in layer_keys]
+    # layer_keys = [re.sub("^res4.", "layer3.", k) for k in layer_keys]
+    # layer_keys = [re.sub("^res5.", "layer4.", k) for k in layer_keys]
+
+    # blocks
+    layer_keys = [k.replace(".branch1.", ".shortcut.") for k in layer_keys]
+    layer_keys = [k.replace(".branch2a.", ".conv1.") for k in layer_keys]
+    layer_keys = [k.replace(".branch2b.", ".conv2.") for k in layer_keys]
+    layer_keys = [k.replace(".branch2c.", ".conv3.") for k in layer_keys]
+
+    # DensePose substitutions
+    layer_keys = [re.sub("^body.conv.fcn", "body_conv_fcn", k) for k in layer_keys]
+    layer_keys = [k.replace("AnnIndex.lowres", "ann_index_lowres") for k in layer_keys]
+    layer_keys = [k.replace("Index.UV.lowres", "index_uv_lowres") for k in layer_keys]
+    layer_keys = [k.replace("U.lowres", "u_lowres") for k in layer_keys]
+    layer_keys = [k.replace("V.lowres", "v_lowres") for k in layer_keys]
+    return layer_keys
+
+
+def convert_c2_detectron_names(weights):
+    """
+    Map Caffe2 Detectron weight names to Detectron2 names.
+
+    Args:
+        weights (dict): name -> tensor
+
+    Returns:
+        dict: detectron2 names -> tensor
+        dict: detectron2 names -> C2 names
+    """
+    logger = logging.getLogger(__name__)
+    logger.info("Renaming Caffe2 weights ......")
+    original_keys = sorted(weights.keys())
+    layer_keys = copy.deepcopy(original_keys)
+
+    layer_keys = convert_basic_c2_names(layer_keys)
+
+    # --------------------------------------------------------------------------
+    # RPN hidden representation conv
+    # --------------------------------------------------------------------------
+    # FPN case
+    # In the C2 model, the RPN hidden layer conv is defined for FPN level 2 and then
+    # shared for all other levels, hence the appearance of "fpn2"
+    layer_keys = [
+        k.replace("conv.rpn.fpn2", "proposal_generator.rpn_head.conv") for k in layer_keys
+    ]
+    # Non-FPN case
+    layer_keys = [k.replace("conv.rpn", "proposal_generator.rpn_head.conv") for k in layer_keys]
+
+    # --------------------------------------------------------------------------
+    # RPN box transformation conv
+    # --------------------------------------------------------------------------
+    # FPN case (see note above about "fpn2")
+    layer_keys = [
+        k.replace("rpn.bbox.pred.fpn2", "proposal_generator.rpn_head.anchor_deltas")
+        for k in layer_keys
+    ]
+    layer_keys = [
+        k.replace("rpn.cls.logits.fpn2", "proposal_generator.rpn_head.objectness_logits")
+        for k in layer_keys
+    ]
+    # Non-FPN case
+    layer_keys = [
+        k.replace("rpn.bbox.pred", "proposal_generator.rpn_head.anchor_deltas") for k in layer_keys
+    ]
+    layer_keys = [
+        k.replace("rpn.cls.logits", "proposal_generator.rpn_head.objectness_logits")
+        for k in layer_keys
+    ]
+
+    # --------------------------------------------------------------------------
+    # Fast R-CNN box head
+    # --------------------------------------------------------------------------
+    layer_keys = [re.sub("^bbox\\.pred", "bbox_pred", k) for k in layer_keys]
+    layer_keys = [re.sub("^cls\\.score", "cls_score", k) for k in layer_keys]
+    layer_keys = [re.sub("^fc6\\.", "box_head.fc1.", k) for k in layer_keys]
+    layer_keys = [re.sub("^fc7\\.", "box_head.fc2.", k) for k in layer_keys]
+    # 4conv1fc head tensor names: head_conv1_w, head_conv1_gn_s
+    layer_keys = [re.sub("^head\\.conv", "box_head.conv", k) for k in layer_keys]
+
+    # --------------------------------------------------------------------------
+    # FPN lateral and output convolutions
+    # --------------------------------------------------------------------------
+    def fpn_map(name):
+        """
+        Look for keys with the following patterns:
+        1) Starts with "fpn.inner."
+           Example: "fpn.inner.res2.2.sum.lateral.weight"
+           Meaning: These are lateral pathway convolutions
+        2) Starts with "fpn.res"
+           Example: "fpn.res2.2.sum.weight"
+           Meaning: These are FPN output convolutions
+        """
+        splits = name.split(".")
+        norm = ".norm" if "norm" in splits else ""
+        if name.startswith("fpn.inner."):
+            # splits example: ['fpn', 'inner', 'res2', '2', 'sum', 'lateral', 'weight']
+            stage = int(splits[2][len("res") :])
+            return "fpn_lateral{}{}.{}".format(stage, norm, splits[-1])
+        elif name.startswith("fpn.res"):
+            # splits example: ['fpn', 'res2', '2', 'sum', 'weight']
+            stage = int(splits[1][len("res") :])
+            return "fpn_output{}{}.{}".format(stage, norm, splits[-1])
+        return name
+
+    layer_keys = [fpn_map(k) for k in layer_keys]
+
+    # --------------------------------------------------------------------------
+    # Mask R-CNN mask head
+    # --------------------------------------------------------------------------
+    # roi_heads.StandardROIHeads case
+    layer_keys = [k.replace(".[mask].fcn", "mask_head.mask_fcn") for k in layer_keys]
+    layer_keys = [re.sub("^\\.mask\\.fcn", "mask_head.mask_fcn", k) for k in layer_keys]
+    layer_keys = [k.replace("mask.fcn.logits", "mask_head.predictor") for k in layer_keys]
+    # roi_heads.Res5ROIHeads case
+    layer_keys = [k.replace("conv5.mask", "mask_head.deconv") for k in layer_keys]
+
+    # --------------------------------------------------------------------------
+    # Keypoint R-CNN head
+    # --------------------------------------------------------------------------
+    # interestingly, the keypoint head convs have blob names that are simply "conv_fcnX"
+    layer_keys = [k.replace("conv.fcn", "roi_heads.keypoint_head.conv_fcn") for k in layer_keys]
+    layer_keys = [
+        k.replace("kps.score.lowres", "roi_heads.keypoint_head.score_lowres") for k in layer_keys
+    ]
+    layer_keys = [k.replace("kps.score.", "roi_heads.keypoint_head.score.") for k in layer_keys]
+
+    # --------------------------------------------------------------------------
+    # Done with replacements
+    # --------------------------------------------------------------------------
+    assert len(set(layer_keys)) == len(layer_keys)
+    assert len(original_keys) == len(layer_keys)
+
+    new_weights = {}
+    new_keys_to_original_keys = {}
+    for orig, renamed in zip(original_keys, layer_keys):
+        new_keys_to_original_keys[renamed] = orig
+        if renamed.startswith("bbox_pred.") or renamed.startswith("mask_head.predictor."):
+            # remove the meaningless prediction weight for background class
+            new_start_idx = 4 if renamed.startswith("bbox_pred.") else 1
+            new_weights[renamed] = weights[orig][new_start_idx:]
+            logger.info(
+                "Remove prediction weight for background class in {}. The shape changes from "
+                "{} to {}.".format(
+                    renamed, tuple(weights[orig].shape), tuple(new_weights[renamed].shape)
+                )
+            )
+        elif renamed.startswith("cls_score."):
+            # move weights of bg class from original index 0 to last index
+            logger.info(
+                "Move classification weights for background class in {} from index 0 to "
+                "index {}.".format(renamed, weights[orig].shape[0] - 1)
+            )
+            new_weights[renamed] = torch.cat([weights[orig][1:], weights[orig][:1]])
+        else:
+            new_weights[renamed] = weights[orig]
+
+    return new_weights, new_keys_to_original_keys
+
+
+# Note the current matching is not symmetric.
+# it assumes model_state_dict will have longer names.
+def align_and_update_state_dicts(model_state_dict, ckpt_state_dict, c2_conversion=True):
+    """
+    Match names between the two state-dict, and returns a new chkpt_state_dict with names
+    converted to match model_state_dict with heuristics. The returned dict can be later
+    loaded with fvcore checkpointer.
+    If `c2_conversion==True`, `ckpt_state_dict` is assumed to be a Caffe2
+    model and will be renamed at first.
+
+    Strategy: suppose that the models that we will create will have prefixes appended
+    to each of its keys, for example due to an extra level of nesting that the original
+    pre-trained weights from ImageNet won't contain. For example, model.state_dict()
+    might return backbone[0].body.res2.conv1.weight, while the pre-trained model contains
+    res2.conv1.weight. We thus want to match both parameters together.
+    For that, we look for each model weight, look among all loaded keys if there is one
+    that is a suffix of the current weight name, and use it if that's the case.
+    If multiple matches exist, take the one with longest size
+    of the corresponding name. For example, for the same model as before, the pretrained
+    weight file can contain both res2.conv1.weight, as well as conv1.weight. In this case,
+    we want to match backbone[0].body.conv1.weight to conv1.weight, and
+    backbone[0].body.res2.conv1.weight to res2.conv1.weight.
+    """
+    model_keys = sorted(model_state_dict.keys())
+    if c2_conversion:
+        ckpt_state_dict, original_keys = convert_c2_detectron_names(ckpt_state_dict)
+        # original_keys: the name in the original dict (before renaming)
+    else:
+        original_keys = {x: x for x in ckpt_state_dict.keys()}
+    ckpt_keys = sorted(ckpt_state_dict.keys())
+
+    def match(a, b):
+        # Matched ckpt_key should be a complete (starts with '.') suffix.
+        # For example, roi_heads.mesh_head.whatever_conv1 does not match conv1,
+        # but matches whatever_conv1 or mesh_head.whatever_conv1.
+        return a == b or a.endswith("." + b)
+
+    # get a matrix of string matches, where each (i, j) entry correspond to the size of the
+    # ckpt_key string, if it matches
+    match_matrix = [len(j) if match(i, j) else 0 for i in model_keys for j in ckpt_keys]
+    match_matrix = torch.as_tensor(match_matrix).view(len(model_keys), len(ckpt_keys))
+    # use the matched one with longest size in case of multiple matches
+    max_match_size, idxs = match_matrix.max(1)
+    # remove indices that correspond to no-match
+    idxs[max_match_size == 0] = -1
+
+    logger = logging.getLogger(__name__)
+    # matched_pairs (matched checkpoint key --> matched model key)
+    matched_keys = {}
+    result_state_dict = {}
+    for idx_model, idx_ckpt in enumerate(idxs.tolist()):
+        if idx_ckpt == -1:
+            continue
+        key_model = model_keys[idx_model]
+        key_ckpt = ckpt_keys[idx_ckpt]
+        value_ckpt = ckpt_state_dict[key_ckpt]
+        shape_in_model = model_state_dict[key_model].shape
+
+        if shape_in_model != value_ckpt.shape:
+            logger.warning(
+                "Shape of {} in checkpoint is {}, while shape of {} in model is {}.".format(
+                    key_ckpt, value_ckpt.shape, key_model, shape_in_model
+                )
+            )
+            logger.warning(
+                "{} will not be loaded. Please double check and see if this is desired.".format(
+                    key_ckpt
+                )
+            )
+            continue
+
+        assert key_model not in result_state_dict
+        result_state_dict[key_model] = value_ckpt
+        if key_ckpt in matched_keys:  # already added to matched_keys
+            logger.error(
+                "Ambiguity found for {} in checkpoint!"
+                "It matches at least two keys in the model ({} and {}).".format(
+                    key_ckpt, key_model, matched_keys[key_ckpt]
+                )
+            )
+            raise ValueError("Cannot match one checkpoint key to multiple keys in the model.")
+
+        matched_keys[key_ckpt] = key_model
+
+    # logging:
+    matched_model_keys = sorted(matched_keys.values())
+    if len(matched_model_keys) == 0:
+        logger.warning("No weights in checkpoint matched with model.")
+        return ckpt_state_dict
+    common_prefix = _longest_common_prefix(matched_model_keys)
+    rev_matched_keys = {v: k for k, v in matched_keys.items()}
+    original_keys = {k: original_keys[rev_matched_keys[k]] for k in matched_model_keys}
+
+    model_key_groups = _group_keys_by_module(matched_model_keys, original_keys)
+    table = []
+    memo = set()
+    for key_model in matched_model_keys:
+        if key_model in memo:
+            continue
+        if key_model in model_key_groups:
+            group = model_key_groups[key_model]
+            memo |= set(group)
+            shapes = [tuple(model_state_dict[k].shape) for k in group]
+            table.append(
+                (
+                    _longest_common_prefix([k[len(common_prefix) :] for k in group]) + "*",
+                    _group_str([original_keys[k] for k in group]),
+                    " ".join([str(x).replace(" ", "") for x in shapes]),
+                )
+            )
+        else:
+            key_checkpoint = original_keys[key_model]
+            shape = str(tuple(model_state_dict[key_model].shape))
+            table.append((key_model[len(common_prefix) :], key_checkpoint, shape))
+    submodule_str = common_prefix[:-1] if common_prefix else "model"
+    logger.info(
+        f"Following weights matched with submodule {submodule_str} - Total num: {len(table)}"
+    )
+
+    unmatched_ckpt_keys = [k for k in ckpt_keys if k not in set(matched_keys.keys())]
+    for k in unmatched_ckpt_keys:
+        result_state_dict[k] = ckpt_state_dict[k]
+    return result_state_dict
+
+
+def _group_keys_by_module(keys: List[str], original_names: Dict[str, str]):
+    """
+    Params in the same submodule are grouped together.
+
+    Args:
+        keys: names of all parameters
+        original_names: mapping from parameter name to their name in the checkpoint
+
+    Returns:
+        dict[name -> all other names in the same group]
+    """
+
+    def _submodule_name(key):
+        pos = key.rfind(".")
+        if pos < 0:
+            return None
+        prefix = key[: pos + 1]
+        return prefix
+
+    all_submodules = [_submodule_name(k) for k in keys]
+    all_submodules = [x for x in all_submodules if x]
+    all_submodules = sorted(all_submodules, key=len)
+
+    ret = {}
+    for prefix in all_submodules:
+        group = [k for k in keys if k.startswith(prefix)]
+        if len(group) <= 1:
+            continue
+        original_name_lcp = _longest_common_prefix_str([original_names[k] for k in group])
+        if len(original_name_lcp) == 0:
+            # don't group weights if original names don't share prefix
+            continue
+
+        for k in group:
+            if k in ret:
+                continue
+            ret[k] = group
+    return ret
+
+
+def _longest_common_prefix(names: List[str]) -> str:
+    """
+    ["abc.zfg", "abc.zef"] -> "abc."
+    """
+    names = [n.split(".") for n in names]
+    m1, m2 = min(names), max(names)
+    ret = [a for a, b in zip(m1, m2) if a == b]
+    ret = ".".join(ret) + "." if len(ret) else ""
+    return ret
+
+
+def _longest_common_prefix_str(names: List[str]) -> str:
+    m1, m2 = min(names), max(names)
+    lcp = []
+    for a, b in zip(m1, m2):
+        if a == b:
+            lcp.append(a)
+        else:
+            break
+    lcp = "".join(lcp)
+    return lcp
+
+
+def _group_str(names: List[str]) -> str:
+    """
+    Turn "common1", "common2", "common3" into "common{1,2,3}"
+    """
+    lcp = _longest_common_prefix_str(names)
+    rest = [x[len(lcp) :] for x in names]
+    rest = "{" + ",".join(rest) + "}"
+    ret = lcp + rest
+
+    # add some simplification for BN specifically
+    ret = ret.replace("bn_{beta,running_mean,running_var,gamma}", "bn_*")
+    ret = ret.replace("bn_beta,bn_running_mean,bn_running_var,bn_gamma", "bn_*")
+    return ret
diff --git a/detectron2/checkpoint/catalog.py b/detectron2/checkpoint/catalog.py
new file mode 100644
index 0000000000000000000000000000000000000000..c954fde210ba9b8124239c989f0a97e3ffcffcfe
--- /dev/null
+++ b/detectron2/checkpoint/catalog.py
@@ -0,0 +1,115 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+import logging
+
+from detectron2.utils.file_io import PathHandler, PathManager
+
+
+class ModelCatalog:
+    """
+    Store mappings from names to third-party models.
+    """
+
+    S3_C2_DETECTRON_PREFIX = "https://dl.fbaipublicfiles.com/detectron"
+
+    # MSRA models have STRIDE_IN_1X1=True. False otherwise.
+    # NOTE: all BN models here have fused BN into an affine layer.
+    # As a result, you should only load them to a model with "FrozenBN".
+    # Loading them to a model with regular BN or SyncBN is wrong.
+    # Even when loaded to FrozenBN, it is still different from affine by an epsilon,
+    # which should be negligible for training.
+    # NOTE: all models here uses PIXEL_STD=[1,1,1]
+    # NOTE: Most of the BN models here are no longer used. We use the
+    # re-converted pre-trained models under detectron2 model zoo instead.
+    C2_IMAGENET_MODELS = {
+        "MSRA/R-50": "ImageNetPretrained/MSRA/R-50.pkl",
+        "MSRA/R-101": "ImageNetPretrained/MSRA/R-101.pkl",
+        "FAIR/R-50-GN": "ImageNetPretrained/47261647/R-50-GN.pkl",
+        "FAIR/R-101-GN": "ImageNetPretrained/47592356/R-101-GN.pkl",
+        "FAIR/X-101-32x8d": "ImageNetPretrained/20171220/X-101-32x8d.pkl",
+        "FAIR/X-101-64x4d": "ImageNetPretrained/FBResNeXt/X-101-64x4d.pkl",
+        "FAIR/X-152-32x8d-IN5k": "ImageNetPretrained/25093814/X-152-32x8d-IN5k.pkl",
+    }
+
+    C2_DETECTRON_PATH_FORMAT = (
+        "{prefix}/{url}/output/train/{dataset}/{type}/model_final.pkl"  # noqa B950
+    )
+
+    C2_DATASET_COCO = "coco_2014_train%3Acoco_2014_valminusminival"
+    C2_DATASET_COCO_KEYPOINTS = "keypoints_coco_2014_train%3Akeypoints_coco_2014_valminusminival"
+
+    # format: {model_name} -> part of the url
+    C2_DETECTRON_MODELS = {
+        "35857197/e2e_faster_rcnn_R-50-C4_1x": "35857197/12_2017_baselines/e2e_faster_rcnn_R-50-C4_1x.yaml.01_33_49.iAX0mXvW",  # noqa B950
+        "35857345/e2e_faster_rcnn_R-50-FPN_1x": "35857345/12_2017_baselines/e2e_faster_rcnn_R-50-FPN_1x.yaml.01_36_30.cUF7QR7I",  # noqa B950
+        "35857890/e2e_faster_rcnn_R-101-FPN_1x": "35857890/12_2017_baselines/e2e_faster_rcnn_R-101-FPN_1x.yaml.01_38_50.sNxI7sX7",  # noqa B950
+        "36761737/e2e_faster_rcnn_X-101-32x8d-FPN_1x": "36761737/12_2017_baselines/e2e_faster_rcnn_X-101-32x8d-FPN_1x.yaml.06_31_39.5MIHi1fZ",  # noqa B950
+        "35858791/e2e_mask_rcnn_R-50-C4_1x": "35858791/12_2017_baselines/e2e_mask_rcnn_R-50-C4_1x.yaml.01_45_57.ZgkA7hPB",  # noqa B950
+        "35858933/e2e_mask_rcnn_R-50-FPN_1x": "35858933/12_2017_baselines/e2e_mask_rcnn_R-50-FPN_1x.yaml.01_48_14.DzEQe4wC",  # noqa B950
+        "35861795/e2e_mask_rcnn_R-101-FPN_1x": "35861795/12_2017_baselines/e2e_mask_rcnn_R-101-FPN_1x.yaml.02_31_37.KqyEK4tT",  # noqa B950
+        "36761843/e2e_mask_rcnn_X-101-32x8d-FPN_1x": "36761843/12_2017_baselines/e2e_mask_rcnn_X-101-32x8d-FPN_1x.yaml.06_35_59.RZotkLKI",  # noqa B950
+        "48616381/e2e_mask_rcnn_R-50-FPN_2x_gn": "GN/48616381/04_2018_gn_baselines/e2e_mask_rcnn_R-50-FPN_2x_gn_0416.13_23_38.bTlTI97Q",  # noqa B950
+        "37697547/e2e_keypoint_rcnn_R-50-FPN_1x": "37697547/12_2017_baselines/e2e_keypoint_rcnn_R-50-FPN_1x.yaml.08_42_54.kdzV35ao",  # noqa B950
+        "35998355/rpn_R-50-C4_1x": "35998355/12_2017_baselines/rpn_R-50-C4_1x.yaml.08_00_43.njH5oD9L",  # noqa B950
+        "35998814/rpn_R-50-FPN_1x": "35998814/12_2017_baselines/rpn_R-50-FPN_1x.yaml.08_06_03.Axg0r179",  # noqa B950
+        "36225147/fast_R-50-FPN_1x": "36225147/12_2017_baselines/fast_rcnn_R-50-FPN_1x.yaml.08_39_09.L3obSdQ2",  # noqa B950
+    }
+
+    @staticmethod
+    def get(name):
+        if name.startswith("Caffe2Detectron/COCO"):
+            return ModelCatalog._get_c2_detectron_baseline(name)
+        if name.startswith("ImageNetPretrained/"):
+            return ModelCatalog._get_c2_imagenet_pretrained(name)
+        raise RuntimeError("model not present in the catalog: {}".format(name))
+
+    @staticmethod
+    def _get_c2_imagenet_pretrained(name):
+        prefix = ModelCatalog.S3_C2_DETECTRON_PREFIX
+        name = name[len("ImageNetPretrained/") :]
+        name = ModelCatalog.C2_IMAGENET_MODELS[name]
+        url = "/".join([prefix, name])
+        return url
+
+    @staticmethod
+    def _get_c2_detectron_baseline(name):
+        name = name[len("Caffe2Detectron/COCO/") :]
+        url = ModelCatalog.C2_DETECTRON_MODELS[name]
+        if "keypoint_rcnn" in name:
+            dataset = ModelCatalog.C2_DATASET_COCO_KEYPOINTS
+        else:
+            dataset = ModelCatalog.C2_DATASET_COCO
+
+        if "35998355/rpn_R-50-C4_1x" in name:
+            # this one model is somehow different from others ..
+            type = "rpn"
+        else:
+            type = "generalized_rcnn"
+
+        # Detectron C2 models are stored in the structure defined in `C2_DETECTRON_PATH_FORMAT`.
+        url = ModelCatalog.C2_DETECTRON_PATH_FORMAT.format(
+            prefix=ModelCatalog.S3_C2_DETECTRON_PREFIX, url=url, type=type, dataset=dataset
+        )
+        return url
+
+
+class ModelCatalogHandler(PathHandler):
+    """
+    Resolve URL like catalog://.
+    """
+
+    PREFIX = "catalog://"
+
+    def _get_supported_prefixes(self):
+        return [self.PREFIX]
+
+    def _get_local_path(self, path, **kwargs):
+        logger = logging.getLogger(__name__)
+        catalog_path = ModelCatalog.get(path[len(self.PREFIX) :])
+        logger.info("Catalog entry {} points to {}".format(path, catalog_path))
+        return PathManager.get_local_path(catalog_path, **kwargs)
+
+    def _open(self, path, mode="r", **kwargs):
+        return PathManager.open(self._get_local_path(path), mode, **kwargs)
+
+
+PathManager.register_handler(ModelCatalogHandler())
diff --git a/detectron2/checkpoint/detection_checkpoint.py b/detectron2/checkpoint/detection_checkpoint.py
new file mode 100644
index 0000000000000000000000000000000000000000..cecb1fc2cfe46283b47096bcbcb2be3181431bf2
--- /dev/null
+++ b/detectron2/checkpoint/detection_checkpoint.py
@@ -0,0 +1,143 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+import logging
+import os
+import pickle
+from urllib.parse import parse_qs, urlparse
+import torch
+from fvcore.common.checkpoint import Checkpointer
+from torch.nn.parallel import DistributedDataParallel
+
+import detectron2.utils.comm as comm
+from detectron2.utils.file_io import PathManager
+
+from .c2_model_loading import align_and_update_state_dicts
+
+
+class DetectionCheckpointer(Checkpointer):
+    """
+    Same as :class:`Checkpointer`, but is able to:
+    1. handle models in detectron & detectron2 model zoo, and apply conversions for legacy models.
+    2. correctly load checkpoints that are only available on the master worker
+    """
+
+    def __init__(self, model, save_dir="", *, save_to_disk=None, **checkpointables):
+        is_main_process = comm.is_main_process()
+        super().__init__(
+            model,
+            save_dir,
+            save_to_disk=is_main_process if save_to_disk is None else save_to_disk,
+            **checkpointables,
+        )
+        self.path_manager = PathManager
+        self._parsed_url_during_load = None
+
+    def load(self, path, *args, **kwargs):
+        assert self._parsed_url_during_load is None
+        need_sync = False
+        logger = logging.getLogger(__name__)
+        logger.info("[DetectionCheckpointer] Loading from {} ...".format(path))
+
+        if path and isinstance(self.model, DistributedDataParallel):
+            path = self.path_manager.get_local_path(path)
+            has_file = os.path.isfile(path)
+            all_has_file = comm.all_gather(has_file)
+            if not all_has_file[0]:
+                raise OSError(f"File {path} not found on main worker.")
+            if not all(all_has_file):
+                logger.warning(
+                    f"Not all workers can read checkpoint {path}. "
+                    "Training may fail to fully resume."
+                )
+                # TODO: broadcast the checkpoint file contents from main
+                # worker, and load from it instead.
+                need_sync = True
+            if not has_file:
+                path = None  # don't load if not readable
+
+        if path:
+            parsed_url = urlparse(path)
+            self._parsed_url_during_load = parsed_url
+            path = parsed_url._replace(query="").geturl()  # remove query from filename
+            path = self.path_manager.get_local_path(path)
+        ret = super().load(path, *args, **kwargs)
+
+        if need_sync:
+            logger.info("Broadcasting model states from main worker ...")
+            self.model._sync_params_and_buffers()
+        self._parsed_url_during_load = None  # reset to None
+        return ret
+
+    def _load_file(self, filename):
+        if filename.endswith(".pkl"):
+            with PathManager.open(filename, "rb") as f:
+                data = pickle.load(f, encoding="latin1")
+            if "model" in data and "__author__" in data:
+                # file is in Detectron2 model zoo format
+                self.logger.info("Reading a file from '{}'".format(data["__author__"]))
+                return data
+            else:
+                # assume file is from Caffe2 / Detectron1 model zoo
+                if "blobs" in data:
+                    # Detection models have "blobs", but ImageNet models don't
+                    data = data["blobs"]
+                data = {k: v for k, v in data.items() if not k.endswith("_momentum")}
+                return {"model": data, "__author__": "Caffe2", "matching_heuristics": True}
+        elif filename.endswith(".pyth"):
+            # assume file is from pycls; no one else seems to use the ".pyth" extension
+            with PathManager.open(filename, "rb") as f:
+                data = torch.load(f)
+            assert (
+                "model_state" in data
+            ), f"Cannot load .pyth file {filename}; pycls checkpoints must contain 'model_state'."
+            model_state = {
+                k: v
+                for k, v in data["model_state"].items()
+                if not k.endswith("num_batches_tracked")
+            }
+            return {"model": model_state, "__author__": "pycls", "matching_heuristics": True}
+
+        loaded = self._torch_load(filename)
+        if "model" not in loaded:
+            loaded = {"model": loaded}
+        assert self._parsed_url_during_load is not None, "`_load_file` must be called inside `load`"
+        parsed_url = self._parsed_url_during_load
+        queries = parse_qs(parsed_url.query)
+        if queries.pop("matching_heuristics", "False") == ["True"]:
+            loaded["matching_heuristics"] = True
+        if len(queries) > 0:
+            raise ValueError(
+                f"Unsupported query remaining: f{queries}, orginal filename: {parsed_url.geturl()}"
+            )
+        return loaded
+
+    def _torch_load(self, f):
+        return super()._load_file(f)
+
+    def _load_model(self, checkpoint):
+        if checkpoint.get("matching_heuristics", False):
+            self._convert_ndarray_to_tensor(checkpoint["model"])
+            # convert weights by name-matching heuristics
+            checkpoint["model"] = align_and_update_state_dicts(
+                self.model.state_dict(),
+                checkpoint["model"],
+                c2_conversion=checkpoint.get("__author__", None) == "Caffe2",
+            )
+        # for non-caffe2 models, use standard ways to load it
+        incompatible = super()._load_model(checkpoint)
+
+        model_buffers = dict(self.model.named_buffers(recurse=False))
+        for k in ["pixel_mean", "pixel_std"]:
+            # Ignore missing key message about pixel_mean/std.
+            # Though they may be missing in old checkpoints, they will be correctly
+            # initialized from config anyway.
+            if k in model_buffers:
+                try:
+                    incompatible.missing_keys.remove(k)
+                except ValueError:
+                    pass
+        for k in incompatible.unexpected_keys[:]:
+            # Ignore unexpected keys about cell anchors. They exist in old checkpoints
+            # but now they are non-persistent buffers and will not be in new checkpoints.
+            if "anchor_generator.cell_anchors" in k:
+                incompatible.unexpected_keys.remove(k)
+        return incompatible
diff --git a/detectron2/config/__init__.py b/detectron2/config/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..4e648e632d55c70f160d49630378d202fbde4e45
--- /dev/null
+++ b/detectron2/config/__init__.py
@@ -0,0 +1,24 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+from .compat import downgrade_config, upgrade_config
+from .config import CfgNode, get_cfg, global_cfg, set_global_cfg, configurable
+from .instantiate import instantiate
+from .lazy import LazyCall, LazyConfig
+
+__all__ = [
+    "CfgNode",
+    "get_cfg",
+    "global_cfg",
+    "set_global_cfg",
+    "downgrade_config",
+    "upgrade_config",
+    "configurable",
+    "instantiate",
+    "LazyCall",
+    "LazyConfig",
+]
+
+
+from detectron2.utils.env import fixup_module_metadata
+
+fixup_module_metadata(__name__, globals(), __all__)
+del fixup_module_metadata
diff --git a/detectron2/config/__pycache__/__init__.cpython-310.pyc b/detectron2/config/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a39b6eda7fe4e4cac5fe87834b648469f3b331d8
Binary files /dev/null and b/detectron2/config/__pycache__/__init__.cpython-310.pyc differ
diff --git a/detectron2/config/__pycache__/__init__.cpython-39.pyc b/detectron2/config/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..cad270940e484f0df216ad3783cea238f78f0983
Binary files /dev/null and b/detectron2/config/__pycache__/__init__.cpython-39.pyc differ
diff --git a/detectron2/config/__pycache__/compat.cpython-310.pyc b/detectron2/config/__pycache__/compat.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6558a350fe58a7bfcbca9c852256f9084c924a54
Binary files /dev/null and b/detectron2/config/__pycache__/compat.cpython-310.pyc differ
diff --git a/detectron2/config/__pycache__/compat.cpython-39.pyc b/detectron2/config/__pycache__/compat.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..225bf398fa9977c782cbdc8a5e547d2486bb9110
Binary files /dev/null and b/detectron2/config/__pycache__/compat.cpython-39.pyc differ
diff --git a/detectron2/config/__pycache__/config.cpython-310.pyc b/detectron2/config/__pycache__/config.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a7ad82e63e83eabb9294736b72396a4440ffeb1d
Binary files /dev/null and b/detectron2/config/__pycache__/config.cpython-310.pyc differ
diff --git a/detectron2/config/__pycache__/config.cpython-39.pyc b/detectron2/config/__pycache__/config.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..16326488d28763155a7c200aa98735091c0b828b
Binary files /dev/null and b/detectron2/config/__pycache__/config.cpython-39.pyc differ
diff --git a/detectron2/config/__pycache__/defaults.cpython-310.pyc b/detectron2/config/__pycache__/defaults.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1d01081c3cf7dde40612aec8080ce04c8ad8d06b
Binary files /dev/null and b/detectron2/config/__pycache__/defaults.cpython-310.pyc differ
diff --git a/detectron2/config/__pycache__/defaults.cpython-39.pyc b/detectron2/config/__pycache__/defaults.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3df6f73447d7ce9383f606d06a4a7beea5b7905c
Binary files /dev/null and b/detectron2/config/__pycache__/defaults.cpython-39.pyc differ
diff --git a/detectron2/config/__pycache__/instantiate.cpython-310.pyc b/detectron2/config/__pycache__/instantiate.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..33d2dc76a7e8126942139c5fa0d9ccd41fba0a83
Binary files /dev/null and b/detectron2/config/__pycache__/instantiate.cpython-310.pyc differ
diff --git a/detectron2/config/__pycache__/instantiate.cpython-39.pyc b/detectron2/config/__pycache__/instantiate.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f980ab7f529b8ef657fdfb3688daff163177dae5
Binary files /dev/null and b/detectron2/config/__pycache__/instantiate.cpython-39.pyc differ
diff --git a/detectron2/config/__pycache__/lazy.cpython-310.pyc b/detectron2/config/__pycache__/lazy.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7a95d843ba48763b1ee6e25ec6e35343550eeeab
Binary files /dev/null and b/detectron2/config/__pycache__/lazy.cpython-310.pyc differ
diff --git a/detectron2/config/__pycache__/lazy.cpython-39.pyc b/detectron2/config/__pycache__/lazy.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8924d36dbe86439e1e658af8f62eb91a1277b9e3
Binary files /dev/null and b/detectron2/config/__pycache__/lazy.cpython-39.pyc differ
diff --git a/detectron2/config/compat.py b/detectron2/config/compat.py
new file mode 100644
index 0000000000000000000000000000000000000000..11a08c439bf14defd880e37a938fab8a08e68eeb
--- /dev/null
+++ b/detectron2/config/compat.py
@@ -0,0 +1,229 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+"""
+Backward compatibility of configs.
+
+Instructions to bump version:
++ It's not needed to bump version if new keys are added.
+  It's only needed when backward-incompatible changes happen
+  (i.e., some existing keys disappear, or the meaning of a key changes)
++ To bump version, do the following:
+    1. Increment _C.VERSION in defaults.py
+    2. Add a converter in this file.
+
+      Each ConverterVX has a function "upgrade" which in-place upgrades config from X-1 to X,
+      and a function "downgrade" which in-place downgrades config from X to X-1
+
+      In each function, VERSION is left unchanged.
+
+      Each converter assumes that its input has the relevant keys
+      (i.e., the input is not a partial config).
+    3. Run the tests (test_config.py) to make sure the upgrade & downgrade
+       functions are consistent.
+"""
+
+import logging
+from typing import List, Optional, Tuple
+
+from .config import CfgNode as CN
+from .defaults import _C
+
+__all__ = ["upgrade_config", "downgrade_config"]
+
+
+def upgrade_config(cfg: CN, to_version: Optional[int] = None) -> CN:
+    """
+    Upgrade a config from its current version to a newer version.
+
+    Args:
+        cfg (CfgNode):
+        to_version (int): defaults to the latest version.
+    """
+    cfg = cfg.clone()
+    if to_version is None:
+        to_version = _C.VERSION
+
+    assert cfg.VERSION <= to_version, "Cannot upgrade from v{} to v{}!".format(
+        cfg.VERSION, to_version
+    )
+    for k in range(cfg.VERSION, to_version):
+        converter = globals()["ConverterV" + str(k + 1)]
+        converter.upgrade(cfg)
+        cfg.VERSION = k + 1
+    return cfg
+
+
+def downgrade_config(cfg: CN, to_version: int) -> CN:
+    """
+    Downgrade a config from its current version to an older version.
+
+    Args:
+        cfg (CfgNode):
+        to_version (int):
+
+    Note:
+        A general downgrade of arbitrary configs is not always possible due to the
+        different functionalities in different versions.
+        The purpose of downgrade is only to recover the defaults in old versions,
+        allowing it to load an old partial yaml config.
+        Therefore, the implementation only needs to fill in the default values
+        in the old version when a general downgrade is not possible.
+    """
+    cfg = cfg.clone()
+    assert cfg.VERSION >= to_version, "Cannot downgrade from v{} to v{}!".format(
+        cfg.VERSION, to_version
+    )
+    for k in range(cfg.VERSION, to_version, -1):
+        converter = globals()["ConverterV" + str(k)]
+        converter.downgrade(cfg)
+        cfg.VERSION = k - 1
+    return cfg
+
+
+def guess_version(cfg: CN, filename: str) -> int:
+    """
+    Guess the version of a partial config where the VERSION field is not specified.
+    Returns the version, or the latest if cannot make a guess.
+
+    This makes it easier for users to migrate.
+    """
+    logger = logging.getLogger(__name__)
+
+    def _has(name: str) -> bool:
+        cur = cfg
+        for n in name.split("."):
+            if n not in cur:
+                return False
+            cur = cur[n]
+        return True
+
+    # Most users' partial configs have "MODEL.WEIGHT", so guess on it
+    ret = None
+    if _has("MODEL.WEIGHT") or _has("TEST.AUG_ON"):
+        ret = 1
+
+    if ret is not None:
+        logger.warning("Config '{}' has no VERSION. Assuming it to be v{}.".format(filename, ret))
+    else:
+        ret = _C.VERSION
+        logger.warning(
+            "Config '{}' has no VERSION. Assuming it to be compatible with latest v{}.".format(
+                filename, ret
+            )
+        )
+    return ret
+
+
+def _rename(cfg: CN, old: str, new: str) -> None:
+    old_keys = old.split(".")
+    new_keys = new.split(".")
+
+    def _set(key_seq: List[str], val: str) -> None:
+        cur = cfg
+        for k in key_seq[:-1]:
+            if k not in cur:
+                cur[k] = CN()
+            cur = cur[k]
+        cur[key_seq[-1]] = val
+
+    def _get(key_seq: List[str]) -> CN:
+        cur = cfg
+        for k in key_seq:
+            cur = cur[k]
+        return cur
+
+    def _del(key_seq: List[str]) -> None:
+        cur = cfg
+        for k in key_seq[:-1]:
+            cur = cur[k]
+        del cur[key_seq[-1]]
+        if len(cur) == 0 and len(key_seq) > 1:
+            _del(key_seq[:-1])
+
+    _set(new_keys, _get(old_keys))
+    _del(old_keys)
+
+
+class _RenameConverter:
+    """
+    A converter that handles simple rename.
+    """
+
+    RENAME: List[Tuple[str, str]] = []  # list of tuples of (old name, new name)
+
+    @classmethod
+    def upgrade(cls, cfg: CN) -> None:
+        for old, new in cls.RENAME:
+            _rename(cfg, old, new)
+
+    @classmethod
+    def downgrade(cls, cfg: CN) -> None:
+        for old, new in cls.RENAME[::-1]:
+            _rename(cfg, new, old)
+
+
+class ConverterV1(_RenameConverter):
+    RENAME = [("MODEL.RPN_HEAD.NAME", "MODEL.RPN.HEAD_NAME")]
+
+
+class ConverterV2(_RenameConverter):
+    """
+    A large bulk of rename, before public release.
+    """
+
+    RENAME = [
+        ("MODEL.WEIGHT", "MODEL.WEIGHTS"),
+        ("MODEL.PANOPTIC_FPN.SEMANTIC_LOSS_SCALE", "MODEL.SEM_SEG_HEAD.LOSS_WEIGHT"),
+        ("MODEL.PANOPTIC_FPN.RPN_LOSS_SCALE", "MODEL.RPN.LOSS_WEIGHT"),
+        ("MODEL.PANOPTIC_FPN.INSTANCE_LOSS_SCALE", "MODEL.PANOPTIC_FPN.INSTANCE_LOSS_WEIGHT"),
+        ("MODEL.PANOPTIC_FPN.COMBINE_ON", "MODEL.PANOPTIC_FPN.COMBINE.ENABLED"),
+        (
+            "MODEL.PANOPTIC_FPN.COMBINE_OVERLAP_THRESHOLD",
+            "MODEL.PANOPTIC_FPN.COMBINE.OVERLAP_THRESH",
+        ),
+        (
+            "MODEL.PANOPTIC_FPN.COMBINE_STUFF_AREA_LIMIT",
+            "MODEL.PANOPTIC_FPN.COMBINE.STUFF_AREA_LIMIT",
+        ),
+        (
+            "MODEL.PANOPTIC_FPN.COMBINE_INSTANCES_CONFIDENCE_THRESHOLD",
+            "MODEL.PANOPTIC_FPN.COMBINE.INSTANCES_CONFIDENCE_THRESH",
+        ),
+        ("MODEL.ROI_HEADS.SCORE_THRESH", "MODEL.ROI_HEADS.SCORE_THRESH_TEST"),
+        ("MODEL.ROI_HEADS.NMS", "MODEL.ROI_HEADS.NMS_THRESH_TEST"),
+        ("MODEL.RETINANET.INFERENCE_SCORE_THRESHOLD", "MODEL.RETINANET.SCORE_THRESH_TEST"),
+        ("MODEL.RETINANET.INFERENCE_TOPK_CANDIDATES", "MODEL.RETINANET.TOPK_CANDIDATES_TEST"),
+        ("MODEL.RETINANET.INFERENCE_NMS_THRESHOLD", "MODEL.RETINANET.NMS_THRESH_TEST"),
+        ("TEST.DETECTIONS_PER_IMG", "TEST.DETECTIONS_PER_IMAGE"),
+        ("TEST.AUG_ON", "TEST.AUG.ENABLED"),
+        ("TEST.AUG_MIN_SIZES", "TEST.AUG.MIN_SIZES"),
+        ("TEST.AUG_MAX_SIZE", "TEST.AUG.MAX_SIZE"),
+        ("TEST.AUG_FLIP", "TEST.AUG.FLIP"),
+    ]
+
+    @classmethod
+    def upgrade(cls, cfg: CN) -> None:
+        super().upgrade(cfg)
+
+        if cfg.MODEL.META_ARCHITECTURE == "RetinaNet":
+            _rename(
+                cfg, "MODEL.RETINANET.ANCHOR_ASPECT_RATIOS", "MODEL.ANCHOR_GENERATOR.ASPECT_RATIOS"
+            )
+            _rename(cfg, "MODEL.RETINANET.ANCHOR_SIZES", "MODEL.ANCHOR_GENERATOR.SIZES")
+            del cfg["MODEL"]["RPN"]["ANCHOR_SIZES"]
+            del cfg["MODEL"]["RPN"]["ANCHOR_ASPECT_RATIOS"]
+        else:
+            _rename(cfg, "MODEL.RPN.ANCHOR_ASPECT_RATIOS", "MODEL.ANCHOR_GENERATOR.ASPECT_RATIOS")
+            _rename(cfg, "MODEL.RPN.ANCHOR_SIZES", "MODEL.ANCHOR_GENERATOR.SIZES")
+            del cfg["MODEL"]["RETINANET"]["ANCHOR_SIZES"]
+            del cfg["MODEL"]["RETINANET"]["ANCHOR_ASPECT_RATIOS"]
+        del cfg["MODEL"]["RETINANET"]["ANCHOR_STRIDES"]
+
+    @classmethod
+    def downgrade(cls, cfg: CN) -> None:
+        super().downgrade(cfg)
+
+        _rename(cfg, "MODEL.ANCHOR_GENERATOR.ASPECT_RATIOS", "MODEL.RPN.ANCHOR_ASPECT_RATIOS")
+        _rename(cfg, "MODEL.ANCHOR_GENERATOR.SIZES", "MODEL.RPN.ANCHOR_SIZES")
+        cfg.MODEL.RETINANET.ANCHOR_ASPECT_RATIOS = cfg.MODEL.RPN.ANCHOR_ASPECT_RATIOS
+        cfg.MODEL.RETINANET.ANCHOR_SIZES = cfg.MODEL.RPN.ANCHOR_SIZES
+        cfg.MODEL.RETINANET.ANCHOR_STRIDES = []  # this is not used anywhere in any version
diff --git a/detectron2/config/config.py b/detectron2/config/config.py
new file mode 100644
index 0000000000000000000000000000000000000000..49a55b1bc87509e2bb24b902ae12c21d5aaeda81
--- /dev/null
+++ b/detectron2/config/config.py
@@ -0,0 +1,265 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) Facebook, Inc. and its affiliates.
+
+import functools
+import inspect
+import logging
+from fvcore.common.config import CfgNode as _CfgNode
+
+from detectron2.utils.file_io import PathManager
+
+
+class CfgNode(_CfgNode):
+    """
+    The same as `fvcore.common.config.CfgNode`, but different in:
+
+    1. Use unsafe yaml loading by default.
+       Note that this may lead to arbitrary code execution: you must not
+       load a config file from untrusted sources before manually inspecting
+       the content of the file.
+    2. Support config versioning.
+       When attempting to merge an old config, it will convert the old config automatically.
+
+    .. automethod:: clone
+    .. automethod:: freeze
+    .. automethod:: defrost
+    .. automethod:: is_frozen
+    .. automethod:: load_yaml_with_base
+    .. automethod:: merge_from_list
+    .. automethod:: merge_from_other_cfg
+    """
+
+    @classmethod
+    def _open_cfg(cls, filename):
+        return PathManager.open(filename, "r")
+
+    # Note that the default value of allow_unsafe is changed to True
+    def merge_from_file(self, cfg_filename: str, allow_unsafe: bool = True) -> None:
+        """
+        Load content from the given config file and merge it into self.
+
+        Args:
+            cfg_filename: config filename
+            allow_unsafe: allow unsafe yaml syntax
+        """
+        assert PathManager.isfile(cfg_filename), f"Config file '{cfg_filename}' does not exist!"
+        loaded_cfg = self.load_yaml_with_base(cfg_filename, allow_unsafe=allow_unsafe)
+        loaded_cfg = type(self)(loaded_cfg)
+
+        # defaults.py needs to import CfgNode
+        from .defaults import _C
+
+        latest_ver = _C.VERSION
+        assert (
+            latest_ver == self.VERSION
+        ), "CfgNode.merge_from_file is only allowed on a config object of latest version!"
+
+        logger = logging.getLogger(__name__)
+
+        loaded_ver = loaded_cfg.get("VERSION", None)
+        if loaded_ver is None:
+            from .compat import guess_version
+
+            loaded_ver = guess_version(loaded_cfg, cfg_filename)
+        assert loaded_ver <= self.VERSION, "Cannot merge a v{} config into a v{} config.".format(
+            loaded_ver, self.VERSION
+        )
+
+        if loaded_ver == self.VERSION:
+            self.merge_from_other_cfg(loaded_cfg)
+        else:
+            # compat.py needs to import CfgNode
+            from .compat import upgrade_config, downgrade_config
+
+            logger.warning(
+                "Loading an old v{} config file '{}' by automatically upgrading to v{}. "
+                "See docs/CHANGELOG.md for instructions to update your files.".format(
+                    loaded_ver, cfg_filename, self.VERSION
+                )
+            )
+            # To convert, first obtain a full config at an old version
+            old_self = downgrade_config(self, to_version=loaded_ver)
+            old_self.merge_from_other_cfg(loaded_cfg)
+            new_config = upgrade_config(old_self)
+            self.clear()
+            self.update(new_config)
+
+    def dump(self, *args, **kwargs):
+        """
+        Returns:
+            str: a yaml string representation of the config
+        """
+        # to make it show up in docs
+        return super().dump(*args, **kwargs)
+
+
+global_cfg = CfgNode()
+
+
+def get_cfg() -> CfgNode:
+    """
+    Get a copy of the default config.
+
+    Returns:
+        a detectron2 CfgNode instance.
+    """
+    from .defaults import _C
+
+    return _C.clone()
+
+
+def set_global_cfg(cfg: CfgNode) -> None:
+    """
+    Let the global config point to the given cfg.
+
+    Assume that the given "cfg" has the key "KEY", after calling
+    `set_global_cfg(cfg)`, the key can be accessed by:
+    ::
+        from detectron2.config import global_cfg
+        print(global_cfg.KEY)
+
+    By using a hacky global config, you can access these configs anywhere,
+    without having to pass the config object or the values deep into the code.
+    This is a hacky feature introduced for quick prototyping / research exploration.
+    """
+    global global_cfg
+    global_cfg.clear()
+    global_cfg.update(cfg)
+
+
+def configurable(init_func=None, *, from_config=None):
+    """
+    Decorate a function or a class's __init__ method so that it can be called
+    with a :class:`CfgNode` object using a :func:`from_config` function that translates
+    :class:`CfgNode` to arguments.
+
+    Examples:
+    ::
+        # Usage 1: Decorator on __init__:
+        class A:
+            @configurable
+            def __init__(self, a, b=2, c=3):
+                pass
+
+            @classmethod
+            def from_config(cls, cfg):   # 'cfg' must be the first argument
+                # Returns kwargs to be passed to __init__
+                return {"a": cfg.A, "b": cfg.B}
+
+        a1 = A(a=1, b=2)  # regular construction
+        a2 = A(cfg)       # construct with a cfg
+        a3 = A(cfg, b=3, c=4)  # construct with extra overwrite
+
+        # Usage 2: Decorator on any function. Needs an extra from_config argument:
+        @configurable(from_config=lambda cfg: {"a: cfg.A, "b": cfg.B})
+        def a_func(a, b=2, c=3):
+            pass
+
+        a1 = a_func(a=1, b=2)  # regular call
+        a2 = a_func(cfg)       # call with a cfg
+        a3 = a_func(cfg, b=3, c=4)  # call with extra overwrite
+
+    Args:
+        init_func (callable): a class's ``__init__`` method in usage 1. The
+            class must have a ``from_config`` classmethod which takes `cfg` as
+            the first argument.
+        from_config (callable): the from_config function in usage 2. It must take `cfg`
+            as its first argument.
+    """
+
+    if init_func is not None:
+        assert (
+            inspect.isfunction(init_func)
+            and from_config is None
+            and init_func.__name__ == "__init__"
+        ), "Incorrect use of @configurable. Check API documentation for examples."
+
+        @functools.wraps(init_func)
+        def wrapped(self, *args, **kwargs):
+            try:
+                from_config_func = type(self).from_config
+            except AttributeError as e:
+                raise AttributeError(
+                    "Class with @configurable must have a 'from_config' classmethod."
+                ) from e
+            if not inspect.ismethod(from_config_func):
+                raise TypeError("Class with @configurable must have a 'from_config' classmethod.")
+
+            if _called_with_cfg(*args, **kwargs):
+                explicit_args = _get_args_from_config(from_config_func, *args, **kwargs)
+                init_func(self, **explicit_args)
+            else:
+                init_func(self, *args, **kwargs)
+
+        return wrapped
+
+    else:
+        if from_config is None:
+            return configurable  # @configurable() is made equivalent to @configurable
+        assert inspect.isfunction(
+            from_config
+        ), "from_config argument of configurable must be a function!"
+
+        def wrapper(orig_func):
+            @functools.wraps(orig_func)
+            def wrapped(*args, **kwargs):
+                if _called_with_cfg(*args, **kwargs):
+                    explicit_args = _get_args_from_config(from_config, *args, **kwargs)
+                    return orig_func(**explicit_args)
+                else:
+                    return orig_func(*args, **kwargs)
+
+            wrapped.from_config = from_config
+            return wrapped
+
+        return wrapper
+
+
+def _get_args_from_config(from_config_func, *args, **kwargs):
+    """
+    Use `from_config` to obtain explicit arguments.
+
+    Returns:
+        dict: arguments to be used for cls.__init__
+    """
+    signature = inspect.signature(from_config_func)
+    if list(signature.parameters.keys())[0] != "cfg":
+        if inspect.isfunction(from_config_func):
+            name = from_config_func.__name__
+        else:
+            name = f"{from_config_func.__self__}.from_config"
+        raise TypeError(f"{name} must take 'cfg' as the first argument!")
+    support_var_arg = any(
+        param.kind in [param.VAR_POSITIONAL, param.VAR_KEYWORD]
+        for param in signature.parameters.values()
+    )
+    if support_var_arg:  # forward all arguments to from_config, if from_config accepts them
+        ret = from_config_func(*args, **kwargs)
+    else:
+        # forward supported arguments to from_config
+        supported_arg_names = set(signature.parameters.keys())
+        extra_kwargs = {}
+        for name in list(kwargs.keys()):
+            if name not in supported_arg_names:
+                extra_kwargs[name] = kwargs.pop(name)
+        ret = from_config_func(*args, **kwargs)
+        # forward the other arguments to __init__
+        ret.update(extra_kwargs)
+    return ret
+
+
+def _called_with_cfg(*args, **kwargs):
+    """
+    Returns:
+        bool: whether the arguments contain CfgNode and should be considered
+            forwarded to from_config.
+    """
+    from omegaconf import DictConfig
+
+    if len(args) and isinstance(args[0], (_CfgNode, DictConfig)):
+        return True
+    if isinstance(kwargs.pop("cfg", None), (_CfgNode, DictConfig)):
+        return True
+    # `from_config`'s first argument is forced to be "cfg".
+    # So the above check covers all cases.
+    return False
diff --git a/detectron2/config/defaults.py b/detectron2/config/defaults.py
new file mode 100644
index 0000000000000000000000000000000000000000..bd2a5f6b2de4af2caa1f65c64ab93a5e3ac21780
--- /dev/null
+++ b/detectron2/config/defaults.py
@@ -0,0 +1,650 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+from .config import CfgNode as CN
+
+# NOTE: given the new config system
+# (https://detectron2.readthedocs.io/en/latest/tutorials/lazyconfigs.html),
+# we will stop adding new functionalities to default CfgNode.
+
+# -----------------------------------------------------------------------------
+# Convention about Training / Test specific parameters
+# -----------------------------------------------------------------------------
+# Whenever an argument can be either used for training or for testing, the
+# corresponding name will be post-fixed by a _TRAIN for a training parameter,
+# or _TEST for a test-specific parameter.
+# For example, the number of images during training will be
+# IMAGES_PER_BATCH_TRAIN, while the number of images for testing will be
+# IMAGES_PER_BATCH_TEST
+
+# -----------------------------------------------------------------------------
+# Config definition
+# -----------------------------------------------------------------------------
+
+_C = CN()
+
+# The version number, to upgrade from old configs to new ones if any
+# changes happen. It's recommended to keep a VERSION in your config file.
+_C.VERSION = 2
+
+_C.MODEL = CN()
+_C.MODEL.LOAD_PROPOSALS = False
+_C.MODEL.MASK_ON = False
+_C.MODEL.KEYPOINT_ON = False
+_C.MODEL.DEVICE = "cuda"
+_C.MODEL.META_ARCHITECTURE = "GeneralizedRCNN"
+
+# Path (a file path, or URL like detectron2://.., https://..) to a checkpoint file
+# to be loaded to the model. You can find available models in the model zoo.
+_C.MODEL.WEIGHTS = ""
+
+# Values to be used for image normalization (BGR order, since INPUT.FORMAT defaults to BGR).
+# To train on images of different number of channels, just set different mean & std.
+# Default values are the mean pixel value from ImageNet: [103.53, 116.28, 123.675]
+_C.MODEL.PIXEL_MEAN = [103.530, 116.280, 123.675]
+# When using pre-trained models in Detectron1 or any MSRA models,
+# std has been absorbed into its conv1 weights, so the std needs to be set 1.
+# Otherwise, you can use [57.375, 57.120, 58.395] (ImageNet std)
+_C.MODEL.PIXEL_STD = [1.0, 1.0, 1.0]
+
+
+# -----------------------------------------------------------------------------
+# INPUT
+# -----------------------------------------------------------------------------
+_C.INPUT = CN()
+# By default, {MIN,MAX}_SIZE options are used in transforms.ResizeShortestEdge.
+# Please refer to ResizeShortestEdge for detailed definition.
+# Size of the smallest side of the image during training
+_C.INPUT.MIN_SIZE_TRAIN = (800,)
+# Sample size of smallest side by choice or random selection from range give by
+# INPUT.MIN_SIZE_TRAIN
+_C.INPUT.MIN_SIZE_TRAIN_SAMPLING = "choice"
+# Maximum size of the side of the image during training
+_C.INPUT.MAX_SIZE_TRAIN = 1333
+# Size of the smallest side of the image during testing. Set to zero to disable resize in testing.
+_C.INPUT.MIN_SIZE_TEST = 800
+# Maximum size of the side of the image during testing
+_C.INPUT.MAX_SIZE_TEST = 1333
+# Mode for flipping images used in data augmentation during training
+# choose one of ["horizontal, "vertical", "none"]
+_C.INPUT.RANDOM_FLIP = "horizontal"
+
+# `True` if cropping is used for data augmentation during training
+_C.INPUT.CROP = CN({"ENABLED": False})
+# Cropping type. See documentation of `detectron2.data.transforms.RandomCrop` for explanation.
+_C.INPUT.CROP.TYPE = "relative_range"
+# Size of crop in range (0, 1] if CROP.TYPE is "relative" or "relative_range" and in number of
+# pixels if CROP.TYPE is "absolute"
+_C.INPUT.CROP.SIZE = [0.9, 0.9]
+
+
+# Whether the model needs RGB, YUV, HSV etc.
+# Should be one of the modes defined here, as we use PIL to read the image:
+# https://pillow.readthedocs.io/en/stable/handbook/concepts.html#concept-modes
+# with BGR being the one exception. One can set image format to BGR, we will
+# internally use RGB for conversion and flip the channels over
+_C.INPUT.FORMAT = "BGR"
+# The ground truth mask format that the model will use.
+# Mask R-CNN supports either "polygon" or "bitmask" as ground truth.
+_C.INPUT.MASK_FORMAT = "polygon"  # alternative: "bitmask"
+
+
+# -----------------------------------------------------------------------------
+# Dataset
+# -----------------------------------------------------------------------------
+_C.DATASETS = CN()
+# List of the dataset names for training. Must be registered in DatasetCatalog
+# Samples from these datasets will be merged and used as one dataset.
+_C.DATASETS.TRAIN = ()
+# List of the pre-computed proposal files for training, which must be consistent
+# with datasets listed in DATASETS.TRAIN.
+_C.DATASETS.PROPOSAL_FILES_TRAIN = ()
+# Number of top scoring precomputed proposals to keep for training
+_C.DATASETS.PRECOMPUTED_PROPOSAL_TOPK_TRAIN = 2000
+# List of the dataset names for testing. Must be registered in DatasetCatalog
+_C.DATASETS.TEST = ()
+# List of the pre-computed proposal files for test, which must be consistent
+# with datasets listed in DATASETS.TEST.
+_C.DATASETS.PROPOSAL_FILES_TEST = ()
+# Number of top scoring precomputed proposals to keep for test
+_C.DATASETS.PRECOMPUTED_PROPOSAL_TOPK_TEST = 1000
+
+# -----------------------------------------------------------------------------
+# DataLoader
+# -----------------------------------------------------------------------------
+_C.DATALOADER = CN()
+# Number of data loading threads
+_C.DATALOADER.NUM_WORKERS = 4
+# If True, each batch should contain only images for which the aspect ratio
+# is compatible. This groups portrait images together, and landscape images
+# are not batched with portrait images.
+_C.DATALOADER.ASPECT_RATIO_GROUPING = True
+# Options: TrainingSampler, RepeatFactorTrainingSampler
+_C.DATALOADER.SAMPLER_TRAIN = "TrainingSampler"
+# Repeat threshold for RepeatFactorTrainingSampler
+_C.DATALOADER.REPEAT_THRESHOLD = 0.0
+# Tf True, when working on datasets that have instance annotations, the
+# training dataloader will filter out images without associated annotations
+_C.DATALOADER.FILTER_EMPTY_ANNOTATIONS = True
+
+# ---------------------------------------------------------------------------- #
+# Backbone options
+# ---------------------------------------------------------------------------- #
+_C.MODEL.BACKBONE = CN()
+
+_C.MODEL.BACKBONE.NAME = "build_resnet_backbone"
+# Freeze the first several stages so they are not trained.
+# There are 5 stages in ResNet. The first is a convolution, and the following
+# stages are each group of residual blocks.
+_C.MODEL.BACKBONE.FREEZE_AT = 2
+
+
+# ---------------------------------------------------------------------------- #
+# FPN options
+# ---------------------------------------------------------------------------- #
+_C.MODEL.FPN = CN()
+# Names of the input feature maps to be used by FPN
+# They must have contiguous power of 2 strides
+# e.g., ["res2", "res3", "res4", "res5"]
+_C.MODEL.FPN.IN_FEATURES = []
+_C.MODEL.FPN.OUT_CHANNELS = 256
+
+# Options: "" (no norm), "GN"
+_C.MODEL.FPN.NORM = ""
+
+# Types for fusing the FPN top-down and lateral features. Can be either "sum" or "avg"
+_C.MODEL.FPN.FUSE_TYPE = "sum"
+
+
+# ---------------------------------------------------------------------------- #
+# Proposal generator options
+# ---------------------------------------------------------------------------- #
+_C.MODEL.PROPOSAL_GENERATOR = CN()
+# Current proposal generators include "RPN", "RRPN" and "PrecomputedProposals"
+_C.MODEL.PROPOSAL_GENERATOR.NAME = "RPN"
+# Proposal height and width both need to be greater than MIN_SIZE
+# (a the scale used during training or inference)
+_C.MODEL.PROPOSAL_GENERATOR.MIN_SIZE = 0
+
+
+# ---------------------------------------------------------------------------- #
+# Anchor generator options
+# ---------------------------------------------------------------------------- #
+_C.MODEL.ANCHOR_GENERATOR = CN()
+# The generator can be any name in the ANCHOR_GENERATOR registry
+_C.MODEL.ANCHOR_GENERATOR.NAME = "DefaultAnchorGenerator"
+# Anchor sizes (i.e. sqrt of area) in absolute pixels w.r.t. the network input.
+# Format: list[list[float]]. SIZES[i] specifies the list of sizes to use for
+# IN_FEATURES[i]; len(SIZES) must be equal to len(IN_FEATURES) or 1.
+# When len(SIZES) == 1, SIZES[0] is used for all IN_FEATURES.
+_C.MODEL.ANCHOR_GENERATOR.SIZES = [[32, 64, 128, 256, 512]]
+# Anchor aspect ratios. For each area given in `SIZES`, anchors with different aspect
+# ratios are generated by an anchor generator.
+# Format: list[list[float]]. ASPECT_RATIOS[i] specifies the list of aspect ratios (H/W)
+# to use for IN_FEATURES[i]; len(ASPECT_RATIOS) == len(IN_FEATURES) must be true,
+# or len(ASPECT_RATIOS) == 1 is true and aspect ratio list ASPECT_RATIOS[0] is used
+# for all IN_FEATURES.
+_C.MODEL.ANCHOR_GENERATOR.ASPECT_RATIOS = [[0.5, 1.0, 2.0]]
+# Anchor angles.
+# list[list[float]], the angle in degrees, for each input feature map.
+# ANGLES[i] specifies the list of angles for IN_FEATURES[i].
+_C.MODEL.ANCHOR_GENERATOR.ANGLES = [[-90, 0, 90]]
+# Relative offset between the center of the first anchor and the top-left corner of the image
+# Value has to be in [0, 1). Recommend to use 0.5, which means half stride.
+# The value is not expected to affect model accuracy.
+_C.MODEL.ANCHOR_GENERATOR.OFFSET = 0.0
+
+# ---------------------------------------------------------------------------- #
+# RPN options
+# ---------------------------------------------------------------------------- #
+_C.MODEL.RPN = CN()
+_C.MODEL.RPN.HEAD_NAME = "StandardRPNHead"  # used by RPN_HEAD_REGISTRY
+
+# Names of the input feature maps to be used by RPN
+# e.g., ["p2", "p3", "p4", "p5", "p6"] for FPN
+_C.MODEL.RPN.IN_FEATURES = ["res4"]
+# Remove RPN anchors that go outside the image by BOUNDARY_THRESH pixels
+# Set to -1 or a large value, e.g. 100000, to disable pruning anchors
+_C.MODEL.RPN.BOUNDARY_THRESH = -1
+# IOU overlap ratios [BG_IOU_THRESHOLD, FG_IOU_THRESHOLD]
+# Minimum overlap required between an anchor and ground-truth box for the
+# (anchor, gt box) pair to be a positive example (IoU >= FG_IOU_THRESHOLD
+# ==> positive RPN example: 1)
+# Maximum overlap allowed between an anchor and ground-truth box for the
+# (anchor, gt box) pair to be a negative examples (IoU < BG_IOU_THRESHOLD
+# ==> negative RPN example: 0)
+# Anchors with overlap in between (BG_IOU_THRESHOLD <= IoU < FG_IOU_THRESHOLD)
+# are ignored (-1)
+_C.MODEL.RPN.IOU_THRESHOLDS = [0.3, 0.7]
+_C.MODEL.RPN.IOU_LABELS = [0, -1, 1]
+# Number of regions per image used to train RPN
+_C.MODEL.RPN.BATCH_SIZE_PER_IMAGE = 256
+# Target fraction of foreground (positive) examples per RPN minibatch
+_C.MODEL.RPN.POSITIVE_FRACTION = 0.5
+# Options are: "smooth_l1", "giou", "diou", "ciou"
+_C.MODEL.RPN.BBOX_REG_LOSS_TYPE = "smooth_l1"
+_C.MODEL.RPN.BBOX_REG_LOSS_WEIGHT = 1.0
+# Weights on (dx, dy, dw, dh) for normalizing RPN anchor regression targets
+_C.MODEL.RPN.BBOX_REG_WEIGHTS = (1.0, 1.0, 1.0, 1.0)
+# The transition point from L1 to L2 loss. Set to 0.0 to make the loss simply L1.
+_C.MODEL.RPN.SMOOTH_L1_BETA = 0.0
+_C.MODEL.RPN.LOSS_WEIGHT = 1.0
+# Number of top scoring RPN proposals to keep before applying NMS
+# When FPN is used, this is *per FPN level* (not total)
+_C.MODEL.RPN.PRE_NMS_TOPK_TRAIN = 12000
+_C.MODEL.RPN.PRE_NMS_TOPK_TEST = 6000
+# Number of top scoring RPN proposals to keep after applying NMS
+# When FPN is used, this limit is applied per level and then again to the union
+# of proposals from all levels
+# NOTE: When FPN is used, the meaning of this config is different from Detectron1.
+# It means per-batch topk in Detectron1, but per-image topk here.
+# See the "find_top_rpn_proposals" function for details.
+_C.MODEL.RPN.POST_NMS_TOPK_TRAIN = 2000
+_C.MODEL.RPN.POST_NMS_TOPK_TEST = 1000
+# NMS threshold used on RPN proposals
+_C.MODEL.RPN.NMS_THRESH = 0.7
+# Set this to -1 to use the same number of output channels as input channels.
+_C.MODEL.RPN.CONV_DIMS = [-1]
+
+# ---------------------------------------------------------------------------- #
+# ROI HEADS options
+# ---------------------------------------------------------------------------- #
+_C.MODEL.ROI_HEADS = CN()
+_C.MODEL.ROI_HEADS.NAME = "Res5ROIHeads"
+# Number of foreground classes
+_C.MODEL.ROI_HEADS.NUM_CLASSES = 80
+# Names of the input feature maps to be used by ROI heads
+# Currently all heads (box, mask, ...) use the same input feature map list
+# e.g., ["p2", "p3", "p4", "p5"] is commonly used for FPN
+_C.MODEL.ROI_HEADS.IN_FEATURES = ["res4"]
+# IOU overlap ratios [IOU_THRESHOLD]
+# Overlap threshold for an RoI to be considered background (if < IOU_THRESHOLD)
+# Overlap threshold for an RoI to be considered foreground (if >= IOU_THRESHOLD)
+_C.MODEL.ROI_HEADS.IOU_THRESHOLDS = [0.5]
+_C.MODEL.ROI_HEADS.IOU_LABELS = [0, 1]
+# RoI minibatch size *per image* (number of regions of interest [ROIs]) during training
+# Total number of RoIs per training minibatch =
+#   ROI_HEADS.BATCH_SIZE_PER_IMAGE * SOLVER.IMS_PER_BATCH
+# E.g., a common configuration is: 512 * 16 = 8192
+_C.MODEL.ROI_HEADS.BATCH_SIZE_PER_IMAGE = 512
+# Target fraction of RoI minibatch that is labeled foreground (i.e. class > 0)
+_C.MODEL.ROI_HEADS.POSITIVE_FRACTION = 0.25
+
+# Only used on test mode
+
+# Minimum score threshold (assuming scores in a [0, 1] range); a value chosen to
+# balance obtaining high recall with not having too many low precision
+# detections that will slow down inference post processing steps (like NMS)
+# A default threshold of 0.0 increases AP by ~0.2-0.3 but significantly slows down
+# inference.
+_C.MODEL.ROI_HEADS.SCORE_THRESH_TEST = 0.05
+# Overlap threshold used for non-maximum suppression (suppress boxes with
+# IoU >= this threshold)
+_C.MODEL.ROI_HEADS.NMS_THRESH_TEST = 0.5
+# If True, augment proposals with ground-truth boxes before sampling proposals to
+# train ROI heads.
+_C.MODEL.ROI_HEADS.PROPOSAL_APPEND_GT = True
+
+# ---------------------------------------------------------------------------- #
+# Box Head
+# ---------------------------------------------------------------------------- #
+_C.MODEL.ROI_BOX_HEAD = CN()
+# C4 don't use head name option
+# Options for non-C4 models: FastRCNNConvFCHead,
+_C.MODEL.ROI_BOX_HEAD.NAME = ""
+# Options are: "smooth_l1", "giou", "diou", "ciou"
+_C.MODEL.ROI_BOX_HEAD.BBOX_REG_LOSS_TYPE = "smooth_l1"
+# The final scaling coefficient on the box regression loss, used to balance the magnitude of its
+# gradients with other losses in the model. See also `MODEL.ROI_KEYPOINT_HEAD.LOSS_WEIGHT`.
+_C.MODEL.ROI_BOX_HEAD.BBOX_REG_LOSS_WEIGHT = 1.0
+# Default weights on (dx, dy, dw, dh) for normalizing bbox regression targets
+# These are empirically chosen to approximately lead to unit variance targets
+_C.MODEL.ROI_BOX_HEAD.BBOX_REG_WEIGHTS = (10.0, 10.0, 5.0, 5.0)
+# The transition point from L1 to L2 loss. Set to 0.0 to make the loss simply L1.
+_C.MODEL.ROI_BOX_HEAD.SMOOTH_L1_BETA = 0.0
+_C.MODEL.ROI_BOX_HEAD.POOLER_RESOLUTION = 14
+_C.MODEL.ROI_BOX_HEAD.POOLER_SAMPLING_RATIO = 0
+# Type of pooling operation applied to the incoming feature map for each RoI
+_C.MODEL.ROI_BOX_HEAD.POOLER_TYPE = "ROIAlignV2"
+
+_C.MODEL.ROI_BOX_HEAD.NUM_FC = 0
+# Hidden layer dimension for FC layers in the RoI box head
+_C.MODEL.ROI_BOX_HEAD.FC_DIM = 1024
+_C.MODEL.ROI_BOX_HEAD.NUM_CONV = 0
+# Channel dimension for Conv layers in the RoI box head
+_C.MODEL.ROI_BOX_HEAD.CONV_DIM = 256
+# Normalization method for the convolution layers.
+# Options: "" (no norm), "GN", "SyncBN".
+_C.MODEL.ROI_BOX_HEAD.NORM = ""
+# Whether to use class agnostic for bbox regression
+_C.MODEL.ROI_BOX_HEAD.CLS_AGNOSTIC_BBOX_REG = False
+# If true, RoI heads use bounding boxes predicted by the box head rather than proposal boxes.
+_C.MODEL.ROI_BOX_HEAD.TRAIN_ON_PRED_BOXES = False
+
+# Federated loss can be used to improve the training of LVIS
+_C.MODEL.ROI_BOX_HEAD.USE_FED_LOSS = False
+# Sigmoid cross entrophy is used with federated loss
+_C.MODEL.ROI_BOX_HEAD.USE_SIGMOID_CE = False
+# The power value applied to image_count when calcualting frequency weight
+_C.MODEL.ROI_BOX_HEAD.FED_LOSS_FREQ_WEIGHT_POWER = 0.5
+# Number of classes to keep in total
+_C.MODEL.ROI_BOX_HEAD.FED_LOSS_NUM_CLASSES = 50
+
+# ---------------------------------------------------------------------------- #
+# Cascaded Box Head
+# ---------------------------------------------------------------------------- #
+_C.MODEL.ROI_BOX_CASCADE_HEAD = CN()
+# The number of cascade stages is implicitly defined by the length of the following two configs.
+_C.MODEL.ROI_BOX_CASCADE_HEAD.BBOX_REG_WEIGHTS = (
+    (10.0, 10.0, 5.0, 5.0),
+    (20.0, 20.0, 10.0, 10.0),
+    (30.0, 30.0, 15.0, 15.0),
+)
+_C.MODEL.ROI_BOX_CASCADE_HEAD.IOUS = (0.5, 0.6, 0.7)
+
+
+# ---------------------------------------------------------------------------- #
+# Mask Head
+# ---------------------------------------------------------------------------- #
+_C.MODEL.ROI_MASK_HEAD = CN()
+_C.MODEL.ROI_MASK_HEAD.NAME = "MaskRCNNConvUpsampleHead"
+_C.MODEL.ROI_MASK_HEAD.POOLER_RESOLUTION = 14
+_C.MODEL.ROI_MASK_HEAD.POOLER_SAMPLING_RATIO = 0
+_C.MODEL.ROI_MASK_HEAD.NUM_CONV = 0  # The number of convs in the mask head
+_C.MODEL.ROI_MASK_HEAD.CONV_DIM = 256
+# Normalization method for the convolution layers.
+# Options: "" (no norm), "GN", "SyncBN".
+_C.MODEL.ROI_MASK_HEAD.NORM = ""
+# Whether to use class agnostic for mask prediction
+_C.MODEL.ROI_MASK_HEAD.CLS_AGNOSTIC_MASK = False
+# Type of pooling operation applied to the incoming feature map for each RoI
+_C.MODEL.ROI_MASK_HEAD.POOLER_TYPE = "ROIAlignV2"
+
+
+# ---------------------------------------------------------------------------- #
+# Keypoint Head
+# ---------------------------------------------------------------------------- #
+_C.MODEL.ROI_KEYPOINT_HEAD = CN()
+_C.MODEL.ROI_KEYPOINT_HEAD.NAME = "KRCNNConvDeconvUpsampleHead"
+_C.MODEL.ROI_KEYPOINT_HEAD.POOLER_RESOLUTION = 14
+_C.MODEL.ROI_KEYPOINT_HEAD.POOLER_SAMPLING_RATIO = 0
+_C.MODEL.ROI_KEYPOINT_HEAD.CONV_DIMS = tuple(512 for _ in range(8))
+_C.MODEL.ROI_KEYPOINT_HEAD.NUM_KEYPOINTS = 17  # 17 is the number of keypoints in COCO.
+
+# Images with too few (or no) keypoints are excluded from training.
+_C.MODEL.ROI_KEYPOINT_HEAD.MIN_KEYPOINTS_PER_IMAGE = 1
+# Normalize by the total number of visible keypoints in the minibatch if True.
+# Otherwise, normalize by the total number of keypoints that could ever exist
+# in the minibatch.
+# The keypoint softmax loss is only calculated on visible keypoints.
+# Since the number of visible keypoints can vary significantly between
+# minibatches, this has the effect of up-weighting the importance of
+# minibatches with few visible keypoints. (Imagine the extreme case of
+# only one visible keypoint versus N: in the case of N, each one
+# contributes 1/N to the gradient compared to the single keypoint
+# determining the gradient direction). Instead, we can normalize the
+# loss by the total number of keypoints, if it were the case that all
+# keypoints were visible in a full minibatch. (Returning to the example,
+# this means that the one visible keypoint contributes as much as each
+# of the N keypoints.)
+_C.MODEL.ROI_KEYPOINT_HEAD.NORMALIZE_LOSS_BY_VISIBLE_KEYPOINTS = True
+# Multi-task loss weight to use for keypoints
+# Recommended values:
+#   - use 1.0 if NORMALIZE_LOSS_BY_VISIBLE_KEYPOINTS is True
+#   - use 4.0 if NORMALIZE_LOSS_BY_VISIBLE_KEYPOINTS is False
+_C.MODEL.ROI_KEYPOINT_HEAD.LOSS_WEIGHT = 1.0
+# Type of pooling operation applied to the incoming feature map for each RoI
+_C.MODEL.ROI_KEYPOINT_HEAD.POOLER_TYPE = "ROIAlignV2"
+
+# ---------------------------------------------------------------------------- #
+# Semantic Segmentation Head
+# ---------------------------------------------------------------------------- #
+_C.MODEL.SEM_SEG_HEAD = CN()
+_C.MODEL.SEM_SEG_HEAD.NAME = "SemSegFPNHead"
+_C.MODEL.SEM_SEG_HEAD.IN_FEATURES = ["p2", "p3", "p4", "p5"]
+# Label in the semantic segmentation ground truth that is ignored, i.e., no loss is calculated for
+# the correposnding pixel.
+_C.MODEL.SEM_SEG_HEAD.IGNORE_VALUE = 255
+# Number of classes in the semantic segmentation head
+_C.MODEL.SEM_SEG_HEAD.NUM_CLASSES = 54
+# Number of channels in the 3x3 convs inside semantic-FPN heads.
+_C.MODEL.SEM_SEG_HEAD.CONVS_DIM = 128
+# Outputs from semantic-FPN heads are up-scaled to the COMMON_STRIDE stride.
+_C.MODEL.SEM_SEG_HEAD.COMMON_STRIDE = 4
+# Normalization method for the convolution layers. Options: "" (no norm), "GN".
+_C.MODEL.SEM_SEG_HEAD.NORM = "GN"
+_C.MODEL.SEM_SEG_HEAD.LOSS_WEIGHT = 1.0
+
+_C.MODEL.PANOPTIC_FPN = CN()
+# Scaling of all losses from instance detection / segmentation head.
+_C.MODEL.PANOPTIC_FPN.INSTANCE_LOSS_WEIGHT = 1.0
+
+# options when combining instance & semantic segmentation outputs
+_C.MODEL.PANOPTIC_FPN.COMBINE = CN({"ENABLED": True})  # "COMBINE.ENABLED" is deprecated & not used
+_C.MODEL.PANOPTIC_FPN.COMBINE.OVERLAP_THRESH = 0.5
+_C.MODEL.PANOPTIC_FPN.COMBINE.STUFF_AREA_LIMIT = 4096
+_C.MODEL.PANOPTIC_FPN.COMBINE.INSTANCES_CONFIDENCE_THRESH = 0.5
+
+
+# ---------------------------------------------------------------------------- #
+# RetinaNet Head
+# ---------------------------------------------------------------------------- #
+_C.MODEL.RETINANET = CN()
+
+# This is the number of foreground classes.
+_C.MODEL.RETINANET.NUM_CLASSES = 80
+
+_C.MODEL.RETINANET.IN_FEATURES = ["p3", "p4", "p5", "p6", "p7"]
+
+# Convolutions to use in the cls and bbox tower
+# NOTE: this doesn't include the last conv for logits
+_C.MODEL.RETINANET.NUM_CONVS = 4
+
+# IoU overlap ratio [bg, fg] for labeling anchors.
+# Anchors with < bg are labeled negative (0)
+# Anchors  with >= bg and < fg are ignored (-1)
+# Anchors with >= fg are labeled positive (1)
+_C.MODEL.RETINANET.IOU_THRESHOLDS = [0.4, 0.5]
+_C.MODEL.RETINANET.IOU_LABELS = [0, -1, 1]
+
+# Prior prob for rare case (i.e. foreground) at the beginning of training.
+# This is used to set the bias for the logits layer of the classifier subnet.
+# This improves training stability in the case of heavy class imbalance.
+_C.MODEL.RETINANET.PRIOR_PROB = 0.01
+
+# Inference cls score threshold, only anchors with score > INFERENCE_TH are
+# considered for inference (to improve speed)
+_C.MODEL.RETINANET.SCORE_THRESH_TEST = 0.05
+# Select topk candidates before NMS
+_C.MODEL.RETINANET.TOPK_CANDIDATES_TEST = 1000
+_C.MODEL.RETINANET.NMS_THRESH_TEST = 0.5
+
+# Weights on (dx, dy, dw, dh) for normalizing Retinanet anchor regression targets
+_C.MODEL.RETINANET.BBOX_REG_WEIGHTS = (1.0, 1.0, 1.0, 1.0)
+
+# Loss parameters
+_C.MODEL.RETINANET.FOCAL_LOSS_GAMMA = 2.0
+_C.MODEL.RETINANET.FOCAL_LOSS_ALPHA = 0.25
+_C.MODEL.RETINANET.SMOOTH_L1_LOSS_BETA = 0.1
+# Options are: "smooth_l1", "giou", "diou", "ciou"
+_C.MODEL.RETINANET.BBOX_REG_LOSS_TYPE = "smooth_l1"
+
+# One of BN, SyncBN, FrozenBN, GN
+# Only supports GN until unshared norm is implemented
+_C.MODEL.RETINANET.NORM = ""
+
+
+# ---------------------------------------------------------------------------- #
+# ResNe[X]t options (ResNets = {ResNet, ResNeXt}
+# Note that parts of a resnet may be used for both the backbone and the head
+# These options apply to both
+# ---------------------------------------------------------------------------- #
+_C.MODEL.RESNETS = CN()
+
+_C.MODEL.RESNETS.DEPTH = 50
+_C.MODEL.RESNETS.OUT_FEATURES = ["res4"]  # res4 for C4 backbone, res2..5 for FPN backbone
+
+# Number of groups to use; 1 ==> ResNet; > 1 ==> ResNeXt
+_C.MODEL.RESNETS.NUM_GROUPS = 1
+
+# Options: FrozenBN, GN, "SyncBN", "BN"
+_C.MODEL.RESNETS.NORM = "FrozenBN"
+
+# Baseline width of each group.
+# Scaling this parameters will scale the width of all bottleneck layers.
+_C.MODEL.RESNETS.WIDTH_PER_GROUP = 64
+
+# Place the stride 2 conv on the 1x1 filter
+# Use True only for the original MSRA ResNet; use False for C2 and Torch models
+_C.MODEL.RESNETS.STRIDE_IN_1X1 = True
+
+# Apply dilation in stage "res5"
+_C.MODEL.RESNETS.RES5_DILATION = 1
+
+# Output width of res2. Scaling this parameters will scale the width of all 1x1 convs in ResNet
+# For R18 and R34, this needs to be set to 64
+_C.MODEL.RESNETS.RES2_OUT_CHANNELS = 256
+_C.MODEL.RESNETS.STEM_OUT_CHANNELS = 64
+
+# Apply Deformable Convolution in stages
+# Specify if apply deform_conv on Res2, Res3, Res4, Res5
+_C.MODEL.RESNETS.DEFORM_ON_PER_STAGE = [False, False, False, False]
+# Use True to use modulated deform_conv (DeformableV2, https://arxiv.org/abs/1811.11168);
+# Use False for DeformableV1.
+_C.MODEL.RESNETS.DEFORM_MODULATED = False
+# Number of groups in deformable conv.
+_C.MODEL.RESNETS.DEFORM_NUM_GROUPS = 1
+
+
+# ---------------------------------------------------------------------------- #
+# Solver
+# ---------------------------------------------------------------------------- #
+_C.SOLVER = CN()
+
+# Options: WarmupMultiStepLR, WarmupCosineLR.
+# See detectron2/solver/build.py for definition.
+_C.SOLVER.LR_SCHEDULER_NAME = "WarmupMultiStepLR"
+
+_C.SOLVER.MAX_ITER = 40000
+
+_C.SOLVER.BASE_LR = 0.001
+# The end lr, only used by WarmupCosineLR
+_C.SOLVER.BASE_LR_END = 0.0
+
+_C.SOLVER.MOMENTUM = 0.9
+
+_C.SOLVER.NESTEROV = False
+
+_C.SOLVER.WEIGHT_DECAY = 0.0001
+# The weight decay that's applied to parameters of normalization layers
+# (typically the affine transformation)
+_C.SOLVER.WEIGHT_DECAY_NORM = 0.0
+
+_C.SOLVER.GAMMA = 0.1
+# The iteration number to decrease learning rate by GAMMA.
+_C.SOLVER.STEPS = (30000,)
+# Number of decays in WarmupStepWithFixedGammaLR schedule
+_C.SOLVER.NUM_DECAYS = 3
+
+_C.SOLVER.WARMUP_FACTOR = 1.0 / 1000
+_C.SOLVER.WARMUP_ITERS = 1000
+_C.SOLVER.WARMUP_METHOD = "linear"
+# Whether to rescale the interval for the learning schedule after warmup
+_C.SOLVER.RESCALE_INTERVAL = False
+
+# Save a checkpoint after every this number of iterations
+_C.SOLVER.CHECKPOINT_PERIOD = 5000
+
+# Number of images per batch across all machines. This is also the number
+# of training images per step (i.e. per iteration). If we use 16 GPUs
+# and IMS_PER_BATCH = 32, each GPU will see 2 images per batch.
+# May be adjusted automatically if REFERENCE_WORLD_SIZE is set.
+_C.SOLVER.IMS_PER_BATCH = 16
+
+# The reference number of workers (GPUs) this config is meant to train with.
+# It takes no effect when set to 0.
+# With a non-zero value, it will be used by DefaultTrainer to compute a desired
+# per-worker batch size, and then scale the other related configs (total batch size,
+# learning rate, etc) to match the per-worker batch size.
+# See documentation of `DefaultTrainer.auto_scale_workers` for details:
+_C.SOLVER.REFERENCE_WORLD_SIZE = 0
+
+# Detectron v1 (and previous detection code) used a 2x higher LR and 0 WD for
+# biases. This is not useful (at least for recent models). You should avoid
+# changing these and they exist only to reproduce Detectron v1 training if
+# desired.
+_C.SOLVER.BIAS_LR_FACTOR = 1.0
+_C.SOLVER.WEIGHT_DECAY_BIAS = None  # None means following WEIGHT_DECAY
+
+# Gradient clipping
+_C.SOLVER.CLIP_GRADIENTS = CN({"ENABLED": False})
+# Type of gradient clipping, currently 2 values are supported:
+# - "value": the absolute values of elements of each gradients are clipped
+# - "norm": the norm of the gradient for each parameter is clipped thus
+#   affecting all elements in the parameter
+_C.SOLVER.CLIP_GRADIENTS.CLIP_TYPE = "value"
+# Maximum absolute value used for clipping gradients
+_C.SOLVER.CLIP_GRADIENTS.CLIP_VALUE = 1.0
+# Floating point number p for L-p norm to be used with the "norm"
+# gradient clipping type; for L-inf, please specify .inf
+_C.SOLVER.CLIP_GRADIENTS.NORM_TYPE = 2.0
+
+# Enable automatic mixed precision for training
+# Note that this does not change model's inference behavior.
+# To use AMP in inference, run inference under autocast()
+_C.SOLVER.AMP = CN({"ENABLED": False})
+
+# ---------------------------------------------------------------------------- #
+# Specific test options
+# ---------------------------------------------------------------------------- #
+_C.TEST = CN()
+# For end-to-end tests to verify the expected accuracy.
+# Each item is [task, metric, value, tolerance]
+# e.g.: [['bbox', 'AP', 38.5, 0.2]]
+_C.TEST.EXPECTED_RESULTS = []
+# The period (in terms of steps) to evaluate the model during training.
+# Set to 0 to disable.
+_C.TEST.EVAL_PERIOD = 0
+# The sigmas used to calculate keypoint OKS. See http://cocodataset.org/#keypoints-eval
+# When empty, it will use the defaults in COCO.
+# Otherwise it should be a list[float] with the same length as ROI_KEYPOINT_HEAD.NUM_KEYPOINTS.
+_C.TEST.KEYPOINT_OKS_SIGMAS = []
+# Maximum number of detections to return per image during inference (100 is
+# based on the limit established for the COCO dataset).
+_C.TEST.DETECTIONS_PER_IMAGE = 100
+
+_C.TEST.AUG = CN({"ENABLED": False})
+_C.TEST.AUG.MIN_SIZES = (400, 500, 600, 700, 800, 900, 1000, 1100, 1200)
+_C.TEST.AUG.MAX_SIZE = 4000
+_C.TEST.AUG.FLIP = True
+
+_C.TEST.PRECISE_BN = CN({"ENABLED": False})
+_C.TEST.PRECISE_BN.NUM_ITER = 200
+
+# ---------------------------------------------------------------------------- #
+# Misc options
+# ---------------------------------------------------------------------------- #
+# Directory where output files are written
+_C.OUTPUT_DIR = "./output"
+# Set seed to negative to fully randomize everything.
+# Set seed to positive to use a fixed seed. Note that a fixed seed increases
+# reproducibility but does not guarantee fully deterministic behavior.
+# Disabling all parallelism further increases reproducibility.
+_C.SEED = -1
+# Benchmark different cudnn algorithms.
+# If input images have very different sizes, this option will have large overhead
+# for about 10k iterations. It usually hurts total time, but can benefit for certain models.
+# If input images have the same or similar sizes, benchmark is often helpful.
+_C.CUDNN_BENCHMARK = False
+# The period (in terms of steps) for minibatch visualization at train time.
+# Set to 0 to disable.
+_C.VIS_PERIOD = 0
+
+# global config is for quick hack purposes.
+# You can set them in command line or config files,
+# and access it with:
+#
+# from detectron2.config import global_cfg
+# print(global_cfg.HACK)
+#
+# Do not commit any configs into it.
+_C.GLOBAL = CN()
+_C.GLOBAL.HACK = 1.0
diff --git a/detectron2/config/instantiate.py b/detectron2/config/instantiate.py
new file mode 100644
index 0000000000000000000000000000000000000000..05ee2c7d21c9bf3e56a0a8e98447d2587b4b8fed
--- /dev/null
+++ b/detectron2/config/instantiate.py
@@ -0,0 +1,88 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+
+import collections.abc as abc
+import dataclasses
+import logging
+from typing import Any
+
+from detectron2.utils.registry import _convert_target_to_string, locate
+
+__all__ = ["dump_dataclass", "instantiate"]
+
+
+def dump_dataclass(obj: Any):
+    """
+    Dump a dataclass recursively into a dict that can be later instantiated.
+
+    Args:
+        obj: a dataclass object
+
+    Returns:
+        dict
+    """
+    assert dataclasses.is_dataclass(obj) and not isinstance(
+        obj, type
+    ), "dump_dataclass() requires an instance of a dataclass."
+    ret = {"_target_": _convert_target_to_string(type(obj))}
+    for f in dataclasses.fields(obj):
+        v = getattr(obj, f.name)
+        if dataclasses.is_dataclass(v):
+            v = dump_dataclass(v)
+        if isinstance(v, (list, tuple)):
+            v = [dump_dataclass(x) if dataclasses.is_dataclass(x) else x for x in v]
+        ret[f.name] = v
+    return ret
+
+
+def instantiate(cfg):
+    """
+    Recursively instantiate objects defined in dictionaries by
+    "_target_" and arguments.
+
+    Args:
+        cfg: a dict-like object with "_target_" that defines the caller, and
+            other keys that define the arguments
+
+    Returns:
+        object instantiated by cfg
+    """
+    from omegaconf import ListConfig, DictConfig, OmegaConf
+
+    if isinstance(cfg, ListConfig):
+        lst = [instantiate(x) for x in cfg]
+        return ListConfig(lst, flags={"allow_objects": True})
+    if isinstance(cfg, list):
+        # Specialize for list, because many classes take
+        # list[objects] as arguments, such as ResNet, DatasetMapper
+        return [instantiate(x) for x in cfg]
+
+    # If input is a DictConfig backed by dataclasses (i.e. omegaconf's structured config),
+    # instantiate it to the actual dataclass.
+    if isinstance(cfg, DictConfig) and dataclasses.is_dataclass(cfg._metadata.object_type):
+        return OmegaConf.to_object(cfg)
+
+    if isinstance(cfg, abc.Mapping) and "_target_" in cfg:
+        # conceptually equivalent to hydra.utils.instantiate(cfg) with _convert_=all,
+        # but faster: https://github.com/facebookresearch/hydra/issues/1200
+        cfg = {k: instantiate(v) for k, v in cfg.items()}
+        cls = cfg.pop("_target_")
+        cls = instantiate(cls)
+
+        if isinstance(cls, str):
+            cls_name = cls
+            cls = locate(cls_name)
+            assert cls is not None, cls_name
+        else:
+            try:
+                cls_name = cls.__module__ + "." + cls.__qualname__
+            except Exception:
+                # target could be anything, so the above could fail
+                cls_name = str(cls)
+        assert callable(cls), f"_target_ {cls} does not define a callable object"
+        try:
+            return cls(**cfg)
+        except TypeError:
+            logger = logging.getLogger(__name__)
+            logger.error(f"Error when instantiating {cls_name}!")
+            raise
+    return cfg  # return as-is if don't know what to do
diff --git a/detectron2/config/lazy.py b/detectron2/config/lazy.py
new file mode 100644
index 0000000000000000000000000000000000000000..ea93e865acce31de07af476f95454d62128a9d1c
--- /dev/null
+++ b/detectron2/config/lazy.py
@@ -0,0 +1,436 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+
+import ast
+import builtins
+import collections.abc as abc
+import importlib
+import inspect
+import logging
+import os
+import uuid
+from contextlib import contextmanager
+from copy import deepcopy
+from dataclasses import is_dataclass
+from typing import List, Tuple, Union
+import cloudpickle
+import yaml
+from omegaconf import DictConfig, ListConfig, OmegaConf, SCMode
+
+from detectron2.utils.file_io import PathManager
+from detectron2.utils.registry import _convert_target_to_string
+
+__all__ = ["LazyCall", "LazyConfig"]
+
+
+class LazyCall:
+    """
+    Wrap a callable so that when it's called, the call will not be executed,
+    but returns a dict that describes the call.
+
+    LazyCall object has to be called with only keyword arguments. Positional
+    arguments are not yet supported.
+
+    Examples:
+    ::
+        from detectron2.config import instantiate, LazyCall
+
+        layer_cfg = LazyCall(nn.Conv2d)(in_channels=32, out_channels=32)
+        layer_cfg.out_channels = 64   # can edit it afterwards
+        layer = instantiate(layer_cfg)
+    """
+
+    def __init__(self, target):
+        if not (callable(target) or isinstance(target, (str, abc.Mapping))):
+            raise TypeError(
+                f"target of LazyCall must be a callable or defines a callable! Got {target}"
+            )
+        self._target = target
+
+    def __call__(self, **kwargs):
+        if is_dataclass(self._target):
+            # omegaconf object cannot hold dataclass type
+            # https://github.com/omry/omegaconf/issues/784
+            target = _convert_target_to_string(self._target)
+        else:
+            target = self._target
+        kwargs["_target_"] = target
+
+        return DictConfig(content=kwargs, flags={"allow_objects": True})
+
+
+def _visit_dict_config(cfg, func):
+    """
+    Apply func recursively to all DictConfig in cfg.
+    """
+    if isinstance(cfg, DictConfig):
+        func(cfg)
+        for v in cfg.values():
+            _visit_dict_config(v, func)
+    elif isinstance(cfg, ListConfig):
+        for v in cfg:
+            _visit_dict_config(v, func)
+
+
+def _validate_py_syntax(filename):
+    # see also https://github.com/open-mmlab/mmcv/blob/master/mmcv/utils/config.py
+    with PathManager.open(filename, "r") as f:
+        content = f.read()
+    try:
+        ast.parse(content)
+    except SyntaxError as e:
+        raise SyntaxError(f"Config file {filename} has syntax error!") from e
+
+
+def _cast_to_config(obj):
+    # if given a dict, return DictConfig instead
+    if isinstance(obj, dict):
+        return DictConfig(obj, flags={"allow_objects": True})
+    return obj
+
+
+_CFG_PACKAGE_NAME = "detectron2._cfg_loader"
+"""
+A namespace to put all imported config into.
+"""
+
+
+def _random_package_name(filename):
+    # generate a random package name when loading config files
+    return _CFG_PACKAGE_NAME + str(uuid.uuid4())[:4] + "." + os.path.basename(filename)
+
+
+@contextmanager
+def _patch_import():
+    """
+    Enhance relative import statements in config files, so that they:
+    1. locate files purely based on relative location, regardless of packages.
+       e.g. you can import file without having __init__
+    2. do not cache modules globally; modifications of module states has no side effect
+    3. support other storage system through PathManager, so config files can be in the cloud
+    4. imported dict are turned into omegaconf.DictConfig automatically
+    """
+    old_import = builtins.__import__
+
+    def find_relative_file(original_file, relative_import_path, level):
+        # NOTE: "from . import x" is not handled. Because then it's unclear
+        # if such import should produce `x` as a python module or DictConfig.
+        # This can be discussed further if needed.
+        relative_import_err = """
+Relative import of directories is not allowed within config files.
+Within a config file, relative import can only import other config files.
+""".replace(
+            "\n", " "
+        )
+        if not len(relative_import_path):
+            raise ImportError(relative_import_err)
+
+        cur_file = os.path.dirname(original_file)
+        for _ in range(level - 1):
+            cur_file = os.path.dirname(cur_file)
+        cur_name = relative_import_path.lstrip(".")
+        for part in cur_name.split("."):
+            cur_file = os.path.join(cur_file, part)
+        if not cur_file.endswith(".py"):
+            cur_file += ".py"
+        if not PathManager.isfile(cur_file):
+            cur_file_no_suffix = cur_file[: -len(".py")]
+            if PathManager.isdir(cur_file_no_suffix):
+                raise ImportError(f"Cannot import from {cur_file_no_suffix}." + relative_import_err)
+            else:
+                raise ImportError(
+                    f"Cannot import name {relative_import_path} from "
+                    f"{original_file}: {cur_file} does not exist."
+                )
+        return cur_file
+
+    def new_import(name, globals=None, locals=None, fromlist=(), level=0):
+        if (
+            # Only deal with relative imports inside config files
+            level != 0
+            and globals is not None
+            and (globals.get("__package__", "") or "").startswith(_CFG_PACKAGE_NAME)
+        ):
+            cur_file = find_relative_file(globals["__file__"], name, level)
+            _validate_py_syntax(cur_file)
+            spec = importlib.machinery.ModuleSpec(
+                _random_package_name(cur_file), None, origin=cur_file
+            )
+            module = importlib.util.module_from_spec(spec)
+            module.__file__ = cur_file
+            with PathManager.open(cur_file) as f:
+                content = f.read()
+            exec(compile(content, cur_file, "exec"), module.__dict__)
+            for name in fromlist:  # turn imported dict into DictConfig automatically
+                val = _cast_to_config(module.__dict__[name])
+                module.__dict__[name] = val
+            return module
+        return old_import(name, globals, locals, fromlist=fromlist, level=level)
+
+    builtins.__import__ = new_import
+    yield new_import
+    builtins.__import__ = old_import
+
+
+class LazyConfig:
+    """
+    Provide methods to save, load, and overrides an omegaconf config object
+    which may contain definition of lazily-constructed objects.
+    """
+
+    @staticmethod
+    def load_rel(filename: str, keys: Union[None, str, Tuple[str, ...]] = None):
+        """
+        Similar to :meth:`load()`, but load path relative to the caller's
+        source file.
+
+        This has the same functionality as a relative import, except that this method
+        accepts filename as a string, so more characters are allowed in the filename.
+        """
+        caller_frame = inspect.stack()[1]
+        caller_fname = caller_frame[0].f_code.co_filename
+        assert caller_fname != "<string>", "load_rel Unable to find caller"
+        caller_dir = os.path.dirname(caller_fname)
+        filename = os.path.join(caller_dir, filename)
+        return LazyConfig.load(filename, keys)
+
+    @staticmethod
+    def load(filename: str, keys: Union[None, str, Tuple[str, ...]] = None):
+        """
+        Load a config file.
+
+        Args:
+            filename: absolute path or relative path w.r.t. the current working directory
+            keys: keys to load and return. If not given, return all keys
+                (whose values are config objects) in a dict.
+        """
+        has_keys = keys is not None
+        filename = filename.replace("/./", "/")  # redundant
+        if os.path.splitext(filename)[1] not in [".py", ".yaml", ".yml"]:
+            raise ValueError(f"Config file {filename} has to be a python or yaml file.")
+        if filename.endswith(".py"):
+            _validate_py_syntax(filename)
+
+            with _patch_import():
+                # Record the filename
+                module_namespace = {
+                    "__file__": filename,
+                    "__package__": _random_package_name(filename),
+                }
+                with PathManager.open(filename) as f:
+                    content = f.read()
+                # Compile first with filename to:
+                # 1. make filename appears in stacktrace
+                # 2. make load_rel able to find its parent's (possibly remote) location
+                exec(compile(content, filename, "exec"), module_namespace)
+
+            ret = module_namespace
+        else:
+            with PathManager.open(filename) as f:
+                obj = yaml.unsafe_load(f)
+            ret = OmegaConf.create(obj, flags={"allow_objects": True})
+
+        if has_keys:
+            if isinstance(keys, str):
+                return _cast_to_config(ret[keys])
+            else:
+                return tuple(_cast_to_config(ret[a]) for a in keys)
+        else:
+            if filename.endswith(".py"):
+                # when not specified, only load those that are config objects
+                ret = DictConfig(
+                    {
+                        name: _cast_to_config(value)
+                        for name, value in ret.items()
+                        if isinstance(value, (DictConfig, ListConfig, dict))
+                        and not name.startswith("_")
+                    },
+                    flags={"allow_objects": True},
+                )
+            return ret
+
+    @staticmethod
+    def save(cfg, filename: str):
+        """
+        Save a config object to a yaml file.
+        Note that when the config dictionary contains complex objects (e.g. lambda),
+        it can't be saved to yaml. In that case we will print an error and
+        attempt to save to a pkl file instead.
+
+        Args:
+            cfg: an omegaconf config object
+            filename: yaml file name to save the config file
+        """
+        logger = logging.getLogger(__name__)
+        try:
+            cfg = deepcopy(cfg)
+        except Exception:
+            pass
+        else:
+            # if it's deep-copyable, then...
+            def _replace_type_by_name(x):
+                if "_target_" in x and callable(x._target_):
+                    try:
+                        x._target_ = _convert_target_to_string(x._target_)
+                    except AttributeError:
+                        pass
+
+            # not necessary, but makes yaml looks nicer
+            _visit_dict_config(cfg, _replace_type_by_name)
+
+        save_pkl = False
+        try:
+            dict = OmegaConf.to_container(
+                cfg,
+                # Do not resolve interpolation when saving, i.e. do not turn ${a} into
+                # actual values when saving.
+                resolve=False,
+                # Save structures (dataclasses) in a format that can be instantiated later.
+                # Without this option, the type information of the dataclass will be erased.
+                structured_config_mode=SCMode.INSTANTIATE,
+            )
+            dumped = yaml.dump(dict, default_flow_style=None, allow_unicode=True, width=9999)
+            with PathManager.open(filename, "w") as f:
+                f.write(dumped)
+
+            try:
+                _ = yaml.unsafe_load(dumped)  # test that it is loadable
+            except Exception:
+                logger.warning(
+                    "The config contains objects that cannot serialize to a valid yaml. "
+                    f"{filename} is human-readable but cannot be loaded."
+                )
+                save_pkl = True
+        except Exception:
+            logger.exception("Unable to serialize the config to yaml. Error:")
+            save_pkl = True
+
+        if save_pkl:
+            new_filename = filename + ".pkl"
+            try:
+                # retry by pickle
+                with PathManager.open(new_filename, "wb") as f:
+                    cloudpickle.dump(cfg, f)
+                logger.warning(f"Config is saved using cloudpickle at {new_filename}.")
+            except Exception:
+                pass
+
+    @staticmethod
+    def apply_overrides(cfg, overrides: List[str]):
+        """
+        In-place override contents of cfg.
+
+        Args:
+            cfg: an omegaconf config object
+            overrides: list of strings in the format of "a=b" to override configs.
+                See https://hydra.cc/docs/next/advanced/override_grammar/basic/
+                for syntax.
+
+        Returns:
+            the cfg object
+        """
+
+        def safe_update(cfg, key, value):
+            parts = key.split(".")
+            for idx in range(1, len(parts)):
+                prefix = ".".join(parts[:idx])
+                v = OmegaConf.select(cfg, prefix, default=None)
+                if v is None:
+                    break
+                if not OmegaConf.is_config(v):
+                    raise KeyError(
+                        f"Trying to update key {key}, but {prefix} "
+                        f"is not a config, but has type {type(v)}."
+                    )
+            OmegaConf.update(cfg, key, value, merge=True)
+
+        try:
+            from hydra.core.override_parser.overrides_parser import OverridesParser
+
+            has_hydra = True
+        except ImportError:
+            has_hydra = False
+
+        if has_hydra:
+            parser = OverridesParser.create()
+            overrides = parser.parse_overrides(overrides)
+            for o in overrides:
+                key = o.key_or_group
+                value = o.value()
+                if o.is_delete():
+                    # TODO support this
+                    raise NotImplementedError("deletion is not yet a supported override")
+                safe_update(cfg, key, value)
+        else:
+            # Fallback. Does not support all the features and error checking like hydra.
+            for o in overrides:
+                key, value = o.split("=")
+                try:
+                    value = eval(value, {})
+                except NameError:
+                    pass
+                safe_update(cfg, key, value)
+        return cfg
+
+    @staticmethod
+    def to_py(cfg, prefix: str = "cfg."):
+        """
+        Try to convert a config object into Python-like psuedo code.
+
+        Note that perfect conversion is not always possible. So the returned
+        results are mainly meant to be human-readable, and not meant to be executed.
+
+        Args:
+            cfg: an omegaconf config object
+            prefix: root name for the resulting code (default: "cfg.")
+
+
+        Returns:
+            str of formatted Python code
+        """
+        import black
+
+        cfg = OmegaConf.to_container(cfg, resolve=True)
+
+        def _to_str(obj, prefix=None, inside_call=False):
+            if prefix is None:
+                prefix = []
+            if isinstance(obj, abc.Mapping) and "_target_" in obj:
+                # Dict representing a function call
+                target = _convert_target_to_string(obj.pop("_target_"))
+                args = []
+                for k, v in sorted(obj.items()):
+                    args.append(f"{k}={_to_str(v, inside_call=True)}")
+                args = ", ".join(args)
+                call = f"{target}({args})"
+                return "".join(prefix) + call
+            elif isinstance(obj, abc.Mapping) and not inside_call:
+                # Dict that is not inside a call is a list of top-level config objects that we
+                # render as one object per line with dot separated prefixes
+                key_list = []
+                for k, v in sorted(obj.items()):
+                    if isinstance(v, abc.Mapping) and "_target_" not in v:
+                        key_list.append(_to_str(v, prefix=prefix + [k + "."]))
+                    else:
+                        key = "".join(prefix) + k
+                        key_list.append(f"{key}={_to_str(v)}")
+                return "\n".join(key_list)
+            elif isinstance(obj, abc.Mapping):
+                # Dict that is inside a call is rendered as a regular dict
+                return (
+                    "{"
+                    + ",".join(
+                        f"{repr(k)}: {_to_str(v, inside_call=inside_call)}"
+                        for k, v in sorted(obj.items())
+                    )
+                    + "}"
+                )
+            elif isinstance(obj, list):
+                return "[" + ",".join(_to_str(x, inside_call=inside_call) for x in obj) + "]"
+            else:
+                return repr(obj)
+
+        py_str = _to_str(cfg, prefix=[prefix])
+        try:
+            return black.format_str(py_str, mode=black.Mode())
+        except black.InvalidInput:
+            return py_str
diff --git a/detectron2/data/__init__.py b/detectron2/data/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..259f669b78bd05815cb8d3351fd6c5fc9a1b85a1
--- /dev/null
+++ b/detectron2/data/__init__.py
@@ -0,0 +1,19 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+from . import transforms  # isort:skip
+
+from .build import (
+    build_batch_data_loader,
+    build_detection_test_loader,
+    build_detection_train_loader,
+    get_detection_dataset_dicts,
+    load_proposals_into_dataset,
+    print_instances_class_histogram,
+)
+from .catalog import DatasetCatalog, MetadataCatalog, Metadata
+from .common import DatasetFromList, MapDataset, ToIterableDataset
+from .dataset_mapper import DatasetMapper
+
+# ensure the builtin datasets are registered
+from . import datasets, samplers  # isort:skip
+
+__all__ = [k for k in globals().keys() if not k.startswith("_")]
diff --git a/detectron2/data/__pycache__/__init__.cpython-310.pyc b/detectron2/data/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d89609d0ec9283f0e6c6a1082c350e944d7a807e
Binary files /dev/null and b/detectron2/data/__pycache__/__init__.cpython-310.pyc differ
diff --git a/detectron2/data/__pycache__/__init__.cpython-39.pyc b/detectron2/data/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..867921e940e4c42e11f7db821ca5748697c9a543
Binary files /dev/null and b/detectron2/data/__pycache__/__init__.cpython-39.pyc differ
diff --git a/detectron2/data/__pycache__/build.cpython-310.pyc b/detectron2/data/__pycache__/build.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ff4f2a73b28e2588779a19b4acb874f46be86dc0
Binary files /dev/null and b/detectron2/data/__pycache__/build.cpython-310.pyc differ
diff --git a/detectron2/data/__pycache__/build.cpython-39.pyc b/detectron2/data/__pycache__/build.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..74891f778a4e1cc35eb816af458e93383e3e4690
Binary files /dev/null and b/detectron2/data/__pycache__/build.cpython-39.pyc differ
diff --git a/detectron2/data/__pycache__/catalog.cpython-310.pyc b/detectron2/data/__pycache__/catalog.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..81c380a3ed509e3c09bd79d1f28816275913167f
Binary files /dev/null and b/detectron2/data/__pycache__/catalog.cpython-310.pyc differ
diff --git a/detectron2/data/__pycache__/catalog.cpython-39.pyc b/detectron2/data/__pycache__/catalog.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9b73cd629fed8c1a68fc6b5beb07f74cb45851a2
Binary files /dev/null and b/detectron2/data/__pycache__/catalog.cpython-39.pyc differ
diff --git a/detectron2/data/__pycache__/common.cpython-310.pyc b/detectron2/data/__pycache__/common.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..47839f4375528dbca9e12acc605ff03df8b388b9
Binary files /dev/null and b/detectron2/data/__pycache__/common.cpython-310.pyc differ
diff --git a/detectron2/data/__pycache__/common.cpython-39.pyc b/detectron2/data/__pycache__/common.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..44edb862bd9c686a2bda3f64867da8b13883e5d9
Binary files /dev/null and b/detectron2/data/__pycache__/common.cpython-39.pyc differ
diff --git a/detectron2/data/__pycache__/dataset_mapper.cpython-310.pyc b/detectron2/data/__pycache__/dataset_mapper.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2afaa63c763d9df8ac5b298568aebef06f34e5f8
Binary files /dev/null and b/detectron2/data/__pycache__/dataset_mapper.cpython-310.pyc differ
diff --git a/detectron2/data/__pycache__/dataset_mapper.cpython-39.pyc b/detectron2/data/__pycache__/dataset_mapper.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6a6eda21ce1202039dcbd19afdedc7de58e2eba2
Binary files /dev/null and b/detectron2/data/__pycache__/dataset_mapper.cpython-39.pyc differ
diff --git a/detectron2/data/__pycache__/detection_utils.cpython-310.pyc b/detectron2/data/__pycache__/detection_utils.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..585090bbfdaf915247e08f53ad30b8023dba99e5
Binary files /dev/null and b/detectron2/data/__pycache__/detection_utils.cpython-310.pyc differ
diff --git a/detectron2/data/__pycache__/detection_utils.cpython-39.pyc b/detectron2/data/__pycache__/detection_utils.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..79dd34b05cba58ebbb30cc6fce15586d7384a880
Binary files /dev/null and b/detectron2/data/__pycache__/detection_utils.cpython-39.pyc differ
diff --git a/detectron2/data/benchmark.py b/detectron2/data/benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..ac2f372a4b111ad40b8e720adea208608271bab6
--- /dev/null
+++ b/detectron2/data/benchmark.py
@@ -0,0 +1,225 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+import logging
+import numpy as np
+from itertools import count
+from typing import List, Tuple
+import torch
+import tqdm
+from fvcore.common.timer import Timer
+
+from detectron2.utils import comm
+
+from .build import build_batch_data_loader
+from .common import DatasetFromList, MapDataset
+from .samplers import TrainingSampler
+
+logger = logging.getLogger(__name__)
+
+
+class _EmptyMapDataset(torch.utils.data.Dataset):
+    """
+    Map anything to emptiness.
+    """
+
+    def __init__(self, dataset):
+        self.ds = dataset
+
+    def __len__(self):
+        return len(self.ds)
+
+    def __getitem__(self, idx):
+        _ = self.ds[idx]
+        return [0]
+
+
+def iter_benchmark(
+    iterator, num_iter: int, warmup: int = 5, max_time_seconds: float = 60
+) -> Tuple[float, List[float]]:
+    """
+    Benchmark an iterator/iterable for `num_iter` iterations with an extra
+    `warmup` iterations of warmup.
+    End early if `max_time_seconds` time is spent on iterations.
+
+    Returns:
+        float: average time (seconds) per iteration
+        list[float]: time spent on each iteration. Sometimes useful for further analysis.
+    """
+    num_iter, warmup = int(num_iter), int(warmup)
+
+    iterator = iter(iterator)
+    for _ in range(warmup):
+        next(iterator)
+    timer = Timer()
+    all_times = []
+    for curr_iter in tqdm.trange(num_iter):
+        start = timer.seconds()
+        if start > max_time_seconds:
+            num_iter = curr_iter
+            break
+        next(iterator)
+        all_times.append(timer.seconds() - start)
+    avg = timer.seconds() / num_iter
+    return avg, all_times
+
+
+class DataLoaderBenchmark:
+    """
+    Some common benchmarks that help understand perf bottleneck of a standard dataloader
+    made of dataset, mapper and sampler.
+    """
+
+    def __init__(
+        self,
+        dataset,
+        *,
+        mapper,
+        sampler=None,
+        total_batch_size,
+        num_workers=0,
+        max_time_seconds: int = 90,
+    ):
+        """
+        Args:
+            max_time_seconds (int): maximum time to spent for each benchmark
+            other args: same as in `build.py:build_detection_train_loader`
+        """
+        if isinstance(dataset, list):
+            dataset = DatasetFromList(dataset, copy=False, serialize=True)
+        if sampler is None:
+            sampler = TrainingSampler(len(dataset))
+
+        self.dataset = dataset
+        self.mapper = mapper
+        self.sampler = sampler
+        self.total_batch_size = total_batch_size
+        self.num_workers = num_workers
+        self.per_gpu_batch_size = self.total_batch_size // comm.get_world_size()
+
+        self.max_time_seconds = max_time_seconds
+
+    def _benchmark(self, iterator, num_iter, warmup, msg=None):
+        avg, all_times = iter_benchmark(iterator, num_iter, warmup, self.max_time_seconds)
+        if msg is not None:
+            self._log_time(msg, avg, all_times)
+        return avg, all_times
+
+    def _log_time(self, msg, avg, all_times, distributed=False):
+        percentiles = [np.percentile(all_times, k, interpolation="nearest") for k in [1, 5, 95, 99]]
+        if not distributed:
+            logger.info(
+                f"{msg}: avg={1.0/avg:.1f} it/s, "
+                f"p1={percentiles[0]:.2g}s, p5={percentiles[1]:.2g}s, "
+                f"p95={percentiles[2]:.2g}s, p99={percentiles[3]:.2g}s."
+            )
+            return
+        avg_per_gpu = comm.all_gather(avg)
+        percentiles_per_gpu = comm.all_gather(percentiles)
+        if comm.get_rank() > 0:
+            return
+        for idx, avg, percentiles in zip(count(), avg_per_gpu, percentiles_per_gpu):
+            logger.info(
+                f"GPU{idx} {msg}: avg={1.0/avg:.1f} it/s, "
+                f"p1={percentiles[0]:.2g}s, p5={percentiles[1]:.2g}s, "
+                f"p95={percentiles[2]:.2g}s, p99={percentiles[3]:.2g}s."
+            )
+
+    def benchmark_dataset(self, num_iter, warmup=5):
+        """
+        Benchmark the speed of taking raw samples from the dataset.
+        """
+
+        def loader():
+            while True:
+                for k in self.sampler:
+                    yield self.dataset[k]
+
+        self._benchmark(loader(), num_iter, warmup, "Dataset Alone")
+
+    def benchmark_mapper(self, num_iter, warmup=5):
+        """
+        Benchmark the speed of taking raw samples from the dataset and map
+        them in a single process.
+        """
+
+        def loader():
+            while True:
+                for k in self.sampler:
+                    yield self.mapper(self.dataset[k])
+
+        self._benchmark(loader(), num_iter, warmup, "Single Process Mapper (sec/sample)")
+
+    def benchmark_workers(self, num_iter, warmup=10):
+        """
+        Benchmark the dataloader by tuning num_workers to [0, 1, self.num_workers].
+        """
+        candidates = [0, 1]
+        if self.num_workers not in candidates:
+            candidates.append(self.num_workers)
+
+        dataset = MapDataset(self.dataset, self.mapper)
+        for n in candidates:
+            loader = build_batch_data_loader(
+                dataset,
+                self.sampler,
+                self.total_batch_size,
+                num_workers=n,
+            )
+            self._benchmark(
+                iter(loader),
+                num_iter * max(n, 1),
+                warmup * max(n, 1),
+                f"DataLoader ({n} workers, bs={self.per_gpu_batch_size})",
+            )
+            del loader
+
+    def benchmark_IPC(self, num_iter, warmup=10):
+        """
+        Benchmark the dataloader where each worker outputs nothing. This
+        eliminates the IPC overhead compared to the regular dataloader.
+
+        PyTorch multiprocessing's IPC only optimizes for torch tensors.
+        Large numpy arrays or other data structure may incur large IPC overhead.
+        """
+        n = self.num_workers
+        dataset = _EmptyMapDataset(MapDataset(self.dataset, self.mapper))
+        loader = build_batch_data_loader(
+            dataset, self.sampler, self.total_batch_size, num_workers=n
+        )
+        self._benchmark(
+            iter(loader),
+            num_iter * max(n, 1),
+            warmup * max(n, 1),
+            f"DataLoader ({n} workers, bs={self.per_gpu_batch_size}) w/o comm",
+        )
+
+    def benchmark_distributed(self, num_iter, warmup=10):
+        """
+        Benchmark the dataloader in each distributed worker, and log results of
+        all workers. This helps understand the final performance as well as
+        the variances among workers.
+
+        It also prints startup time (first iter) of the dataloader.
+        """
+        gpu = comm.get_world_size()
+        dataset = MapDataset(self.dataset, self.mapper)
+        n = self.num_workers
+        loader = build_batch_data_loader(
+            dataset, self.sampler, self.total_batch_size, num_workers=n
+        )
+
+        timer = Timer()
+        loader = iter(loader)
+        next(loader)
+        startup_time = timer.seconds()
+        logger.info("Dataloader startup time: {:.2f} seconds".format(startup_time))
+
+        comm.synchronize()
+
+        avg, all_times = self._benchmark(loader, num_iter * max(n, 1), warmup * max(n, 1))
+        del loader
+        self._log_time(
+            f"DataLoader ({gpu} GPUs x {n} workers, total bs={self.total_batch_size})",
+            avg,
+            all_times,
+            True,
+        )
diff --git a/detectron2/data/build.py b/detectron2/data/build.py
new file mode 100644
index 0000000000000000000000000000000000000000..42867687e329c67f37cb7c5e938fd44150500d20
--- /dev/null
+++ b/detectron2/data/build.py
@@ -0,0 +1,678 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+import itertools
+import logging
+import numpy as np
+import operator
+import pickle
+from collections import OrderedDict, defaultdict
+from typing import Any, Callable, Dict, List, Optional, Union
+import torch
+import torch.utils.data as torchdata
+from tabulate import tabulate
+from termcolor import colored
+
+from detectron2.config import configurable
+from detectron2.structures import BoxMode
+from detectron2.utils.comm import get_world_size
+from detectron2.utils.env import seed_all_rng
+from detectron2.utils.file_io import PathManager
+from detectron2.utils.logger import _log_api_usage, log_first_n
+
+from .catalog import DatasetCatalog, MetadataCatalog
+from .common import AspectRatioGroupedDataset, DatasetFromList, MapDataset, ToIterableDataset
+from .dataset_mapper import DatasetMapper
+from .detection_utils import check_metadata_consistency
+from .samplers import (
+    InferenceSampler,
+    RandomSubsetTrainingSampler,
+    RepeatFactorTrainingSampler,
+    TrainingSampler,
+)
+
+"""
+This file contains the default logic to build a dataloader for training or testing.
+"""
+
+__all__ = [
+    "build_batch_data_loader",
+    "build_detection_train_loader",
+    "build_detection_test_loader",
+    "get_detection_dataset_dicts",
+    "load_proposals_into_dataset",
+    "print_instances_class_histogram",
+]
+
+
+def filter_images_with_only_crowd_annotations(dataset_dicts):
+    """
+    Filter out images with none annotations or only crowd annotations
+    (i.e., images without non-crowd annotations).
+    A common training-time preprocessing on COCO dataset.
+
+    Args:
+        dataset_dicts (list[dict]): annotations in Detectron2 Dataset format.
+
+    Returns:
+        list[dict]: the same format, but filtered.
+    """
+    num_before = len(dataset_dicts)
+
+    def valid(anns):
+        for ann in anns:
+            if ann.get("iscrowd", 0) == 0:
+                return True
+        return False
+
+    dataset_dicts = [x for x in dataset_dicts if valid(x["annotations"])]
+    num_after = len(dataset_dicts)
+    logger = logging.getLogger(__name__)
+    logger.info(
+        "Removed {} images with no usable annotations. {} images left.".format(
+            num_before - num_after, num_after
+        )
+    )
+    return dataset_dicts
+
+
+def filter_images_with_few_keypoints(dataset_dicts, min_keypoints_per_image):
+    """
+    Filter out images with too few number of keypoints.
+
+    Args:
+        dataset_dicts (list[dict]): annotations in Detectron2 Dataset format.
+
+    Returns:
+        list[dict]: the same format as dataset_dicts, but filtered.
+    """
+    num_before = len(dataset_dicts)
+
+    def visible_keypoints_in_image(dic):
+        # Each keypoints field has the format [x1, y1, v1, ...], where v is visibility
+        annotations = dic["annotations"]
+        return sum(
+            (np.array(ann["keypoints"][2::3]) > 0).sum()
+            for ann in annotations
+            if "keypoints" in ann
+        )
+
+    dataset_dicts = [
+        x for x in dataset_dicts if visible_keypoints_in_image(x) >= min_keypoints_per_image
+    ]
+    num_after = len(dataset_dicts)
+    logger = logging.getLogger(__name__)
+    logger.info(
+        "Removed {} images with fewer than {} keypoints.".format(
+            num_before - num_after, min_keypoints_per_image
+        )
+    )
+    return dataset_dicts
+
+
+def load_proposals_into_dataset(dataset_dicts, proposal_file):
+    """
+    Load precomputed object proposals into the dataset.
+
+    The proposal file should be a pickled dict with the following keys:
+
+    - "ids": list[int] or list[str], the image ids
+    - "boxes": list[np.ndarray], each is an Nx4 array of boxes corresponding to the image id
+    - "objectness_logits": list[np.ndarray], each is an N sized array of objectness scores
+      corresponding to the boxes.
+    - "bbox_mode": the BoxMode of the boxes array. Defaults to ``BoxMode.XYXY_ABS``.
+
+    Args:
+        dataset_dicts (list[dict]): annotations in Detectron2 Dataset format.
+        proposal_file (str): file path of pre-computed proposals, in pkl format.
+
+    Returns:
+        list[dict]: the same format as dataset_dicts, but added proposal field.
+    """
+    logger = logging.getLogger(__name__)
+    logger.info("Loading proposals from: {}".format(proposal_file))
+
+    with PathManager.open(proposal_file, "rb") as f:
+        proposals = pickle.load(f, encoding="latin1")
+
+    # Rename the key names in D1 proposal files
+    rename_keys = {"indexes": "ids", "scores": "objectness_logits"}
+    for key in rename_keys:
+        if key in proposals:
+            proposals[rename_keys[key]] = proposals.pop(key)
+
+    # Fetch the indexes of all proposals that are in the dataset
+    # Convert image_id to str since they could be int.
+    img_ids = set({str(record["image_id"]) for record in dataset_dicts})
+    id_to_index = {str(id): i for i, id in enumerate(proposals["ids"]) if str(id) in img_ids}
+
+    # Assuming default bbox_mode of precomputed proposals are 'XYXY_ABS'
+    bbox_mode = BoxMode(proposals["bbox_mode"]) if "bbox_mode" in proposals else BoxMode.XYXY_ABS
+
+    for record in dataset_dicts:
+        # Get the index of the proposal
+        i = id_to_index[str(record["image_id"])]
+
+        boxes = proposals["boxes"][i]
+        objectness_logits = proposals["objectness_logits"][i]
+        # Sort the proposals in descending order of the scores
+        inds = objectness_logits.argsort()[::-1]
+        record["proposal_boxes"] = boxes[inds]
+        record["proposal_objectness_logits"] = objectness_logits[inds]
+        record["proposal_bbox_mode"] = bbox_mode
+
+    return dataset_dicts
+
+
+def print_instances_class_histogram(dataset_dicts, class_names):
+    """
+    Args:
+        dataset_dicts (list[dict]): list of dataset dicts.
+        class_names (list[str]): list of class names (zero-indexed).
+    """
+    num_classes = len(class_names)
+    hist_bins = np.arange(num_classes + 1)
+    histogram = np.zeros((num_classes,), dtype=int)
+    for entry in dataset_dicts:
+        annos = entry["annotations"]
+        classes = np.asarray(
+            [x["category_id"] for x in annos if not x.get("iscrowd", 0)], dtype=int
+        )
+        if len(classes):
+            assert classes.min() >= 0, f"Got an invalid category_id={classes.min()}"
+            assert (
+                classes.max() < num_classes
+            ), f"Got an invalid category_id={classes.max()} for a dataset of {num_classes} classes"
+        histogram += np.histogram(classes, bins=hist_bins)[0]
+
+    N_COLS = min(6, len(class_names) * 2)
+
+    def short_name(x):
+        # make long class names shorter. useful for lvis
+        if len(x) > 13:
+            return x[:11] + ".."
+        return x
+
+    data = list(
+        itertools.chain(*[[short_name(class_names[i]), int(v)] for i, v in enumerate(histogram)])
+    )
+    total_num_instances = sum(data[1::2])
+    data.extend([None] * (N_COLS - (len(data) % N_COLS)))
+    if num_classes > 1:
+        data.extend(["total", total_num_instances])
+    data = itertools.zip_longest(*[data[i::N_COLS] for i in range(N_COLS)])
+    table = tabulate(
+        data,
+        headers=["category", "#instances"] * (N_COLS // 2),
+        tablefmt="pipe",
+        numalign="left",
+        stralign="center",
+    )
+    log_first_n(
+        logging.INFO,
+        "Distribution of instances among all {} categories:\n".format(num_classes)
+        + colored(table, "cyan"),
+        key="message",
+    )
+
+
+def get_detection_dataset_dicts(
+    names,
+    filter_empty=True,
+    min_keypoints=0,
+    proposal_files=None,
+    check_consistency=True,
+):
+    """
+    Load and prepare dataset dicts for instance detection/segmentation and semantic segmentation.
+
+    Args:
+        names (str or list[str]): a dataset name or a list of dataset names
+        filter_empty (bool): whether to filter out images without instance annotations
+        min_keypoints (int): filter out images with fewer keypoints than
+            `min_keypoints`. Set to 0 to do nothing.
+        proposal_files (list[str]): if given, a list of object proposal files
+            that match each dataset in `names`.
+        check_consistency (bool): whether to check if datasets have consistent metadata.
+
+    Returns:
+        list[dict]: a list of dicts following the standard dataset dict format.
+    """
+    if isinstance(names, str):
+        names = [names]
+    assert len(names), names
+
+    available_datasets = DatasetCatalog.keys()
+    names_set = set(names)
+    if not names_set.issubset(available_datasets):
+        logger = logging.getLogger(__name__)
+        logger.warning(
+            "The following dataset names are not registered in the DatasetCatalog: "
+            f"{names_set - available_datasets}. "
+            f"Available datasets are {available_datasets}"
+        )
+
+    dataset_dicts = [DatasetCatalog.get(dataset_name) for dataset_name in names]
+
+    if isinstance(dataset_dicts[0], torchdata.Dataset):
+        if len(dataset_dicts) > 1:
+            # ConcatDataset does not work for iterable style dataset.
+            # We could support concat for iterable as well, but it's often
+            # not a good idea to concat iterables anyway.
+            return torchdata.ConcatDataset(dataset_dicts)
+        return dataset_dicts[0]
+
+    for dataset_name, dicts in zip(names, dataset_dicts):
+        assert len(dicts), "Dataset '{}' is empty!".format(dataset_name)
+
+    if proposal_files is not None:
+        assert len(names) == len(proposal_files)
+        # load precomputed proposals from proposal files
+        dataset_dicts = [
+            load_proposals_into_dataset(dataset_i_dicts, proposal_file)
+            for dataset_i_dicts, proposal_file in zip(dataset_dicts, proposal_files)
+        ]
+
+    dataset_dicts = list(itertools.chain.from_iterable(dataset_dicts))
+
+    has_instances = "annotations" in dataset_dicts[0]
+    if filter_empty and has_instances:
+        dataset_dicts = filter_images_with_only_crowd_annotations(dataset_dicts)
+    if min_keypoints > 0 and has_instances:
+        dataset_dicts = filter_images_with_few_keypoints(dataset_dicts, min_keypoints)
+
+    if check_consistency and has_instances:
+        try:
+            class_names = MetadataCatalog.get(names[0]).thing_classes
+            check_metadata_consistency("thing_classes", names)
+            print_instances_class_histogram(dataset_dicts, class_names)
+        except AttributeError:  # class names are not available for this dataset
+            pass
+
+    assert len(dataset_dicts), "No valid data found in {}.".format(",".join(names))
+    return dataset_dicts
+
+
+def build_batch_data_loader(
+    dataset,
+    sampler,
+    total_batch_size,
+    *,
+    aspect_ratio_grouping=False,
+    num_workers=0,
+    collate_fn=None,
+    drop_last: bool = True,
+    single_gpu_batch_size=None,
+    seed=None,
+    **kwargs,
+):
+    """
+    Build a batched dataloader. The main differences from `torch.utils.data.DataLoader` are:
+    1. support aspect ratio grouping options
+    2. use no "batch collation", because this is common for detection training
+
+    Args:
+        dataset (torch.utils.data.Dataset): a pytorch map-style or iterable dataset.
+        sampler (torch.utils.data.sampler.Sampler or None): a sampler that produces indices.
+            Must be provided iff. ``dataset`` is a map-style dataset.
+        total_batch_size, aspect_ratio_grouping, num_workers, collate_fn: see
+            :func:`build_detection_train_loader`.
+        single_gpu_batch_size: You can specify either `single_gpu_batch_size` or `total_batch_size`.
+            `single_gpu_batch_size` specifies the batch size that will be used for each gpu/process.
+            `total_batch_size` allows you to specify the total aggregate batch size across gpus.
+            It is an error to supply a value for both.
+        drop_last (bool): if ``True``, the dataloader will drop incomplete batches.
+
+    Returns:
+        iterable[list]. Length of each list is the batch size of the current
+            GPU. Each element in the list comes from the dataset.
+    """
+    if single_gpu_batch_size:
+        if total_batch_size:
+            raise ValueError(
+                """total_batch_size and single_gpu_batch_size are mutually incompatible.
+                Please specify only one. """
+            )
+        batch_size = single_gpu_batch_size
+    else:
+        world_size = get_world_size()
+        assert (
+            total_batch_size > 0 and total_batch_size % world_size == 0
+        ), "Total batch size ({}) must be divisible by the number of gpus ({}).".format(
+            total_batch_size, world_size
+        )
+        batch_size = total_batch_size // world_size
+    logger = logging.getLogger(__name__)
+    logger.info("Making batched data loader with batch_size=%d", batch_size)
+
+    if isinstance(dataset, torchdata.IterableDataset):
+        assert sampler is None, "sampler must be None if dataset is IterableDataset"
+    else:
+        dataset = ToIterableDataset(dataset, sampler, shard_chunk_size=batch_size)
+
+    generator = None
+    if seed is not None:
+        generator = torch.Generator()
+        generator.manual_seed(seed)
+
+    if aspect_ratio_grouping:
+        assert drop_last, "Aspect ratio grouping will drop incomplete batches."
+        data_loader = torchdata.DataLoader(
+            dataset,
+            num_workers=num_workers,
+            collate_fn=operator.itemgetter(0),  # don't batch, but yield individual elements
+            worker_init_fn=worker_init_reset_seed,
+            generator=generator,
+            **kwargs
+        )  # yield individual mapped dict
+        data_loader = AspectRatioGroupedDataset(data_loader, batch_size)
+        if collate_fn is None:
+            return data_loader
+        return MapDataset(data_loader, collate_fn)
+    else:
+        return torchdata.DataLoader(
+            dataset,
+            batch_size=batch_size,
+            drop_last=drop_last,
+            num_workers=num_workers,
+            collate_fn=trivial_batch_collator if collate_fn is None else collate_fn,
+            worker_init_fn=worker_init_reset_seed,
+            generator=generator,
+            **kwargs
+        )
+
+
+def _get_train_datasets_repeat_factors(cfg) -> Dict[str, float]:
+    repeat_factors = cfg.DATASETS.TRAIN_REPEAT_FACTOR
+    assert all(len(tup) == 2 for tup in repeat_factors)
+    name_to_weight = defaultdict(lambda: 1, dict(repeat_factors))
+    # The sampling weights map should only contain datasets in train config
+    unrecognized = set(name_to_weight.keys()) - set(cfg.DATASETS.TRAIN)
+    assert not unrecognized, f"unrecognized datasets: {unrecognized}"
+    logger = logging.getLogger(__name__)
+    logger.info(f"Found repeat factors: {list(name_to_weight.items())}")
+
+    # pyre-fixme[7]: Expected `Dict[str, float]` but got `DefaultDict[typing.Any, int]`.
+    return name_to_weight
+
+
+def _build_weighted_sampler(cfg, enable_category_balance=False):
+    dataset_repeat_factors = _get_train_datasets_repeat_factors(cfg)
+    # OrderedDict to guarantee order of values() consistent with repeat factors
+    dataset_name_to_dicts = OrderedDict(
+        {
+            name: get_detection_dataset_dicts(
+                [name],
+                filter_empty=cfg.DATALOADER.FILTER_EMPTY_ANNOTATIONS,
+                min_keypoints=cfg.MODEL.ROI_KEYPOINT_HEAD.MIN_KEYPOINTS_PER_IMAGE
+                if cfg.MODEL.KEYPOINT_ON
+                else 0,
+                proposal_files=cfg.DATASETS.PROPOSAL_FILES_TRAIN
+                if cfg.MODEL.LOAD_PROPOSALS
+                else None,
+            )
+            for name in cfg.DATASETS.TRAIN
+        }
+    )
+    # Repeat factor for every sample in the dataset
+    repeat_factors = [
+        [dataset_repeat_factors[dsname]] * len(dataset_name_to_dicts[dsname])
+        for dsname in cfg.DATASETS.TRAIN
+    ]
+
+    repeat_factors = list(itertools.chain.from_iterable(repeat_factors))
+
+    repeat_factors = torch.tensor(repeat_factors)
+    logger = logging.getLogger(__name__)
+    if enable_category_balance:
+        """
+        1. Calculate repeat factors using category frequency for each dataset and then merge them.
+        2. Element wise dot producting the dataset frequency repeat factors with
+            the category frequency repeat factors gives the final repeat factors.
+        """
+        category_repeat_factors = [
+            RepeatFactorTrainingSampler.repeat_factors_from_category_frequency(
+                dataset_dict, cfg.DATALOADER.REPEAT_THRESHOLD
+            )
+            for dataset_dict in dataset_name_to_dicts.values()
+        ]
+        # flatten the category repeat factors from all datasets
+        category_repeat_factors = list(itertools.chain.from_iterable(category_repeat_factors))
+        category_repeat_factors = torch.tensor(category_repeat_factors)
+        repeat_factors = torch.mul(category_repeat_factors, repeat_factors)
+        repeat_factors = repeat_factors / torch.min(repeat_factors)
+        logger.info(
+            "Using WeightedCategoryTrainingSampler with repeat_factors={}".format(
+                cfg.DATASETS.TRAIN_REPEAT_FACTOR
+            )
+        )
+    else:
+        logger.info(
+            "Using WeightedTrainingSampler with repeat_factors={}".format(
+                cfg.DATASETS.TRAIN_REPEAT_FACTOR
+            )
+        )
+
+    sampler = RepeatFactorTrainingSampler(repeat_factors)
+    return sampler
+
+
+def _train_loader_from_config(cfg, mapper=None, *, dataset=None, sampler=None):
+    if dataset is None:
+        dataset = get_detection_dataset_dicts(
+            cfg.DATASETS.TRAIN,
+            filter_empty=cfg.DATALOADER.FILTER_EMPTY_ANNOTATIONS,
+            min_keypoints=cfg.MODEL.ROI_KEYPOINT_HEAD.MIN_KEYPOINTS_PER_IMAGE
+            if cfg.MODEL.KEYPOINT_ON
+            else 0,
+            proposal_files=cfg.DATASETS.PROPOSAL_FILES_TRAIN if cfg.MODEL.LOAD_PROPOSALS else None,
+        )
+        _log_api_usage("dataset." + cfg.DATASETS.TRAIN[0])
+
+    if mapper is None:
+        mapper = DatasetMapper(cfg, True)
+
+    if sampler is None:
+        sampler_name = cfg.DATALOADER.SAMPLER_TRAIN
+        logger = logging.getLogger(__name__)
+        if isinstance(dataset, torchdata.IterableDataset):
+            logger.info("Not using any sampler since the dataset is IterableDataset.")
+            sampler = None
+        else:
+            logger.info("Using training sampler {}".format(sampler_name))
+            if sampler_name == "TrainingSampler":
+                sampler = TrainingSampler(len(dataset))
+            elif sampler_name == "RepeatFactorTrainingSampler":
+                repeat_factors = RepeatFactorTrainingSampler.repeat_factors_from_category_frequency(
+                    dataset, cfg.DATALOADER.REPEAT_THRESHOLD
+                )
+                sampler = RepeatFactorTrainingSampler(repeat_factors)
+            elif sampler_name == "RandomSubsetTrainingSampler":
+                sampler = RandomSubsetTrainingSampler(
+                    len(dataset), cfg.DATALOADER.RANDOM_SUBSET_RATIO
+                )
+            elif sampler_name == "WeightedTrainingSampler":
+                sampler = _build_weighted_sampler(cfg)
+            elif sampler_name == "WeightedCategoryTrainingSampler":
+                sampler = _build_weighted_sampler(cfg, enable_category_balance=True)
+            else:
+                raise ValueError("Unknown training sampler: {}".format(sampler_name))
+
+    return {
+        "dataset": dataset,
+        "sampler": sampler,
+        "mapper": mapper,
+        "total_batch_size": cfg.SOLVER.IMS_PER_BATCH,
+        "aspect_ratio_grouping": cfg.DATALOADER.ASPECT_RATIO_GROUPING,
+        "num_workers": cfg.DATALOADER.NUM_WORKERS,
+    }
+
+
+@configurable(from_config=_train_loader_from_config)
+def build_detection_train_loader(
+    dataset,
+    *,
+    mapper,
+    sampler=None,
+    total_batch_size,
+    aspect_ratio_grouping=True,
+    num_workers=0,
+    collate_fn=None,
+    **kwargs
+):
+    """
+    Build a dataloader for object detection with some default features.
+
+    Args:
+        dataset (list or torch.utils.data.Dataset): a list of dataset dicts,
+            or a pytorch dataset (either map-style or iterable). It can be obtained
+            by using :func:`DatasetCatalog.get` or :func:`get_detection_dataset_dicts`.
+        mapper (callable): a callable which takes a sample (dict) from dataset and
+            returns the format to be consumed by the model.
+            When using cfg, the default choice is ``DatasetMapper(cfg, is_train=True)``.
+        sampler (torch.utils.data.sampler.Sampler or None): a sampler that produces
+            indices to be applied on ``dataset``.
+            If ``dataset`` is map-style, the default sampler is a :class:`TrainingSampler`,
+            which coordinates an infinite random shuffle sequence across all workers.
+            Sampler must be None if ``dataset`` is iterable.
+        total_batch_size (int): total batch size across all workers.
+        aspect_ratio_grouping (bool): whether to group images with similar
+            aspect ratio for efficiency. When enabled, it requires each
+            element in dataset be a dict with keys "width" and "height".
+        num_workers (int): number of parallel data loading workers
+        collate_fn: a function that determines how to do batching, same as the argument of
+            `torch.utils.data.DataLoader`. Defaults to do no collation and return a list of
+            data. No collation is OK for small batch size and simple data structures.
+            If your batch size is large and each sample contains too many small tensors,
+            it's more efficient to collate them in data loader.
+
+    Returns:
+        torch.utils.data.DataLoader:
+            a dataloader. Each output from it is a ``list[mapped_element]`` of length
+            ``total_batch_size / num_workers``, where ``mapped_element`` is produced
+            by the ``mapper``.
+    """
+    if isinstance(dataset, list):
+        dataset = DatasetFromList(dataset, copy=False)
+    if mapper is not None:
+        dataset = MapDataset(dataset, mapper)
+
+    if isinstance(dataset, torchdata.IterableDataset):
+        assert sampler is None, "sampler must be None if dataset is IterableDataset"
+    else:
+        if sampler is None:
+            sampler = TrainingSampler(len(dataset))
+        assert isinstance(sampler, torchdata.Sampler), f"Expect a Sampler but got {type(sampler)}"
+    return build_batch_data_loader(
+        dataset,
+        sampler,
+        total_batch_size,
+        aspect_ratio_grouping=aspect_ratio_grouping,
+        num_workers=num_workers,
+        collate_fn=collate_fn,
+        **kwargs
+    )
+
+
+def _test_loader_from_config(cfg, dataset_name, mapper=None):
+    """
+    Uses the given `dataset_name` argument (instead of the names in cfg), because the
+    standard practice is to evaluate each test set individually (not combining them).
+    """
+    if isinstance(dataset_name, str):
+        dataset_name = [dataset_name]
+
+    dataset = get_detection_dataset_dicts(
+        dataset_name,
+        filter_empty=False,
+        proposal_files=[
+            cfg.DATASETS.PROPOSAL_FILES_TEST[list(cfg.DATASETS.TEST).index(x)] for x in dataset_name
+        ]
+        if cfg.MODEL.LOAD_PROPOSALS
+        else None,
+    )
+    if mapper is None:
+        mapper = DatasetMapper(cfg, False)
+    return {
+        "dataset": dataset,
+        "mapper": mapper,
+        "num_workers": cfg.DATALOADER.NUM_WORKERS,
+        "sampler": InferenceSampler(len(dataset))
+        if not isinstance(dataset, torchdata.IterableDataset)
+        else None,
+    }
+
+
+@configurable(from_config=_test_loader_from_config)
+def build_detection_test_loader(
+    dataset: Union[List[Any], torchdata.Dataset],
+    *,
+    mapper: Callable[[Dict[str, Any]], Any],
+    sampler: Optional[torchdata.Sampler] = None,
+    batch_size: int = 1,
+    num_workers: int = 0,
+    collate_fn: Optional[Callable[[List[Any]], Any]] = None,
+) -> torchdata.DataLoader:
+    """
+    Similar to `build_detection_train_loader`, with default batch size = 1,
+    and sampler = :class:`InferenceSampler`. This sampler coordinates all workers
+    to produce the exact set of all samples.
+
+    Args:
+        dataset: a list of dataset dicts,
+            or a pytorch dataset (either map-style or iterable). They can be obtained
+            by using :func:`DatasetCatalog.get` or :func:`get_detection_dataset_dicts`.
+        mapper: a callable which takes a sample (dict) from dataset
+           and returns the format to be consumed by the model.
+           When using cfg, the default choice is ``DatasetMapper(cfg, is_train=False)``.
+        sampler: a sampler that produces
+            indices to be applied on ``dataset``. Default to :class:`InferenceSampler`,
+            which splits the dataset across all workers. Sampler must be None
+            if `dataset` is iterable.
+        batch_size: the batch size of the data loader to be created.
+            Default to 1 image per worker since this is the standard when reporting
+            inference time in papers.
+        num_workers: number of parallel data loading workers
+        collate_fn: same as the argument of `torch.utils.data.DataLoader`.
+            Defaults to do no collation and return a list of data.
+
+    Returns:
+        DataLoader: a torch DataLoader, that loads the given detection
+        dataset, with test-time transformation and batching.
+
+    Examples:
+    ::
+        data_loader = build_detection_test_loader(
+            DatasetRegistry.get("my_test"),
+            mapper=DatasetMapper(...))
+
+        # or, instantiate with a CfgNode:
+        data_loader = build_detection_test_loader(cfg, "my_test")
+    """
+    if isinstance(dataset, list):
+        dataset = DatasetFromList(dataset, copy=False)
+    if mapper is not None:
+        dataset = MapDataset(dataset, mapper)
+    if isinstance(dataset, torchdata.IterableDataset):
+        assert sampler is None, "sampler must be None if dataset is IterableDataset"
+    else:
+        if sampler is None:
+            sampler = InferenceSampler(len(dataset))
+    return torchdata.DataLoader(
+        dataset,
+        batch_size=batch_size,
+        sampler=sampler,
+        drop_last=False,
+        num_workers=num_workers,
+        collate_fn=trivial_batch_collator if collate_fn is None else collate_fn,
+    )
+
+
+def trivial_batch_collator(batch):
+    """
+    A batch collator that does nothing.
+    """
+    return batch
+
+
+def worker_init_reset_seed(worker_id):
+    initial_seed = torch.initial_seed() % 2**31
+    seed_all_rng(initial_seed + worker_id)
diff --git a/detectron2/data/catalog.py b/detectron2/data/catalog.py
new file mode 100644
index 0000000000000000000000000000000000000000..45c110c19508f23921b9033cdaf0aa8056f0c125
--- /dev/null
+++ b/detectron2/data/catalog.py
@@ -0,0 +1,236 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+import copy
+import logging
+import types
+from collections import UserDict
+from typing import List
+
+from detectron2.utils.logger import log_first_n
+
+__all__ = ["DatasetCatalog", "MetadataCatalog", "Metadata"]
+
+
+class _DatasetCatalog(UserDict):
+    """
+    A global dictionary that stores information about the datasets and how to obtain them.
+
+    It contains a mapping from strings
+    (which are names that identify a dataset, e.g. "coco_2014_train")
+    to a function which parses the dataset and returns the samples in the
+    format of `list[dict]`.
+
+    The returned dicts should be in Detectron2 Dataset format (See DATASETS.md for details)
+    if used with the data loader functionalities in `data/build.py,data/detection_transform.py`.
+
+    The purpose of having this catalog is to make it easy to choose
+    different datasets, by just using the strings in the config.
+    """
+
+    def register(self, name, func):
+        """
+        Args:
+            name (str): the name that identifies a dataset, e.g. "coco_2014_train".
+            func (callable): a callable which takes no arguments and returns a list of dicts.
+                It must return the same results if called multiple times.
+        """
+        assert callable(func), "You must register a function with `DatasetCatalog.register`!"
+        assert name not in self, "Dataset '{}' is already registered!".format(name)
+        self[name] = func
+
+    def get(self, name):
+        """
+        Call the registered function and return its results.
+
+        Args:
+            name (str): the name that identifies a dataset, e.g. "coco_2014_train".
+
+        Returns:
+            list[dict]: dataset annotations.
+        """
+        try:
+            f = self[name]
+        except KeyError as e:
+            raise KeyError(
+                "Dataset '{}' is not registered! Available datasets are: {}".format(
+                    name, ", ".join(list(self.keys()))
+                )
+            ) from e
+        return f()
+
+    def list(self) -> List[str]:
+        """
+        List all registered datasets.
+
+        Returns:
+            list[str]
+        """
+        return list(self.keys())
+
+    def remove(self, name):
+        """
+        Alias of ``pop``.
+        """
+        self.pop(name)
+
+    def __str__(self):
+        return "DatasetCatalog(registered datasets: {})".format(", ".join(self.keys()))
+
+    __repr__ = __str__
+
+
+DatasetCatalog = _DatasetCatalog()
+DatasetCatalog.__doc__ = (
+    _DatasetCatalog.__doc__
+    + """
+    .. automethod:: detectron2.data.catalog.DatasetCatalog.register
+    .. automethod:: detectron2.data.catalog.DatasetCatalog.get
+"""
+)
+
+
+class Metadata(types.SimpleNamespace):
+    """
+    A class that supports simple attribute setter/getter.
+    It is intended for storing metadata of a dataset and make it accessible globally.
+
+    Examples:
+    ::
+        # somewhere when you load the data:
+        MetadataCatalog.get("mydataset").thing_classes = ["person", "dog"]
+
+        # somewhere when you print statistics or visualize:
+        classes = MetadataCatalog.get("mydataset").thing_classes
+    """
+
+    # the name of the dataset
+    # set default to N/A so that `self.name` in the errors will not trigger getattr again
+    name: str = "N/A"
+
+    _RENAMED = {
+        "class_names": "thing_classes",
+        "dataset_id_to_contiguous_id": "thing_dataset_id_to_contiguous_id",
+        "stuff_class_names": "stuff_classes",
+    }
+
+    def __getattr__(self, key):
+        if key in self._RENAMED:
+            log_first_n(
+                logging.WARNING,
+                "Metadata '{}' was renamed to '{}'!".format(key, self._RENAMED[key]),
+                n=10,
+            )
+            return getattr(self, self._RENAMED[key])
+
+        # "name" exists in every metadata
+        if len(self.__dict__) > 1:
+            raise AttributeError(
+                "Attribute '{}' does not exist in the metadata of dataset '{}'. Available "
+                "keys are {}.".format(key, self.name, str(self.__dict__.keys()))
+            )
+        else:
+            raise AttributeError(
+                f"Attribute '{key}' does not exist in the metadata of dataset '{self.name}': "
+                "metadata is empty."
+            )
+
+    def __setattr__(self, key, val):
+        if key in self._RENAMED:
+            log_first_n(
+                logging.WARNING,
+                "Metadata '{}' was renamed to '{}'!".format(key, self._RENAMED[key]),
+                n=10,
+            )
+            setattr(self, self._RENAMED[key], val)
+
+        # Ensure that metadata of the same name stays consistent
+        try:
+            oldval = getattr(self, key)
+            assert oldval == val, (
+                "Attribute '{}' in the metadata of '{}' cannot be set "
+                "to a different value!\n{} != {}".format(key, self.name, oldval, val)
+            )
+        except AttributeError:
+            super().__setattr__(key, val)
+
+    def as_dict(self):
+        """
+        Returns all the metadata as a dict.
+        Note that modifications to the returned dict will not reflect on the Metadata object.
+        """
+        return copy.copy(self.__dict__)
+
+    def set(self, **kwargs):
+        """
+        Set multiple metadata with kwargs.
+        """
+        for k, v in kwargs.items():
+            setattr(self, k, v)
+        return self
+
+    def get(self, key, default=None):
+        """
+        Access an attribute and return its value if exists.
+        Otherwise return default.
+        """
+        try:
+            return getattr(self, key)
+        except AttributeError:
+            return default
+
+
+class _MetadataCatalog(UserDict):
+    """
+    MetadataCatalog is a global dictionary that provides access to
+    :class:`Metadata` of a given dataset.
+
+    The metadata associated with a certain name is a singleton: once created, the
+    metadata will stay alive and will be returned by future calls to ``get(name)``.
+
+    It's like global variables, so don't abuse it.
+    It's meant for storing knowledge that's constant and shared across the execution
+    of the program, e.g.: the class names in COCO.
+    """
+
+    def get(self, name):
+        """
+        Args:
+            name (str): name of a dataset (e.g. coco_2014_train).
+
+        Returns:
+            Metadata: The :class:`Metadata` instance associated with this name,
+            or create an empty one if none is available.
+        """
+        assert len(name)
+        r = super().get(name, None)
+        if r is None:
+            r = self[name] = Metadata(name=name)
+        return r
+
+    def list(self):
+        """
+        List all registered metadata.
+
+        Returns:
+            list[str]: keys (names of datasets) of all registered metadata
+        """
+        return list(self.keys())
+
+    def remove(self, name):
+        """
+        Alias of ``pop``.
+        """
+        self.pop(name)
+
+    def __str__(self):
+        return "MetadataCatalog(registered metadata: {})".format(", ".join(self.keys()))
+
+    __repr__ = __str__
+
+
+MetadataCatalog = _MetadataCatalog()
+MetadataCatalog.__doc__ = (
+    _MetadataCatalog.__doc__
+    + """
+    .. automethod:: detectron2.data.catalog.MetadataCatalog.get
+"""
+)
diff --git a/detectron2/data/common.py b/detectron2/data/common.py
new file mode 100644
index 0000000000000000000000000000000000000000..38770597093360c46fe72719a8d2ec428874aed4
--- /dev/null
+++ b/detectron2/data/common.py
@@ -0,0 +1,339 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+import contextlib
+import copy
+import itertools
+import logging
+import numpy as np
+import pickle
+import random
+from typing import Callable, Union
+import torch
+import torch.utils.data as data
+from torch.utils.data.sampler import Sampler
+
+from detectron2.utils.serialize import PicklableWrapper
+
+__all__ = ["MapDataset", "DatasetFromList", "AspectRatioGroupedDataset", "ToIterableDataset"]
+
+logger = logging.getLogger(__name__)
+
+
+# copied from: https://docs.python.org/3/library/itertools.html#recipes
+def _roundrobin(*iterables):
+    "roundrobin('ABC', 'D', 'EF') --> A D E B F C"
+    # Recipe credited to George Sakkis
+    num_active = len(iterables)
+    nexts = itertools.cycle(iter(it).__next__ for it in iterables)
+    while num_active:
+        try:
+            for next in nexts:
+                yield next()
+        except StopIteration:
+            # Remove the iterator we just exhausted from the cycle.
+            num_active -= 1
+            nexts = itertools.cycle(itertools.islice(nexts, num_active))
+
+
+def _shard_iterator_dataloader_worker(iterable, chunk_size=1):
+    # Shard the iterable if we're currently inside pytorch dataloader worker.
+    worker_info = data.get_worker_info()
+    if worker_info is None or worker_info.num_workers == 1:
+        # do nothing
+        yield from iterable
+    else:
+        # worker0: 0, 1, ..., chunk_size-1, num_workers*chunk_size, num_workers*chunk_size+1, ...
+        # worker1: chunk_size, chunk_size+1, ...
+        # worker2: 2*chunk_size, 2*chunk_size+1, ...
+        # ...
+        yield from _roundrobin(
+            *[
+                itertools.islice(
+                    iterable,
+                    worker_info.id * chunk_size + chunk_i,
+                    None,
+                    worker_info.num_workers * chunk_size,
+                )
+                for chunk_i in range(chunk_size)
+            ]
+        )
+
+
+class _MapIterableDataset(data.IterableDataset):
+    """
+    Map a function over elements in an IterableDataset.
+
+    Similar to pytorch's MapIterDataPipe, but support filtering when map_func
+    returns None.
+
+    This class is not public-facing. Will be called by `MapDataset`.
+    """
+
+    def __init__(self, dataset, map_func):
+        self._dataset = dataset
+        self._map_func = PicklableWrapper(map_func)  # wrap so that a lambda will work
+
+    def __len__(self):
+        return len(self._dataset)
+
+    def __iter__(self):
+        for x in map(self._map_func, self._dataset):
+            if x is not None:
+                yield x
+
+
+class MapDataset(data.Dataset):
+    """
+    Map a function over the elements in a dataset.
+    """
+
+    def __init__(self, dataset, map_func):
+        """
+        Args:
+            dataset: a dataset where map function is applied. Can be either
+                map-style or iterable dataset. When given an iterable dataset,
+                the returned object will also be an iterable dataset.
+            map_func: a callable which maps the element in dataset. map_func can
+                return None to skip the data (e.g. in case of errors).
+                How None is handled depends on the style of `dataset`.
+                If `dataset` is map-style, it randomly tries other elements.
+                If `dataset` is iterable, it skips the data and tries the next.
+        """
+        self._dataset = dataset
+        self._map_func = PicklableWrapper(map_func)  # wrap so that a lambda will work
+
+        self._rng = random.Random(42)
+        self._fallback_candidates = set(range(len(dataset)))
+
+    def __new__(cls, dataset, map_func):
+        is_iterable = isinstance(dataset, data.IterableDataset)
+        if is_iterable:
+            return _MapIterableDataset(dataset, map_func)
+        else:
+            return super().__new__(cls)
+
+    def __getnewargs__(self):
+        return self._dataset, self._map_func
+
+    def __len__(self):
+        return len(self._dataset)
+
+    def __getitem__(self, idx):
+        retry_count = 0
+        cur_idx = int(idx)
+
+        while True:
+            data = self._map_func(self._dataset[cur_idx])
+            if data is not None:
+                self._fallback_candidates.add(cur_idx)
+                return data
+
+            # _map_func fails for this idx, use a random new index from the pool
+            retry_count += 1
+            self._fallback_candidates.discard(cur_idx)
+            cur_idx = self._rng.sample(self._fallback_candidates, k=1)[0]
+
+            if retry_count >= 3:
+                logger = logging.getLogger(__name__)
+                logger.warning(
+                    "Failed to apply `_map_func` for idx: {}, retry count: {}".format(
+                        idx, retry_count
+                    )
+                )
+
+
+class _TorchSerializedList:
+    """
+    A list-like object whose items are serialized and stored in a torch tensor. When
+    launching a process that uses TorchSerializedList with "fork" start method,
+    the subprocess can read the same buffer without triggering copy-on-access. When
+    launching a process that uses TorchSerializedList with "spawn/forkserver" start
+    method, the list will be pickled by a special ForkingPickler registered by PyTorch
+    that moves data to shared memory. In both cases, this allows parent and child
+    processes to share RAM for the list data, hence avoids the issue in
+    https://github.com/pytorch/pytorch/issues/13246.
+
+    See also https://ppwwyyxx.com/blog/2022/Demystify-RAM-Usage-in-Multiprocess-DataLoader/
+    on how it works.
+    """
+
+    def __init__(self, lst: list):
+        self._lst = lst
+
+        def _serialize(data):
+            buffer = pickle.dumps(data, protocol=-1)
+            return np.frombuffer(buffer, dtype=np.uint8)
+
+        logger.info(
+            "Serializing {} elements to byte tensors and concatenating them all ...".format(
+                len(self._lst)
+            )
+        )
+        self._lst = [_serialize(x) for x in self._lst]
+        self._addr = np.asarray([len(x) for x in self._lst], dtype=np.int64)
+        self._addr = torch.from_numpy(np.cumsum(self._addr))
+        self._lst = torch.from_numpy(np.concatenate(self._lst))
+        logger.info("Serialized dataset takes {:.2f} MiB".format(len(self._lst) / 1024**2))
+
+    def __len__(self):
+        return len(self._addr)
+
+    def __getitem__(self, idx):
+        start_addr = 0 if idx == 0 else self._addr[idx - 1].item()
+        end_addr = self._addr[idx].item()
+        bytes = memoryview(self._lst[start_addr:end_addr].numpy())
+
+        # @lint-ignore PYTHONPICKLEISBAD
+        return pickle.loads(bytes)
+
+
+_DEFAULT_DATASET_FROM_LIST_SERIALIZE_METHOD = _TorchSerializedList
+
+
+@contextlib.contextmanager
+def set_default_dataset_from_list_serialize_method(new):
+    """
+    Context manager for using custom serialize function when creating DatasetFromList
+    """
+
+    global _DEFAULT_DATASET_FROM_LIST_SERIALIZE_METHOD
+    orig = _DEFAULT_DATASET_FROM_LIST_SERIALIZE_METHOD
+    _DEFAULT_DATASET_FROM_LIST_SERIALIZE_METHOD = new
+    yield
+    _DEFAULT_DATASET_FROM_LIST_SERIALIZE_METHOD = orig
+
+
+class DatasetFromList(data.Dataset):
+    """
+    Wrap a list to a torch Dataset. It produces elements of the list as data.
+    """
+
+    def __init__(
+        self,
+        lst: list,
+        copy: bool = True,
+        serialize: Union[bool, Callable] = True,
+    ):
+        """
+        Args:
+            lst (list): a list which contains elements to produce.
+            copy (bool): whether to deepcopy the element when producing it,
+                so that the result can be modified in place without affecting the
+                source in the list.
+            serialize (bool or callable): whether to serialize the stroage to other
+                backend. If `True`, the default serialize method will be used, if given
+                a callable, the callable will be used as serialize method.
+        """
+        self._lst = lst
+        self._copy = copy
+        if not isinstance(serialize, (bool, Callable)):
+            raise TypeError(f"Unsupported type for argument `serailzie`: {serialize}")
+        self._serialize = serialize is not False
+
+        if self._serialize:
+            serialize_method = (
+                serialize
+                if isinstance(serialize, Callable)
+                else _DEFAULT_DATASET_FROM_LIST_SERIALIZE_METHOD
+            )
+            logger.info(f"Serializing the dataset using: {serialize_method}")
+            self._lst = serialize_method(self._lst)
+
+    def __len__(self):
+        return len(self._lst)
+
+    def __getitem__(self, idx):
+        if self._copy and not self._serialize:
+            return copy.deepcopy(self._lst[idx])
+        else:
+            return self._lst[idx]
+
+
+class ToIterableDataset(data.IterableDataset):
+    """
+    Convert an old indices-based (also called map-style) dataset
+    to an iterable-style dataset.
+    """
+
+    def __init__(
+        self,
+        dataset: data.Dataset,
+        sampler: Sampler,
+        shard_sampler: bool = True,
+        shard_chunk_size: int = 1,
+    ):
+        """
+        Args:
+            dataset: an old-style dataset with ``__getitem__``
+            sampler: a cheap iterable that produces indices to be applied on ``dataset``.
+            shard_sampler: whether to shard the sampler based on the current pytorch data loader
+                worker id. When an IterableDataset is forked by pytorch's DataLoader into multiple
+                workers, it is responsible for sharding its data based on worker id so that workers
+                don't produce identical data.
+
+                Most samplers (like our TrainingSampler) do not shard based on dataloader worker id
+                and this argument should be set to True. But certain samplers may be already
+                sharded, in that case this argument should be set to False.
+            shard_chunk_size: when sharding the sampler, each worker will
+        """
+        assert not isinstance(dataset, data.IterableDataset), dataset
+        assert isinstance(sampler, Sampler), sampler
+        self.dataset = dataset
+        self.sampler = sampler
+        self.shard_sampler = shard_sampler
+        self.shard_chunk_size = shard_chunk_size
+
+    def __iter__(self):
+        if not self.shard_sampler:
+            sampler = self.sampler
+        else:
+            # With map-style dataset, `DataLoader(dataset, sampler)` runs the
+            # sampler in main process only. But `DataLoader(ToIterableDataset(dataset, sampler))`
+            # will run sampler in every of the N worker. So we should only keep 1/N of the ids on
+            # each worker. The assumption is that sampler is cheap to iterate so it's fine to
+            # discard ids in workers.
+            sampler = _shard_iterator_dataloader_worker(self.sampler, self.shard_chunk_size)
+        for idx in sampler:
+            yield self.dataset[idx]
+
+    def __len__(self):
+        return len(self.sampler)
+
+
+class AspectRatioGroupedDataset(data.IterableDataset):
+    """
+    Batch data that have similar aspect ratio together.
+    In this implementation, images whose aspect ratio < (or >) 1 will
+    be batched together.
+    This improves training speed because the images then need less padding
+    to form a batch.
+
+    It assumes the underlying dataset produces dicts with "width" and "height" keys.
+    It will then produce a list of original dicts with length = batch_size,
+    all with similar aspect ratios.
+    """
+
+    def __init__(self, dataset, batch_size):
+        """
+        Args:
+            dataset: an iterable. Each element must be a dict with keys
+                "width" and "height", which will be used to batch data.
+            batch_size (int):
+        """
+        self.dataset = dataset
+        self.batch_size = batch_size
+        self._buckets = [[] for _ in range(2)]
+        # Hard-coded two aspect ratio groups: w > h and w < h.
+        # Can add support for more aspect ratio groups, but doesn't seem useful
+
+    def __iter__(self):
+        for d in self.dataset:
+            w, h = d["width"], d["height"]
+            bucket_id = 0 if w > h else 1
+            bucket = self._buckets[bucket_id]
+            bucket.append(d)
+            if len(bucket) == self.batch_size:
+                data = bucket[:]
+                # Clear bucket first, because code after yield is not
+                # guaranteed to execute
+                del bucket[:]
+                yield data
diff --git a/detectron2/data/dataset_mapper.py b/detectron2/data/dataset_mapper.py
new file mode 100644
index 0000000000000000000000000000000000000000..a8714f7990f11e146a01e03d108518e0356b50c4
--- /dev/null
+++ b/detectron2/data/dataset_mapper.py
@@ -0,0 +1,191 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+import copy
+import logging
+import numpy as np
+from typing import List, Optional, Union
+import torch
+
+from detectron2.config import configurable
+
+from . import detection_utils as utils
+from . import transforms as T
+
+"""
+This file contains the default mapping that's applied to "dataset dicts".
+"""
+
+__all__ = ["DatasetMapper"]
+
+
+class DatasetMapper:
+    """
+    A callable which takes a dataset dict in Detectron2 Dataset format,
+    and map it into a format used by the model.
+
+    This is the default callable to be used to map your dataset dict into training data.
+    You may need to follow it to implement your own one for customized logic,
+    such as a different way to read or transform images.
+    See :doc:`/tutorials/data_loading` for details.
+
+    The callable currently does the following:
+
+    1. Read the image from "file_name"
+    2. Applies cropping/geometric transforms to the image and annotations
+    3. Prepare data and annotations to Tensor and :class:`Instances`
+    """
+
+    @configurable
+    def __init__(
+        self,
+        is_train: bool,
+        *,
+        augmentations: List[Union[T.Augmentation, T.Transform]],
+        image_format: str,
+        use_instance_mask: bool = False,
+        use_keypoint: bool = False,
+        instance_mask_format: str = "polygon",
+        keypoint_hflip_indices: Optional[np.ndarray] = None,
+        precomputed_proposal_topk: Optional[int] = None,
+        recompute_boxes: bool = False,
+    ):
+        """
+        NOTE: this interface is experimental.
+
+        Args:
+            is_train: whether it's used in training or inference
+            augmentations: a list of augmentations or deterministic transforms to apply
+            image_format: an image format supported by :func:`detection_utils.read_image`.
+            use_instance_mask: whether to process instance segmentation annotations, if available
+            use_keypoint: whether to process keypoint annotations if available
+            instance_mask_format: one of "polygon" or "bitmask". Process instance segmentation
+                masks into this format.
+            keypoint_hflip_indices: see :func:`detection_utils.create_keypoint_hflip_indices`
+            precomputed_proposal_topk: if given, will load pre-computed
+                proposals from dataset_dict and keep the top k proposals for each image.
+            recompute_boxes: whether to overwrite bounding box annotations
+                by computing tight bounding boxes from instance mask annotations.
+        """
+        if recompute_boxes:
+            assert use_instance_mask, "recompute_boxes requires instance masks"
+        # fmt: off
+        self.is_train               = is_train
+        self.augmentations          = T.AugmentationList(augmentations)
+        self.image_format           = image_format
+        self.use_instance_mask      = use_instance_mask
+        self.instance_mask_format   = instance_mask_format
+        self.use_keypoint           = use_keypoint
+        self.keypoint_hflip_indices = keypoint_hflip_indices
+        self.proposal_topk          = precomputed_proposal_topk
+        self.recompute_boxes        = recompute_boxes
+        # fmt: on
+        logger = logging.getLogger(__name__)
+        mode = "training" if is_train else "inference"
+        logger.info(f"[DatasetMapper] Augmentations used in {mode}: {augmentations}")
+
+    @classmethod
+    def from_config(cls, cfg, is_train: bool = True):
+        augs = utils.build_augmentation(cfg, is_train)
+        if cfg.INPUT.CROP.ENABLED and is_train:
+            augs.insert(0, T.RandomCrop(cfg.INPUT.CROP.TYPE, cfg.INPUT.CROP.SIZE))
+            recompute_boxes = cfg.MODEL.MASK_ON
+        else:
+            recompute_boxes = False
+
+        ret = {
+            "is_train": is_train,
+            "augmentations": augs,
+            "image_format": cfg.INPUT.FORMAT,
+            "use_instance_mask": cfg.MODEL.MASK_ON,
+            "instance_mask_format": cfg.INPUT.MASK_FORMAT,
+            "use_keypoint": cfg.MODEL.KEYPOINT_ON,
+            "recompute_boxes": recompute_boxes,
+        }
+
+        if cfg.MODEL.KEYPOINT_ON:
+            ret["keypoint_hflip_indices"] = utils.create_keypoint_hflip_indices(cfg.DATASETS.TRAIN)
+
+        if cfg.MODEL.LOAD_PROPOSALS:
+            ret["precomputed_proposal_topk"] = (
+                cfg.DATASETS.PRECOMPUTED_PROPOSAL_TOPK_TRAIN
+                if is_train
+                else cfg.DATASETS.PRECOMPUTED_PROPOSAL_TOPK_TEST
+            )
+        return ret
+
+    def _transform_annotations(self, dataset_dict, transforms, image_shape):
+        # USER: Modify this if you want to keep them for some reason.
+        for anno in dataset_dict["annotations"]:
+            if not self.use_instance_mask:
+                anno.pop("segmentation", None)
+            if not self.use_keypoint:
+                anno.pop("keypoints", None)
+
+        # USER: Implement additional transformations if you have other types of data
+        annos = [
+            utils.transform_instance_annotations(
+                obj, transforms, image_shape, keypoint_hflip_indices=self.keypoint_hflip_indices
+            )
+            for obj in dataset_dict.pop("annotations")
+            if obj.get("iscrowd", 0) == 0
+        ]
+        instances = utils.annotations_to_instances(
+            annos, image_shape, mask_format=self.instance_mask_format
+        )
+
+        # After transforms such as cropping are applied, the bounding box may no longer
+        # tightly bound the object. As an example, imagine a triangle object
+        # [(0,0), (2,0), (0,2)] cropped by a box [(1,0),(2,2)] (XYXY format). The tight
+        # bounding box of the cropped triangle should be [(1,0),(2,1)], which is not equal to
+        # the intersection of original bounding box and the cropping box.
+        if self.recompute_boxes:
+            instances.gt_boxes = instances.gt_masks.get_bounding_boxes()
+        dataset_dict["instances"] = utils.filter_empty_instances(instances)
+
+    def __call__(self, dataset_dict):
+        """
+        Args:
+            dataset_dict (dict): Metadata of one image, in Detectron2 Dataset format.
+
+        Returns:
+            dict: a format that builtin models in detectron2 accept
+        """
+        dataset_dict = copy.deepcopy(dataset_dict)  # it will be modified by code below
+        # USER: Write your own image loading if it's not from a file
+        image = utils.read_image(dataset_dict["file_name"], format=self.image_format)
+        utils.check_image_size(dataset_dict, image)
+
+        # USER: Remove if you don't do semantic/panoptic segmentation.
+        if "sem_seg_file_name" in dataset_dict:
+            sem_seg_gt = utils.read_image(dataset_dict.pop("sem_seg_file_name"), "L").squeeze(2)
+        else:
+            sem_seg_gt = None
+
+        aug_input = T.AugInput(image, sem_seg=sem_seg_gt)
+        transforms = self.augmentations(aug_input)
+        image, sem_seg_gt = aug_input.image, aug_input.sem_seg
+
+        image_shape = image.shape[:2]  # h, w
+        # Pytorch's dataloader is efficient on torch.Tensor due to shared-memory,
+        # but not efficient on large generic data structures due to the use of pickle & mp.Queue.
+        # Therefore it's important to use torch.Tensor.
+        dataset_dict["image"] = torch.as_tensor(np.ascontiguousarray(image.transpose(2, 0, 1)))
+        if sem_seg_gt is not None:
+            dataset_dict["sem_seg"] = torch.as_tensor(sem_seg_gt.astype("long"))
+
+        # USER: Remove if you don't use pre-computed proposals.
+        # Most users would not need this feature.
+        if self.proposal_topk is not None:
+            utils.transform_proposals(
+                dataset_dict, image_shape, transforms, proposal_topk=self.proposal_topk
+            )
+
+        if not self.is_train:
+            # USER: Modify this if you want to keep them for some reason.
+            dataset_dict.pop("annotations", None)
+            dataset_dict.pop("sem_seg_file_name", None)
+            return dataset_dict
+
+        if "annotations" in dataset_dict:
+            self._transform_annotations(dataset_dict, transforms, image_shape)
+
+        return dataset_dict
diff --git a/detectron2/data/datasets/README.md b/detectron2/data/datasets/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..9fb3e4f7afec17137c95c78be6ef06d520ec8032
--- /dev/null
+++ b/detectron2/data/datasets/README.md
@@ -0,0 +1,9 @@
+
+
+### Common Datasets
+
+The dataset implemented here do not need to load the data into the final format.
+It should provide the minimal data structure needed to use the dataset, so it can be very efficient.
+
+For example, for an image dataset, just provide the file names and labels, but don't read the images.
+Let the downstream decide how to read.
diff --git a/detectron2/data/datasets/__init__.py b/detectron2/data/datasets/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a44bedc15e5f0e762fc4d77efd6f1b07c6ff77d0
--- /dev/null
+++ b/detectron2/data/datasets/__init__.py
@@ -0,0 +1,9 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+from .coco import load_coco_json, load_sem_seg, register_coco_instances, convert_to_coco_json
+from .coco_panoptic import register_coco_panoptic, register_coco_panoptic_separated
+from .lvis import load_lvis_json, register_lvis_instances, get_lvis_instances_meta
+from .pascal_voc import load_voc_instances, register_pascal_voc
+from . import builtin as _builtin  # ensure the builtin datasets are registered
+
+
+__all__ = [k for k in globals().keys() if not k.startswith("_")]
diff --git a/detectron2/data/datasets/__pycache__/__init__.cpython-310.pyc b/detectron2/data/datasets/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..fc361d0ea923a324f930c205d1729e1f63ae9af6
Binary files /dev/null and b/detectron2/data/datasets/__pycache__/__init__.cpython-310.pyc differ
diff --git a/detectron2/data/datasets/__pycache__/__init__.cpython-39.pyc b/detectron2/data/datasets/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..bb8b7e8667660998079aee0fc57477108d7da8b6
Binary files /dev/null and b/detectron2/data/datasets/__pycache__/__init__.cpython-39.pyc differ
diff --git a/detectron2/data/datasets/__pycache__/builtin.cpython-310.pyc b/detectron2/data/datasets/__pycache__/builtin.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2ecec45880ee0f8dca4c6adcf3987bb70b2b5ba5
Binary files /dev/null and b/detectron2/data/datasets/__pycache__/builtin.cpython-310.pyc differ
diff --git a/detectron2/data/datasets/__pycache__/builtin.cpython-39.pyc b/detectron2/data/datasets/__pycache__/builtin.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b3ffe2fd33f1bcf81600baca7da9f91bacf39f12
Binary files /dev/null and b/detectron2/data/datasets/__pycache__/builtin.cpython-39.pyc differ
diff --git a/detectron2/data/datasets/__pycache__/builtin_meta.cpython-310.pyc b/detectron2/data/datasets/__pycache__/builtin_meta.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0f848dc7056db4eaa754a9b10a5ef413124ddf67
Binary files /dev/null and b/detectron2/data/datasets/__pycache__/builtin_meta.cpython-310.pyc differ
diff --git a/detectron2/data/datasets/__pycache__/builtin_meta.cpython-39.pyc b/detectron2/data/datasets/__pycache__/builtin_meta.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a6d05ab51575f308529600f6f3817b0c10b22a3b
Binary files /dev/null and b/detectron2/data/datasets/__pycache__/builtin_meta.cpython-39.pyc differ
diff --git a/detectron2/data/datasets/__pycache__/cityscapes.cpython-310.pyc b/detectron2/data/datasets/__pycache__/cityscapes.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c1f96bc76cb5ba94d950886157b6331f801b9c0c
Binary files /dev/null and b/detectron2/data/datasets/__pycache__/cityscapes.cpython-310.pyc differ
diff --git a/detectron2/data/datasets/__pycache__/cityscapes.cpython-39.pyc b/detectron2/data/datasets/__pycache__/cityscapes.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..26e28b9e6f85aec52f61e855fdad3ff44a411728
Binary files /dev/null and b/detectron2/data/datasets/__pycache__/cityscapes.cpython-39.pyc differ
diff --git a/detectron2/data/datasets/__pycache__/cityscapes_panoptic.cpython-310.pyc b/detectron2/data/datasets/__pycache__/cityscapes_panoptic.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..23a14e7e8c3548a2085d6e33195f41ff8e6abc4e
Binary files /dev/null and b/detectron2/data/datasets/__pycache__/cityscapes_panoptic.cpython-310.pyc differ
diff --git a/detectron2/data/datasets/__pycache__/cityscapes_panoptic.cpython-39.pyc b/detectron2/data/datasets/__pycache__/cityscapes_panoptic.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8a350e36a5e584c198628508fdbab0dd4e773d5b
Binary files /dev/null and b/detectron2/data/datasets/__pycache__/cityscapes_panoptic.cpython-39.pyc differ
diff --git a/detectron2/data/datasets/__pycache__/coco.cpython-310.pyc b/detectron2/data/datasets/__pycache__/coco.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d5b73017f2220c6558effed059a1182b24b21f3a
Binary files /dev/null and b/detectron2/data/datasets/__pycache__/coco.cpython-310.pyc differ
diff --git a/detectron2/data/datasets/__pycache__/coco.cpython-39.pyc b/detectron2/data/datasets/__pycache__/coco.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8fb6f6c45ea4a0edc5da3c6ba78d5e2c96829ae2
Binary files /dev/null and b/detectron2/data/datasets/__pycache__/coco.cpython-39.pyc differ
diff --git a/detectron2/data/datasets/__pycache__/coco_panoptic.cpython-310.pyc b/detectron2/data/datasets/__pycache__/coco_panoptic.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ae070b3ae1d961f14c5a68b0076a5686313df2b5
Binary files /dev/null and b/detectron2/data/datasets/__pycache__/coco_panoptic.cpython-310.pyc differ
diff --git a/detectron2/data/datasets/__pycache__/coco_panoptic.cpython-39.pyc b/detectron2/data/datasets/__pycache__/coco_panoptic.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7c2deb09330fceb998ef1e837e55e3162ee3a2b4
Binary files /dev/null and b/detectron2/data/datasets/__pycache__/coco_panoptic.cpython-39.pyc differ
diff --git a/detectron2/data/datasets/__pycache__/lvis.cpython-310.pyc b/detectron2/data/datasets/__pycache__/lvis.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..19aa6205401d7353cd46311e105ef914195a3801
Binary files /dev/null and b/detectron2/data/datasets/__pycache__/lvis.cpython-310.pyc differ
diff --git a/detectron2/data/datasets/__pycache__/lvis.cpython-39.pyc b/detectron2/data/datasets/__pycache__/lvis.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..696d675cb2ba572ccf5c1dfc1ebbc350b6a9f5b7
Binary files /dev/null and b/detectron2/data/datasets/__pycache__/lvis.cpython-39.pyc differ
diff --git a/detectron2/data/datasets/__pycache__/lvis_v0_5_categories.cpython-310.pyc b/detectron2/data/datasets/__pycache__/lvis_v0_5_categories.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..34e6fdb2932c35b19f89214be5b435f32509c7ad
Binary files /dev/null and b/detectron2/data/datasets/__pycache__/lvis_v0_5_categories.cpython-310.pyc differ
diff --git a/detectron2/data/datasets/__pycache__/lvis_v0_5_categories.cpython-39.pyc b/detectron2/data/datasets/__pycache__/lvis_v0_5_categories.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..89ffe29eb560a39159dd37160f28e306c02312b3
Binary files /dev/null and b/detectron2/data/datasets/__pycache__/lvis_v0_5_categories.cpython-39.pyc differ
diff --git a/detectron2/data/datasets/__pycache__/lvis_v1_categories.cpython-310.pyc b/detectron2/data/datasets/__pycache__/lvis_v1_categories.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6da72aa102721cf827a5987534abc150b295e2df
Binary files /dev/null and b/detectron2/data/datasets/__pycache__/lvis_v1_categories.cpython-310.pyc differ
diff --git a/detectron2/data/datasets/__pycache__/lvis_v1_categories.cpython-39.pyc b/detectron2/data/datasets/__pycache__/lvis_v1_categories.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3c46813baf684e881682f92f58b25629a5d28059
Binary files /dev/null and b/detectron2/data/datasets/__pycache__/lvis_v1_categories.cpython-39.pyc differ
diff --git a/detectron2/data/datasets/__pycache__/lvis_v1_category_image_count.cpython-310.pyc b/detectron2/data/datasets/__pycache__/lvis_v1_category_image_count.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7e5953be88f37f40c877eb29efd1b953ae5bc434
Binary files /dev/null and b/detectron2/data/datasets/__pycache__/lvis_v1_category_image_count.cpython-310.pyc differ
diff --git a/detectron2/data/datasets/__pycache__/lvis_v1_category_image_count.cpython-39.pyc b/detectron2/data/datasets/__pycache__/lvis_v1_category_image_count.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ead007b53e932ac83ca15fff38148c36ab734ed6
Binary files /dev/null and b/detectron2/data/datasets/__pycache__/lvis_v1_category_image_count.cpython-39.pyc differ
diff --git a/detectron2/data/datasets/__pycache__/pascal_voc.cpython-310.pyc b/detectron2/data/datasets/__pycache__/pascal_voc.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..640d93cd08caca303991584e93161efb55cf1274
Binary files /dev/null and b/detectron2/data/datasets/__pycache__/pascal_voc.cpython-310.pyc differ
diff --git a/detectron2/data/datasets/__pycache__/pascal_voc.cpython-39.pyc b/detectron2/data/datasets/__pycache__/pascal_voc.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b0d970ba347081f9a34e45a602c9124ab000a04e
Binary files /dev/null and b/detectron2/data/datasets/__pycache__/pascal_voc.cpython-39.pyc differ
diff --git a/detectron2/data/datasets/builtin.py b/detectron2/data/datasets/builtin.py
new file mode 100644
index 0000000000000000000000000000000000000000..c3a68aa833f12f0fa324a269c36190f21b8a75bd
--- /dev/null
+++ b/detectron2/data/datasets/builtin.py
@@ -0,0 +1,259 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) Facebook, Inc. and its affiliates.
+
+
+"""
+This file registers pre-defined datasets at hard-coded paths, and their metadata.
+
+We hard-code metadata for common datasets. This will enable:
+1. Consistency check when loading the datasets
+2. Use models on these standard datasets directly and run demos,
+   without having to download the dataset annotations
+
+We hard-code some paths to the dataset that's assumed to
+exist in "./datasets/".
+
+Users SHOULD NOT use this file to create new dataset / metadata for new dataset.
+To add new dataset, refer to the tutorial "docs/DATASETS.md".
+"""
+
+import os
+
+from detectron2.data import DatasetCatalog, MetadataCatalog
+
+from .builtin_meta import ADE20K_SEM_SEG_CATEGORIES, _get_builtin_metadata
+from .cityscapes import load_cityscapes_instances, load_cityscapes_semantic
+from .cityscapes_panoptic import register_all_cityscapes_panoptic
+from .coco import load_sem_seg, register_coco_instances
+from .coco_panoptic import register_coco_panoptic, register_coco_panoptic_separated
+from .lvis import get_lvis_instances_meta, register_lvis_instances
+from .pascal_voc import register_pascal_voc
+
+# ==== Predefined datasets and splits for COCO ==========
+
+_PREDEFINED_SPLITS_COCO = {}
+_PREDEFINED_SPLITS_COCO["coco"] = {
+    "coco_2014_train": ("coco/train2014", "coco/annotations/instances_train2014.json"),
+    "coco_2014_val": ("coco/val2014", "coco/annotations/instances_val2014.json"),
+    "coco_2014_minival": ("coco/val2014", "coco/annotations/instances_minival2014.json"),
+    "coco_2014_valminusminival": (
+        "coco/val2014",
+        "coco/annotations/instances_valminusminival2014.json",
+    ),
+    "coco_2017_train": ("coco/train2017", "coco/annotations/instances_train2017.json"),
+    "coco_2017_val": ("coco/val2017", "coco/annotations/instances_val2017.json"),
+    "coco_2017_test": ("coco/test2017", "coco/annotations/image_info_test2017.json"),
+    "coco_2017_test-dev": ("coco/test2017", "coco/annotations/image_info_test-dev2017.json"),
+    "coco_2017_val_100": ("coco/val2017", "coco/annotations/instances_val2017_100.json"),
+}
+
+_PREDEFINED_SPLITS_COCO["coco_person"] = {
+    "keypoints_coco_2014_train": (
+        "coco/train2014",
+        "coco/annotations/person_keypoints_train2014.json",
+    ),
+    "keypoints_coco_2014_val": ("coco/val2014", "coco/annotations/person_keypoints_val2014.json"),
+    "keypoints_coco_2014_minival": (
+        "coco/val2014",
+        "coco/annotations/person_keypoints_minival2014.json",
+    ),
+    "keypoints_coco_2014_valminusminival": (
+        "coco/val2014",
+        "coco/annotations/person_keypoints_valminusminival2014.json",
+    ),
+    "keypoints_coco_2017_train": (
+        "coco/train2017",
+        "coco/annotations/person_keypoints_train2017.json",
+    ),
+    "keypoints_coco_2017_val": ("coco/val2017", "coco/annotations/person_keypoints_val2017.json"),
+    "keypoints_coco_2017_val_100": (
+        "coco/val2017",
+        "coco/annotations/person_keypoints_val2017_100.json",
+    ),
+}
+
+
+_PREDEFINED_SPLITS_COCO_PANOPTIC = {
+    "coco_2017_train_panoptic": (
+        # This is the original panoptic annotation directory
+        "coco/panoptic_train2017",
+        "coco/annotations/panoptic_train2017.json",
+        # This directory contains semantic annotations that are
+        # converted from panoptic annotations.
+        # It is used by PanopticFPN.
+        # You can use the script at detectron2/datasets/prepare_panoptic_fpn.py
+        # to create these directories.
+        "coco/panoptic_stuff_train2017",
+    ),
+    "coco_2017_val_panoptic": (
+        "coco/panoptic_val2017",
+        "coco/annotations/panoptic_val2017.json",
+        "coco/panoptic_stuff_val2017",
+    ),
+    "coco_2017_val_100_panoptic": (
+        "coco/panoptic_val2017_100",
+        "coco/annotations/panoptic_val2017_100.json",
+        "coco/panoptic_stuff_val2017_100",
+    ),
+}
+
+
+def register_all_coco(root):
+    for dataset_name, splits_per_dataset in _PREDEFINED_SPLITS_COCO.items():
+        for key, (image_root, json_file) in splits_per_dataset.items():
+            # Assume pre-defined datasets live in `./datasets`.
+            register_coco_instances(
+                key,
+                _get_builtin_metadata(dataset_name),
+                os.path.join(root, json_file) if "://" not in json_file else json_file,
+                os.path.join(root, image_root),
+            )
+
+    for (
+        prefix,
+        (panoptic_root, panoptic_json, semantic_root),
+    ) in _PREDEFINED_SPLITS_COCO_PANOPTIC.items():
+        prefix_instances = prefix[: -len("_panoptic")]
+        instances_meta = MetadataCatalog.get(prefix_instances)
+        image_root, instances_json = instances_meta.image_root, instances_meta.json_file
+        # The "separated" version of COCO panoptic segmentation dataset,
+        # e.g. used by Panoptic FPN
+        register_coco_panoptic_separated(
+            prefix,
+            _get_builtin_metadata("coco_panoptic_separated"),
+            image_root,
+            os.path.join(root, panoptic_root),
+            os.path.join(root, panoptic_json),
+            os.path.join(root, semantic_root),
+            instances_json,
+        )
+        # The "standard" version of COCO panoptic segmentation dataset,
+        # e.g. used by Panoptic-DeepLab
+        register_coco_panoptic(
+            prefix,
+            _get_builtin_metadata("coco_panoptic_standard"),
+            image_root,
+            os.path.join(root, panoptic_root),
+            os.path.join(root, panoptic_json),
+            instances_json,
+        )
+
+
+# ==== Predefined datasets and splits for LVIS ==========
+
+
+_PREDEFINED_SPLITS_LVIS = {
+    "lvis_v1": {
+        "lvis_v1_train": ("coco/", "lvis/lvis_v1_train.json"),
+        "lvis_v1_val": ("coco/", "lvis/lvis_v1_val.json"),
+        "lvis_v1_test_dev": ("coco/", "lvis/lvis_v1_image_info_test_dev.json"),
+        "lvis_v1_test_challenge": ("coco/", "lvis/lvis_v1_image_info_test_challenge.json"),
+    },
+    "lvis_v0.5": {
+        "lvis_v0.5_train": ("coco/", "lvis/lvis_v0.5_train.json"),
+        "lvis_v0.5_val": ("coco/", "lvis/lvis_v0.5_val.json"),
+        "lvis_v0.5_val_rand_100": ("coco/", "lvis/lvis_v0.5_val_rand_100.json"),
+        "lvis_v0.5_test": ("coco/", "lvis/lvis_v0.5_image_info_test.json"),
+    },
+    "lvis_v0.5_cocofied": {
+        "lvis_v0.5_train_cocofied": ("coco/", "lvis/lvis_v0.5_train_cocofied.json"),
+        "lvis_v0.5_val_cocofied": ("coco/", "lvis/lvis_v0.5_val_cocofied.json"),
+    },
+}
+
+
+def register_all_lvis(root):
+    for dataset_name, splits_per_dataset in _PREDEFINED_SPLITS_LVIS.items():
+        for key, (image_root, json_file) in splits_per_dataset.items():
+            register_lvis_instances(
+                key,
+                get_lvis_instances_meta(dataset_name),
+                os.path.join(root, json_file) if "://" not in json_file else json_file,
+                os.path.join(root, image_root),
+            )
+
+
+# ==== Predefined splits for raw cityscapes images ===========
+_RAW_CITYSCAPES_SPLITS = {
+    "cityscapes_fine_{task}_train": ("cityscapes/leftImg8bit/train/", "cityscapes/gtFine/train/"),
+    "cityscapes_fine_{task}_val": ("cityscapes/leftImg8bit/val/", "cityscapes/gtFine/val/"),
+    "cityscapes_fine_{task}_test": ("cityscapes/leftImg8bit/test/", "cityscapes/gtFine/test/"),
+}
+
+
+def register_all_cityscapes(root):
+    for key, (image_dir, gt_dir) in _RAW_CITYSCAPES_SPLITS.items():
+        meta = _get_builtin_metadata("cityscapes")
+        image_dir = os.path.join(root, image_dir)
+        gt_dir = os.path.join(root, gt_dir)
+
+        inst_key = key.format(task="instance_seg")
+        DatasetCatalog.register(
+            inst_key,
+            lambda x=image_dir, y=gt_dir: load_cityscapes_instances(
+                x, y, from_json=True, to_polygons=True
+            ),
+        )
+        MetadataCatalog.get(inst_key).set(
+            image_dir=image_dir, gt_dir=gt_dir, evaluator_type="cityscapes_instance", **meta
+        )
+
+        sem_key = key.format(task="sem_seg")
+        DatasetCatalog.register(
+            sem_key, lambda x=image_dir, y=gt_dir: load_cityscapes_semantic(x, y)
+        )
+        MetadataCatalog.get(sem_key).set(
+            image_dir=image_dir,
+            gt_dir=gt_dir,
+            evaluator_type="cityscapes_sem_seg",
+            ignore_label=255,
+            **meta,
+        )
+
+
+# ==== Predefined splits for PASCAL VOC ===========
+def register_all_pascal_voc(root):
+    SPLITS = [
+        ("voc_2007_trainval", "VOC2007", "trainval"),
+        ("voc_2007_train", "VOC2007", "train"),
+        ("voc_2007_val", "VOC2007", "val"),
+        ("voc_2007_test", "VOC2007", "test"),
+        ("voc_2012_trainval", "VOC2012", "trainval"),
+        ("voc_2012_train", "VOC2012", "train"),
+        ("voc_2012_val", "VOC2012", "val"),
+    ]
+    for name, dirname, split in SPLITS:
+        year = 2007 if "2007" in name else 2012
+        register_pascal_voc(name, os.path.join(root, dirname), split, year)
+        MetadataCatalog.get(name).evaluator_type = "pascal_voc"
+
+
+def register_all_ade20k(root):
+    root = os.path.join(root, "ADEChallengeData2016")
+    for name, dirname in [("train", "training"), ("val", "validation")]:
+        image_dir = os.path.join(root, "images", dirname)
+        gt_dir = os.path.join(root, "annotations_detectron2", dirname)
+        name = f"ade20k_sem_seg_{name}"
+        DatasetCatalog.register(
+            name, lambda x=image_dir, y=gt_dir: load_sem_seg(y, x, gt_ext="png", image_ext="jpg")
+        )
+        MetadataCatalog.get(name).set(
+            stuff_classes=ADE20K_SEM_SEG_CATEGORIES[:],
+            image_root=image_dir,
+            sem_seg_root=gt_dir,
+            evaluator_type="sem_seg",
+            ignore_label=255,
+        )
+
+
+# True for open source;
+# Internally at fb, we register them elsewhere
+if __name__.endswith(".builtin"):
+    # Assume pre-defined datasets live in `./datasets`.
+    _root = os.path.expanduser(os.getenv("DETECTRON2_DATASETS", "datasets"))
+    register_all_coco(_root)
+    register_all_lvis(_root)
+    register_all_cityscapes(_root)
+    register_all_cityscapes_panoptic(_root)
+    register_all_pascal_voc(_root)
+    register_all_ade20k(_root)
diff --git a/detectron2/data/datasets/builtin_meta.py b/detectron2/data/datasets/builtin_meta.py
new file mode 100644
index 0000000000000000000000000000000000000000..63c7a1a31b31dd89b82011effee26471faccacf5
--- /dev/null
+++ b/detectron2/data/datasets/builtin_meta.py
@@ -0,0 +1,350 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) Facebook, Inc. and its affiliates.
+
+"""
+Note:
+For your custom dataset, there is no need to hard-code metadata anywhere in the code.
+For example, for COCO-format dataset, metadata will be obtained automatically
+when calling `load_coco_json`. For other dataset, metadata may also be obtained in other ways
+during loading.
+
+However, we hard-coded metadata for a few common dataset here.
+The only goal is to allow users who don't have these dataset to use pre-trained models.
+Users don't have to download a COCO json (which contains metadata), in order to visualize a
+COCO model (with correct class names and colors).
+"""
+
+
+# All coco categories, together with their nice-looking visualization colors
+# It's from https://github.com/cocodataset/panopticapi/blob/master/panoptic_coco_categories.json
+COCO_CATEGORIES = [
+    {"color": [220, 20, 60], "isthing": 1, "id": 1, "name": "person"},
+    {"color": [119, 11, 32], "isthing": 1, "id": 2, "name": "bicycle"},
+    {"color": [0, 0, 142], "isthing": 1, "id": 3, "name": "car"},
+    {"color": [0, 0, 230], "isthing": 1, "id": 4, "name": "motorcycle"},
+    {"color": [106, 0, 228], "isthing": 1, "id": 5, "name": "airplane"},
+    {"color": [0, 60, 100], "isthing": 1, "id": 6, "name": "bus"},
+    {"color": [0, 80, 100], "isthing": 1, "id": 7, "name": "train"},
+    {"color": [0, 0, 70], "isthing": 1, "id": 8, "name": "truck"},
+    {"color": [0, 0, 192], "isthing": 1, "id": 9, "name": "boat"},
+    {"color": [250, 170, 30], "isthing": 1, "id": 10, "name": "traffic light"},
+    {"color": [100, 170, 30], "isthing": 1, "id": 11, "name": "fire hydrant"},
+    {"color": [220, 220, 0], "isthing": 1, "id": 13, "name": "stop sign"},
+    {"color": [175, 116, 175], "isthing": 1, "id": 14, "name": "parking meter"},
+    {"color": [250, 0, 30], "isthing": 1, "id": 15, "name": "bench"},
+    {"color": [165, 42, 42], "isthing": 1, "id": 16, "name": "bird"},
+    {"color": [255, 77, 255], "isthing": 1, "id": 17, "name": "cat"},
+    {"color": [0, 226, 252], "isthing": 1, "id": 18, "name": "dog"},
+    {"color": [182, 182, 255], "isthing": 1, "id": 19, "name": "horse"},
+    {"color": [0, 82, 0], "isthing": 1, "id": 20, "name": "sheep"},
+    {"color": [120, 166, 157], "isthing": 1, "id": 21, "name": "cow"},
+    {"color": [110, 76, 0], "isthing": 1, "id": 22, "name": "elephant"},
+    {"color": [174, 57, 255], "isthing": 1, "id": 23, "name": "bear"},
+    {"color": [199, 100, 0], "isthing": 1, "id": 24, "name": "zebra"},
+    {"color": [72, 0, 118], "isthing": 1, "id": 25, "name": "giraffe"},
+    {"color": [255, 179, 240], "isthing": 1, "id": 27, "name": "backpack"},
+    {"color": [0, 125, 92], "isthing": 1, "id": 28, "name": "umbrella"},
+    {"color": [209, 0, 151], "isthing": 1, "id": 31, "name": "handbag"},
+    {"color": [188, 208, 182], "isthing": 1, "id": 32, "name": "tie"},
+    {"color": [0, 220, 176], "isthing": 1, "id": 33, "name": "suitcase"},
+    {"color": [255, 99, 164], "isthing": 1, "id": 34, "name": "frisbee"},
+    {"color": [92, 0, 73], "isthing": 1, "id": 35, "name": "skis"},
+    {"color": [133, 129, 255], "isthing": 1, "id": 36, "name": "snowboard"},
+    {"color": [78, 180, 255], "isthing": 1, "id": 37, "name": "sports ball"},
+    {"color": [0, 228, 0], "isthing": 1, "id": 38, "name": "kite"},
+    {"color": [174, 255, 243], "isthing": 1, "id": 39, "name": "baseball bat"},
+    {"color": [45, 89, 255], "isthing": 1, "id": 40, "name": "baseball glove"},
+    {"color": [134, 134, 103], "isthing": 1, "id": 41, "name": "skateboard"},
+    {"color": [145, 148, 174], "isthing": 1, "id": 42, "name": "surfboard"},
+    {"color": [255, 208, 186], "isthing": 1, "id": 43, "name": "tennis racket"},
+    {"color": [197, 226, 255], "isthing": 1, "id": 44, "name": "bottle"},
+    {"color": [171, 134, 1], "isthing": 1, "id": 46, "name": "wine glass"},
+    {"color": [109, 63, 54], "isthing": 1, "id": 47, "name": "cup"},
+    {"color": [207, 138, 255], "isthing": 1, "id": 48, "name": "fork"},
+    {"color": [151, 0, 95], "isthing": 1, "id": 49, "name": "knife"},
+    {"color": [9, 80, 61], "isthing": 1, "id": 50, "name": "spoon"},
+    {"color": [84, 105, 51], "isthing": 1, "id": 51, "name": "bowl"},
+    {"color": [74, 65, 105], "isthing": 1, "id": 52, "name": "banana"},
+    {"color": [166, 196, 102], "isthing": 1, "id": 53, "name": "apple"},
+    {"color": [208, 195, 210], "isthing": 1, "id": 54, "name": "sandwich"},
+    {"color": [255, 109, 65], "isthing": 1, "id": 55, "name": "orange"},
+    {"color": [0, 143, 149], "isthing": 1, "id": 56, "name": "broccoli"},
+    {"color": [179, 0, 194], "isthing": 1, "id": 57, "name": "carrot"},
+    {"color": [209, 99, 106], "isthing": 1, "id": 58, "name": "hot dog"},
+    {"color": [5, 121, 0], "isthing": 1, "id": 59, "name": "pizza"},
+    {"color": [227, 255, 205], "isthing": 1, "id": 60, "name": "donut"},
+    {"color": [147, 186, 208], "isthing": 1, "id": 61, "name": "cake"},
+    {"color": [153, 69, 1], "isthing": 1, "id": 62, "name": "chair"},
+    {"color": [3, 95, 161], "isthing": 1, "id": 63, "name": "couch"},
+    {"color": [163, 255, 0], "isthing": 1, "id": 64, "name": "potted plant"},
+    {"color": [119, 0, 170], "isthing": 1, "id": 65, "name": "bed"},
+    {"color": [0, 182, 199], "isthing": 1, "id": 67, "name": "dining table"},
+    {"color": [0, 165, 120], "isthing": 1, "id": 70, "name": "toilet"},
+    {"color": [183, 130, 88], "isthing": 1, "id": 72, "name": "tv"},
+    {"color": [95, 32, 0], "isthing": 1, "id": 73, "name": "laptop"},
+    {"color": [130, 114, 135], "isthing": 1, "id": 74, "name": "mouse"},
+    {"color": [110, 129, 133], "isthing": 1, "id": 75, "name": "remote"},
+    {"color": [166, 74, 118], "isthing": 1, "id": 76, "name": "keyboard"},
+    {"color": [219, 142, 185], "isthing": 1, "id": 77, "name": "cell phone"},
+    {"color": [79, 210, 114], "isthing": 1, "id": 78, "name": "microwave"},
+    {"color": [178, 90, 62], "isthing": 1, "id": 79, "name": "oven"},
+    {"color": [65, 70, 15], "isthing": 1, "id": 80, "name": "toaster"},
+    {"color": [127, 167, 115], "isthing": 1, "id": 81, "name": "sink"},
+    {"color": [59, 105, 106], "isthing": 1, "id": 82, "name": "refrigerator"},
+    {"color": [142, 108, 45], "isthing": 1, "id": 84, "name": "book"},
+    {"color": [196, 172, 0], "isthing": 1, "id": 85, "name": "clock"},
+    {"color": [95, 54, 80], "isthing": 1, "id": 86, "name": "vase"},
+    {"color": [128, 76, 255], "isthing": 1, "id": 87, "name": "scissors"},
+    {"color": [201, 57, 1], "isthing": 1, "id": 88, "name": "teddy bear"},
+    {"color": [246, 0, 122], "isthing": 1, "id": 89, "name": "hair drier"},
+    {"color": [191, 162, 208], "isthing": 1, "id": 90, "name": "toothbrush"},
+    {"color": [255, 255, 128], "isthing": 0, "id": 92, "name": "banner"},
+    {"color": [147, 211, 203], "isthing": 0, "id": 93, "name": "blanket"},
+    {"color": [150, 100, 100], "isthing": 0, "id": 95, "name": "bridge"},
+    {"color": [168, 171, 172], "isthing": 0, "id": 100, "name": "cardboard"},
+    {"color": [146, 112, 198], "isthing": 0, "id": 107, "name": "counter"},
+    {"color": [210, 170, 100], "isthing": 0, "id": 109, "name": "curtain"},
+    {"color": [92, 136, 89], "isthing": 0, "id": 112, "name": "door-stuff"},
+    {"color": [218, 88, 184], "isthing": 0, "id": 118, "name": "floor-wood"},
+    {"color": [241, 129, 0], "isthing": 0, "id": 119, "name": "flower"},
+    {"color": [217, 17, 255], "isthing": 0, "id": 122, "name": "fruit"},
+    {"color": [124, 74, 181], "isthing": 0, "id": 125, "name": "gravel"},
+    {"color": [70, 70, 70], "isthing": 0, "id": 128, "name": "house"},
+    {"color": [255, 228, 255], "isthing": 0, "id": 130, "name": "light"},
+    {"color": [154, 208, 0], "isthing": 0, "id": 133, "name": "mirror-stuff"},
+    {"color": [193, 0, 92], "isthing": 0, "id": 138, "name": "net"},
+    {"color": [76, 91, 113], "isthing": 0, "id": 141, "name": "pillow"},
+    {"color": [255, 180, 195], "isthing": 0, "id": 144, "name": "platform"},
+    {"color": [106, 154, 176], "isthing": 0, "id": 145, "name": "playingfield"},
+    {"color": [230, 150, 140], "isthing": 0, "id": 147, "name": "railroad"},
+    {"color": [60, 143, 255], "isthing": 0, "id": 148, "name": "river"},
+    {"color": [128, 64, 128], "isthing": 0, "id": 149, "name": "road"},
+    {"color": [92, 82, 55], "isthing": 0, "id": 151, "name": "roof"},
+    {"color": [254, 212, 124], "isthing": 0, "id": 154, "name": "sand"},
+    {"color": [73, 77, 174], "isthing": 0, "id": 155, "name": "sea"},
+    {"color": [255, 160, 98], "isthing": 0, "id": 156, "name": "shelf"},
+    {"color": [255, 255, 255], "isthing": 0, "id": 159, "name": "snow"},
+    {"color": [104, 84, 109], "isthing": 0, "id": 161, "name": "stairs"},
+    {"color": [169, 164, 131], "isthing": 0, "id": 166, "name": "tent"},
+    {"color": [225, 199, 255], "isthing": 0, "id": 168, "name": "towel"},
+    {"color": [137, 54, 74], "isthing": 0, "id": 171, "name": "wall-brick"},
+    {"color": [135, 158, 223], "isthing": 0, "id": 175, "name": "wall-stone"},
+    {"color": [7, 246, 231], "isthing": 0, "id": 176, "name": "wall-tile"},
+    {"color": [107, 255, 200], "isthing": 0, "id": 177, "name": "wall-wood"},
+    {"color": [58, 41, 149], "isthing": 0, "id": 178, "name": "water-other"},
+    {"color": [183, 121, 142], "isthing": 0, "id": 180, "name": "window-blind"},
+    {"color": [255, 73, 97], "isthing": 0, "id": 181, "name": "window-other"},
+    {"color": [107, 142, 35], "isthing": 0, "id": 184, "name": "tree-merged"},
+    {"color": [190, 153, 153], "isthing": 0, "id": 185, "name": "fence-merged"},
+    {"color": [146, 139, 141], "isthing": 0, "id": 186, "name": "ceiling-merged"},
+    {"color": [70, 130, 180], "isthing": 0, "id": 187, "name": "sky-other-merged"},
+    {"color": [134, 199, 156], "isthing": 0, "id": 188, "name": "cabinet-merged"},
+    {"color": [209, 226, 140], "isthing": 0, "id": 189, "name": "table-merged"},
+    {"color": [96, 36, 108], "isthing": 0, "id": 190, "name": "floor-other-merged"},
+    {"color": [96, 96, 96], "isthing": 0, "id": 191, "name": "pavement-merged"},
+    {"color": [64, 170, 64], "isthing": 0, "id": 192, "name": "mountain-merged"},
+    {"color": [152, 251, 152], "isthing": 0, "id": 193, "name": "grass-merged"},
+    {"color": [208, 229, 228], "isthing": 0, "id": 194, "name": "dirt-merged"},
+    {"color": [206, 186, 171], "isthing": 0, "id": 195, "name": "paper-merged"},
+    {"color": [152, 161, 64], "isthing": 0, "id": 196, "name": "food-other-merged"},
+    {"color": [116, 112, 0], "isthing": 0, "id": 197, "name": "building-other-merged"},
+    {"color": [0, 114, 143], "isthing": 0, "id": 198, "name": "rock-merged"},
+    {"color": [102, 102, 156], "isthing": 0, "id": 199, "name": "wall-other-merged"},
+    {"color": [250, 141, 255], "isthing": 0, "id": 200, "name": "rug-merged"},
+]
+
+# fmt: off
+COCO_PERSON_KEYPOINT_NAMES = (
+    "nose",
+    "left_eye", "right_eye",
+    "left_ear", "right_ear",
+    "left_shoulder", "right_shoulder",
+    "left_elbow", "right_elbow",
+    "left_wrist", "right_wrist",
+    "left_hip", "right_hip",
+    "left_knee", "right_knee",
+    "left_ankle", "right_ankle",
+)
+# fmt: on
+
+# Pairs of keypoints that should be exchanged under horizontal flipping
+COCO_PERSON_KEYPOINT_FLIP_MAP = (
+    ("left_eye", "right_eye"),
+    ("left_ear", "right_ear"),
+    ("left_shoulder", "right_shoulder"),
+    ("left_elbow", "right_elbow"),
+    ("left_wrist", "right_wrist"),
+    ("left_hip", "right_hip"),
+    ("left_knee", "right_knee"),
+    ("left_ankle", "right_ankle"),
+)
+
+# rules for pairs of keypoints to draw a line between, and the line color to use.
+KEYPOINT_CONNECTION_RULES = [
+    # face
+    ("left_ear", "left_eye", (102, 204, 255)),
+    ("right_ear", "right_eye", (51, 153, 255)),
+    ("left_eye", "nose", (102, 0, 204)),
+    ("nose", "right_eye", (51, 102, 255)),
+    # upper-body
+    ("left_shoulder", "right_shoulder", (255, 128, 0)),
+    ("left_shoulder", "left_elbow", (153, 255, 204)),
+    ("right_shoulder", "right_elbow", (128, 229, 255)),
+    ("left_elbow", "left_wrist", (153, 255, 153)),
+    ("right_elbow", "right_wrist", (102, 255, 224)),
+    # lower-body
+    ("left_hip", "right_hip", (255, 102, 0)),
+    ("left_hip", "left_knee", (255, 255, 77)),
+    ("right_hip", "right_knee", (153, 255, 204)),
+    ("left_knee", "left_ankle", (191, 255, 128)),
+    ("right_knee", "right_ankle", (255, 195, 77)),
+]
+
+# All Cityscapes categories, together with their nice-looking visualization colors
+# It's from https://github.com/mcordts/cityscapesScripts/blob/master/cityscapesscripts/helpers/labels.py  # noqa
+CITYSCAPES_CATEGORIES = [
+    {"color": (128, 64, 128), "isthing": 0, "id": 7, "trainId": 0, "name": "road"},
+    {"color": (244, 35, 232), "isthing": 0, "id": 8, "trainId": 1, "name": "sidewalk"},
+    {"color": (70, 70, 70), "isthing": 0, "id": 11, "trainId": 2, "name": "building"},
+    {"color": (102, 102, 156), "isthing": 0, "id": 12, "trainId": 3, "name": "wall"},
+    {"color": (190, 153, 153), "isthing": 0, "id": 13, "trainId": 4, "name": "fence"},
+    {"color": (153, 153, 153), "isthing": 0, "id": 17, "trainId": 5, "name": "pole"},
+    {"color": (250, 170, 30), "isthing": 0, "id": 19, "trainId": 6, "name": "traffic light"},
+    {"color": (220, 220, 0), "isthing": 0, "id": 20, "trainId": 7, "name": "traffic sign"},
+    {"color": (107, 142, 35), "isthing": 0, "id": 21, "trainId": 8, "name": "vegetation"},
+    {"color": (152, 251, 152), "isthing": 0, "id": 22, "trainId": 9, "name": "terrain"},
+    {"color": (70, 130, 180), "isthing": 0, "id": 23, "trainId": 10, "name": "sky"},
+    {"color": (220, 20, 60), "isthing": 1, "id": 24, "trainId": 11, "name": "person"},
+    {"color": (255, 0, 0), "isthing": 1, "id": 25, "trainId": 12, "name": "rider"},
+    {"color": (0, 0, 142), "isthing": 1, "id": 26, "trainId": 13, "name": "car"},
+    {"color": (0, 0, 70), "isthing": 1, "id": 27, "trainId": 14, "name": "truck"},
+    {"color": (0, 60, 100), "isthing": 1, "id": 28, "trainId": 15, "name": "bus"},
+    {"color": (0, 80, 100), "isthing": 1, "id": 31, "trainId": 16, "name": "train"},
+    {"color": (0, 0, 230), "isthing": 1, "id": 32, "trainId": 17, "name": "motorcycle"},
+    {"color": (119, 11, 32), "isthing": 1, "id": 33, "trainId": 18, "name": "bicycle"},
+]
+
+# fmt: off
+ADE20K_SEM_SEG_CATEGORIES = [
+    "wall", "building", "sky", "floor", "tree", "ceiling", "road, route", "bed", "window ", "grass", "cabinet", "sidewalk, pavement", "person", "earth, ground", "door", "table", "mountain, mount", "plant", "curtain", "chair", "car", "water", "painting, picture", "sofa", "shelf", "house", "sea", "mirror", "rug", "field", "armchair", "seat", "fence", "desk", "rock, stone", "wardrobe, closet, press", "lamp", "tub", "rail", "cushion", "base, pedestal, stand", "box", "column, pillar", "signboard, sign", "chest of drawers, chest, bureau, dresser", "counter", "sand", "sink", "skyscraper", "fireplace", "refrigerator, icebox", "grandstand, covered stand", "path", "stairs", "runway", "case, display case, showcase, vitrine", "pool table, billiard table, snooker table", "pillow", "screen door, screen", "stairway, staircase", "river", "bridge, span", "bookcase", "blind, screen", "coffee table", "toilet, can, commode, crapper, pot, potty, stool, throne", "flower", "book", "hill", "bench", "countertop", "stove", "palm, palm tree", "kitchen island", "computer", "swivel chair", "boat", "bar", "arcade machine", "hovel, hut, hutch, shack, shanty", "bus", "towel", "light", "truck", "tower", "chandelier", "awning, sunshade, sunblind", "street lamp", "booth", "tv", "plane", "dirt track", "clothes", "pole", "land, ground, soil", "bannister, banister, balustrade, balusters, handrail", "escalator, moving staircase, moving stairway", "ottoman, pouf, pouffe, puff, hassock", "bottle", "buffet, counter, sideboard", "poster, posting, placard, notice, bill, card", "stage", "van", "ship", "fountain", "conveyer belt, conveyor belt, conveyer, conveyor, transporter", "canopy", "washer, automatic washer, washing machine", "plaything, toy", "pool", "stool", "barrel, cask", "basket, handbasket", "falls", "tent", "bag", "minibike, motorbike", "cradle", "oven", "ball", "food, solid food", "step, stair", "tank, storage tank", "trade name", "microwave", "pot", "animal", "bicycle", "lake", "dishwasher", "screen", "blanket, cover", "sculpture", "hood, exhaust hood", "sconce", "vase", "traffic light", "tray", "trash can", "fan", "pier", "crt screen", "plate", "monitor", "bulletin board", "shower", "radiator", "glass, drinking glass", "clock", "flag", # noqa
+]
+# After processed by `prepare_ade20k_sem_seg.py`, id 255 means ignore
+# fmt: on
+
+
+def _get_coco_instances_meta():
+    thing_ids = [k["id"] for k in COCO_CATEGORIES if k["isthing"] == 1]
+    thing_colors = [k["color"] for k in COCO_CATEGORIES if k["isthing"] == 1]
+    assert len(thing_ids) == 80, len(thing_ids)
+    # Mapping from the incontiguous COCO category id to an id in [0, 79]
+    thing_dataset_id_to_contiguous_id = {k: i for i, k in enumerate(thing_ids)}
+    thing_classes = [k["name"] for k in COCO_CATEGORIES if k["isthing"] == 1]
+    ret = {
+        "thing_dataset_id_to_contiguous_id": thing_dataset_id_to_contiguous_id,
+        "thing_classes": thing_classes,
+        "thing_colors": thing_colors,
+    }
+    return ret
+
+
+def _get_coco_panoptic_separated_meta():
+    """
+    Returns metadata for "separated" version of the panoptic segmentation dataset.
+    """
+    stuff_ids = [k["id"] for k in COCO_CATEGORIES if k["isthing"] == 0]
+    assert len(stuff_ids) == 53, len(stuff_ids)
+
+    # For semantic segmentation, this mapping maps from contiguous stuff id
+    # (in [0, 53], used in models) to ids in the dataset (used for processing results)
+    # The id 0 is mapped to an extra category "thing".
+    stuff_dataset_id_to_contiguous_id = {k: i + 1 for i, k in enumerate(stuff_ids)}
+    # When converting COCO panoptic annotations to semantic annotations
+    # We label the "thing" category to 0
+    stuff_dataset_id_to_contiguous_id[0] = 0
+
+    # 54 names for COCO stuff categories (including "things")
+    stuff_classes = ["things"] + [
+        k["name"].replace("-other", "").replace("-merged", "")
+        for k in COCO_CATEGORIES
+        if k["isthing"] == 0
+    ]
+
+    # NOTE: I randomly picked a color for things
+    stuff_colors = [[82, 18, 128]] + [k["color"] for k in COCO_CATEGORIES if k["isthing"] == 0]
+    ret = {
+        "stuff_dataset_id_to_contiguous_id": stuff_dataset_id_to_contiguous_id,
+        "stuff_classes": stuff_classes,
+        "stuff_colors": stuff_colors,
+    }
+    ret.update(_get_coco_instances_meta())
+    return ret
+
+
+def _get_builtin_metadata(dataset_name):
+    if dataset_name == "coco":
+        return _get_coco_instances_meta()
+    if dataset_name == "coco_panoptic_separated":
+        return _get_coco_panoptic_separated_meta()
+    elif dataset_name == "coco_panoptic_standard":
+        meta = {}
+        # The following metadata maps contiguous id from [0, #thing categories +
+        # #stuff categories) to their names and colors. We have to replica of the
+        # same name and color under "thing_*" and "stuff_*" because the current
+        # visualization function in D2 handles thing and class classes differently
+        # due to some heuristic used in Panoptic FPN. We keep the same naming to
+        # enable reusing existing visualization functions.
+        thing_classes = [k["name"] for k in COCO_CATEGORIES]
+        thing_colors = [k["color"] for k in COCO_CATEGORIES]
+        stuff_classes = [k["name"] for k in COCO_CATEGORIES]
+        stuff_colors = [k["color"] for k in COCO_CATEGORIES]
+
+        meta["thing_classes"] = thing_classes
+        meta["thing_colors"] = thing_colors
+        meta["stuff_classes"] = stuff_classes
+        meta["stuff_colors"] = stuff_colors
+
+        # Convert category id for training:
+        #   category id: like semantic segmentation, it is the class id for each
+        #   pixel. Since there are some classes not used in evaluation, the category
+        #   id is not always contiguous and thus we have two set of category ids:
+        #       - original category id: category id in the original dataset, mainly
+        #           used for evaluation.
+        #       - contiguous category id: [0, #classes), in order to train the linear
+        #           softmax classifier.
+        thing_dataset_id_to_contiguous_id = {}
+        stuff_dataset_id_to_contiguous_id = {}
+
+        for i, cat in enumerate(COCO_CATEGORIES):
+            if cat["isthing"]:
+                thing_dataset_id_to_contiguous_id[cat["id"]] = i
+            else:
+                stuff_dataset_id_to_contiguous_id[cat["id"]] = i
+
+        meta["thing_dataset_id_to_contiguous_id"] = thing_dataset_id_to_contiguous_id
+        meta["stuff_dataset_id_to_contiguous_id"] = stuff_dataset_id_to_contiguous_id
+
+        return meta
+    elif dataset_name == "coco_person":
+        return {
+            "thing_classes": ["person"],
+            "keypoint_names": COCO_PERSON_KEYPOINT_NAMES,
+            "keypoint_flip_map": COCO_PERSON_KEYPOINT_FLIP_MAP,
+            "keypoint_connection_rules": KEYPOINT_CONNECTION_RULES,
+        }
+    elif dataset_name == "cityscapes":
+        # fmt: off
+        CITYSCAPES_THING_CLASSES = [
+            "person", "rider", "car", "truck",
+            "bus", "train", "motorcycle", "bicycle",
+        ]
+        CITYSCAPES_STUFF_CLASSES = [
+            "road", "sidewalk", "building", "wall", "fence", "pole", "traffic light",
+            "traffic sign", "vegetation", "terrain", "sky", "person", "rider", "car",
+            "truck", "bus", "train", "motorcycle", "bicycle",
+        ]
+        # fmt: on
+        return {
+            "thing_classes": CITYSCAPES_THING_CLASSES,
+            "stuff_classes": CITYSCAPES_STUFF_CLASSES,
+        }
+    raise KeyError("No built-in metadata for dataset {}".format(dataset_name))
diff --git a/detectron2/data/datasets/cityscapes.py b/detectron2/data/datasets/cityscapes.py
new file mode 100644
index 0000000000000000000000000000000000000000..8a82256725f5f7152751e102bfe6ca6410f7f983
--- /dev/null
+++ b/detectron2/data/datasets/cityscapes.py
@@ -0,0 +1,345 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+import functools
+import json
+import logging
+import multiprocessing as mp
+import os
+from itertools import chain
+
+import numpy as np
+import pycocotools.mask as mask_util
+
+from detectron2.structures import BoxMode
+from detectron2.utils.comm import get_world_size
+from detectron2.utils.file_io import PathManager
+from detectron2.utils.logger import setup_logger
+from PIL import Image
+
+try:
+    import cv2  # noqa
+except ImportError:
+    # OpenCV is an optional dependency at the moment
+    pass
+
+
+logger = logging.getLogger(__name__)
+
+
+def _get_cityscapes_files(image_dir, gt_dir):
+    files = []
+    # scan through the directory
+    cities = PathManager.ls(image_dir)
+    logger.info(f"{len(cities)} cities found in '{image_dir}'.")
+    for city in cities:
+        city_img_dir = os.path.join(image_dir, city)
+        city_gt_dir = os.path.join(gt_dir, city)
+        for basename in PathManager.ls(city_img_dir):
+            image_file = os.path.join(city_img_dir, basename)
+
+            suffix = "leftImg8bit.png"
+            assert basename.endswith(suffix), basename
+            basename = basename[: -len(suffix)]
+
+            instance_file = os.path.join(
+                city_gt_dir, basename + "gtFine_instanceIds.png"
+            )
+            label_file = os.path.join(city_gt_dir, basename + "gtFine_labelIds.png")
+            json_file = os.path.join(city_gt_dir, basename + "gtFine_polygons.json")
+
+            files.append((image_file, instance_file, label_file, json_file))
+    assert len(files), "No images found in {}".format(image_dir)
+    for f in files[0]:
+        assert PathManager.isfile(f), f
+    return files
+
+
+def load_cityscapes_instances(image_dir, gt_dir, from_json=True, to_polygons=True):
+    """
+    Args:
+        image_dir (str): path to the raw dataset. e.g., "~/cityscapes/leftImg8bit/train".
+        gt_dir (str): path to the raw annotations. e.g., "~/cityscapes/gtFine/train".
+        from_json (bool): whether to read annotations from the raw json file or the png files.
+        to_polygons (bool): whether to represent the segmentation as polygons
+            (COCO's format) instead of masks (cityscapes's format).
+
+    Returns:
+        list[dict]: a list of dicts in Detectron2 standard format. (See
+        `Using Custom Datasets </tutorials/datasets.html>`_ )
+    """
+    if from_json:
+        assert to_polygons, (
+            "Cityscapes's json annotations are in polygon format. "
+            "Converting to mask format is not supported now."
+        )
+    files = _get_cityscapes_files(image_dir, gt_dir)
+
+    logger.info("Preprocessing cityscapes annotations ...")
+    # This is still not fast: all workers will execute duplicate works and will
+    # take up to 10m on a 8GPU server.
+    pool = mp.Pool(processes=max(mp.cpu_count() // get_world_size() // 2, 4))
+
+    ret = pool.map(
+        functools.partial(
+            _cityscapes_files_to_dict, from_json=from_json, to_polygons=to_polygons
+        ),
+        files,
+    )
+    logger.info("Loaded {} images from {}".format(len(ret), image_dir))
+
+    # Map cityscape ids to contiguous ids
+    from cityscapesscripts.helpers.labels import labels
+
+    labels = [l for l in labels if l.hasInstances and not l.ignoreInEval]
+    dataset_id_to_contiguous_id = {l.id: idx for idx, l in enumerate(labels)}
+    for dict_per_image in ret:
+        for anno in dict_per_image["annotations"]:
+            anno["category_id"] = dataset_id_to_contiguous_id[anno["category_id"]]
+    return ret
+
+
+def load_cityscapes_semantic(image_dir, gt_dir):
+    """
+    Args:
+        image_dir (str): path to the raw dataset. e.g., "~/cityscapes/leftImg8bit/train".
+        gt_dir (str): path to the raw annotations. e.g., "~/cityscapes/gtFine/train".
+
+    Returns:
+        list[dict]: a list of dict, each has "file_name" and
+            "sem_seg_file_name".
+    """
+    ret = []
+    # gt_dir is small and contain many small files. make sense to fetch to local first
+    gt_dir = PathManager.get_local_path(gt_dir)
+    for image_file, _, label_file, json_file in _get_cityscapes_files(
+        image_dir, gt_dir
+    ):
+        label_file = label_file.replace("labelIds", "labelTrainIds")
+
+        with PathManager.open(json_file, "r") as f:
+            jsonobj = json.load(f)
+        ret.append(
+            {
+                "file_name": image_file,
+                "sem_seg_file_name": label_file,
+                "height": jsonobj["imgHeight"],
+                "width": jsonobj["imgWidth"],
+            }
+        )
+    assert len(ret), f"No images found in {image_dir}!"
+    assert PathManager.isfile(
+        ret[0]["sem_seg_file_name"]
+    ), "Please generate labelTrainIds.png with cityscapesscripts/preparation/createTrainIdLabelImgs.py"  # noqa
+    return ret
+
+
+def _cityscapes_files_to_dict(files, from_json, to_polygons):
+    """
+    Parse cityscapes annotation files to a instance segmentation dataset dict.
+
+    Args:
+        files (tuple): consists of (image_file, instance_id_file, label_id_file, json_file)
+        from_json (bool): whether to read annotations from the raw json file or the png files.
+        to_polygons (bool): whether to represent the segmentation as polygons
+            (COCO's format) instead of masks (cityscapes's format).
+
+    Returns:
+        A dict in Detectron2 Dataset format.
+    """
+    from cityscapesscripts.helpers.labels import id2label, name2label
+
+    image_file, instance_id_file, _, json_file = files
+
+    annos = []
+
+    if from_json:
+        from shapely.geometry import MultiPolygon, Polygon
+
+        with PathManager.open(json_file, "r") as f:
+            jsonobj = json.load(f)
+        ret = {
+            "file_name": image_file,
+            "image_id": os.path.basename(image_file),
+            "height": jsonobj["imgHeight"],
+            "width": jsonobj["imgWidth"],
+        }
+
+        # `polygons_union` contains the union of all valid polygons.
+        polygons_union = Polygon()
+
+        # CityscapesScripts draw the polygons in sequential order
+        # and each polygon *overwrites* existing ones. See
+        # (https://github.com/mcordts/cityscapesScripts/blob/master/cityscapesscripts/preparation/json2instanceImg.py) # noqa
+        # We use reverse order, and each polygon *avoids* early ones.
+        # This will resolve the ploygon overlaps in the same way as CityscapesScripts.
+        for obj in jsonobj["objects"][::-1]:
+            if "deleted" in obj:  # cityscapes data format specific
+                continue
+            label_name = obj["label"]
+
+            try:
+                label = name2label[label_name]
+            except KeyError:
+                if label_name.endswith("group"):  # crowd area
+                    label = name2label[label_name[: -len("group")]]
+                else:
+                    raise
+            if label.id < 0:  # cityscapes data format
+                continue
+
+            # Cityscapes's raw annotations uses integer coordinates
+            # Therefore +0.5 here
+            poly_coord = np.asarray(obj["polygon"], dtype="f4") + 0.5
+            # CityscapesScript uses PIL.ImageDraw.polygon to rasterize
+            # polygons for evaluation. This function operates in integer space
+            # and draws each pixel whose center falls into the polygon.
+            # Therefore it draws a polygon which is 0.5 "fatter" in expectation.
+            # We therefore dilate the input polygon by 0.5 as our input.
+            poly = Polygon(poly_coord).buffer(0.5, resolution=4)
+
+            if not label.hasInstances or label.ignoreInEval:
+                # even if we won't store the polygon it still contributes to overlaps resolution
+                polygons_union = polygons_union.union(poly)
+                continue
+
+            # Take non-overlapping part of the polygon
+            poly_wo_overlaps = poly.difference(polygons_union)
+            if poly_wo_overlaps.is_empty:
+                continue
+            polygons_union = polygons_union.union(poly)
+
+            anno = {}
+            anno["iscrowd"] = label_name.endswith("group")
+            anno["category_id"] = label.id
+
+            if isinstance(poly_wo_overlaps, Polygon):
+                poly_list = [poly_wo_overlaps]
+            elif isinstance(poly_wo_overlaps, MultiPolygon):
+                poly_list = poly_wo_overlaps.geoms
+            else:
+                raise NotImplementedError(
+                    "Unknown geometric structure {}".format(poly_wo_overlaps)
+                )
+
+            poly_coord = []
+            for poly_el in poly_list:
+                # COCO API can work only with exterior boundaries now, hence we store only them.
+                # TODO: store both exterior and interior boundaries once other parts of the
+                # codebase support holes in polygons.
+                poly_coord.append(list(chain(*poly_el.exterior.coords)))
+            anno["segmentation"] = poly_coord
+            (xmin, ymin, xmax, ymax) = poly_wo_overlaps.bounds
+
+            anno["bbox"] = (xmin, ymin, xmax, ymax)
+            anno["bbox_mode"] = BoxMode.XYXY_ABS
+
+            annos.append(anno)
+    else:
+        # See also the official annotation parsing scripts at
+        # https://github.com/mcordts/cityscapesScripts/blob/master/cityscapesscripts/evaluation/instances2dict.py  # noqa
+        with PathManager.open(instance_id_file, "rb") as f:
+            inst_image = np.asarray(Image.open(f), order="F")
+        # ids < 24 are stuff labels (filtering them first is about 5% faster)
+        flattened_ids = np.unique(inst_image[inst_image >= 24])
+
+        ret = {
+            "file_name": image_file,
+            "image_id": os.path.basename(image_file),
+            "height": inst_image.shape[0],
+            "width": inst_image.shape[1],
+        }
+
+        for instance_id in flattened_ids:
+            # For non-crowd annotations, instance_id // 1000 is the label_id
+            # Crowd annotations have <1000 instance ids
+            label_id = instance_id // 1000 if instance_id >= 1000 else instance_id
+            label = id2label[label_id]
+            if not label.hasInstances or label.ignoreInEval:
+                continue
+
+            anno = {}
+            anno["iscrowd"] = instance_id < 1000
+            anno["category_id"] = label.id
+
+            mask = np.asarray(inst_image == instance_id, dtype=np.uint8, order="F")
+
+            inds = np.nonzero(mask)
+            ymin, ymax = inds[0].min(), inds[0].max()
+            xmin, xmax = inds[1].min(), inds[1].max()
+            anno["bbox"] = (xmin, ymin, xmax, ymax)
+            if xmax <= xmin or ymax <= ymin:
+                continue
+            anno["bbox_mode"] = BoxMode.XYXY_ABS
+            if to_polygons:
+                # This conversion comes from D4809743 and D5171122,
+                # when Mask-RCNN was first developed.
+                contours = cv2.findContours(
+                    mask.copy(), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_NONE
+                )[-2]
+                polygons = [c.reshape(-1).tolist() for c in contours if len(c) >= 3]
+                # opencv's can produce invalid polygons
+                if len(polygons) == 0:
+                    continue
+                anno["segmentation"] = polygons
+            else:
+                anno["segmentation"] = mask_util.encode(mask[:, :, None])[0]
+            annos.append(anno)
+    ret["annotations"] = annos
+    return ret
+
+
+def main() -> None:
+    global logger, labels
+    """
+    Test the cityscapes dataset loader.
+
+    Usage:
+        python -m detectron2.data.datasets.cityscapes \
+            cityscapes/leftImg8bit/train cityscapes/gtFine/train
+    """
+    import argparse
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument("image_dir")
+    parser.add_argument("gt_dir")
+    parser.add_argument("--type", choices=["instance", "semantic"], default="instance")
+    args = parser.parse_args()
+    from cityscapesscripts.helpers.labels import labels
+    from detectron2.data.catalog import Metadata
+    from detectron2.utils.visualizer import Visualizer
+
+    logger = setup_logger(name=__name__)
+
+    dirname = "cityscapes-data-vis"
+    os.makedirs(dirname, exist_ok=True)
+
+    if args.type == "instance":
+        dicts = load_cityscapes_instances(
+            args.image_dir, args.gt_dir, from_json=True, to_polygons=True
+        )
+        logger.info("Done loading {} samples.".format(len(dicts)))
+
+        thing_classes = [
+            k.name for k in labels if k.hasInstances and not k.ignoreInEval
+        ]
+        meta = Metadata().set(thing_classes=thing_classes)
+
+    else:
+        dicts = load_cityscapes_semantic(args.image_dir, args.gt_dir)
+        logger.info("Done loading {} samples.".format(len(dicts)))
+
+        stuff_classes = [k.name for k in labels if k.trainId != 255]
+        stuff_colors = [k.color for k in labels if k.trainId != 255]
+        meta = Metadata().set(stuff_classes=stuff_classes, stuff_colors=stuff_colors)
+
+    for d in dicts:
+        img = np.array(Image.open(PathManager.open(d["file_name"], "rb")))
+        visualizer = Visualizer(img, metadata=meta)
+        vis = visualizer.draw_dataset_dict(d)
+        # cv2.imshow("a", vis.get_image()[:, :, ::-1])
+        # cv2.waitKey()
+        fpath = os.path.join(dirname, os.path.basename(d["file_name"]))
+        vis.save(fpath)
+
+
+if __name__ == "__main__":
+    main()  # pragma: no cover
diff --git a/detectron2/data/datasets/cityscapes_panoptic.py b/detectron2/data/datasets/cityscapes_panoptic.py
new file mode 100644
index 0000000000000000000000000000000000000000..48c136f1623261b079591065fec7c7fc38165076
--- /dev/null
+++ b/detectron2/data/datasets/cityscapes_panoptic.py
@@ -0,0 +1,187 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+import json
+import logging
+import os
+
+from detectron2.data import DatasetCatalog, MetadataCatalog
+from detectron2.data.datasets.builtin_meta import CITYSCAPES_CATEGORIES
+from detectron2.utils.file_io import PathManager
+
+"""
+This file contains functions to register the Cityscapes panoptic dataset to the DatasetCatalog.
+"""
+
+
+logger = logging.getLogger(__name__)
+
+
+def get_cityscapes_panoptic_files(image_dir, gt_dir, json_info):
+    files = []
+    # scan through the directory
+    cities = PathManager.ls(image_dir)
+    logger.info(f"{len(cities)} cities found in '{image_dir}'.")
+    image_dict = {}
+    for city in cities:
+        city_img_dir = os.path.join(image_dir, city)
+        for basename in PathManager.ls(city_img_dir):
+            image_file = os.path.join(city_img_dir, basename)
+
+            suffix = "_leftImg8bit.png"
+            assert basename.endswith(suffix), basename
+            basename = os.path.basename(basename)[: -len(suffix)]
+
+            image_dict[basename] = image_file
+
+    for ann in json_info["annotations"]:
+        image_file = image_dict.get(ann["image_id"], None)
+        assert image_file is not None, "No image {} found for annotation {}".format(
+            ann["image_id"], ann["file_name"]
+        )
+        label_file = os.path.join(gt_dir, ann["file_name"])
+        segments_info = ann["segments_info"]
+
+        files.append((image_file, label_file, segments_info))
+
+    assert len(files), "No images found in {}".format(image_dir)
+    assert PathManager.isfile(files[0][0]), files[0][0]
+    assert PathManager.isfile(files[0][1]), files[0][1]
+    return files
+
+
+def load_cityscapes_panoptic(image_dir, gt_dir, gt_json, meta):
+    """
+    Args:
+        image_dir (str): path to the raw dataset. e.g., "~/cityscapes/leftImg8bit/train".
+        gt_dir (str): path to the raw annotations. e.g.,
+            "~/cityscapes/gtFine/cityscapes_panoptic_train".
+        gt_json (str): path to the json file. e.g.,
+            "~/cityscapes/gtFine/cityscapes_panoptic_train.json".
+        meta (dict): dictionary containing "thing_dataset_id_to_contiguous_id"
+            and "stuff_dataset_id_to_contiguous_id" to map category ids to
+            contiguous ids for training.
+
+    Returns:
+        list[dict]: a list of dicts in Detectron2 standard format. (See
+        `Using Custom Datasets </tutorials/datasets.html>`_ )
+    """
+
+    def _convert_category_id(segment_info, meta):
+        if segment_info["category_id"] in meta["thing_dataset_id_to_contiguous_id"]:
+            segment_info["category_id"] = meta["thing_dataset_id_to_contiguous_id"][
+                segment_info["category_id"]
+            ]
+        else:
+            segment_info["category_id"] = meta["stuff_dataset_id_to_contiguous_id"][
+                segment_info["category_id"]
+            ]
+        return segment_info
+
+    assert os.path.exists(
+        gt_json
+    ), "Please run `python cityscapesscripts/preparation/createPanopticImgs.py` to generate label files."  # noqa
+    with open(gt_json) as f:
+        json_info = json.load(f)
+    files = get_cityscapes_panoptic_files(image_dir, gt_dir, json_info)
+    ret = []
+    for image_file, label_file, segments_info in files:
+        sem_label_file = (
+            image_file.replace("leftImg8bit", "gtFine").split(".")[0] + "_labelTrainIds.png"
+        )
+        segments_info = [_convert_category_id(x, meta) for x in segments_info]
+        ret.append(
+            {
+                "file_name": image_file,
+                "image_id": "_".join(
+                    os.path.splitext(os.path.basename(image_file))[0].split("_")[:3]
+                ),
+                "sem_seg_file_name": sem_label_file,
+                "pan_seg_file_name": label_file,
+                "segments_info": segments_info,
+            }
+        )
+    assert len(ret), f"No images found in {image_dir}!"
+    assert PathManager.isfile(
+        ret[0]["sem_seg_file_name"]
+    ), "Please generate labelTrainIds.png with cityscapesscripts/preparation/createTrainIdLabelImgs.py"  # noqa
+    assert PathManager.isfile(
+        ret[0]["pan_seg_file_name"]
+    ), "Please generate panoptic annotation with python cityscapesscripts/preparation/createPanopticImgs.py"  # noqa
+    return ret
+
+
+_RAW_CITYSCAPES_PANOPTIC_SPLITS = {
+    "cityscapes_fine_panoptic_train": (
+        "cityscapes/leftImg8bit/train",
+        "cityscapes/gtFine/cityscapes_panoptic_train",
+        "cityscapes/gtFine/cityscapes_panoptic_train.json",
+    ),
+    "cityscapes_fine_panoptic_val": (
+        "cityscapes/leftImg8bit/val",
+        "cityscapes/gtFine/cityscapes_panoptic_val",
+        "cityscapes/gtFine/cityscapes_panoptic_val.json",
+    ),
+    # "cityscapes_fine_panoptic_test": not supported yet
+}
+
+
+def register_all_cityscapes_panoptic(root):
+    meta = {}
+    # The following metadata maps contiguous id from [0, #thing categories +
+    # #stuff categories) to their names and colors. We have to replica of the
+    # same name and color under "thing_*" and "stuff_*" because the current
+    # visualization function in D2 handles thing and class classes differently
+    # due to some heuristic used in Panoptic FPN. We keep the same naming to
+    # enable reusing existing visualization functions.
+    thing_classes = [k["name"] for k in CITYSCAPES_CATEGORIES]
+    thing_colors = [k["color"] for k in CITYSCAPES_CATEGORIES]
+    stuff_classes = [k["name"] for k in CITYSCAPES_CATEGORIES]
+    stuff_colors = [k["color"] for k in CITYSCAPES_CATEGORIES]
+
+    meta["thing_classes"] = thing_classes
+    meta["thing_colors"] = thing_colors
+    meta["stuff_classes"] = stuff_classes
+    meta["stuff_colors"] = stuff_colors
+
+    # There are three types of ids in cityscapes panoptic segmentation:
+    # (1) category id: like semantic segmentation, it is the class id for each
+    #   pixel. Since there are some classes not used in evaluation, the category
+    #   id is not always contiguous and thus we have two set of category ids:
+    #       - original category id: category id in the original dataset, mainly
+    #           used for evaluation.
+    #       - contiguous category id: [0, #classes), in order to train the classifier
+    # (2) instance id: this id is used to differentiate different instances from
+    #   the same category. For "stuff" classes, the instance id is always 0; for
+    #   "thing" classes, the instance id starts from 1 and 0 is reserved for
+    #   ignored instances (e.g. crowd annotation).
+    # (3) panoptic id: this is the compact id that encode both category and
+    #   instance id by: category_id * 1000 + instance_id.
+    thing_dataset_id_to_contiguous_id = {}
+    stuff_dataset_id_to_contiguous_id = {}
+
+    for k in CITYSCAPES_CATEGORIES:
+        if k["isthing"] == 1:
+            thing_dataset_id_to_contiguous_id[k["id"]] = k["trainId"]
+        else:
+            stuff_dataset_id_to_contiguous_id[k["id"]] = k["trainId"]
+
+    meta["thing_dataset_id_to_contiguous_id"] = thing_dataset_id_to_contiguous_id
+    meta["stuff_dataset_id_to_contiguous_id"] = stuff_dataset_id_to_contiguous_id
+
+    for key, (image_dir, gt_dir, gt_json) in _RAW_CITYSCAPES_PANOPTIC_SPLITS.items():
+        image_dir = os.path.join(root, image_dir)
+        gt_dir = os.path.join(root, gt_dir)
+        gt_json = os.path.join(root, gt_json)
+
+        DatasetCatalog.register(
+            key, lambda x=image_dir, y=gt_dir, z=gt_json: load_cityscapes_panoptic(x, y, z, meta)
+        )
+        MetadataCatalog.get(key).set(
+            panoptic_root=gt_dir,
+            image_root=image_dir,
+            panoptic_json=gt_json,
+            gt_dir=gt_dir.replace("cityscapes_panoptic_", ""),
+            evaluator_type="cityscapes_panoptic_seg",
+            ignore_label=255,
+            label_divisor=1000,
+            **meta,
+        )
diff --git a/detectron2/data/datasets/coco.py b/detectron2/data/datasets/coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..5b88f7da36edd50bb8c5618eb49cd971447f255d
--- /dev/null
+++ b/detectron2/data/datasets/coco.py
@@ -0,0 +1,586 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+import contextlib
+import datetime
+import io
+import json
+import logging
+import os
+import shutil
+
+import numpy as np
+import pycocotools.mask as mask_util
+
+from detectron2.structures import Boxes, BoxMode, PolygonMasks, RotatedBoxes
+from detectron2.utils.file_io import PathManager
+from fvcore.common.timer import Timer
+from iopath.common.file_io import file_lock
+from PIL import Image
+
+from .. import DatasetCatalog, MetadataCatalog
+
+"""
+This file contains functions to parse COCO-format annotations into dicts in "Detectron2 format".
+"""
+
+
+logger = logging.getLogger(__name__)
+
+__all__ = [
+    "load_coco_json",
+    "load_sem_seg",
+    "convert_to_coco_json",
+    "register_coco_instances",
+]
+
+
+def load_coco_json(
+    json_file, image_root, dataset_name=None, extra_annotation_keys=None
+):
+    """
+    Load a json file with COCO's instances annotation format.
+    Currently supports instance detection, instance segmentation,
+    and person keypoints annotations.
+
+    Args:
+        json_file (str): full path to the json file in COCO instances annotation format.
+        image_root (str or path-like): the directory where the images in this json file exists.
+        dataset_name (str or None): the name of the dataset (e.g., coco_2017_train).
+            When provided, this function will also do the following:
+
+            * Put "thing_classes" into the metadata associated with this dataset.
+            * Map the category ids into a contiguous range (needed by standard dataset format),
+              and add "thing_dataset_id_to_contiguous_id" to the metadata associated
+              with this dataset.
+
+            This option should usually be provided, unless users need to load
+            the original json content and apply more processing manually.
+        extra_annotation_keys (list[str]): list of per-annotation keys that should also be
+            loaded into the dataset dict (besides "iscrowd", "bbox", "keypoints",
+            "category_id", "segmentation"). The values for these keys will be returned as-is.
+            For example, the densepose annotations are loaded in this way.
+
+    Returns:
+        list[dict]: a list of dicts in Detectron2 standard dataset dicts format (See
+        `Using Custom Datasets </tutorials/datasets.html>`_ ) when `dataset_name` is not None.
+        If `dataset_name` is None, the returned `category_ids` may be
+        incontiguous and may not conform to the Detectron2 standard format.
+
+    Notes:
+        1. This function does not read the image files.
+           The results do not have the "image" field.
+    """
+    from pycocotools.coco import COCO
+
+    timer = Timer()
+    json_file = PathManager.get_local_path(json_file)
+    with contextlib.redirect_stdout(io.StringIO()):
+        coco_api = COCO(json_file)
+    if timer.seconds() > 1:
+        logger.info(
+            "Loading {} takes {:.2f} seconds.".format(json_file, timer.seconds())
+        )
+
+    id_map = None
+    if dataset_name is not None:
+        meta = MetadataCatalog.get(dataset_name)
+        cat_ids = sorted(coco_api.getCatIds())
+        cats = coco_api.loadCats(cat_ids)
+        # The categories in a custom json file may not be sorted.
+        thing_classes = [c["name"] for c in sorted(cats, key=lambda x: x["id"])]
+        meta.thing_classes = thing_classes
+
+        # In COCO, certain category ids are artificially removed,
+        # and by convention they are always ignored.
+        # We deal with COCO's id issue and translate
+        # the category ids to contiguous ids in [0, 80).
+
+        # It works by looking at the "categories" field in the json, therefore
+        # if users' own json also have incontiguous ids, we'll
+        # apply this mapping as well but print a warning.
+        if not (min(cat_ids) == 1 and max(cat_ids) == len(cat_ids)):
+            if "coco" not in dataset_name:
+                logger.warning(
+                    """
+Category ids in annotations are not in [1, #categories]! We'll apply a mapping for you.
+"""
+                )
+        id_map = {v: i for i, v in enumerate(cat_ids)}
+        meta.thing_dataset_id_to_contiguous_id = id_map
+
+    # sort indices for reproducible results
+    img_ids = sorted(coco_api.imgs.keys())
+    # imgs is a list of dicts, each looks something like:
+    # {'license': 4,
+    #  'url': 'http://farm6.staticflickr.com/5454/9413846304_881d5e5c3b_z.jpg',
+    #  'file_name': 'COCO_val2014_000000001268.jpg',
+    #  'height': 427,
+    #  'width': 640,
+    #  'date_captured': '2013-11-17 05:57:24',
+    #  'id': 1268}
+    imgs = coco_api.loadImgs(img_ids)
+    # anns is a list[list[dict]], where each dict is an annotation
+    # record for an object. The inner list enumerates the objects in an image
+    # and the outer list enumerates over images. Example of anns[0]:
+    # [{'segmentation': [[192.81,
+    #     247.09,
+    #     ...
+    #     219.03,
+    #     249.06]],
+    #   'area': 1035.749,
+    #   'iscrowd': 0,
+    #   'image_id': 1268,
+    #   'bbox': [192.81, 224.8, 74.73, 33.43],
+    #   'category_id': 16,
+    #   'id': 42986},
+    #  ...]
+    anns = [coco_api.imgToAnns[img_id] for img_id in img_ids]
+    total_num_valid_anns = sum([len(x) for x in anns])
+    total_num_anns = len(coco_api.anns)
+    if total_num_valid_anns < total_num_anns:
+        logger.warning(
+            f"{json_file} contains {total_num_anns} annotations, but only "
+            f"{total_num_valid_anns} of them match to images in the file."
+        )
+
+    if "minival" not in json_file:
+        # The popular valminusminival & minival annotations for COCO2014 contain this bug.
+        # However the ratio of buggy annotations there is tiny and does not affect accuracy.
+        # Therefore we explicitly white-list them.
+        ann_ids = [ann["id"] for anns_per_image in anns for ann in anns_per_image]
+        assert len(set(ann_ids)) == len(
+            ann_ids
+        ), "Annotation ids in '{}' are not unique!".format(json_file)
+
+    imgs_anns = list(zip(imgs, anns))
+    logger.info(
+        "Loaded {} images in COCO format from {}".format(len(imgs_anns), json_file)
+    )
+
+    dataset_dicts = []
+
+    ann_keys = ["iscrowd", "bbox", "keypoints", "category_id"] + (
+        extra_annotation_keys or []
+    )
+
+    num_instances_without_valid_segmentation = 0
+
+    for (img_dict, anno_dict_list) in imgs_anns:
+        record = {}
+        record["file_name"] = os.path.join(image_root, img_dict["file_name"])
+        record["height"] = img_dict["height"]
+        record["width"] = img_dict["width"]
+        image_id = record["image_id"] = img_dict["id"]
+
+        objs = []
+        for anno in anno_dict_list:
+            # Check that the image_id in this annotation is the same as
+            # the image_id we're looking at.
+            # This fails only when the data parsing logic or the annotation file is buggy.
+
+            # The original COCO valminusminival2014 & minival2014 annotation files
+            # actually contains bugs that, together with certain ways of using COCO API,
+            # can trigger this assertion.
+            assert anno["image_id"] == image_id
+
+            assert (
+                anno.get("ignore", 0) == 0
+            ), '"ignore" in COCO json file is not supported.'
+
+            obj = {key: anno[key] for key in ann_keys if key in anno}
+            if "bbox" in obj and len(obj["bbox"]) == 0:
+                raise ValueError(
+                    f"One annotation of image {image_id} contains empty 'bbox' value! "
+                    "This json does not have valid COCO format."
+                )
+
+            segm = anno.get("segmentation", None)
+            if segm:  # either list[list[float]] or dict(RLE)
+                if isinstance(segm, dict):
+                    if isinstance(segm["counts"], list):
+                        # convert to compressed RLE
+                        segm = mask_util.frPyObjects(segm, *segm["size"])
+                else:
+                    # filter out invalid polygons (< 3 points)
+                    segm = [
+                        poly for poly in segm if len(poly) % 2 == 0 and len(poly) >= 6
+                    ]
+                    if len(segm) == 0:
+                        num_instances_without_valid_segmentation += 1
+                        continue  # ignore this instance
+                obj["segmentation"] = segm
+
+            keypts = anno.get("keypoints", None)
+            if keypts:  # list[int]
+                for idx, v in enumerate(keypts):
+                    if idx % 3 != 2:
+                        # COCO's segmentation coordinates are floating points in [0, H or W],
+                        # but keypoint coordinates are integers in [0, H-1 or W-1]
+                        # Therefore we assume the coordinates are "pixel indices" and
+                        # add 0.5 to convert to floating point coordinates.
+                        keypts[idx] = v + 0.5
+                obj["keypoints"] = keypts
+
+            obj["bbox_mode"] = BoxMode.XYWH_ABS
+            if id_map:
+                annotation_category_id = obj["category_id"]
+                try:
+                    obj["category_id"] = id_map[annotation_category_id]
+                except KeyError as e:
+                    raise KeyError(
+                        f"Encountered category_id={annotation_category_id} "
+                        "but this id does not exist in 'categories' of the json file."
+                    ) from e
+            objs.append(obj)
+        record["annotations"] = objs
+        dataset_dicts.append(record)
+
+    if num_instances_without_valid_segmentation > 0:
+        logger.warning(
+            "Filtered out {} instances without valid segmentation. ".format(
+                num_instances_without_valid_segmentation
+            )
+            + "There might be issues in your dataset generation process.  Please "
+            "check https://detectron2.readthedocs.io/en/latest/tutorials/datasets.html carefully"
+        )
+    return dataset_dicts
+
+
+def load_sem_seg(gt_root, image_root, gt_ext="png", image_ext="jpg"):
+    """
+    Load semantic segmentation datasets. All files under "gt_root" with "gt_ext" extension are
+    treated as ground truth annotations and all files under "image_root" with "image_ext" extension
+    as input images. Ground truth and input images are matched using file paths relative to
+    "gt_root" and "image_root" respectively without taking into account file extensions.
+    This works for COCO as well as some other datasets.
+
+    Args:
+        gt_root (str): full path to ground truth semantic segmentation files. Semantic segmentation
+            annotations are stored as images with integer values in pixels that represent
+            corresponding semantic labels.
+        image_root (str): the directory where the input images are.
+        gt_ext (str): file extension for ground truth annotations.
+        image_ext (str): file extension for input images.
+
+    Returns:
+        list[dict]:
+            a list of dicts in detectron2 standard format without instance-level
+            annotation.
+
+    Notes:
+        1. This function does not read the image and ground truth files.
+           The results do not have the "image" and "sem_seg" fields.
+    """
+
+    # We match input images with ground truth based on their relative filepaths (without file
+    # extensions) starting from 'image_root' and 'gt_root' respectively.
+    def file2id(folder_path, file_path):
+        # extract relative path starting from `folder_path`
+        image_id = os.path.normpath(os.path.relpath(file_path, start=folder_path))
+        # remove file extension
+        image_id = os.path.splitext(image_id)[0]
+        return image_id
+
+    input_files = sorted(
+        (
+            os.path.join(image_root, f)
+            for f in PathManager.ls(image_root)
+            if f.endswith(image_ext)
+        ),
+        key=lambda file_path: file2id(image_root, file_path),
+    )
+    gt_files = sorted(
+        (
+            os.path.join(gt_root, f)
+            for f in PathManager.ls(gt_root)
+            if f.endswith(gt_ext)
+        ),
+        key=lambda file_path: file2id(gt_root, file_path),
+    )
+
+    assert len(gt_files) > 0, "No annotations found in {}.".format(gt_root)
+
+    # Use the intersection, so that val2017_100 annotations can run smoothly with val2017 images
+    if len(input_files) != len(gt_files):
+        logger.warn(
+            "Directory {} and {} has {} and {} files, respectively.".format(
+                image_root, gt_root, len(input_files), len(gt_files)
+            )
+        )
+        input_basenames = [os.path.basename(f)[: -len(image_ext)] for f in input_files]
+        gt_basenames = [os.path.basename(f)[: -len(gt_ext)] for f in gt_files]
+        intersect = list(set(input_basenames) & set(gt_basenames))
+        # sort, otherwise each worker may obtain a list[dict] in different order
+        intersect = sorted(intersect)
+        logger.warn("Will use their intersection of {} files.".format(len(intersect)))
+        input_files = [os.path.join(image_root, f + image_ext) for f in intersect]
+        gt_files = [os.path.join(gt_root, f + gt_ext) for f in intersect]
+
+    logger.info(
+        "Loaded {} images with semantic segmentation from {}".format(
+            len(input_files), image_root
+        )
+    )
+
+    dataset_dicts = []
+    for (img_path, gt_path) in zip(input_files, gt_files):
+        record = {}
+        record["file_name"] = img_path
+        record["sem_seg_file_name"] = gt_path
+        dataset_dicts.append(record)
+
+    return dataset_dicts
+
+
+def convert_to_coco_dict(dataset_name):
+    """
+    Convert an instance detection/segmentation or keypoint detection dataset
+    in detectron2's standard format into COCO json format.
+
+    Generic dataset description can be found here:
+    https://detectron2.readthedocs.io/tutorials/datasets.html#register-a-dataset
+
+    COCO data format description can be found here:
+    http://cocodataset.org/#format-data
+
+    Args:
+        dataset_name (str):
+            name of the source dataset
+            Must be registered in DatastCatalog and in detectron2's standard format.
+            Must have corresponding metadata "thing_classes"
+    Returns:
+        coco_dict: serializable dict in COCO json format
+    """
+
+    dataset_dicts = DatasetCatalog.get(dataset_name)
+    metadata = MetadataCatalog.get(dataset_name)
+
+    # unmap the category mapping ids for COCO
+    if hasattr(metadata, "thing_dataset_id_to_contiguous_id"):
+        reverse_id_mapping = {
+            v: k for k, v in metadata.thing_dataset_id_to_contiguous_id.items()
+        }
+        reverse_id_mapper = lambda contiguous_id: reverse_id_mapping[contiguous_id]  # noqa
+    else:
+        reverse_id_mapper = lambda contiguous_id: contiguous_id  # noqa
+
+    categories = [
+        {"id": reverse_id_mapper(id), "name": name}
+        for id, name in enumerate(metadata.thing_classes)
+    ]
+
+    logger.info("Converting dataset dicts into COCO format")
+    coco_images = []
+    coco_annotations = []
+
+    for image_id, image_dict in enumerate(dataset_dicts):
+        coco_image = {
+            "id": image_dict.get("image_id", image_id),
+            "width": int(image_dict["width"]),
+            "height": int(image_dict["height"]),
+            "file_name": str(image_dict["file_name"]),
+        }
+        coco_images.append(coco_image)
+
+        anns_per_image = image_dict.get("annotations", [])
+        for annotation in anns_per_image:
+            # create a new dict with only COCO fields
+            coco_annotation = {}
+
+            # COCO requirement: XYWH box format for axis-align and XYWHA for rotated
+            bbox = annotation["bbox"]
+            if isinstance(bbox, np.ndarray):
+                if bbox.ndim != 1:
+                    raise ValueError(
+                        f"bbox has to be 1-dimensional. Got shape={bbox.shape}."
+                    )
+                bbox = bbox.tolist()
+            if len(bbox) not in [4, 5]:
+                raise ValueError(f"bbox has to has length 4 or 5. Got {bbox}.")
+            from_bbox_mode = annotation["bbox_mode"]
+            to_bbox_mode = BoxMode.XYWH_ABS if len(bbox) == 4 else BoxMode.XYWHA_ABS
+            bbox = BoxMode.convert(bbox, from_bbox_mode, to_bbox_mode)
+
+            # COCO requirement: instance area
+            if "segmentation" in annotation:
+                # Computing areas for instances by counting the pixels
+                segmentation = annotation["segmentation"]
+                # TODO: check segmentation type: RLE, BinaryMask or Polygon
+                if isinstance(segmentation, list):
+                    polygons = PolygonMasks([segmentation])
+                    area = polygons.area()[0].item()
+                elif isinstance(segmentation, dict):  # RLE
+                    area = mask_util.area(segmentation).item()
+                else:
+                    raise TypeError(f"Unknown segmentation type {type(segmentation)}!")
+            else:
+                # Computing areas using bounding boxes
+                if to_bbox_mode == BoxMode.XYWH_ABS:
+                    bbox_xy = BoxMode.convert(bbox, to_bbox_mode, BoxMode.XYXY_ABS)
+                    area = Boxes([bbox_xy]).area()[0].item()
+                else:
+                    area = RotatedBoxes([bbox]).area()[0].item()
+
+            if "keypoints" in annotation:
+                keypoints = annotation["keypoints"]  # list[int]
+                for idx, v in enumerate(keypoints):
+                    if idx % 3 != 2:
+                        # COCO's segmentation coordinates are floating points in [0, H or W],
+                        # but keypoint coordinates are integers in [0, H-1 or W-1]
+                        # For COCO format consistency we substract 0.5
+                        # https://github.com/facebookresearch/detectron2/pull/175#issuecomment-551202163
+                        keypoints[idx] = v - 0.5
+                if "num_keypoints" in annotation:
+                    num_keypoints = annotation["num_keypoints"]
+                else:
+                    num_keypoints = sum(kp > 0 for kp in keypoints[2::3])
+
+            # COCO requirement:
+            #   linking annotations to images
+            #   "id" field must start with 1
+            coco_annotation["id"] = len(coco_annotations) + 1
+            coco_annotation["image_id"] = coco_image["id"]
+            coco_annotation["bbox"] = [round(float(x), 3) for x in bbox]
+            coco_annotation["area"] = float(area)
+            coco_annotation["iscrowd"] = int(annotation.get("iscrowd", 0))
+            coco_annotation["category_id"] = int(
+                reverse_id_mapper(annotation["category_id"])
+            )
+
+            # Add optional fields
+            if "keypoints" in annotation:
+                coco_annotation["keypoints"] = keypoints
+                coco_annotation["num_keypoints"] = num_keypoints
+
+            if "segmentation" in annotation:
+                seg = coco_annotation["segmentation"] = annotation["segmentation"]
+                if isinstance(seg, dict):  # RLE
+                    counts = seg["counts"]
+                    if not isinstance(counts, str):
+                        # make it json-serializable
+                        seg["counts"] = counts.decode("ascii")
+
+            coco_annotations.append(coco_annotation)
+
+    logger.info(
+        "Conversion finished, "
+        f"#images: {len(coco_images)}, #annotations: {len(coco_annotations)}"
+    )
+
+    info = {
+        "date_created": str(datetime.datetime.now()),
+        "description": "Automatically generated COCO json file for Detectron2.",
+    }
+    coco_dict = {
+        "info": info,
+        "images": coco_images,
+        "categories": categories,
+        "licenses": None,
+    }
+    if len(coco_annotations) > 0:
+        coco_dict["annotations"] = coco_annotations
+    return coco_dict
+
+
+def convert_to_coco_json(dataset_name, output_file, allow_cached=True):
+    """
+    Converts dataset into COCO format and saves it to a json file.
+    dataset_name must be registered in DatasetCatalog and in detectron2's standard format.
+
+    Args:
+        dataset_name:
+            reference from the config file to the catalogs
+            must be registered in DatasetCatalog and in detectron2's standard format
+        output_file: path of json file that will be saved to
+        allow_cached: if json file is already present then skip conversion
+    """
+
+    # TODO: The dataset or the conversion script *may* change,
+    # a checksum would be useful for validating the cached data
+
+    PathManager.mkdirs(os.path.dirname(output_file))
+    with file_lock(output_file):
+        if PathManager.exists(output_file) and allow_cached:
+            logger.warning(
+                f"Using previously cached COCO format annotations at '{output_file}'. "
+                "You need to clear the cache file if your dataset has been modified."
+            )
+        else:
+            logger.info(
+                f"Converting annotations of dataset '{dataset_name}' to COCO format ...)"
+            )
+            coco_dict = convert_to_coco_dict(dataset_name)
+
+            logger.info(f"Caching COCO format annotations at '{output_file}' ...")
+            tmp_file = output_file + ".tmp"
+            with PathManager.open(tmp_file, "w") as f:
+                json.dump(coco_dict, f)
+            shutil.move(tmp_file, output_file)
+
+
+def register_coco_instances(name, metadata, json_file, image_root):
+    """
+    Register a dataset in COCO's json annotation format for
+    instance detection, instance segmentation and keypoint detection.
+    (i.e., Type 1 and 2 in http://cocodataset.org/#format-data.
+    `instances*.json` and `person_keypoints*.json` in the dataset).
+
+    This is an example of how to register a new dataset.
+    You can do something similar to this function, to register new datasets.
+
+    Args:
+        name (str): the name that identifies a dataset, e.g. "coco_2014_train".
+        metadata (dict): extra metadata associated with this dataset.  You can
+            leave it as an empty dict.
+        json_file (str): path to the json instance annotation file.
+        image_root (str or path-like): directory which contains all the images.
+    """
+    assert isinstance(name, str), name
+    assert isinstance(json_file, (str, os.PathLike)), json_file
+    assert isinstance(image_root, (str, os.PathLike)), image_root
+    # 1. register a function which returns dicts
+    DatasetCatalog.register(name, lambda: load_coco_json(json_file, image_root, name))
+
+    # 2. Optionally, add metadata about this dataset,
+    # since they might be useful in evaluation, visualization or logging
+    MetadataCatalog.get(name).set(
+        json_file=json_file, image_root=image_root, evaluator_type="coco", **metadata
+    )
+
+
+def main() -> None:
+    global logger
+    """
+    Test the COCO json dataset loader.
+
+    Usage:
+        python -m detectron2.data.datasets.coco \
+            path/to/json path/to/image_root dataset_name
+
+        "dataset_name" can be "coco_2014_minival_100", or other
+        pre-registered ones
+    """
+    import sys
+
+    import detectron2.data.datasets  # noqa  # add pre-defined metadata
+    from detectron2.utils.logger import setup_logger
+    from detectron2.utils.visualizer import Visualizer
+
+    logger = setup_logger(name=__name__)
+    assert sys.argv[3] in DatasetCatalog.list()
+    meta = MetadataCatalog.get(sys.argv[3])
+
+    dicts = load_coco_json(sys.argv[1], sys.argv[2], sys.argv[3])
+    logger.info("Done loading {} samples.".format(len(dicts)))
+
+    dirname = "coco-data-vis"
+    os.makedirs(dirname, exist_ok=True)
+    for d in dicts:
+        img = np.array(Image.open(d["file_name"]))
+        visualizer = Visualizer(img, metadata=meta)
+        vis = visualizer.draw_dataset_dict(d)
+        fpath = os.path.join(dirname, os.path.basename(d["file_name"]))
+        vis.save(fpath)
+
+
+if __name__ == "__main__":
+    main()  # pragma: no cover
diff --git a/detectron2/data/datasets/coco_panoptic.py b/detectron2/data/datasets/coco_panoptic.py
new file mode 100644
index 0000000000000000000000000000000000000000..b8dae44317b556610d7fed39017e082d7e855956
--- /dev/null
+++ b/detectron2/data/datasets/coco_panoptic.py
@@ -0,0 +1,228 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+import copy
+import json
+import os
+
+from detectron2.data import DatasetCatalog, MetadataCatalog
+from detectron2.utils.file_io import PathManager
+
+from .coco import load_coco_json, load_sem_seg
+
+__all__ = ["register_coco_panoptic", "register_coco_panoptic_separated"]
+
+
+def load_coco_panoptic_json(json_file, image_dir, gt_dir, meta):
+    """
+    Args:
+        image_dir (str): path to the raw dataset. e.g., "~/coco/train2017".
+        gt_dir (str): path to the raw annotations. e.g., "~/coco/panoptic_train2017".
+        json_file (str): path to the json file. e.g., "~/coco/annotations/panoptic_train2017.json".
+
+    Returns:
+        list[dict]: a list of dicts in Detectron2 standard format. (See
+        `Using Custom Datasets </tutorials/datasets.html>`_ )
+    """
+
+    def _convert_category_id(segment_info, meta):
+        if segment_info["category_id"] in meta["thing_dataset_id_to_contiguous_id"]:
+            segment_info["category_id"] = meta["thing_dataset_id_to_contiguous_id"][
+                segment_info["category_id"]
+            ]
+            segment_info["isthing"] = True
+        else:
+            segment_info["category_id"] = meta["stuff_dataset_id_to_contiguous_id"][
+                segment_info["category_id"]
+            ]
+            segment_info["isthing"] = False
+        return segment_info
+
+    with PathManager.open(json_file) as f:
+        json_info = json.load(f)
+
+    ret = []
+    for ann in json_info["annotations"]:
+        image_id = int(ann["image_id"])
+        # TODO: currently we assume image and label has the same filename but
+        # different extension, and images have extension ".jpg" for COCO. Need
+        # to make image extension a user-provided argument if we extend this
+        # function to support other COCO-like datasets.
+        image_file = os.path.join(image_dir, os.path.splitext(ann["file_name"])[0] + ".jpg")
+        label_file = os.path.join(gt_dir, ann["file_name"])
+        segments_info = [_convert_category_id(x, meta) for x in ann["segments_info"]]
+        ret.append(
+            {
+                "file_name": image_file,
+                "image_id": image_id,
+                "pan_seg_file_name": label_file,
+                "segments_info": segments_info,
+            }
+        )
+    assert len(ret), f"No images found in {image_dir}!"
+    assert PathManager.isfile(ret[0]["file_name"]), ret[0]["file_name"]
+    assert PathManager.isfile(ret[0]["pan_seg_file_name"]), ret[0]["pan_seg_file_name"]
+    return ret
+
+
+def register_coco_panoptic(
+    name, metadata, image_root, panoptic_root, panoptic_json, instances_json=None
+):
+    """
+    Register a "standard" version of COCO panoptic segmentation dataset named `name`.
+    The dictionaries in this registered dataset follows detectron2's standard format.
+    Hence it's called "standard".
+
+    Args:
+        name (str): the name that identifies a dataset,
+            e.g. "coco_2017_train_panoptic"
+        metadata (dict): extra metadata associated with this dataset.
+        image_root (str): directory which contains all the images
+        panoptic_root (str): directory which contains panoptic annotation images in COCO format
+        panoptic_json (str): path to the json panoptic annotation file in COCO format
+        sem_seg_root (none): not used, to be consistent with
+            `register_coco_panoptic_separated`.
+        instances_json (str): path to the json instance annotation file
+    """
+    panoptic_name = name
+    DatasetCatalog.register(
+        panoptic_name,
+        lambda: load_coco_panoptic_json(panoptic_json, image_root, panoptic_root, metadata),
+    )
+    MetadataCatalog.get(panoptic_name).set(
+        panoptic_root=panoptic_root,
+        image_root=image_root,
+        panoptic_json=panoptic_json,
+        json_file=instances_json,
+        evaluator_type="coco_panoptic_seg",
+        ignore_label=255,
+        label_divisor=1000,
+        **metadata,
+    )
+
+
+def register_coco_panoptic_separated(
+    name, metadata, image_root, panoptic_root, panoptic_json, sem_seg_root, instances_json
+):
+    """
+    Register a "separated" version of COCO panoptic segmentation dataset named `name`.
+    The annotations in this registered dataset will contain both instance annotations and
+    semantic annotations, each with its own contiguous ids. Hence it's called "separated".
+
+    It follows the setting used by the PanopticFPN paper:
+
+    1. The instance annotations directly come from polygons in the COCO
+       instances annotation task, rather than from the masks in the COCO panoptic annotations.
+
+       The two format have small differences:
+       Polygons in the instance annotations may have overlaps.
+       The mask annotations are produced by labeling the overlapped polygons
+       with depth ordering.
+
+    2. The semantic annotations are converted from panoptic annotations, where
+       all "things" are assigned a semantic id of 0.
+       All semantic categories will therefore have ids in contiguous
+       range [1, #stuff_categories].
+
+    This function will also register a pure semantic segmentation dataset
+    named ``name + '_stuffonly'``.
+
+    Args:
+        name (str): the name that identifies a dataset,
+            e.g. "coco_2017_train_panoptic"
+        metadata (dict): extra metadata associated with this dataset.
+        image_root (str): directory which contains all the images
+        panoptic_root (str): directory which contains panoptic annotation images
+        panoptic_json (str): path to the json panoptic annotation file
+        sem_seg_root (str): directory which contains all the ground truth segmentation annotations.
+        instances_json (str): path to the json instance annotation file
+    """
+    panoptic_name = name + "_separated"
+    DatasetCatalog.register(
+        panoptic_name,
+        lambda: merge_to_panoptic(
+            load_coco_json(instances_json, image_root, panoptic_name),
+            load_sem_seg(sem_seg_root, image_root),
+        ),
+    )
+    MetadataCatalog.get(panoptic_name).set(
+        panoptic_root=panoptic_root,
+        image_root=image_root,
+        panoptic_json=panoptic_json,
+        sem_seg_root=sem_seg_root,
+        json_file=instances_json,  # TODO rename
+        evaluator_type="coco_panoptic_seg",
+        ignore_label=255,
+        **metadata,
+    )
+
+    semantic_name = name + "_stuffonly"
+    DatasetCatalog.register(semantic_name, lambda: load_sem_seg(sem_seg_root, image_root))
+    MetadataCatalog.get(semantic_name).set(
+        sem_seg_root=sem_seg_root,
+        image_root=image_root,
+        evaluator_type="sem_seg",
+        ignore_label=255,
+        **metadata,
+    )
+
+
+def merge_to_panoptic(detection_dicts, sem_seg_dicts):
+    """
+    Create dataset dicts for panoptic segmentation, by
+    merging two dicts using "file_name" field to match their entries.
+
+    Args:
+        detection_dicts (list[dict]): lists of dicts for object detection or instance segmentation.
+        sem_seg_dicts (list[dict]): lists of dicts for semantic segmentation.
+
+    Returns:
+        list[dict] (one per input image): Each dict contains all (key, value) pairs from dicts in
+            both detection_dicts and sem_seg_dicts that correspond to the same image.
+            The function assumes that the same key in different dicts has the same value.
+    """
+    results = []
+    sem_seg_file_to_entry = {x["file_name"]: x for x in sem_seg_dicts}
+    assert len(sem_seg_file_to_entry) > 0
+
+    for det_dict in detection_dicts:
+        dic = copy.copy(det_dict)
+        dic.update(sem_seg_file_to_entry[dic["file_name"]])
+        results.append(dic)
+    return results
+
+
+if __name__ == "__main__":
+    """
+    Test the COCO panoptic dataset loader.
+
+    Usage:
+        python -m detectron2.data.datasets.coco_panoptic \
+            path/to/image_root path/to/panoptic_root path/to/panoptic_json dataset_name 10
+
+        "dataset_name" can be "coco_2017_train_panoptic", or other
+        pre-registered ones
+    """
+    from detectron2.utils.logger import setup_logger
+    from detectron2.utils.visualizer import Visualizer
+    import detectron2.data.datasets  # noqa # add pre-defined metadata
+    import sys
+    from PIL import Image
+    import numpy as np
+
+    logger = setup_logger(name=__name__)
+    assert sys.argv[4] in DatasetCatalog.list()
+    meta = MetadataCatalog.get(sys.argv[4])
+
+    dicts = load_coco_panoptic_json(sys.argv[3], sys.argv[1], sys.argv[2], meta.as_dict())
+    logger.info("Done loading {} samples.".format(len(dicts)))
+
+    dirname = "coco-data-vis"
+    os.makedirs(dirname, exist_ok=True)
+    num_imgs_to_vis = int(sys.argv[5])
+    for i, d in enumerate(dicts):
+        img = np.array(Image.open(d["file_name"]))
+        visualizer = Visualizer(img, metadata=meta)
+        vis = visualizer.draw_dataset_dict(d)
+        fpath = os.path.join(dirname, os.path.basename(d["file_name"]))
+        vis.save(fpath)
+        if i + 1 >= num_imgs_to_vis:
+            break
diff --git a/detectron2/data/datasets/lvis.py b/detectron2/data/datasets/lvis.py
new file mode 100644
index 0000000000000000000000000000000000000000..1a28463428a5a9a0311647bd39063f73e6abc0b4
--- /dev/null
+++ b/detectron2/data/datasets/lvis.py
@@ -0,0 +1,268 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+import logging
+import os
+
+from detectron2.data import DatasetCatalog, MetadataCatalog
+from detectron2.structures import BoxMode
+from detectron2.utils.file_io import PathManager
+from fvcore.common.timer import Timer
+
+from .builtin_meta import _get_coco_instances_meta
+from .lvis_v0_5_categories import LVIS_CATEGORIES as LVIS_V0_5_CATEGORIES
+from .lvis_v1_categories import LVIS_CATEGORIES as LVIS_V1_CATEGORIES
+from .lvis_v1_category_image_count import (
+    LVIS_CATEGORY_IMAGE_COUNT as LVIS_V1_CATEGORY_IMAGE_COUNT,
+)
+
+"""
+This file contains functions to parse LVIS-format annotations into dicts in the
+"Detectron2 format".
+"""
+
+logger = logging.getLogger(__name__)
+
+__all__ = ["load_lvis_json", "register_lvis_instances", "get_lvis_instances_meta"]
+
+
+def register_lvis_instances(name, metadata, json_file, image_root):
+    """
+    Register a dataset in LVIS's json annotation format for instance detection and segmentation.
+
+    Args:
+        name (str): a name that identifies the dataset, e.g. "lvis_v0.5_train".
+        metadata (dict): extra metadata associated with this dataset. It can be an empty dict.
+        json_file (str): path to the json instance annotation file.
+        image_root (str or path-like): directory which contains all the images.
+    """
+    DatasetCatalog.register(name, lambda: load_lvis_json(json_file, image_root, name))
+    MetadataCatalog.get(name).set(
+        json_file=json_file, image_root=image_root, evaluator_type="lvis", **metadata
+    )
+
+
+def load_lvis_json(
+    json_file, image_root, dataset_name=None, extra_annotation_keys=None
+):
+    """
+    Load a json file in LVIS's annotation format.
+
+    Args:
+        json_file (str): full path to the LVIS json annotation file.
+        image_root (str): the directory where the images in this json file exists.
+        dataset_name (str): the name of the dataset (e.g., "lvis_v0.5_train").
+            If provided, this function will put "thing_classes" into the metadata
+            associated with this dataset.
+        extra_annotation_keys (list[str]): list of per-annotation keys that should also be
+            loaded into the dataset dict (besides "bbox", "bbox_mode", "category_id",
+            "segmentation"). The values for these keys will be returned as-is.
+
+    Returns:
+        list[dict]: a list of dicts in Detectron2 standard format. (See
+        `Using Custom Datasets </tutorials/datasets.html>`_ )
+
+    Notes:
+        1. This function does not read the image files.
+           The results do not have the "image" field.
+    """
+    from lvis import LVIS
+
+    json_file = PathManager.get_local_path(json_file)
+
+    timer = Timer()
+    lvis_api = LVIS(json_file)
+    if timer.seconds() > 1:
+        logger.info(
+            "Loading {} takes {:.2f} seconds.".format(json_file, timer.seconds())
+        )
+
+    if dataset_name is not None:
+        meta = get_lvis_instances_meta(dataset_name)
+        MetadataCatalog.get(dataset_name).set(**meta)
+
+    # sort indices for reproducible results
+    img_ids = sorted(lvis_api.imgs.keys())
+    # imgs is a list of dicts, each looks something like:
+    # {'license': 4,
+    #  'url': 'http://farm6.staticflickr.com/5454/9413846304_881d5e5c3b_z.jpg',
+    #  'file_name': 'COCO_val2014_000000001268.jpg',
+    #  'height': 427,
+    #  'width': 640,
+    #  'date_captured': '2013-11-17 05:57:24',
+    #  'id': 1268}
+    imgs = lvis_api.load_imgs(img_ids)
+    # anns is a list[list[dict]], where each dict is an annotation
+    # record for an object. The inner list enumerates the objects in an image
+    # and the outer list enumerates over images. Example of anns[0]:
+    # [{'segmentation': [[192.81,
+    #     247.09,
+    #     ...
+    #     219.03,
+    #     249.06]],
+    #   'area': 1035.749,
+    #   'image_id': 1268,
+    #   'bbox': [192.81, 224.8, 74.73, 33.43],
+    #   'category_id': 16,
+    #   'id': 42986},
+    #  ...]
+    anns = [lvis_api.img_ann_map[img_id] for img_id in img_ids]
+
+    # Sanity check that each annotation has a unique id
+    ann_ids = [ann["id"] for anns_per_image in anns for ann in anns_per_image]
+    assert len(set(ann_ids)) == len(
+        ann_ids
+    ), "Annotation ids in '{}' are not unique".format(json_file)
+
+    imgs_anns = list(zip(imgs, anns))
+
+    logger.info(
+        "Loaded {} images in the LVIS format from {}".format(len(imgs_anns), json_file)
+    )
+
+    if extra_annotation_keys:
+        logger.info(
+            "The following extra annotation keys will be loaded: {} ".format(
+                extra_annotation_keys
+            )
+        )
+    else:
+        extra_annotation_keys = []
+
+    def get_file_name(img_root, img_dict):
+        # Determine the path including the split folder ("train2017", "val2017", "test2017") from
+        # the coco_url field. Example:
+        #   'coco_url': 'http://images.cocodataset.org/train2017/000000155379.jpg'
+        split_folder, file_name = img_dict["coco_url"].split("/")[-2:]
+        return os.path.join(img_root + split_folder, file_name)
+
+    dataset_dicts = []
+
+    for (img_dict, anno_dict_list) in imgs_anns:
+        record = {}
+        record["file_name"] = get_file_name(image_root, img_dict)
+        record["height"] = img_dict["height"]
+        record["width"] = img_dict["width"]
+        record["not_exhaustive_category_ids"] = img_dict.get(
+            "not_exhaustive_category_ids", []
+        )
+        record["neg_category_ids"] = img_dict.get("neg_category_ids", [])
+        image_id = record["image_id"] = img_dict["id"]
+
+        objs = []
+        for anno in anno_dict_list:
+            # Check that the image_id in this annotation is the same as
+            # the image_id we're looking at.
+            # This fails only when the data parsing logic or the annotation file is buggy.
+            assert anno["image_id"] == image_id
+            obj = {"bbox": anno["bbox"], "bbox_mode": BoxMode.XYWH_ABS}
+            # LVIS data loader can be used to load COCO dataset categories. In this case `meta`
+            # variable will have a field with COCO-specific category mapping.
+            if dataset_name is not None and "thing_dataset_id_to_contiguous_id" in meta:
+                obj["category_id"] = meta["thing_dataset_id_to_contiguous_id"][
+                    anno["category_id"]
+                ]
+            else:
+                obj["category_id"] = (
+                    anno["category_id"] - 1
+                )  # Convert 1-indexed to 0-indexed
+            segm = anno["segmentation"]  # list[list[float]]
+            # filter out invalid polygons (< 3 points)
+            valid_segm = [
+                poly for poly in segm if len(poly) % 2 == 0 and len(poly) >= 6
+            ]
+            assert len(segm) == len(
+                valid_segm
+            ), "Annotation contains an invalid polygon with < 3 points"
+            assert len(segm) > 0
+            obj["segmentation"] = segm
+            for extra_ann_key in extra_annotation_keys:
+                obj[extra_ann_key] = anno[extra_ann_key]
+            objs.append(obj)
+        record["annotations"] = objs
+        dataset_dicts.append(record)
+
+    return dataset_dicts
+
+
+def get_lvis_instances_meta(dataset_name):
+    """
+    Load LVIS metadata.
+
+    Args:
+        dataset_name (str): LVIS dataset name without the split name (e.g., "lvis_v0.5").
+
+    Returns:
+        dict: LVIS metadata with keys: thing_classes
+    """
+    if "cocofied" in dataset_name:
+        return _get_coco_instances_meta()
+    if "v0.5" in dataset_name:
+        return _get_lvis_instances_meta_v0_5()
+    elif "v1" in dataset_name:
+        return _get_lvis_instances_meta_v1()
+    raise ValueError("No built-in metadata for dataset {}".format(dataset_name))
+
+
+def _get_lvis_instances_meta_v0_5():
+    assert len(LVIS_V0_5_CATEGORIES) == 1230
+    cat_ids = [k["id"] for k in LVIS_V0_5_CATEGORIES]
+    assert min(cat_ids) == 1 and max(cat_ids) == len(
+        cat_ids
+    ), "Category ids are not in [1, #categories], as expected"
+    # Ensure that the category list is sorted by id
+    lvis_categories = sorted(LVIS_V0_5_CATEGORIES, key=lambda x: x["id"])
+    thing_classes = [k["synonyms"][0] for k in lvis_categories]
+    meta = {"thing_classes": thing_classes}
+    return meta
+
+
+def _get_lvis_instances_meta_v1():
+    assert len(LVIS_V1_CATEGORIES) == 1203
+    cat_ids = [k["id"] for k in LVIS_V1_CATEGORIES]
+    assert min(cat_ids) == 1 and max(cat_ids) == len(
+        cat_ids
+    ), "Category ids are not in [1, #categories], as expected"
+    # Ensure that the category list is sorted by id
+    lvis_categories = sorted(LVIS_V1_CATEGORIES, key=lambda x: x["id"])
+    thing_classes = [k["synonyms"][0] for k in lvis_categories]
+    meta = {
+        "thing_classes": thing_classes,
+        "class_image_count": LVIS_V1_CATEGORY_IMAGE_COUNT,
+    }
+    return meta
+
+
+def main() -> None:
+    global logger
+    """
+    Test the LVIS json dataset loader.
+
+    Usage:
+        python -m detectron2.data.datasets.lvis \
+            path/to/json path/to/image_root dataset_name vis_limit
+    """
+    import sys
+
+    import detectron2.data.datasets  # noqa  # add pre-defined metadata
+    import numpy as np
+    from detectron2.utils.logger import setup_logger
+    from detectron2.utils.visualizer import Visualizer
+    from PIL import Image
+
+    logger = setup_logger(name=__name__)
+    meta = MetadataCatalog.get(sys.argv[3])
+
+    dicts = load_lvis_json(sys.argv[1], sys.argv[2], sys.argv[3])
+    logger.info("Done loading {} samples.".format(len(dicts)))
+
+    dirname = "lvis-data-vis"
+    os.makedirs(dirname, exist_ok=True)
+    for d in dicts[: int(sys.argv[4])]:
+        img = np.array(Image.open(d["file_name"]))
+        visualizer = Visualizer(img, metadata=meta)
+        vis = visualizer.draw_dataset_dict(d)
+        fpath = os.path.join(dirname, os.path.basename(d["file_name"]))
+        vis.save(fpath)
+
+
+if __name__ == "__main__":
+    main()  # pragma: no cover
diff --git a/detectron2/data/datasets/lvis_v0_5_categories.py b/detectron2/data/datasets/lvis_v0_5_categories.py
new file mode 100644
index 0000000000000000000000000000000000000000..d3dab6198da614937b08682f4c9edf52bdf1d236
--- /dev/null
+++ b/detectron2/data/datasets/lvis_v0_5_categories.py
@@ -0,0 +1,13 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# Autogen with
+# with open("lvis_v0.5_val.json", "r") as f:
+#     a = json.load(f)
+# c = a["categories"]
+# for x in c:
+#     del x["image_count"]
+#     del x["instance_count"]
+# LVIS_CATEGORIES = repr(c) + "  # noqa"
+
+# fmt: off
+LVIS_CATEGORIES = [{'frequency': 'r', 'id': 1, 'synset': 'acorn.n.01', 'synonyms': ['acorn'], 'def': 'nut from an oak tree', 'name': 'acorn'}, {'frequency': 'c', 'id': 2, 'synset': 'aerosol.n.02', 'synonyms': ['aerosol_can', 'spray_can'], 'def': 'a dispenser that holds a substance under pressure', 'name': 'aerosol_can'}, {'frequency': 'f', 'id': 3, 'synset': 'air_conditioner.n.01', 'synonyms': ['air_conditioner'], 'def': 'a machine that keeps air cool and dry', 'name': 'air_conditioner'}, {'frequency': 'f', 'id': 4, 'synset': 'airplane.n.01', 'synonyms': ['airplane', 'aeroplane'], 'def': 'an aircraft that has a fixed wing and is powered by propellers or jets', 'name': 'airplane'}, {'frequency': 'c', 'id': 5, 'synset': 'alarm_clock.n.01', 'synonyms': ['alarm_clock'], 'def': 'a clock that wakes a sleeper at some preset time', 'name': 'alarm_clock'}, {'frequency': 'c', 'id': 6, 'synset': 'alcohol.n.01', 'synonyms': ['alcohol', 'alcoholic_beverage'], 'def': 'a liquor or brew containing alcohol as the active agent', 'name': 'alcohol'}, {'frequency': 'r', 'id': 7, 'synset': 'alligator.n.02', 'synonyms': ['alligator', 'gator'], 'def': 'amphibious reptiles related to crocodiles but with shorter broader snouts', 'name': 'alligator'}, {'frequency': 'c', 'id': 8, 'synset': 'almond.n.02', 'synonyms': ['almond'], 'def': 'oval-shaped edible seed of the almond tree', 'name': 'almond'}, {'frequency': 'c', 'id': 9, 'synset': 'ambulance.n.01', 'synonyms': ['ambulance'], 'def': 'a vehicle that takes people to and from hospitals', 'name': 'ambulance'}, {'frequency': 'r', 'id': 10, 'synset': 'amplifier.n.01', 'synonyms': ['amplifier'], 'def': 'electronic equipment that increases strength of signals', 'name': 'amplifier'}, {'frequency': 'c', 'id': 11, 'synset': 'anklet.n.03', 'synonyms': ['anklet', 'ankle_bracelet'], 'def': 'an ornament worn around the ankle', 'name': 'anklet'}, {'frequency': 'f', 'id': 12, 'synset': 'antenna.n.01', 'synonyms': ['antenna', 'aerial', 'transmitting_aerial'], 'def': 'an electrical device that sends or receives radio or television signals', 'name': 'antenna'}, {'frequency': 'f', 'id': 13, 'synset': 'apple.n.01', 'synonyms': ['apple'], 'def': 'fruit with red or yellow or green skin and sweet to tart crisp whitish flesh', 'name': 'apple'}, {'frequency': 'r', 'id': 14, 'synset': 'apple_juice.n.01', 'synonyms': ['apple_juice'], 'def': 'the juice of apples', 'name': 'apple_juice'}, {'frequency': 'r', 'id': 15, 'synset': 'applesauce.n.01', 'synonyms': ['applesauce'], 'def': 'puree of stewed apples usually sweetened and spiced', 'name': 'applesauce'}, {'frequency': 'r', 'id': 16, 'synset': 'apricot.n.02', 'synonyms': ['apricot'], 'def': 'downy yellow to rosy-colored fruit resembling a small peach', 'name': 'apricot'}, {'frequency': 'f', 'id': 17, 'synset': 'apron.n.01', 'synonyms': ['apron'], 'def': 'a garment of cloth that is tied about the waist and worn to protect clothing', 'name': 'apron'}, {'frequency': 'c', 'id': 18, 'synset': 'aquarium.n.01', 'synonyms': ['aquarium', 'fish_tank'], 'def': 'a tank/pool/bowl filled with water for keeping live fish and underwater animals', 'name': 'aquarium'}, {'frequency': 'c', 'id': 19, 'synset': 'armband.n.02', 'synonyms': ['armband'], 'def': 'a band worn around the upper arm', 'name': 'armband'}, {'frequency': 'f', 'id': 20, 'synset': 'armchair.n.01', 'synonyms': ['armchair'], 'def': 'chair with a support on each side for arms', 'name': 'armchair'}, {'frequency': 'r', 'id': 21, 'synset': 'armoire.n.01', 'synonyms': ['armoire'], 'def': 'a large wardrobe or cabinet', 'name': 'armoire'}, {'frequency': 'r', 'id': 22, 'synset': 'armor.n.01', 'synonyms': ['armor', 'armour'], 'def': 'protective covering made of metal and used in combat', 'name': 'armor'}, {'frequency': 'c', 'id': 23, 'synset': 'artichoke.n.02', 'synonyms': ['artichoke'], 'def': 'a thistlelike flower head with edible fleshy leaves and heart', 'name': 'artichoke'}, {'frequency': 'f', 'id': 24, 'synset': 'ashcan.n.01', 'synonyms': ['trash_can', 'garbage_can', 'wastebin', 'dustbin', 'trash_barrel', 'trash_bin'], 'def': 'a bin that holds rubbish until it is collected', 'name': 'trash_can'}, {'frequency': 'c', 'id': 25, 'synset': 'ashtray.n.01', 'synonyms': ['ashtray'], 'def': "a receptacle for the ash from smokers' cigars or cigarettes", 'name': 'ashtray'}, {'frequency': 'c', 'id': 26, 'synset': 'asparagus.n.02', 'synonyms': ['asparagus'], 'def': 'edible young shoots of the asparagus plant', 'name': 'asparagus'}, {'frequency': 'c', 'id': 27, 'synset': 'atomizer.n.01', 'synonyms': ['atomizer', 'atomiser', 'spray', 'sprayer', 'nebulizer', 'nebuliser'], 'def': 'a dispenser that turns a liquid (such as perfume) into a fine mist', 'name': 'atomizer'}, {'frequency': 'c', 'id': 28, 'synset': 'avocado.n.01', 'synonyms': ['avocado'], 'def': 'a pear-shaped fruit with green or blackish skin and rich yellowish pulp enclosing a single large seed', 'name': 'avocado'}, {'frequency': 'c', 'id': 29, 'synset': 'award.n.02', 'synonyms': ['award', 'accolade'], 'def': 'a tangible symbol signifying approval or distinction', 'name': 'award'}, {'frequency': 'f', 'id': 30, 'synset': 'awning.n.01', 'synonyms': ['awning'], 'def': 'a canopy made of canvas to shelter people or things from rain or sun', 'name': 'awning'}, {'frequency': 'r', 'id': 31, 'synset': 'ax.n.01', 'synonyms': ['ax', 'axe'], 'def': 'an edge tool with a heavy bladed head mounted across a handle', 'name': 'ax'}, {'frequency': 'f', 'id': 32, 'synset': 'baby_buggy.n.01', 'synonyms': ['baby_buggy', 'baby_carriage', 'perambulator', 'pram', 'stroller'], 'def': 'a small vehicle with four wheels in which a baby or child is pushed around', 'name': 'baby_buggy'}, {'frequency': 'c', 'id': 33, 'synset': 'backboard.n.01', 'synonyms': ['basketball_backboard'], 'def': 'a raised vertical board with basket attached; used to play basketball', 'name': 'basketball_backboard'}, {'frequency': 'f', 'id': 34, 'synset': 'backpack.n.01', 'synonyms': ['backpack', 'knapsack', 'packsack', 'rucksack', 'haversack'], 'def': 'a bag carried by a strap on your back or shoulder', 'name': 'backpack'}, {'frequency': 'f', 'id': 35, 'synset': 'bag.n.04', 'synonyms': ['handbag', 'purse', 'pocketbook'], 'def': 'a container used for carrying money and small personal items or accessories', 'name': 'handbag'}, {'frequency': 'f', 'id': 36, 'synset': 'bag.n.06', 'synonyms': ['suitcase', 'baggage', 'luggage'], 'def': 'cases used to carry belongings when traveling', 'name': 'suitcase'}, {'frequency': 'c', 'id': 37, 'synset': 'bagel.n.01', 'synonyms': ['bagel', 'beigel'], 'def': 'glazed yeast-raised doughnut-shaped roll with hard crust', 'name': 'bagel'}, {'frequency': 'r', 'id': 38, 'synset': 'bagpipe.n.01', 'synonyms': ['bagpipe'], 'def': 'a tubular wind instrument; the player blows air into a bag and squeezes it out', 'name': 'bagpipe'}, {'frequency': 'r', 'id': 39, 'synset': 'baguet.n.01', 'synonyms': ['baguet', 'baguette'], 'def': 'narrow French stick loaf', 'name': 'baguet'}, {'frequency': 'r', 'id': 40, 'synset': 'bait.n.02', 'synonyms': ['bait', 'lure'], 'def': 'something used to lure fish or other animals into danger so they can be trapped or killed', 'name': 'bait'}, {'frequency': 'f', 'id': 41, 'synset': 'ball.n.06', 'synonyms': ['ball'], 'def': 'a spherical object used as a plaything', 'name': 'ball'}, {'frequency': 'r', 'id': 42, 'synset': 'ballet_skirt.n.01', 'synonyms': ['ballet_skirt', 'tutu'], 'def': 'very short skirt worn by ballerinas', 'name': 'ballet_skirt'}, {'frequency': 'f', 'id': 43, 'synset': 'balloon.n.01', 'synonyms': ['balloon'], 'def': 'large tough nonrigid bag filled with gas or heated air', 'name': 'balloon'}, {'frequency': 'c', 'id': 44, 'synset': 'bamboo.n.02', 'synonyms': ['bamboo'], 'def': 'woody tropical grass having hollow woody stems', 'name': 'bamboo'}, {'frequency': 'f', 'id': 45, 'synset': 'banana.n.02', 'synonyms': ['banana'], 'def': 'elongated crescent-shaped yellow fruit with soft sweet flesh', 'name': 'banana'}, {'frequency': 'r', 'id': 46, 'synset': 'band_aid.n.01', 'synonyms': ['Band_Aid'], 'def': 'trade name for an adhesive bandage to cover small cuts or blisters', 'name': 'Band_Aid'}, {'frequency': 'c', 'id': 47, 'synset': 'bandage.n.01', 'synonyms': ['bandage'], 'def': 'a piece of soft material that covers and protects an injured part of the body', 'name': 'bandage'}, {'frequency': 'c', 'id': 48, 'synset': 'bandanna.n.01', 'synonyms': ['bandanna', 'bandana'], 'def': 'large and brightly colored handkerchief; often used as a neckerchief', 'name': 'bandanna'}, {'frequency': 'r', 'id': 49, 'synset': 'banjo.n.01', 'synonyms': ['banjo'], 'def': 'a stringed instrument of the guitar family with a long neck and circular body', 'name': 'banjo'}, {'frequency': 'f', 'id': 50, 'synset': 'banner.n.01', 'synonyms': ['banner', 'streamer'], 'def': 'long strip of cloth or paper used for decoration or advertising', 'name': 'banner'}, {'frequency': 'r', 'id': 51, 'synset': 'barbell.n.01', 'synonyms': ['barbell'], 'def': 'a bar to which heavy discs are attached at each end; used in weightlifting', 'name': 'barbell'}, {'frequency': 'r', 'id': 52, 'synset': 'barge.n.01', 'synonyms': ['barge'], 'def': 'a flatbottom boat for carrying heavy loads (especially on canals)', 'name': 'barge'}, {'frequency': 'f', 'id': 53, 'synset': 'barrel.n.02', 'synonyms': ['barrel', 'cask'], 'def': 'a cylindrical container that holds liquids', 'name': 'barrel'}, {'frequency': 'c', 'id': 54, 'synset': 'barrette.n.01', 'synonyms': ['barrette'], 'def': "a pin for holding women's hair in place", 'name': 'barrette'}, {'frequency': 'c', 'id': 55, 'synset': 'barrow.n.03', 'synonyms': ['barrow', 'garden_cart', 'lawn_cart', 'wheelbarrow'], 'def': 'a cart for carrying small loads; has handles and one or more wheels', 'name': 'barrow'}, {'frequency': 'f', 'id': 56, 'synset': 'base.n.03', 'synonyms': ['baseball_base'], 'def': 'a place that the runner must touch before scoring', 'name': 'baseball_base'}, {'frequency': 'f', 'id': 57, 'synset': 'baseball.n.02', 'synonyms': ['baseball'], 'def': 'a ball used in playing baseball', 'name': 'baseball'}, {'frequency': 'f', 'id': 58, 'synset': 'baseball_bat.n.01', 'synonyms': ['baseball_bat'], 'def': 'an implement used in baseball by the batter', 'name': 'baseball_bat'}, {'frequency': 'f', 'id': 59, 'synset': 'baseball_cap.n.01', 'synonyms': ['baseball_cap', 'jockey_cap', 'golf_cap'], 'def': 'a cap with a bill', 'name': 'baseball_cap'}, {'frequency': 'f', 'id': 60, 'synset': 'baseball_glove.n.01', 'synonyms': ['baseball_glove', 'baseball_mitt'], 'def': 'the handwear used by fielders in playing baseball', 'name': 'baseball_glove'}, {'frequency': 'f', 'id': 61, 'synset': 'basket.n.01', 'synonyms': ['basket', 'handbasket'], 'def': 'a container that is usually woven and has handles', 'name': 'basket'}, {'frequency': 'c', 'id': 62, 'synset': 'basket.n.03', 'synonyms': ['basketball_hoop'], 'def': 'metal hoop supporting a net through which players try to throw the basketball', 'name': 'basketball_hoop'}, {'frequency': 'c', 'id': 63, 'synset': 'basketball.n.02', 'synonyms': ['basketball'], 'def': 'an inflated ball used in playing basketball', 'name': 'basketball'}, {'frequency': 'r', 'id': 64, 'synset': 'bass_horn.n.01', 'synonyms': ['bass_horn', 'sousaphone', 'tuba'], 'def': 'the lowest brass wind instrument', 'name': 'bass_horn'}, {'frequency': 'r', 'id': 65, 'synset': 'bat.n.01', 'synonyms': ['bat_(animal)'], 'def': 'nocturnal mouselike mammal with forelimbs modified to form membranous wings', 'name': 'bat_(animal)'}, {'frequency': 'f', 'id': 66, 'synset': 'bath_mat.n.01', 'synonyms': ['bath_mat'], 'def': 'a heavy towel or mat to stand on while drying yourself after a bath', 'name': 'bath_mat'}, {'frequency': 'f', 'id': 67, 'synset': 'bath_towel.n.01', 'synonyms': ['bath_towel'], 'def': 'a large towel; to dry yourself after a bath', 'name': 'bath_towel'}, {'frequency': 'c', 'id': 68, 'synset': 'bathrobe.n.01', 'synonyms': ['bathrobe'], 'def': 'a loose-fitting robe of towelling; worn after a bath or swim', 'name': 'bathrobe'}, {'frequency': 'f', 'id': 69, 'synset': 'bathtub.n.01', 'synonyms': ['bathtub', 'bathing_tub'], 'def': 'a large open container that you fill with water and use to wash the body', 'name': 'bathtub'}, {'frequency': 'r', 'id': 70, 'synset': 'batter.n.02', 'synonyms': ['batter_(food)'], 'def': 'a liquid or semiliquid mixture, as of flour, eggs, and milk, used in cooking', 'name': 'batter_(food)'}, {'frequency': 'c', 'id': 71, 'synset': 'battery.n.02', 'synonyms': ['battery'], 'def': 'a portable device that produces electricity', 'name': 'battery'}, {'frequency': 'r', 'id': 72, 'synset': 'beach_ball.n.01', 'synonyms': ['beachball'], 'def': 'large and light ball; for play at the seaside', 'name': 'beachball'}, {'frequency': 'c', 'id': 73, 'synset': 'bead.n.01', 'synonyms': ['bead'], 'def': 'a small ball with a hole through the middle used for ornamentation, jewellery, etc.', 'name': 'bead'}, {'frequency': 'r', 'id': 74, 'synset': 'beaker.n.01', 'synonyms': ['beaker'], 'def': 'a flatbottomed jar made of glass or plastic; used for chemistry', 'name': 'beaker'}, {'frequency': 'c', 'id': 75, 'synset': 'bean_curd.n.01', 'synonyms': ['bean_curd', 'tofu'], 'def': 'cheeselike food made of curdled soybean milk', 'name': 'bean_curd'}, {'frequency': 'c', 'id': 76, 'synset': 'beanbag.n.01', 'synonyms': ['beanbag'], 'def': 'a bag filled with dried beans or similar items; used in games or to sit on', 'name': 'beanbag'}, {'frequency': 'f', 'id': 77, 'synset': 'beanie.n.01', 'synonyms': ['beanie', 'beany'], 'def': 'a small skullcap; formerly worn by schoolboys and college freshmen', 'name': 'beanie'}, {'frequency': 'f', 'id': 78, 'synset': 'bear.n.01', 'synonyms': ['bear'], 'def': 'large carnivorous or omnivorous mammals with shaggy coats and claws', 'name': 'bear'}, {'frequency': 'f', 'id': 79, 'synset': 'bed.n.01', 'synonyms': ['bed'], 'def': 'a piece of furniture that provides a place to sleep', 'name': 'bed'}, {'frequency': 'c', 'id': 80, 'synset': 'bedspread.n.01', 'synonyms': ['bedspread', 'bedcover', 'bed_covering', 'counterpane', 'spread'], 'def': 'decorative cover for a bed', 'name': 'bedspread'}, {'frequency': 'f', 'id': 81, 'synset': 'beef.n.01', 'synonyms': ['cow'], 'def': 'cattle that are reared for their meat', 'name': 'cow'}, {'frequency': 'c', 'id': 82, 'synset': 'beef.n.02', 'synonyms': ['beef_(food)', 'boeuf_(food)'], 'def': 'meat from an adult domestic bovine', 'name': 'beef_(food)'}, {'frequency': 'r', 'id': 83, 'synset': 'beeper.n.01', 'synonyms': ['beeper', 'pager'], 'def': 'an device that beeps when the person carrying it is being paged', 'name': 'beeper'}, {'frequency': 'f', 'id': 84, 'synset': 'beer_bottle.n.01', 'synonyms': ['beer_bottle'], 'def': 'a bottle that holds beer', 'name': 'beer_bottle'}, {'frequency': 'c', 'id': 85, 'synset': 'beer_can.n.01', 'synonyms': ['beer_can'], 'def': 'a can that holds beer', 'name': 'beer_can'}, {'frequency': 'r', 'id': 86, 'synset': 'beetle.n.01', 'synonyms': ['beetle'], 'def': 'insect with hard wing covers', 'name': 'beetle'}, {'frequency': 'f', 'id': 87, 'synset': 'bell.n.01', 'synonyms': ['bell'], 'def': 'a hollow device made of metal that makes a ringing sound when struck', 'name': 'bell'}, {'frequency': 'f', 'id': 88, 'synset': 'bell_pepper.n.02', 'synonyms': ['bell_pepper', 'capsicum'], 'def': 'large bell-shaped sweet pepper in green or red or yellow or orange or black varieties', 'name': 'bell_pepper'}, {'frequency': 'f', 'id': 89, 'synset': 'belt.n.02', 'synonyms': ['belt'], 'def': 'a band to tie or buckle around the body (usually at the waist)', 'name': 'belt'}, {'frequency': 'f', 'id': 90, 'synset': 'belt_buckle.n.01', 'synonyms': ['belt_buckle'], 'def': 'the buckle used to fasten a belt', 'name': 'belt_buckle'}, {'frequency': 'f', 'id': 91, 'synset': 'bench.n.01', 'synonyms': ['bench'], 'def': 'a long seat for more than one person', 'name': 'bench'}, {'frequency': 'c', 'id': 92, 'synset': 'beret.n.01', 'synonyms': ['beret'], 'def': 'a cap with no brim or bill; made of soft cloth', 'name': 'beret'}, {'frequency': 'c', 'id': 93, 'synset': 'bib.n.02', 'synonyms': ['bib'], 'def': 'a napkin tied under the chin of a child while eating', 'name': 'bib'}, {'frequency': 'r', 'id': 94, 'synset': 'bible.n.01', 'synonyms': ['Bible'], 'def': 'the sacred writings of the Christian religions', 'name': 'Bible'}, {'frequency': 'f', 'id': 95, 'synset': 'bicycle.n.01', 'synonyms': ['bicycle', 'bike_(bicycle)'], 'def': 'a wheeled vehicle that has two wheels and is moved by foot pedals', 'name': 'bicycle'}, {'frequency': 'f', 'id': 96, 'synset': 'bill.n.09', 'synonyms': ['visor', 'vizor'], 'def': 'a brim that projects to the front to shade the eyes', 'name': 'visor'}, {'frequency': 'c', 'id': 97, 'synset': 'binder.n.03', 'synonyms': ['binder', 'ring-binder'], 'def': 'holds loose papers or magazines', 'name': 'binder'}, {'frequency': 'c', 'id': 98, 'synset': 'binoculars.n.01', 'synonyms': ['binoculars', 'field_glasses', 'opera_glasses'], 'def': 'an optical instrument designed for simultaneous use by both eyes', 'name': 'binoculars'}, {'frequency': 'f', 'id': 99, 'synset': 'bird.n.01', 'synonyms': ['bird'], 'def': 'animal characterized by feathers and wings', 'name': 'bird'}, {'frequency': 'r', 'id': 100, 'synset': 'bird_feeder.n.01', 'synonyms': ['birdfeeder'], 'def': 'an outdoor device that supplies food for wild birds', 'name': 'birdfeeder'}, {'frequency': 'r', 'id': 101, 'synset': 'birdbath.n.01', 'synonyms': ['birdbath'], 'def': 'an ornamental basin (usually in a garden) for birds to bathe in', 'name': 'birdbath'}, {'frequency': 'c', 'id': 102, 'synset': 'birdcage.n.01', 'synonyms': ['birdcage'], 'def': 'a cage in which a bird can be kept', 'name': 'birdcage'}, {'frequency': 'c', 'id': 103, 'synset': 'birdhouse.n.01', 'synonyms': ['birdhouse'], 'def': 'a shelter for birds', 'name': 'birdhouse'}, {'frequency': 'f', 'id': 104, 'synset': 'birthday_cake.n.01', 'synonyms': ['birthday_cake'], 'def': 'decorated cake served at a birthday party', 'name': 'birthday_cake'}, {'frequency': 'r', 'id': 105, 'synset': 'birthday_card.n.01', 'synonyms': ['birthday_card'], 'def': 'a card expressing a birthday greeting', 'name': 'birthday_card'}, {'frequency': 'r', 'id': 106, 'synset': 'biscuit.n.01', 'synonyms': ['biscuit_(bread)'], 'def': 'small round bread leavened with baking-powder or soda', 'name': 'biscuit_(bread)'}, {'frequency': 'r', 'id': 107, 'synset': 'black_flag.n.01', 'synonyms': ['pirate_flag'], 'def': 'a flag usually bearing a white skull and crossbones on a black background', 'name': 'pirate_flag'}, {'frequency': 'c', 'id': 108, 'synset': 'black_sheep.n.02', 'synonyms': ['black_sheep'], 'def': 'sheep with a black coat', 'name': 'black_sheep'}, {'frequency': 'c', 'id': 109, 'synset': 'blackboard.n.01', 'synonyms': ['blackboard', 'chalkboard'], 'def': 'sheet of slate; for writing with chalk', 'name': 'blackboard'}, {'frequency': 'f', 'id': 110, 'synset': 'blanket.n.01', 'synonyms': ['blanket'], 'def': 'bedding that keeps a person warm in bed', 'name': 'blanket'}, {'frequency': 'c', 'id': 111, 'synset': 'blazer.n.01', 'synonyms': ['blazer', 'sport_jacket', 'sport_coat', 'sports_jacket', 'sports_coat'], 'def': 'lightweight jacket; often striped in the colors of a club or school', 'name': 'blazer'}, {'frequency': 'f', 'id': 112, 'synset': 'blender.n.01', 'synonyms': ['blender', 'liquidizer', 'liquidiser'], 'def': 'an electrically powered mixer that mix or chop or liquefy foods', 'name': 'blender'}, {'frequency': 'r', 'id': 113, 'synset': 'blimp.n.02', 'synonyms': ['blimp'], 'def': 'a small nonrigid airship used for observation or as a barrage balloon', 'name': 'blimp'}, {'frequency': 'c', 'id': 114, 'synset': 'blinker.n.01', 'synonyms': ['blinker', 'flasher'], 'def': 'a light that flashes on and off; used as a signal or to send messages', 'name': 'blinker'}, {'frequency': 'c', 'id': 115, 'synset': 'blueberry.n.02', 'synonyms': ['blueberry'], 'def': 'sweet edible dark-blue berries of blueberry plants', 'name': 'blueberry'}, {'frequency': 'r', 'id': 116, 'synset': 'boar.n.02', 'synonyms': ['boar'], 'def': 'an uncastrated male hog', 'name': 'boar'}, {'frequency': 'r', 'id': 117, 'synset': 'board.n.09', 'synonyms': ['gameboard'], 'def': 'a flat portable surface (usually rectangular) designed for board games', 'name': 'gameboard'}, {'frequency': 'f', 'id': 118, 'synset': 'boat.n.01', 'synonyms': ['boat', 'ship_(boat)'], 'def': 'a vessel for travel on water', 'name': 'boat'}, {'frequency': 'c', 'id': 119, 'synset': 'bobbin.n.01', 'synonyms': ['bobbin', 'spool', 'reel'], 'def': 'a thing around which thread/tape/film or other flexible materials can be wound', 'name': 'bobbin'}, {'frequency': 'r', 'id': 120, 'synset': 'bobby_pin.n.01', 'synonyms': ['bobby_pin', 'hairgrip'], 'def': 'a flat wire hairpin used to hold bobbed hair in place', 'name': 'bobby_pin'}, {'frequency': 'c', 'id': 121, 'synset': 'boiled_egg.n.01', 'synonyms': ['boiled_egg', 'coddled_egg'], 'def': 'egg cooked briefly in the shell in gently boiling water', 'name': 'boiled_egg'}, {'frequency': 'r', 'id': 122, 'synset': 'bolo_tie.n.01', 'synonyms': ['bolo_tie', 'bolo', 'bola_tie', 'bola'], 'def': 'a cord fastened around the neck with an ornamental clasp and worn as a necktie', 'name': 'bolo_tie'}, {'frequency': 'c', 'id': 123, 'synset': 'bolt.n.03', 'synonyms': ['deadbolt'], 'def': 'the part of a lock that is engaged or withdrawn with a key', 'name': 'deadbolt'}, {'frequency': 'f', 'id': 124, 'synset': 'bolt.n.06', 'synonyms': ['bolt'], 'def': 'a screw that screws into a nut to form a fastener', 'name': 'bolt'}, {'frequency': 'r', 'id': 125, 'synset': 'bonnet.n.01', 'synonyms': ['bonnet'], 'def': 'a hat tied under the chin', 'name': 'bonnet'}, {'frequency': 'f', 'id': 126, 'synset': 'book.n.01', 'synonyms': ['book'], 'def': 'a written work or composition that has been published', 'name': 'book'}, {'frequency': 'r', 'id': 127, 'synset': 'book_bag.n.01', 'synonyms': ['book_bag'], 'def': 'a bag in which students carry their books', 'name': 'book_bag'}, {'frequency': 'c', 'id': 128, 'synset': 'bookcase.n.01', 'synonyms': ['bookcase'], 'def': 'a piece of furniture with shelves for storing books', 'name': 'bookcase'}, {'frequency': 'c', 'id': 129, 'synset': 'booklet.n.01', 'synonyms': ['booklet', 'brochure', 'leaflet', 'pamphlet'], 'def': 'a small book usually having a paper cover', 'name': 'booklet'}, {'frequency': 'r', 'id': 130, 'synset': 'bookmark.n.01', 'synonyms': ['bookmark', 'bookmarker'], 'def': 'a marker (a piece of paper or ribbon) placed between the pages of a book', 'name': 'bookmark'}, {'frequency': 'r', 'id': 131, 'synset': 'boom.n.04', 'synonyms': ['boom_microphone', 'microphone_boom'], 'def': 'a pole carrying an overhead microphone projected over a film or tv set', 'name': 'boom_microphone'}, {'frequency': 'f', 'id': 132, 'synset': 'boot.n.01', 'synonyms': ['boot'], 'def': 'footwear that covers the whole foot and lower leg', 'name': 'boot'}, {'frequency': 'f', 'id': 133, 'synset': 'bottle.n.01', 'synonyms': ['bottle'], 'def': 'a glass or plastic vessel used for storing drinks or other liquids', 'name': 'bottle'}, {'frequency': 'c', 'id': 134, 'synset': 'bottle_opener.n.01', 'synonyms': ['bottle_opener'], 'def': 'an opener for removing caps or corks from bottles', 'name': 'bottle_opener'}, {'frequency': 'c', 'id': 135, 'synset': 'bouquet.n.01', 'synonyms': ['bouquet'], 'def': 'an arrangement of flowers that is usually given as a present', 'name': 'bouquet'}, {'frequency': 'r', 'id': 136, 'synset': 'bow.n.04', 'synonyms': ['bow_(weapon)'], 'def': 'a weapon for shooting arrows', 'name': 'bow_(weapon)'}, {'frequency': 'f', 'id': 137, 'synset': 'bow.n.08', 'synonyms': ['bow_(decorative_ribbons)'], 'def': 'a decorative interlacing of ribbons', 'name': 'bow_(decorative_ribbons)'}, {'frequency': 'f', 'id': 138, 'synset': 'bow_tie.n.01', 'synonyms': ['bow-tie', 'bowtie'], 'def': "a man's tie that ties in a bow", 'name': 'bow-tie'}, {'frequency': 'f', 'id': 139, 'synset': 'bowl.n.03', 'synonyms': ['bowl'], 'def': 'a dish that is round and open at the top for serving foods', 'name': 'bowl'}, {'frequency': 'r', 'id': 140, 'synset': 'bowl.n.08', 'synonyms': ['pipe_bowl'], 'def': 'a small round container that is open at the top for holding tobacco', 'name': 'pipe_bowl'}, {'frequency': 'c', 'id': 141, 'synset': 'bowler_hat.n.01', 'synonyms': ['bowler_hat', 'bowler', 'derby_hat', 'derby', 'plug_hat'], 'def': 'a felt hat that is round and hard with a narrow brim', 'name': 'bowler_hat'}, {'frequency': 'r', 'id': 142, 'synset': 'bowling_ball.n.01', 'synonyms': ['bowling_ball'], 'def': 'a large ball with finger holes used in the sport of bowling', 'name': 'bowling_ball'}, {'frequency': 'r', 'id': 143, 'synset': 'bowling_pin.n.01', 'synonyms': ['bowling_pin'], 'def': 'a club-shaped wooden object used in bowling', 'name': 'bowling_pin'}, {'frequency': 'r', 'id': 144, 'synset': 'boxing_glove.n.01', 'synonyms': ['boxing_glove'], 'def': 'large glove coverings the fists of a fighter worn for the sport of boxing', 'name': 'boxing_glove'}, {'frequency': 'c', 'id': 145, 'synset': 'brace.n.06', 'synonyms': ['suspenders'], 'def': 'elastic straps that hold trousers up (usually used in the plural)', 'name': 'suspenders'}, {'frequency': 'f', 'id': 146, 'synset': 'bracelet.n.02', 'synonyms': ['bracelet', 'bangle'], 'def': 'jewelry worn around the wrist for decoration', 'name': 'bracelet'}, {'frequency': 'r', 'id': 147, 'synset': 'brass.n.07', 'synonyms': ['brass_plaque'], 'def': 'a memorial made of brass', 'name': 'brass_plaque'}, {'frequency': 'c', 'id': 148, 'synset': 'brassiere.n.01', 'synonyms': ['brassiere', 'bra', 'bandeau'], 'def': 'an undergarment worn by women to support their breasts', 'name': 'brassiere'}, {'frequency': 'c', 'id': 149, 'synset': 'bread-bin.n.01', 'synonyms': ['bread-bin', 'breadbox'], 'def': 'a container used to keep bread or cake in', 'name': 'bread-bin'}, {'frequency': 'r', 'id': 150, 'synset': 'breechcloth.n.01', 'synonyms': ['breechcloth', 'breechclout', 'loincloth'], 'def': 'a garment that provides covering for the loins', 'name': 'breechcloth'}, {'frequency': 'c', 'id': 151, 'synset': 'bridal_gown.n.01', 'synonyms': ['bridal_gown', 'wedding_gown', 'wedding_dress'], 'def': 'a gown worn by the bride at a wedding', 'name': 'bridal_gown'}, {'frequency': 'c', 'id': 152, 'synset': 'briefcase.n.01', 'synonyms': ['briefcase'], 'def': 'a case with a handle; for carrying papers or files or books', 'name': 'briefcase'}, {'frequency': 'c', 'id': 153, 'synset': 'bristle_brush.n.01', 'synonyms': ['bristle_brush'], 'def': 'a brush that is made with the short stiff hairs of an animal or plant', 'name': 'bristle_brush'}, {'frequency': 'f', 'id': 154, 'synset': 'broccoli.n.01', 'synonyms': ['broccoli'], 'def': 'plant with dense clusters of tight green flower buds', 'name': 'broccoli'}, {'frequency': 'r', 'id': 155, 'synset': 'brooch.n.01', 'synonyms': ['broach'], 'def': 'a decorative pin worn by women', 'name': 'broach'}, {'frequency': 'c', 'id': 156, 'synset': 'broom.n.01', 'synonyms': ['broom'], 'def': 'bundle of straws or twigs attached to a long handle; used for cleaning', 'name': 'broom'}, {'frequency': 'c', 'id': 157, 'synset': 'brownie.n.03', 'synonyms': ['brownie'], 'def': 'square or bar of very rich chocolate cake usually with nuts', 'name': 'brownie'}, {'frequency': 'c', 'id': 158, 'synset': 'brussels_sprouts.n.01', 'synonyms': ['brussels_sprouts'], 'def': 'the small edible cabbage-like buds growing along a stalk', 'name': 'brussels_sprouts'}, {'frequency': 'r', 'id': 159, 'synset': 'bubble_gum.n.01', 'synonyms': ['bubble_gum'], 'def': 'a kind of chewing gum that can be blown into bubbles', 'name': 'bubble_gum'}, {'frequency': 'f', 'id': 160, 'synset': 'bucket.n.01', 'synonyms': ['bucket', 'pail'], 'def': 'a roughly cylindrical vessel that is open at the top', 'name': 'bucket'}, {'frequency': 'r', 'id': 161, 'synset': 'buggy.n.01', 'synonyms': ['horse_buggy'], 'def': 'a small lightweight carriage; drawn by a single horse', 'name': 'horse_buggy'}, {'frequency': 'c', 'id': 162, 'synset': 'bull.n.11', 'synonyms': ['bull'], 'def': 'mature male cow', 'name': 'bull'}, {'frequency': 'r', 'id': 163, 'synset': 'bulldog.n.01', 'synonyms': ['bulldog'], 'def': 'a thickset short-haired dog with a large head and strong undershot lower jaw', 'name': 'bulldog'}, {'frequency': 'r', 'id': 164, 'synset': 'bulldozer.n.01', 'synonyms': ['bulldozer', 'dozer'], 'def': 'large powerful tractor; a large blade in front flattens areas of ground', 'name': 'bulldozer'}, {'frequency': 'c', 'id': 165, 'synset': 'bullet_train.n.01', 'synonyms': ['bullet_train'], 'def': 'a high-speed passenger train', 'name': 'bullet_train'}, {'frequency': 'c', 'id': 166, 'synset': 'bulletin_board.n.02', 'synonyms': ['bulletin_board', 'notice_board'], 'def': 'a board that hangs on a wall; displays announcements', 'name': 'bulletin_board'}, {'frequency': 'r', 'id': 167, 'synset': 'bulletproof_vest.n.01', 'synonyms': ['bulletproof_vest'], 'def': 'a vest capable of resisting the impact of a bullet', 'name': 'bulletproof_vest'}, {'frequency': 'c', 'id': 168, 'synset': 'bullhorn.n.01', 'synonyms': ['bullhorn', 'megaphone'], 'def': 'a portable loudspeaker with built-in microphone and amplifier', 'name': 'bullhorn'}, {'frequency': 'r', 'id': 169, 'synset': 'bully_beef.n.01', 'synonyms': ['corned_beef', 'corn_beef'], 'def': 'beef cured or pickled in brine', 'name': 'corned_beef'}, {'frequency': 'f', 'id': 170, 'synset': 'bun.n.01', 'synonyms': ['bun', 'roll'], 'def': 'small rounded bread either plain or sweet', 'name': 'bun'}, {'frequency': 'c', 'id': 171, 'synset': 'bunk_bed.n.01', 'synonyms': ['bunk_bed'], 'def': 'beds built one above the other', 'name': 'bunk_bed'}, {'frequency': 'f', 'id': 172, 'synset': 'buoy.n.01', 'synonyms': ['buoy'], 'def': 'a float attached by rope to the seabed to mark channels in a harbor or underwater hazards', 'name': 'buoy'}, {'frequency': 'r', 'id': 173, 'synset': 'burrito.n.01', 'synonyms': ['burrito'], 'def': 'a flour tortilla folded around a filling', 'name': 'burrito'}, {'frequency': 'f', 'id': 174, 'synset': 'bus.n.01', 'synonyms': ['bus_(vehicle)', 'autobus', 'charabanc', 'double-decker', 'motorbus', 'motorcoach'], 'def': 'a vehicle carrying many passengers; used for public transport', 'name': 'bus_(vehicle)'}, {'frequency': 'c', 'id': 175, 'synset': 'business_card.n.01', 'synonyms': ['business_card'], 'def': "a card on which are printed the person's name and business affiliation", 'name': 'business_card'}, {'frequency': 'c', 'id': 176, 'synset': 'butcher_knife.n.01', 'synonyms': ['butcher_knife'], 'def': 'a large sharp knife for cutting or trimming meat', 'name': 'butcher_knife'}, {'frequency': 'c', 'id': 177, 'synset': 'butter.n.01', 'synonyms': ['butter'], 'def': 'an edible emulsion of fat globules made by churning milk or cream; for cooking and table use', 'name': 'butter'}, {'frequency': 'c', 'id': 178, 'synset': 'butterfly.n.01', 'synonyms': ['butterfly'], 'def': 'insect typically having a slender body with knobbed antennae and broad colorful wings', 'name': 'butterfly'}, {'frequency': 'f', 'id': 179, 'synset': 'button.n.01', 'synonyms': ['button'], 'def': 'a round fastener sewn to shirts and coats etc to fit through buttonholes', 'name': 'button'}, {'frequency': 'f', 'id': 180, 'synset': 'cab.n.03', 'synonyms': ['cab_(taxi)', 'taxi', 'taxicab'], 'def': 'a car that takes passengers where they want to go in exchange for money', 'name': 'cab_(taxi)'}, {'frequency': 'r', 'id': 181, 'synset': 'cabana.n.01', 'synonyms': ['cabana'], 'def': 'a small tent used as a dressing room beside the sea or a swimming pool', 'name': 'cabana'}, {'frequency': 'r', 'id': 182, 'synset': 'cabin_car.n.01', 'synonyms': ['cabin_car', 'caboose'], 'def': 'a car on a freight train for use of the train crew; usually the last car on the train', 'name': 'cabin_car'}, {'frequency': 'f', 'id': 183, 'synset': 'cabinet.n.01', 'synonyms': ['cabinet'], 'def': 'a piece of furniture resembling a cupboard with doors and shelves and drawers', 'name': 'cabinet'}, {'frequency': 'r', 'id': 184, 'synset': 'cabinet.n.03', 'synonyms': ['locker', 'storage_locker'], 'def': 'a storage compartment for clothes and valuables; usually it has a lock', 'name': 'locker'}, {'frequency': 'f', 'id': 185, 'synset': 'cake.n.03', 'synonyms': ['cake'], 'def': 'baked goods made from or based on a mixture of flour, sugar, eggs, and fat', 'name': 'cake'}, {'frequency': 'c', 'id': 186, 'synset': 'calculator.n.02', 'synonyms': ['calculator'], 'def': 'a small machine that is used for mathematical calculations', 'name': 'calculator'}, {'frequency': 'f', 'id': 187, 'synset': 'calendar.n.02', 'synonyms': ['calendar'], 'def': 'a list or register of events (appointments/social events/court cases, etc)', 'name': 'calendar'}, {'frequency': 'c', 'id': 188, 'synset': 'calf.n.01', 'synonyms': ['calf'], 'def': 'young of domestic cattle', 'name': 'calf'}, {'frequency': 'c', 'id': 189, 'synset': 'camcorder.n.01', 'synonyms': ['camcorder'], 'def': 'a portable television camera and videocassette recorder', 'name': 'camcorder'}, {'frequency': 'c', 'id': 190, 'synset': 'camel.n.01', 'synonyms': ['camel'], 'def': 'cud-chewing mammal used as a draft or saddle animal in desert regions', 'name': 'camel'}, {'frequency': 'f', 'id': 191, 'synset': 'camera.n.01', 'synonyms': ['camera'], 'def': 'equipment for taking photographs', 'name': 'camera'}, {'frequency': 'c', 'id': 192, 'synset': 'camera_lens.n.01', 'synonyms': ['camera_lens'], 'def': 'a lens that focuses the image in a camera', 'name': 'camera_lens'}, {'frequency': 'c', 'id': 193, 'synset': 'camper.n.02', 'synonyms': ['camper_(vehicle)', 'camping_bus', 'motor_home'], 'def': 'a recreational vehicle equipped for camping out while traveling', 'name': 'camper_(vehicle)'}, {'frequency': 'f', 'id': 194, 'synset': 'can.n.01', 'synonyms': ['can', 'tin_can'], 'def': 'airtight sealed metal container for food or drink or paint etc.', 'name': 'can'}, {'frequency': 'c', 'id': 195, 'synset': 'can_opener.n.01', 'synonyms': ['can_opener', 'tin_opener'], 'def': 'a device for cutting cans open', 'name': 'can_opener'}, {'frequency': 'r', 'id': 196, 'synset': 'candelabrum.n.01', 'synonyms': ['candelabrum', 'candelabra'], 'def': 'branched candlestick; ornamental; has several lights', 'name': 'candelabrum'}, {'frequency': 'f', 'id': 197, 'synset': 'candle.n.01', 'synonyms': ['candle', 'candlestick'], 'def': 'stick of wax with a wick in the middle', 'name': 'candle'}, {'frequency': 'f', 'id': 198, 'synset': 'candlestick.n.01', 'synonyms': ['candle_holder'], 'def': 'a holder with sockets for candles', 'name': 'candle_holder'}, {'frequency': 'r', 'id': 199, 'synset': 'candy_bar.n.01', 'synonyms': ['candy_bar'], 'def': 'a candy shaped as a bar', 'name': 'candy_bar'}, {'frequency': 'c', 'id': 200, 'synset': 'candy_cane.n.01', 'synonyms': ['candy_cane'], 'def': 'a hard candy in the shape of a rod (usually with stripes)', 'name': 'candy_cane'}, {'frequency': 'c', 'id': 201, 'synset': 'cane.n.01', 'synonyms': ['walking_cane'], 'def': 'a stick that people can lean on to help them walk', 'name': 'walking_cane'}, {'frequency': 'c', 'id': 202, 'synset': 'canister.n.02', 'synonyms': ['canister', 'cannister'], 'def': 'metal container for storing dry foods such as tea or flour', 'name': 'canister'}, {'frequency': 'r', 'id': 203, 'synset': 'cannon.n.02', 'synonyms': ['cannon'], 'def': 'heavy gun fired from a tank', 'name': 'cannon'}, {'frequency': 'c', 'id': 204, 'synset': 'canoe.n.01', 'synonyms': ['canoe'], 'def': 'small and light boat; pointed at both ends; propelled with a paddle', 'name': 'canoe'}, {'frequency': 'r', 'id': 205, 'synset': 'cantaloup.n.02', 'synonyms': ['cantaloup', 'cantaloupe'], 'def': 'the fruit of a cantaloup vine; small to medium-sized melon with yellowish flesh', 'name': 'cantaloup'}, {'frequency': 'r', 'id': 206, 'synset': 'canteen.n.01', 'synonyms': ['canteen'], 'def': 'a flask for carrying water; used by soldiers or travelers', 'name': 'canteen'}, {'frequency': 'c', 'id': 207, 'synset': 'cap.n.01', 'synonyms': ['cap_(headwear)'], 'def': 'a tight-fitting headwear', 'name': 'cap_(headwear)'}, {'frequency': 'f', 'id': 208, 'synset': 'cap.n.02', 'synonyms': ['bottle_cap', 'cap_(container_lid)'], 'def': 'a top (as for a bottle)', 'name': 'bottle_cap'}, {'frequency': 'r', 'id': 209, 'synset': 'cape.n.02', 'synonyms': ['cape'], 'def': 'a sleeveless garment like a cloak but shorter', 'name': 'cape'}, {'frequency': 'c', 'id': 210, 'synset': 'cappuccino.n.01', 'synonyms': ['cappuccino', 'coffee_cappuccino'], 'def': 'equal parts of espresso and steamed milk', 'name': 'cappuccino'}, {'frequency': 'f', 'id': 211, 'synset': 'car.n.01', 'synonyms': ['car_(automobile)', 'auto_(automobile)', 'automobile'], 'def': 'a motor vehicle with four wheels', 'name': 'car_(automobile)'}, {'frequency': 'f', 'id': 212, 'synset': 'car.n.02', 'synonyms': ['railcar_(part_of_a_train)', 'railway_car_(part_of_a_train)', 'railroad_car_(part_of_a_train)'], 'def': 'a wheeled vehicle adapted to the rails of railroad', 'name': 'railcar_(part_of_a_train)'}, {'frequency': 'r', 'id': 213, 'synset': 'car.n.04', 'synonyms': ['elevator_car'], 'def': 'where passengers ride up and down', 'name': 'elevator_car'}, {'frequency': 'r', 'id': 214, 'synset': 'car_battery.n.01', 'synonyms': ['car_battery', 'automobile_battery'], 'def': 'a battery in a motor vehicle', 'name': 'car_battery'}, {'frequency': 'c', 'id': 215, 'synset': 'card.n.02', 'synonyms': ['identity_card'], 'def': 'a card certifying the identity of the bearer', 'name': 'identity_card'}, {'frequency': 'c', 'id': 216, 'synset': 'card.n.03', 'synonyms': ['card'], 'def': 'a rectangular piece of paper used to send messages (e.g. greetings or pictures)', 'name': 'card'}, {'frequency': 'r', 'id': 217, 'synset': 'cardigan.n.01', 'synonyms': ['cardigan'], 'def': 'knitted jacket that is fastened up the front with buttons or a zipper', 'name': 'cardigan'}, {'frequency': 'r', 'id': 218, 'synset': 'cargo_ship.n.01', 'synonyms': ['cargo_ship', 'cargo_vessel'], 'def': 'a ship designed to carry cargo', 'name': 'cargo_ship'}, {'frequency': 'r', 'id': 219, 'synset': 'carnation.n.01', 'synonyms': ['carnation'], 'def': 'plant with pink to purple-red spice-scented usually double flowers', 'name': 'carnation'}, {'frequency': 'c', 'id': 220, 'synset': 'carriage.n.02', 'synonyms': ['horse_carriage'], 'def': 'a vehicle with wheels drawn by one or more horses', 'name': 'horse_carriage'}, {'frequency': 'f', 'id': 221, 'synset': 'carrot.n.01', 'synonyms': ['carrot'], 'def': 'deep orange edible root of the cultivated carrot plant', 'name': 'carrot'}, {'frequency': 'c', 'id': 222, 'synset': 'carryall.n.01', 'synonyms': ['tote_bag'], 'def': 'a capacious bag or basket', 'name': 'tote_bag'}, {'frequency': 'c', 'id': 223, 'synset': 'cart.n.01', 'synonyms': ['cart'], 'def': 'a heavy open wagon usually having two wheels and drawn by an animal', 'name': 'cart'}, {'frequency': 'c', 'id': 224, 'synset': 'carton.n.02', 'synonyms': ['carton'], 'def': 'a box made of cardboard; opens by flaps on top', 'name': 'carton'}, {'frequency': 'c', 'id': 225, 'synset': 'cash_register.n.01', 'synonyms': ['cash_register', 'register_(for_cash_transactions)'], 'def': 'a cashbox with an adding machine to register transactions', 'name': 'cash_register'}, {'frequency': 'r', 'id': 226, 'synset': 'casserole.n.01', 'synonyms': ['casserole'], 'def': 'food cooked and served in a casserole', 'name': 'casserole'}, {'frequency': 'r', 'id': 227, 'synset': 'cassette.n.01', 'synonyms': ['cassette'], 'def': 'a container that holds a magnetic tape used for recording or playing sound or video', 'name': 'cassette'}, {'frequency': 'c', 'id': 228, 'synset': 'cast.n.05', 'synonyms': ['cast', 'plaster_cast', 'plaster_bandage'], 'def': 'bandage consisting of a firm covering that immobilizes broken bones while they heal', 'name': 'cast'}, {'frequency': 'f', 'id': 229, 'synset': 'cat.n.01', 'synonyms': ['cat'], 'def': 'a domestic house cat', 'name': 'cat'}, {'frequency': 'c', 'id': 230, 'synset': 'cauliflower.n.02', 'synonyms': ['cauliflower'], 'def': 'edible compact head of white undeveloped flowers', 'name': 'cauliflower'}, {'frequency': 'r', 'id': 231, 'synset': 'caviar.n.01', 'synonyms': ['caviar', 'caviare'], 'def': "salted roe of sturgeon or other large fish; usually served as an hors d'oeuvre", 'name': 'caviar'}, {'frequency': 'c', 'id': 232, 'synset': 'cayenne.n.02', 'synonyms': ['cayenne_(spice)', 'cayenne_pepper_(spice)', 'red_pepper_(spice)'], 'def': 'ground pods and seeds of pungent red peppers of the genus Capsicum', 'name': 'cayenne_(spice)'}, {'frequency': 'c', 'id': 233, 'synset': 'cd_player.n.01', 'synonyms': ['CD_player'], 'def': 'electronic equipment for playing compact discs (CDs)', 'name': 'CD_player'}, {'frequency': 'c', 'id': 234, 'synset': 'celery.n.01', 'synonyms': ['celery'], 'def': 'widely cultivated herb with aromatic leaf stalks that are eaten raw or cooked', 'name': 'celery'}, {'frequency': 'f', 'id': 235, 'synset': 'cellular_telephone.n.01', 'synonyms': ['cellular_telephone', 'cellular_phone', 'cellphone', 'mobile_phone', 'smart_phone'], 'def': 'a hand-held mobile telephone', 'name': 'cellular_telephone'}, {'frequency': 'r', 'id': 236, 'synset': 'chain_mail.n.01', 'synonyms': ['chain_mail', 'ring_mail', 'chain_armor', 'chain_armour', 'ring_armor', 'ring_armour'], 'def': '(Middle Ages) flexible armor made of interlinked metal rings', 'name': 'chain_mail'}, {'frequency': 'f', 'id': 237, 'synset': 'chair.n.01', 'synonyms': ['chair'], 'def': 'a seat for one person, with a support for the back', 'name': 'chair'}, {'frequency': 'r', 'id': 238, 'synset': 'chaise_longue.n.01', 'synonyms': ['chaise_longue', 'chaise', 'daybed'], 'def': 'a long chair; for reclining', 'name': 'chaise_longue'}, {'frequency': 'r', 'id': 239, 'synset': 'champagne.n.01', 'synonyms': ['champagne'], 'def': 'a white sparkling wine produced in Champagne or resembling that produced there', 'name': 'champagne'}, {'frequency': 'f', 'id': 240, 'synset': 'chandelier.n.01', 'synonyms': ['chandelier'], 'def': 'branched lighting fixture; often ornate; hangs from the ceiling', 'name': 'chandelier'}, {'frequency': 'r', 'id': 241, 'synset': 'chap.n.04', 'synonyms': ['chap'], 'def': 'leather leggings without a seat; worn over trousers by cowboys to protect their legs', 'name': 'chap'}, {'frequency': 'r', 'id': 242, 'synset': 'checkbook.n.01', 'synonyms': ['checkbook', 'chequebook'], 'def': 'a book issued to holders of checking accounts', 'name': 'checkbook'}, {'frequency': 'r', 'id': 243, 'synset': 'checkerboard.n.01', 'synonyms': ['checkerboard'], 'def': 'a board having 64 squares of two alternating colors', 'name': 'checkerboard'}, {'frequency': 'c', 'id': 244, 'synset': 'cherry.n.03', 'synonyms': ['cherry'], 'def': 'a red fruit with a single hard stone', 'name': 'cherry'}, {'frequency': 'r', 'id': 245, 'synset': 'chessboard.n.01', 'synonyms': ['chessboard'], 'def': 'a checkerboard used to play chess', 'name': 'chessboard'}, {'frequency': 'r', 'id': 246, 'synset': 'chest_of_drawers.n.01', 'synonyms': ['chest_of_drawers_(furniture)', 'bureau_(furniture)', 'chest_(furniture)'], 'def': 'furniture with drawers for keeping clothes', 'name': 'chest_of_drawers_(furniture)'}, {'frequency': 'c', 'id': 247, 'synset': 'chicken.n.02', 'synonyms': ['chicken_(animal)'], 'def': 'a domestic fowl bred for flesh or eggs', 'name': 'chicken_(animal)'}, {'frequency': 'c', 'id': 248, 'synset': 'chicken_wire.n.01', 'synonyms': ['chicken_wire'], 'def': 'a galvanized wire network with a hexagonal mesh; used to build fences', 'name': 'chicken_wire'}, {'frequency': 'r', 'id': 249, 'synset': 'chickpea.n.01', 'synonyms': ['chickpea', 'garbanzo'], 'def': 'the seed of the chickpea plant; usually dried', 'name': 'chickpea'}, {'frequency': 'r', 'id': 250, 'synset': 'chihuahua.n.03', 'synonyms': ['Chihuahua'], 'def': 'an old breed of tiny short-haired dog with protruding eyes from Mexico', 'name': 'Chihuahua'}, {'frequency': 'r', 'id': 251, 'synset': 'chili.n.02', 'synonyms': ['chili_(vegetable)', 'chili_pepper_(vegetable)', 'chilli_(vegetable)', 'chilly_(vegetable)', 'chile_(vegetable)'], 'def': 'very hot and finely tapering pepper of special pungency', 'name': 'chili_(vegetable)'}, {'frequency': 'r', 'id': 252, 'synset': 'chime.n.01', 'synonyms': ['chime', 'gong'], 'def': 'an instrument consisting of a set of bells that are struck with a hammer', 'name': 'chime'}, {'frequency': 'r', 'id': 253, 'synset': 'chinaware.n.01', 'synonyms': ['chinaware'], 'def': 'dishware made of high quality porcelain', 'name': 'chinaware'}, {'frequency': 'c', 'id': 254, 'synset': 'chip.n.04', 'synonyms': ['crisp_(potato_chip)', 'potato_chip'], 'def': 'a thin crisp slice of potato fried in deep fat', 'name': 'crisp_(potato_chip)'}, {'frequency': 'r', 'id': 255, 'synset': 'chip.n.06', 'synonyms': ['poker_chip'], 'def': 'a small disk-shaped counter used to represent money when gambling', 'name': 'poker_chip'}, {'frequency': 'c', 'id': 256, 'synset': 'chocolate_bar.n.01', 'synonyms': ['chocolate_bar'], 'def': 'a bar of chocolate candy', 'name': 'chocolate_bar'}, {'frequency': 'c', 'id': 257, 'synset': 'chocolate_cake.n.01', 'synonyms': ['chocolate_cake'], 'def': 'cake containing chocolate', 'name': 'chocolate_cake'}, {'frequency': 'r', 'id': 258, 'synset': 'chocolate_milk.n.01', 'synonyms': ['chocolate_milk'], 'def': 'milk flavored with chocolate syrup', 'name': 'chocolate_milk'}, {'frequency': 'r', 'id': 259, 'synset': 'chocolate_mousse.n.01', 'synonyms': ['chocolate_mousse'], 'def': 'dessert mousse made with chocolate', 'name': 'chocolate_mousse'}, {'frequency': 'f', 'id': 260, 'synset': 'choker.n.03', 'synonyms': ['choker', 'collar', 'neckband'], 'def': 'necklace that fits tightly around the neck', 'name': 'choker'}, {'frequency': 'f', 'id': 261, 'synset': 'chopping_board.n.01', 'synonyms': ['chopping_board', 'cutting_board', 'chopping_block'], 'def': 'a wooden board where meats or vegetables can be cut', 'name': 'chopping_board'}, {'frequency': 'c', 'id': 262, 'synset': 'chopstick.n.01', 'synonyms': ['chopstick'], 'def': 'one of a pair of slender sticks used as oriental tableware to eat food with', 'name': 'chopstick'}, {'frequency': 'f', 'id': 263, 'synset': 'christmas_tree.n.05', 'synonyms': ['Christmas_tree'], 'def': 'an ornamented evergreen used as a Christmas decoration', 'name': 'Christmas_tree'}, {'frequency': 'c', 'id': 264, 'synset': 'chute.n.02', 'synonyms': ['slide'], 'def': 'sloping channel through which things can descend', 'name': 'slide'}, {'frequency': 'r', 'id': 265, 'synset': 'cider.n.01', 'synonyms': ['cider', 'cyder'], 'def': 'a beverage made from juice pressed from apples', 'name': 'cider'}, {'frequency': 'r', 'id': 266, 'synset': 'cigar_box.n.01', 'synonyms': ['cigar_box'], 'def': 'a box for holding cigars', 'name': 'cigar_box'}, {'frequency': 'c', 'id': 267, 'synset': 'cigarette.n.01', 'synonyms': ['cigarette'], 'def': 'finely ground tobacco wrapped in paper; for smoking', 'name': 'cigarette'}, {'frequency': 'c', 'id': 268, 'synset': 'cigarette_case.n.01', 'synonyms': ['cigarette_case', 'cigarette_pack'], 'def': 'a small flat case for holding cigarettes', 'name': 'cigarette_case'}, {'frequency': 'f', 'id': 269, 'synset': 'cistern.n.02', 'synonyms': ['cistern', 'water_tank'], 'def': 'a tank that holds the water used to flush a toilet', 'name': 'cistern'}, {'frequency': 'r', 'id': 270, 'synset': 'clarinet.n.01', 'synonyms': ['clarinet'], 'def': 'a single-reed instrument with a straight tube', 'name': 'clarinet'}, {'frequency': 'r', 'id': 271, 'synset': 'clasp.n.01', 'synonyms': ['clasp'], 'def': 'a fastener (as a buckle or hook) that is used to hold two things together', 'name': 'clasp'}, {'frequency': 'c', 'id': 272, 'synset': 'cleansing_agent.n.01', 'synonyms': ['cleansing_agent', 'cleanser', 'cleaner'], 'def': 'a preparation used in cleaning something', 'name': 'cleansing_agent'}, {'frequency': 'r', 'id': 273, 'synset': 'clementine.n.01', 'synonyms': ['clementine'], 'def': 'a variety of mandarin orange', 'name': 'clementine'}, {'frequency': 'c', 'id': 274, 'synset': 'clip.n.03', 'synonyms': ['clip'], 'def': 'any of various small fasteners used to hold loose articles together', 'name': 'clip'}, {'frequency': 'c', 'id': 275, 'synset': 'clipboard.n.01', 'synonyms': ['clipboard'], 'def': 'a small writing board with a clip at the top for holding papers', 'name': 'clipboard'}, {'frequency': 'f', 'id': 276, 'synset': 'clock.n.01', 'synonyms': ['clock', 'timepiece', 'timekeeper'], 'def': 'a timepiece that shows the time of day', 'name': 'clock'}, {'frequency': 'f', 'id': 277, 'synset': 'clock_tower.n.01', 'synonyms': ['clock_tower'], 'def': 'a tower with a large clock visible high up on an outside face', 'name': 'clock_tower'}, {'frequency': 'c', 'id': 278, 'synset': 'clothes_hamper.n.01', 'synonyms': ['clothes_hamper', 'laundry_basket', 'clothes_basket'], 'def': 'a hamper that holds dirty clothes to be washed or wet clothes to be dried', 'name': 'clothes_hamper'}, {'frequency': 'c', 'id': 279, 'synset': 'clothespin.n.01', 'synonyms': ['clothespin', 'clothes_peg'], 'def': 'wood or plastic fastener; for holding clothes on a clothesline', 'name': 'clothespin'}, {'frequency': 'r', 'id': 280, 'synset': 'clutch_bag.n.01', 'synonyms': ['clutch_bag'], 'def': "a woman's strapless purse that is carried in the hand", 'name': 'clutch_bag'}, {'frequency': 'f', 'id': 281, 'synset': 'coaster.n.03', 'synonyms': ['coaster'], 'def': 'a covering (plate or mat) that protects the surface of a table', 'name': 'coaster'}, {'frequency': 'f', 'id': 282, 'synset': 'coat.n.01', 'synonyms': ['coat'], 'def': 'an outer garment that has sleeves and covers the body from shoulder down', 'name': 'coat'}, {'frequency': 'c', 'id': 283, 'synset': 'coat_hanger.n.01', 'synonyms': ['coat_hanger', 'clothes_hanger', 'dress_hanger'], 'def': "a hanger that is shaped like a person's shoulders", 'name': 'coat_hanger'}, {'frequency': 'r', 'id': 284, 'synset': 'coatrack.n.01', 'synonyms': ['coatrack', 'hatrack'], 'def': 'a rack with hooks for temporarily holding coats and hats', 'name': 'coatrack'}, {'frequency': 'c', 'id': 285, 'synset': 'cock.n.04', 'synonyms': ['cock', 'rooster'], 'def': 'adult male chicken', 'name': 'cock'}, {'frequency': 'c', 'id': 286, 'synset': 'coconut.n.02', 'synonyms': ['coconut', 'cocoanut'], 'def': 'large hard-shelled brown oval nut with a fibrous husk', 'name': 'coconut'}, {'frequency': 'r', 'id': 287, 'synset': 'coffee_filter.n.01', 'synonyms': ['coffee_filter'], 'def': 'filter (usually of paper) that passes the coffee and retains the coffee grounds', 'name': 'coffee_filter'}, {'frequency': 'f', 'id': 288, 'synset': 'coffee_maker.n.01', 'synonyms': ['coffee_maker', 'coffee_machine'], 'def': 'a kitchen appliance for brewing coffee automatically', 'name': 'coffee_maker'}, {'frequency': 'f', 'id': 289, 'synset': 'coffee_table.n.01', 'synonyms': ['coffee_table', 'cocktail_table'], 'def': 'low table where magazines can be placed and coffee or cocktails are served', 'name': 'coffee_table'}, {'frequency': 'c', 'id': 290, 'synset': 'coffeepot.n.01', 'synonyms': ['coffeepot'], 'def': 'tall pot in which coffee is brewed', 'name': 'coffeepot'}, {'frequency': 'r', 'id': 291, 'synset': 'coil.n.05', 'synonyms': ['coil'], 'def': 'tubing that is wound in a spiral', 'name': 'coil'}, {'frequency': 'c', 'id': 292, 'synset': 'coin.n.01', 'synonyms': ['coin'], 'def': 'a flat metal piece (usually a disc) used as money', 'name': 'coin'}, {'frequency': 'r', 'id': 293, 'synset': 'colander.n.01', 'synonyms': ['colander', 'cullender'], 'def': 'bowl-shaped strainer; used to wash or drain foods', 'name': 'colander'}, {'frequency': 'c', 'id': 294, 'synset': 'coleslaw.n.01', 'synonyms': ['coleslaw', 'slaw'], 'def': 'basically shredded cabbage', 'name': 'coleslaw'}, {'frequency': 'r', 'id': 295, 'synset': 'coloring_material.n.01', 'synonyms': ['coloring_material', 'colouring_material'], 'def': 'any material used for its color', 'name': 'coloring_material'}, {'frequency': 'r', 'id': 296, 'synset': 'combination_lock.n.01', 'synonyms': ['combination_lock'], 'def': 'lock that can be opened only by turning dials in a special sequence', 'name': 'combination_lock'}, {'frequency': 'c', 'id': 297, 'synset': 'comforter.n.04', 'synonyms': ['pacifier', 'teething_ring'], 'def': 'device used for an infant to suck or bite on', 'name': 'pacifier'}, {'frequency': 'r', 'id': 298, 'synset': 'comic_book.n.01', 'synonyms': ['comic_book'], 'def': 'a magazine devoted to comic strips', 'name': 'comic_book'}, {'frequency': 'f', 'id': 299, 'synset': 'computer_keyboard.n.01', 'synonyms': ['computer_keyboard', 'keyboard_(computer)'], 'def': 'a keyboard that is a data input device for computers', 'name': 'computer_keyboard'}, {'frequency': 'r', 'id': 300, 'synset': 'concrete_mixer.n.01', 'synonyms': ['concrete_mixer', 'cement_mixer'], 'def': 'a machine with a large revolving drum in which cement/concrete is mixed', 'name': 'concrete_mixer'}, {'frequency': 'f', 'id': 301, 'synset': 'cone.n.01', 'synonyms': ['cone', 'traffic_cone'], 'def': 'a cone-shaped object used to direct traffic', 'name': 'cone'}, {'frequency': 'f', 'id': 302, 'synset': 'control.n.09', 'synonyms': ['control', 'controller'], 'def': 'a mechanism that controls the operation of a machine', 'name': 'control'}, {'frequency': 'r', 'id': 303, 'synset': 'convertible.n.01', 'synonyms': ['convertible_(automobile)'], 'def': 'a car that has top that can be folded or removed', 'name': 'convertible_(automobile)'}, {'frequency': 'r', 'id': 304, 'synset': 'convertible.n.03', 'synonyms': ['sofa_bed'], 'def': 'a sofa that can be converted into a bed', 'name': 'sofa_bed'}, {'frequency': 'c', 'id': 305, 'synset': 'cookie.n.01', 'synonyms': ['cookie', 'cooky', 'biscuit_(cookie)'], 'def': "any of various small flat sweet cakes (`biscuit' is the British term)", 'name': 'cookie'}, {'frequency': 'r', 'id': 306, 'synset': 'cookie_jar.n.01', 'synonyms': ['cookie_jar', 'cooky_jar'], 'def': 'a jar in which cookies are kept (and sometimes money is hidden)', 'name': 'cookie_jar'}, {'frequency': 'r', 'id': 307, 'synset': 'cooking_utensil.n.01', 'synonyms': ['cooking_utensil'], 'def': 'a kitchen utensil made of material that does not melt easily; used for cooking', 'name': 'cooking_utensil'}, {'frequency': 'f', 'id': 308, 'synset': 'cooler.n.01', 'synonyms': ['cooler_(for_food)', 'ice_chest'], 'def': 'an insulated box for storing food often with ice', 'name': 'cooler_(for_food)'}, {'frequency': 'c', 'id': 309, 'synset': 'cork.n.04', 'synonyms': ['cork_(bottle_plug)', 'bottle_cork'], 'def': 'the plug in the mouth of a bottle (especially a wine bottle)', 'name': 'cork_(bottle_plug)'}, {'frequency': 'r', 'id': 310, 'synset': 'corkboard.n.01', 'synonyms': ['corkboard'], 'def': 'a sheet consisting of cork granules', 'name': 'corkboard'}, {'frequency': 'r', 'id': 311, 'synset': 'corkscrew.n.01', 'synonyms': ['corkscrew', 'bottle_screw'], 'def': 'a bottle opener that pulls corks', 'name': 'corkscrew'}, {'frequency': 'c', 'id': 312, 'synset': 'corn.n.03', 'synonyms': ['edible_corn', 'corn', 'maize'], 'def': 'ears of corn that can be prepared and served for human food', 'name': 'edible_corn'}, {'frequency': 'r', 'id': 313, 'synset': 'cornbread.n.01', 'synonyms': ['cornbread'], 'def': 'bread made primarily of cornmeal', 'name': 'cornbread'}, {'frequency': 'c', 'id': 314, 'synset': 'cornet.n.01', 'synonyms': ['cornet', 'horn', 'trumpet'], 'def': 'a brass musical instrument with a narrow tube and a flared bell and many valves', 'name': 'cornet'}, {'frequency': 'c', 'id': 315, 'synset': 'cornice.n.01', 'synonyms': ['cornice', 'valance', 'valance_board', 'pelmet'], 'def': 'a decorative framework to conceal curtain fixtures at the top of a window casing', 'name': 'cornice'}, {'frequency': 'r', 'id': 316, 'synset': 'cornmeal.n.01', 'synonyms': ['cornmeal'], 'def': 'coarsely ground corn', 'name': 'cornmeal'}, {'frequency': 'r', 'id': 317, 'synset': 'corset.n.01', 'synonyms': ['corset', 'girdle'], 'def': "a woman's close-fitting foundation garment", 'name': 'corset'}, {'frequency': 'r', 'id': 318, 'synset': 'cos.n.02', 'synonyms': ['romaine_lettuce'], 'def': 'lettuce with long dark-green leaves in a loosely packed elongated head', 'name': 'romaine_lettuce'}, {'frequency': 'c', 'id': 319, 'synset': 'costume.n.04', 'synonyms': ['costume'], 'def': 'the attire characteristic of a country or a time or a social class', 'name': 'costume'}, {'frequency': 'r', 'id': 320, 'synset': 'cougar.n.01', 'synonyms': ['cougar', 'puma', 'catamount', 'mountain_lion', 'panther'], 'def': 'large American feline resembling a lion', 'name': 'cougar'}, {'frequency': 'r', 'id': 321, 'synset': 'coverall.n.01', 'synonyms': ['coverall'], 'def': 'a loose-fitting protective garment that is worn over other clothing', 'name': 'coverall'}, {'frequency': 'r', 'id': 322, 'synset': 'cowbell.n.01', 'synonyms': ['cowbell'], 'def': 'a bell hung around the neck of cow so that the cow can be easily located', 'name': 'cowbell'}, {'frequency': 'f', 'id': 323, 'synset': 'cowboy_hat.n.01', 'synonyms': ['cowboy_hat', 'ten-gallon_hat'], 'def': 'a hat with a wide brim and a soft crown; worn by American ranch hands', 'name': 'cowboy_hat'}, {'frequency': 'r', 'id': 324, 'synset': 'crab.n.01', 'synonyms': ['crab_(animal)'], 'def': 'decapod having eyes on short stalks and a broad flattened shell and pincers', 'name': 'crab_(animal)'}, {'frequency': 'c', 'id': 325, 'synset': 'cracker.n.01', 'synonyms': ['cracker'], 'def': 'a thin crisp wafer', 'name': 'cracker'}, {'frequency': 'r', 'id': 326, 'synset': 'crape.n.01', 'synonyms': ['crape', 'crepe', 'French_pancake'], 'def': 'small very thin pancake', 'name': 'crape'}, {'frequency': 'f', 'id': 327, 'synset': 'crate.n.01', 'synonyms': ['crate'], 'def': 'a rugged box (usually made of wood); used for shipping', 'name': 'crate'}, {'frequency': 'r', 'id': 328, 'synset': 'crayon.n.01', 'synonyms': ['crayon', 'wax_crayon'], 'def': 'writing or drawing implement made of a colored stick of composition wax', 'name': 'crayon'}, {'frequency': 'r', 'id': 329, 'synset': 'cream_pitcher.n.01', 'synonyms': ['cream_pitcher'], 'def': 'a small pitcher for serving cream', 'name': 'cream_pitcher'}, {'frequency': 'r', 'id': 330, 'synset': 'credit_card.n.01', 'synonyms': ['credit_card', 'charge_card', 'debit_card'], 'def': 'a card, usually plastic, used to pay for goods and services', 'name': 'credit_card'}, {'frequency': 'c', 'id': 331, 'synset': 'crescent_roll.n.01', 'synonyms': ['crescent_roll', 'croissant'], 'def': 'very rich flaky crescent-shaped roll', 'name': 'crescent_roll'}, {'frequency': 'c', 'id': 332, 'synset': 'crib.n.01', 'synonyms': ['crib', 'cot'], 'def': 'baby bed with high sides made of slats', 'name': 'crib'}, {'frequency': 'c', 'id': 333, 'synset': 'crock.n.03', 'synonyms': ['crock_pot', 'earthenware_jar'], 'def': 'an earthen jar (made of baked clay)', 'name': 'crock_pot'}, {'frequency': 'f', 'id': 334, 'synset': 'crossbar.n.01', 'synonyms': ['crossbar'], 'def': 'a horizontal bar that goes across something', 'name': 'crossbar'}, {'frequency': 'r', 'id': 335, 'synset': 'crouton.n.01', 'synonyms': ['crouton'], 'def': 'a small piece of toasted or fried bread; served in soup or salads', 'name': 'crouton'}, {'frequency': 'r', 'id': 336, 'synset': 'crow.n.01', 'synonyms': ['crow'], 'def': 'black birds having a raucous call', 'name': 'crow'}, {'frequency': 'c', 'id': 337, 'synset': 'crown.n.04', 'synonyms': ['crown'], 'def': 'an ornamental jeweled headdress signifying sovereignty', 'name': 'crown'}, {'frequency': 'c', 'id': 338, 'synset': 'crucifix.n.01', 'synonyms': ['crucifix'], 'def': 'representation of the cross on which Jesus died', 'name': 'crucifix'}, {'frequency': 'c', 'id': 339, 'synset': 'cruise_ship.n.01', 'synonyms': ['cruise_ship', 'cruise_liner'], 'def': 'a passenger ship used commercially for pleasure cruises', 'name': 'cruise_ship'}, {'frequency': 'c', 'id': 340, 'synset': 'cruiser.n.01', 'synonyms': ['police_cruiser', 'patrol_car', 'police_car', 'squad_car'], 'def': 'a car in which policemen cruise the streets', 'name': 'police_cruiser'}, {'frequency': 'c', 'id': 341, 'synset': 'crumb.n.03', 'synonyms': ['crumb'], 'def': 'small piece of e.g. bread or cake', 'name': 'crumb'}, {'frequency': 'r', 'id': 342, 'synset': 'crutch.n.01', 'synonyms': ['crutch'], 'def': 'a wooden or metal staff that fits under the armpit and reaches to the ground', 'name': 'crutch'}, {'frequency': 'c', 'id': 343, 'synset': 'cub.n.03', 'synonyms': ['cub_(animal)'], 'def': 'the young of certain carnivorous mammals such as the bear or wolf or lion', 'name': 'cub_(animal)'}, {'frequency': 'r', 'id': 344, 'synset': 'cube.n.05', 'synonyms': ['cube', 'square_block'], 'def': 'a block in the (approximate) shape of a cube', 'name': 'cube'}, {'frequency': 'f', 'id': 345, 'synset': 'cucumber.n.02', 'synonyms': ['cucumber', 'cuke'], 'def': 'cylindrical green fruit with thin green rind and white flesh eaten as a vegetable', 'name': 'cucumber'}, {'frequency': 'c', 'id': 346, 'synset': 'cufflink.n.01', 'synonyms': ['cufflink'], 'def': 'jewelry consisting of linked buttons used to fasten the cuffs of a shirt', 'name': 'cufflink'}, {'frequency': 'f', 'id': 347, 'synset': 'cup.n.01', 'synonyms': ['cup'], 'def': 'a small open container usually used for drinking; usually has a handle', 'name': 'cup'}, {'frequency': 'c', 'id': 348, 'synset': 'cup.n.08', 'synonyms': ['trophy_cup'], 'def': 'a metal vessel with handles that is awarded as a trophy to a competition winner', 'name': 'trophy_cup'}, {'frequency': 'c', 'id': 349, 'synset': 'cupcake.n.01', 'synonyms': ['cupcake'], 'def': 'small cake baked in a muffin tin', 'name': 'cupcake'}, {'frequency': 'r', 'id': 350, 'synset': 'curler.n.01', 'synonyms': ['hair_curler', 'hair_roller', 'hair_crimper'], 'def': 'a cylindrical tube around which the hair is wound to curl it', 'name': 'hair_curler'}, {'frequency': 'r', 'id': 351, 'synset': 'curling_iron.n.01', 'synonyms': ['curling_iron'], 'def': 'a cylindrical home appliance that heats hair that has been curled around it', 'name': 'curling_iron'}, {'frequency': 'f', 'id': 352, 'synset': 'curtain.n.01', 'synonyms': ['curtain', 'drapery'], 'def': 'hanging cloth used as a blind (especially for a window)', 'name': 'curtain'}, {'frequency': 'f', 'id': 353, 'synset': 'cushion.n.03', 'synonyms': ['cushion'], 'def': 'a soft bag filled with air or padding such as feathers or foam rubber', 'name': 'cushion'}, {'frequency': 'r', 'id': 354, 'synset': 'custard.n.01', 'synonyms': ['custard'], 'def': 'sweetened mixture of milk and eggs baked or boiled or frozen', 'name': 'custard'}, {'frequency': 'c', 'id': 355, 'synset': 'cutter.n.06', 'synonyms': ['cutting_tool'], 'def': 'a cutting implement; a tool for cutting', 'name': 'cutting_tool'}, {'frequency': 'r', 'id': 356, 'synset': 'cylinder.n.04', 'synonyms': ['cylinder'], 'def': 'a cylindrical container', 'name': 'cylinder'}, {'frequency': 'r', 'id': 357, 'synset': 'cymbal.n.01', 'synonyms': ['cymbal'], 'def': 'a percussion instrument consisting of a concave brass disk', 'name': 'cymbal'}, {'frequency': 'r', 'id': 358, 'synset': 'dachshund.n.01', 'synonyms': ['dachshund', 'dachsie', 'badger_dog'], 'def': 'small long-bodied short-legged breed of dog having a short sleek coat and long drooping ears', 'name': 'dachshund'}, {'frequency': 'r', 'id': 359, 'synset': 'dagger.n.01', 'synonyms': ['dagger'], 'def': 'a short knife with a pointed blade used for piercing or stabbing', 'name': 'dagger'}, {'frequency': 'r', 'id': 360, 'synset': 'dartboard.n.01', 'synonyms': ['dartboard'], 'def': 'a circular board of wood or cork used as the target in the game of darts', 'name': 'dartboard'}, {'frequency': 'r', 'id': 361, 'synset': 'date.n.08', 'synonyms': ['date_(fruit)'], 'def': 'sweet edible fruit of the date palm with a single long woody seed', 'name': 'date_(fruit)'}, {'frequency': 'f', 'id': 362, 'synset': 'deck_chair.n.01', 'synonyms': ['deck_chair', 'beach_chair'], 'def': 'a folding chair for use outdoors; a wooden frame supports a length of canvas', 'name': 'deck_chair'}, {'frequency': 'c', 'id': 363, 'synset': 'deer.n.01', 'synonyms': ['deer', 'cervid'], 'def': "distinguished from Bovidae by the male's having solid deciduous antlers", 'name': 'deer'}, {'frequency': 'c', 'id': 364, 'synset': 'dental_floss.n.01', 'synonyms': ['dental_floss', 'floss'], 'def': 'a soft thread for cleaning the spaces between the teeth', 'name': 'dental_floss'}, {'frequency': 'f', 'id': 365, 'synset': 'desk.n.01', 'synonyms': ['desk'], 'def': 'a piece of furniture with a writing surface and usually drawers or other compartments', 'name': 'desk'}, {'frequency': 'r', 'id': 366, 'synset': 'detergent.n.01', 'synonyms': ['detergent'], 'def': 'a surface-active chemical widely used in industry and laundering', 'name': 'detergent'}, {'frequency': 'c', 'id': 367, 'synset': 'diaper.n.01', 'synonyms': ['diaper'], 'def': 'garment consisting of a folded cloth drawn up between the legs and fastened at the waist', 'name': 'diaper'}, {'frequency': 'r', 'id': 368, 'synset': 'diary.n.01', 'synonyms': ['diary', 'journal'], 'def': 'a daily written record of (usually personal) experiences and observations', 'name': 'diary'}, {'frequency': 'r', 'id': 369, 'synset': 'die.n.01', 'synonyms': ['die', 'dice'], 'def': 'a small cube with 1 to 6 spots on the six faces; used in gambling', 'name': 'die'}, {'frequency': 'r', 'id': 370, 'synset': 'dinghy.n.01', 'synonyms': ['dinghy', 'dory', 'rowboat'], 'def': 'a small boat of shallow draft with seats and oars with which it is propelled', 'name': 'dinghy'}, {'frequency': 'f', 'id': 371, 'synset': 'dining_table.n.01', 'synonyms': ['dining_table'], 'def': 'a table at which meals are served', 'name': 'dining_table'}, {'frequency': 'r', 'id': 372, 'synset': 'dinner_jacket.n.01', 'synonyms': ['tux', 'tuxedo'], 'def': 'semiformal evening dress for men', 'name': 'tux'}, {'frequency': 'c', 'id': 373, 'synset': 'dish.n.01', 'synonyms': ['dish'], 'def': 'a piece of dishware normally used as a container for holding or serving food', 'name': 'dish'}, {'frequency': 'c', 'id': 374, 'synset': 'dish.n.05', 'synonyms': ['dish_antenna'], 'def': 'directional antenna consisting of a parabolic reflector', 'name': 'dish_antenna'}, {'frequency': 'c', 'id': 375, 'synset': 'dishrag.n.01', 'synonyms': ['dishrag', 'dishcloth'], 'def': 'a cloth for washing dishes', 'name': 'dishrag'}, {'frequency': 'c', 'id': 376, 'synset': 'dishtowel.n.01', 'synonyms': ['dishtowel', 'tea_towel'], 'def': 'a towel for drying dishes', 'name': 'dishtowel'}, {'frequency': 'f', 'id': 377, 'synset': 'dishwasher.n.01', 'synonyms': ['dishwasher', 'dishwashing_machine'], 'def': 'a machine for washing dishes', 'name': 'dishwasher'}, {'frequency': 'r', 'id': 378, 'synset': 'dishwasher_detergent.n.01', 'synonyms': ['dishwasher_detergent', 'dishwashing_detergent', 'dishwashing_liquid'], 'def': 'a low-sudsing detergent designed for use in dishwashers', 'name': 'dishwasher_detergent'}, {'frequency': 'r', 'id': 379, 'synset': 'diskette.n.01', 'synonyms': ['diskette', 'floppy', 'floppy_disk'], 'def': 'a small plastic magnetic disk enclosed in a stiff envelope used to store data', 'name': 'diskette'}, {'frequency': 'c', 'id': 380, 'synset': 'dispenser.n.01', 'synonyms': ['dispenser'], 'def': 'a container so designed that the contents can be used in prescribed amounts', 'name': 'dispenser'}, {'frequency': 'c', 'id': 381, 'synset': 'dixie_cup.n.01', 'synonyms': ['Dixie_cup', 'paper_cup'], 'def': 'a disposable cup made of paper; for holding drinks', 'name': 'Dixie_cup'}, {'frequency': 'f', 'id': 382, 'synset': 'dog.n.01', 'synonyms': ['dog'], 'def': 'a common domesticated dog', 'name': 'dog'}, {'frequency': 'f', 'id': 383, 'synset': 'dog_collar.n.01', 'synonyms': ['dog_collar'], 'def': 'a collar for a dog', 'name': 'dog_collar'}, {'frequency': 'c', 'id': 384, 'synset': 'doll.n.01', 'synonyms': ['doll'], 'def': 'a toy replica of a HUMAN (NOT AN ANIMAL)', 'name': 'doll'}, {'frequency': 'r', 'id': 385, 'synset': 'dollar.n.02', 'synonyms': ['dollar', 'dollar_bill', 'one_dollar_bill'], 'def': 'a piece of paper money worth one dollar', 'name': 'dollar'}, {'frequency': 'r', 'id': 386, 'synset': 'dolphin.n.02', 'synonyms': ['dolphin'], 'def': 'any of various small toothed whales with a beaklike snout; larger than porpoises', 'name': 'dolphin'}, {'frequency': 'c', 'id': 387, 'synset': 'domestic_ass.n.01', 'synonyms': ['domestic_ass', 'donkey'], 'def': 'domestic beast of burden descended from the African wild ass; patient but stubborn', 'name': 'domestic_ass'}, {'frequency': 'r', 'id': 388, 'synset': 'domino.n.03', 'synonyms': ['eye_mask'], 'def': 'a mask covering the upper part of the face but with holes for the eyes', 'name': 'eye_mask'}, {'frequency': 'r', 'id': 389, 'synset': 'doorbell.n.01', 'synonyms': ['doorbell', 'buzzer'], 'def': 'a button at an outer door that gives a ringing or buzzing signal when pushed', 'name': 'doorbell'}, {'frequency': 'f', 'id': 390, 'synset': 'doorknob.n.01', 'synonyms': ['doorknob', 'doorhandle'], 'def': "a knob used to open a door (often called `doorhandle' in Great Britain)", 'name': 'doorknob'}, {'frequency': 'c', 'id': 391, 'synset': 'doormat.n.02', 'synonyms': ['doormat', 'welcome_mat'], 'def': 'a mat placed outside an exterior door for wiping the shoes before entering', 'name': 'doormat'}, {'frequency': 'f', 'id': 392, 'synset': 'doughnut.n.02', 'synonyms': ['doughnut', 'donut'], 'def': 'a small ring-shaped friedcake', 'name': 'doughnut'}, {'frequency': 'r', 'id': 393, 'synset': 'dove.n.01', 'synonyms': ['dove'], 'def': 'any of numerous small pigeons', 'name': 'dove'}, {'frequency': 'r', 'id': 394, 'synset': 'dragonfly.n.01', 'synonyms': ['dragonfly'], 'def': 'slender-bodied non-stinging insect having iridescent wings that are outspread at rest', 'name': 'dragonfly'}, {'frequency': 'f', 'id': 395, 'synset': 'drawer.n.01', 'synonyms': ['drawer'], 'def': 'a boxlike container in a piece of furniture; made so as to slide in and out', 'name': 'drawer'}, {'frequency': 'c', 'id': 396, 'synset': 'drawers.n.01', 'synonyms': ['underdrawers', 'boxers', 'boxershorts'], 'def': 'underpants worn by men', 'name': 'underdrawers'}, {'frequency': 'f', 'id': 397, 'synset': 'dress.n.01', 'synonyms': ['dress', 'frock'], 'def': 'a one-piece garment for a woman; has skirt and bodice', 'name': 'dress'}, {'frequency': 'c', 'id': 398, 'synset': 'dress_hat.n.01', 'synonyms': ['dress_hat', 'high_hat', 'opera_hat', 'silk_hat', 'top_hat'], 'def': "a man's hat with a tall crown; usually covered with silk or with beaver fur", 'name': 'dress_hat'}, {'frequency': 'c', 'id': 399, 'synset': 'dress_suit.n.01', 'synonyms': ['dress_suit'], 'def': 'formalwear consisting of full evening dress for men', 'name': 'dress_suit'}, {'frequency': 'c', 'id': 400, 'synset': 'dresser.n.05', 'synonyms': ['dresser'], 'def': 'a cabinet with shelves', 'name': 'dresser'}, {'frequency': 'c', 'id': 401, 'synset': 'drill.n.01', 'synonyms': ['drill'], 'def': 'a tool with a sharp rotating point for making holes in hard materials', 'name': 'drill'}, {'frequency': 'r', 'id': 402, 'synset': 'drinking_fountain.n.01', 'synonyms': ['drinking_fountain'], 'def': 'a public fountain to provide a jet of drinking water', 'name': 'drinking_fountain'}, {'frequency': 'r', 'id': 403, 'synset': 'drone.n.04', 'synonyms': ['drone'], 'def': 'an aircraft without a pilot that is operated by remote control', 'name': 'drone'}, {'frequency': 'r', 'id': 404, 'synset': 'dropper.n.01', 'synonyms': ['dropper', 'eye_dropper'], 'def': 'pipet consisting of a small tube with a vacuum bulb at one end for drawing liquid in and releasing it a drop at a time', 'name': 'dropper'}, {'frequency': 'c', 'id': 405, 'synset': 'drum.n.01', 'synonyms': ['drum_(musical_instrument)'], 'def': 'a musical percussion instrument; usually consists of a hollow cylinder with a membrane stretched across each end', 'name': 'drum_(musical_instrument)'}, {'frequency': 'r', 'id': 406, 'synset': 'drumstick.n.02', 'synonyms': ['drumstick'], 'def': 'a stick used for playing a drum', 'name': 'drumstick'}, {'frequency': 'f', 'id': 407, 'synset': 'duck.n.01', 'synonyms': ['duck'], 'def': 'small web-footed broad-billed swimming bird', 'name': 'duck'}, {'frequency': 'r', 'id': 408, 'synset': 'duckling.n.02', 'synonyms': ['duckling'], 'def': 'young duck', 'name': 'duckling'}, {'frequency': 'c', 'id': 409, 'synset': 'duct_tape.n.01', 'synonyms': ['duct_tape'], 'def': 'a wide silvery adhesive tape', 'name': 'duct_tape'}, {'frequency': 'f', 'id': 410, 'synset': 'duffel_bag.n.01', 'synonyms': ['duffel_bag', 'duffle_bag', 'duffel', 'duffle'], 'def': 'a large cylindrical bag of heavy cloth', 'name': 'duffel_bag'}, {'frequency': 'r', 'id': 411, 'synset': 'dumbbell.n.01', 'synonyms': ['dumbbell'], 'def': 'an exercising weight with two ball-like ends connected by a short handle', 'name': 'dumbbell'}, {'frequency': 'c', 'id': 412, 'synset': 'dumpster.n.01', 'synonyms': ['dumpster'], 'def': 'a container designed to receive and transport and dump waste', 'name': 'dumpster'}, {'frequency': 'r', 'id': 413, 'synset': 'dustpan.n.02', 'synonyms': ['dustpan'], 'def': 'a short-handled receptacle into which dust can be swept', 'name': 'dustpan'}, {'frequency': 'r', 'id': 414, 'synset': 'dutch_oven.n.02', 'synonyms': ['Dutch_oven'], 'def': 'iron or earthenware cooking pot; used for stews', 'name': 'Dutch_oven'}, {'frequency': 'c', 'id': 415, 'synset': 'eagle.n.01', 'synonyms': ['eagle'], 'def': 'large birds of prey noted for their broad wings and strong soaring flight', 'name': 'eagle'}, {'frequency': 'f', 'id': 416, 'synset': 'earphone.n.01', 'synonyms': ['earphone', 'earpiece', 'headphone'], 'def': 'device for listening to audio that is held over or inserted into the ear', 'name': 'earphone'}, {'frequency': 'r', 'id': 417, 'synset': 'earplug.n.01', 'synonyms': ['earplug'], 'def': 'a soft plug that is inserted into the ear canal to block sound', 'name': 'earplug'}, {'frequency': 'f', 'id': 418, 'synset': 'earring.n.01', 'synonyms': ['earring'], 'def': 'jewelry to ornament the ear', 'name': 'earring'}, {'frequency': 'c', 'id': 419, 'synset': 'easel.n.01', 'synonyms': ['easel'], 'def': "an upright tripod for displaying something (usually an artist's canvas)", 'name': 'easel'}, {'frequency': 'r', 'id': 420, 'synset': 'eclair.n.01', 'synonyms': ['eclair'], 'def': 'oblong cream puff', 'name': 'eclair'}, {'frequency': 'r', 'id': 421, 'synset': 'eel.n.01', 'synonyms': ['eel'], 'def': 'an elongate fish with fatty flesh', 'name': 'eel'}, {'frequency': 'f', 'id': 422, 'synset': 'egg.n.02', 'synonyms': ['egg', 'eggs'], 'def': 'oval reproductive body of a fowl (especially a hen) used as food', 'name': 'egg'}, {'frequency': 'r', 'id': 423, 'synset': 'egg_roll.n.01', 'synonyms': ['egg_roll', 'spring_roll'], 'def': 'minced vegetables and meat wrapped in a pancake and fried', 'name': 'egg_roll'}, {'frequency': 'c', 'id': 424, 'synset': 'egg_yolk.n.01', 'synonyms': ['egg_yolk', 'yolk_(egg)'], 'def': 'the yellow spherical part of an egg', 'name': 'egg_yolk'}, {'frequency': 'c', 'id': 425, 'synset': 'eggbeater.n.02', 'synonyms': ['eggbeater', 'eggwhisk'], 'def': 'a mixer for beating eggs or whipping cream', 'name': 'eggbeater'}, {'frequency': 'c', 'id': 426, 'synset': 'eggplant.n.01', 'synonyms': ['eggplant', 'aubergine'], 'def': 'egg-shaped vegetable having a shiny skin typically dark purple', 'name': 'eggplant'}, {'frequency': 'r', 'id': 427, 'synset': 'electric_chair.n.01', 'synonyms': ['electric_chair'], 'def': 'a chair-shaped instrument of execution by electrocution', 'name': 'electric_chair'}, {'frequency': 'f', 'id': 428, 'synset': 'electric_refrigerator.n.01', 'synonyms': ['refrigerator'], 'def': 'a refrigerator in which the coolant is pumped around by an electric motor', 'name': 'refrigerator'}, {'frequency': 'f', 'id': 429, 'synset': 'elephant.n.01', 'synonyms': ['elephant'], 'def': 'a common elephant', 'name': 'elephant'}, {'frequency': 'r', 'id': 430, 'synset': 'elk.n.01', 'synonyms': ['elk', 'moose'], 'def': 'large northern deer with enormous flattened antlers in the male', 'name': 'elk'}, {'frequency': 'c', 'id': 431, 'synset': 'envelope.n.01', 'synonyms': ['envelope'], 'def': 'a flat (usually rectangular) container for a letter, thin package, etc.', 'name': 'envelope'}, {'frequency': 'c', 'id': 432, 'synset': 'eraser.n.01', 'synonyms': ['eraser'], 'def': 'an implement used to erase something', 'name': 'eraser'}, {'frequency': 'r', 'id': 433, 'synset': 'escargot.n.01', 'synonyms': ['escargot'], 'def': 'edible snail usually served in the shell with a sauce of melted butter and garlic', 'name': 'escargot'}, {'frequency': 'r', 'id': 434, 'synset': 'eyepatch.n.01', 'synonyms': ['eyepatch'], 'def': 'a protective cloth covering for an injured eye', 'name': 'eyepatch'}, {'frequency': 'r', 'id': 435, 'synset': 'falcon.n.01', 'synonyms': ['falcon'], 'def': 'birds of prey having long pointed powerful wings adapted for swift flight', 'name': 'falcon'}, {'frequency': 'f', 'id': 436, 'synset': 'fan.n.01', 'synonyms': ['fan'], 'def': 'a device for creating a current of air by movement of a surface or surfaces', 'name': 'fan'}, {'frequency': 'f', 'id': 437, 'synset': 'faucet.n.01', 'synonyms': ['faucet', 'spigot', 'tap'], 'def': 'a regulator for controlling the flow of a liquid from a reservoir', 'name': 'faucet'}, {'frequency': 'r', 'id': 438, 'synset': 'fedora.n.01', 'synonyms': ['fedora'], 'def': 'a hat made of felt with a creased crown', 'name': 'fedora'}, {'frequency': 'r', 'id': 439, 'synset': 'ferret.n.02', 'synonyms': ['ferret'], 'def': 'domesticated albino variety of the European polecat bred for hunting rats and rabbits', 'name': 'ferret'}, {'frequency': 'c', 'id': 440, 'synset': 'ferris_wheel.n.01', 'synonyms': ['Ferris_wheel'], 'def': 'a large wheel with suspended seats that remain upright as the wheel rotates', 'name': 'Ferris_wheel'}, {'frequency': 'r', 'id': 441, 'synset': 'ferry.n.01', 'synonyms': ['ferry', 'ferryboat'], 'def': 'a boat that transports people or vehicles across a body of water and operates on a regular schedule', 'name': 'ferry'}, {'frequency': 'r', 'id': 442, 'synset': 'fig.n.04', 'synonyms': ['fig_(fruit)'], 'def': 'fleshy sweet pear-shaped yellowish or purple fruit eaten fresh or preserved or dried', 'name': 'fig_(fruit)'}, {'frequency': 'c', 'id': 443, 'synset': 'fighter.n.02', 'synonyms': ['fighter_jet', 'fighter_aircraft', 'attack_aircraft'], 'def': 'a high-speed military or naval airplane designed to destroy enemy targets', 'name': 'fighter_jet'}, {'frequency': 'f', 'id': 444, 'synset': 'figurine.n.01', 'synonyms': ['figurine'], 'def': 'a small carved or molded figure', 'name': 'figurine'}, {'frequency': 'c', 'id': 445, 'synset': 'file.n.03', 'synonyms': ['file_cabinet', 'filing_cabinet'], 'def': 'office furniture consisting of a container for keeping papers in order', 'name': 'file_cabinet'}, {'frequency': 'r', 'id': 446, 'synset': 'file.n.04', 'synonyms': ['file_(tool)'], 'def': 'a steel hand tool with small sharp teeth on some or all of its surfaces; used for smoothing wood or metal', 'name': 'file_(tool)'}, {'frequency': 'f', 'id': 447, 'synset': 'fire_alarm.n.02', 'synonyms': ['fire_alarm', 'smoke_alarm'], 'def': 'an alarm that is tripped off by fire or smoke', 'name': 'fire_alarm'}, {'frequency': 'c', 'id': 448, 'synset': 'fire_engine.n.01', 'synonyms': ['fire_engine', 'fire_truck'], 'def': 'large trucks that carry firefighters and equipment to the site of a fire', 'name': 'fire_engine'}, {'frequency': 'c', 'id': 449, 'synset': 'fire_extinguisher.n.01', 'synonyms': ['fire_extinguisher', 'extinguisher'], 'def': 'a manually operated device for extinguishing small fires', 'name': 'fire_extinguisher'}, {'frequency': 'c', 'id': 450, 'synset': 'fire_hose.n.01', 'synonyms': ['fire_hose'], 'def': 'a large hose that carries water from a fire hydrant to the site of the fire', 'name': 'fire_hose'}, {'frequency': 'f', 'id': 451, 'synset': 'fireplace.n.01', 'synonyms': ['fireplace'], 'def': 'an open recess in a wall at the base of a chimney where a fire can be built', 'name': 'fireplace'}, {'frequency': 'f', 'id': 452, 'synset': 'fireplug.n.01', 'synonyms': ['fireplug', 'fire_hydrant', 'hydrant'], 'def': 'an upright hydrant for drawing water to use in fighting a fire', 'name': 'fireplug'}, {'frequency': 'c', 'id': 453, 'synset': 'fish.n.01', 'synonyms': ['fish'], 'def': 'any of various mostly cold-blooded aquatic vertebrates usually having scales and breathing through gills', 'name': 'fish'}, {'frequency': 'r', 'id': 454, 'synset': 'fish.n.02', 'synonyms': ['fish_(food)'], 'def': 'the flesh of fish used as food', 'name': 'fish_(food)'}, {'frequency': 'r', 'id': 455, 'synset': 'fishbowl.n.02', 'synonyms': ['fishbowl', 'goldfish_bowl'], 'def': 'a transparent bowl in which small fish are kept', 'name': 'fishbowl'}, {'frequency': 'r', 'id': 456, 'synset': 'fishing_boat.n.01', 'synonyms': ['fishing_boat', 'fishing_vessel'], 'def': 'a vessel for fishing', 'name': 'fishing_boat'}, {'frequency': 'c', 'id': 457, 'synset': 'fishing_rod.n.01', 'synonyms': ['fishing_rod', 'fishing_pole'], 'def': 'a rod that is used in fishing to extend the fishing line', 'name': 'fishing_rod'}, {'frequency': 'f', 'id': 458, 'synset': 'flag.n.01', 'synonyms': ['flag'], 'def': 'emblem usually consisting of a rectangular piece of cloth of distinctive design (do not include pole)', 'name': 'flag'}, {'frequency': 'f', 'id': 459, 'synset': 'flagpole.n.02', 'synonyms': ['flagpole', 'flagstaff'], 'def': 'a tall staff or pole on which a flag is raised', 'name': 'flagpole'}, {'frequency': 'c', 'id': 460, 'synset': 'flamingo.n.01', 'synonyms': ['flamingo'], 'def': 'large pink web-footed bird with down-bent bill', 'name': 'flamingo'}, {'frequency': 'c', 'id': 461, 'synset': 'flannel.n.01', 'synonyms': ['flannel'], 'def': 'a soft light woolen fabric; used for clothing', 'name': 'flannel'}, {'frequency': 'r', 'id': 462, 'synset': 'flash.n.10', 'synonyms': ['flash', 'flashbulb'], 'def': 'a lamp for providing momentary light to take a photograph', 'name': 'flash'}, {'frequency': 'c', 'id': 463, 'synset': 'flashlight.n.01', 'synonyms': ['flashlight', 'torch'], 'def': 'a small portable battery-powered electric lamp', 'name': 'flashlight'}, {'frequency': 'r', 'id': 464, 'synset': 'fleece.n.03', 'synonyms': ['fleece'], 'def': 'a soft bulky fabric with deep pile; used chiefly for clothing', 'name': 'fleece'}, {'frequency': 'f', 'id': 465, 'synset': 'flip-flop.n.02', 'synonyms': ['flip-flop_(sandal)'], 'def': 'a backless sandal held to the foot by a thong between two toes', 'name': 'flip-flop_(sandal)'}, {'frequency': 'c', 'id': 466, 'synset': 'flipper.n.01', 'synonyms': ['flipper_(footwear)', 'fin_(footwear)'], 'def': 'a shoe to aid a person in swimming', 'name': 'flipper_(footwear)'}, {'frequency': 'f', 'id': 467, 'synset': 'flower_arrangement.n.01', 'synonyms': ['flower_arrangement', 'floral_arrangement'], 'def': 'a decorative arrangement of flowers', 'name': 'flower_arrangement'}, {'frequency': 'c', 'id': 468, 'synset': 'flute.n.02', 'synonyms': ['flute_glass', 'champagne_flute'], 'def': 'a tall narrow wineglass', 'name': 'flute_glass'}, {'frequency': 'r', 'id': 469, 'synset': 'foal.n.01', 'synonyms': ['foal'], 'def': 'a young horse', 'name': 'foal'}, {'frequency': 'c', 'id': 470, 'synset': 'folding_chair.n.01', 'synonyms': ['folding_chair'], 'def': 'a chair that can be folded flat for storage', 'name': 'folding_chair'}, {'frequency': 'c', 'id': 471, 'synset': 'food_processor.n.01', 'synonyms': ['food_processor'], 'def': 'a kitchen appliance for shredding, blending, chopping, or slicing food', 'name': 'food_processor'}, {'frequency': 'c', 'id': 472, 'synset': 'football.n.02', 'synonyms': ['football_(American)'], 'def': 'the inflated oblong ball used in playing American football', 'name': 'football_(American)'}, {'frequency': 'r', 'id': 473, 'synset': 'football_helmet.n.01', 'synonyms': ['football_helmet'], 'def': 'a padded helmet with a face mask to protect the head of football players', 'name': 'football_helmet'}, {'frequency': 'c', 'id': 474, 'synset': 'footstool.n.01', 'synonyms': ['footstool', 'footrest'], 'def': 'a low seat or a stool to rest the feet of a seated person', 'name': 'footstool'}, {'frequency': 'f', 'id': 475, 'synset': 'fork.n.01', 'synonyms': ['fork'], 'def': 'cutlery used for serving and eating food', 'name': 'fork'}, {'frequency': 'r', 'id': 476, 'synset': 'forklift.n.01', 'synonyms': ['forklift'], 'def': 'an industrial vehicle with a power operated fork in front that can be inserted under loads to lift and move them', 'name': 'forklift'}, {'frequency': 'r', 'id': 477, 'synset': 'freight_car.n.01', 'synonyms': ['freight_car'], 'def': 'a railway car that carries freight', 'name': 'freight_car'}, {'frequency': 'r', 'id': 478, 'synset': 'french_toast.n.01', 'synonyms': ['French_toast'], 'def': 'bread slice dipped in egg and milk and fried', 'name': 'French_toast'}, {'frequency': 'c', 'id': 479, 'synset': 'freshener.n.01', 'synonyms': ['freshener', 'air_freshener'], 'def': 'anything that freshens', 'name': 'freshener'}, {'frequency': 'f', 'id': 480, 'synset': 'frisbee.n.01', 'synonyms': ['frisbee'], 'def': 'a light, plastic disk propelled with a flip of the wrist for recreation or competition', 'name': 'frisbee'}, {'frequency': 'c', 'id': 481, 'synset': 'frog.n.01', 'synonyms': ['frog', 'toad', 'toad_frog'], 'def': 'a tailless stout-bodied amphibians with long hind limbs for leaping', 'name': 'frog'}, {'frequency': 'c', 'id': 482, 'synset': 'fruit_juice.n.01', 'synonyms': ['fruit_juice'], 'def': 'drink produced by squeezing or crushing fruit', 'name': 'fruit_juice'}, {'frequency': 'r', 'id': 483, 'synset': 'fruit_salad.n.01', 'synonyms': ['fruit_salad'], 'def': 'salad composed of fruits', 'name': 'fruit_salad'}, {'frequency': 'c', 'id': 484, 'synset': 'frying_pan.n.01', 'synonyms': ['frying_pan', 'frypan', 'skillet'], 'def': 'a pan used for frying foods', 'name': 'frying_pan'}, {'frequency': 'r', 'id': 485, 'synset': 'fudge.n.01', 'synonyms': ['fudge'], 'def': 'soft creamy candy', 'name': 'fudge'}, {'frequency': 'r', 'id': 486, 'synset': 'funnel.n.02', 'synonyms': ['funnel'], 'def': 'a cone-shaped utensil used to channel a substance into a container with a small mouth', 'name': 'funnel'}, {'frequency': 'c', 'id': 487, 'synset': 'futon.n.01', 'synonyms': ['futon'], 'def': 'a pad that is used for sleeping on the floor or on a raised frame', 'name': 'futon'}, {'frequency': 'r', 'id': 488, 'synset': 'gag.n.02', 'synonyms': ['gag', 'muzzle'], 'def': "restraint put into a person's mouth to prevent speaking or shouting", 'name': 'gag'}, {'frequency': 'r', 'id': 489, 'synset': 'garbage.n.03', 'synonyms': ['garbage'], 'def': 'a receptacle where waste can be discarded', 'name': 'garbage'}, {'frequency': 'c', 'id': 490, 'synset': 'garbage_truck.n.01', 'synonyms': ['garbage_truck'], 'def': 'a truck for collecting domestic refuse', 'name': 'garbage_truck'}, {'frequency': 'c', 'id': 491, 'synset': 'garden_hose.n.01', 'synonyms': ['garden_hose'], 'def': 'a hose used for watering a lawn or garden', 'name': 'garden_hose'}, {'frequency': 'c', 'id': 492, 'synset': 'gargle.n.01', 'synonyms': ['gargle', 'mouthwash'], 'def': 'a medicated solution used for gargling and rinsing the mouth', 'name': 'gargle'}, {'frequency': 'r', 'id': 493, 'synset': 'gargoyle.n.02', 'synonyms': ['gargoyle'], 'def': 'an ornament consisting of a grotesquely carved figure of a person or animal', 'name': 'gargoyle'}, {'frequency': 'c', 'id': 494, 'synset': 'garlic.n.02', 'synonyms': ['garlic', 'ail'], 'def': 'aromatic bulb used as seasoning', 'name': 'garlic'}, {'frequency': 'r', 'id': 495, 'synset': 'gasmask.n.01', 'synonyms': ['gasmask', 'respirator', 'gas_helmet'], 'def': 'a protective face mask with a filter', 'name': 'gasmask'}, {'frequency': 'r', 'id': 496, 'synset': 'gazelle.n.01', 'synonyms': ['gazelle'], 'def': 'small swift graceful antelope of Africa and Asia having lustrous eyes', 'name': 'gazelle'}, {'frequency': 'c', 'id': 497, 'synset': 'gelatin.n.02', 'synonyms': ['gelatin', 'jelly'], 'def': 'an edible jelly made with gelatin and used as a dessert or salad base or a coating for foods', 'name': 'gelatin'}, {'frequency': 'r', 'id': 498, 'synset': 'gem.n.02', 'synonyms': ['gemstone'], 'def': 'a crystalline rock that can be cut and polished for jewelry', 'name': 'gemstone'}, {'frequency': 'c', 'id': 499, 'synset': 'giant_panda.n.01', 'synonyms': ['giant_panda', 'panda', 'panda_bear'], 'def': 'large black-and-white herbivorous mammal of bamboo forests of China and Tibet', 'name': 'giant_panda'}, {'frequency': 'c', 'id': 500, 'synset': 'gift_wrap.n.01', 'synonyms': ['gift_wrap'], 'def': 'attractive wrapping paper suitable for wrapping gifts', 'name': 'gift_wrap'}, {'frequency': 'c', 'id': 501, 'synset': 'ginger.n.03', 'synonyms': ['ginger', 'gingerroot'], 'def': 'the root of the common ginger plant; used fresh as a seasoning', 'name': 'ginger'}, {'frequency': 'f', 'id': 502, 'synset': 'giraffe.n.01', 'synonyms': ['giraffe'], 'def': 'tall animal having a spotted coat and small horns and very long neck and legs', 'name': 'giraffe'}, {'frequency': 'c', 'id': 503, 'synset': 'girdle.n.02', 'synonyms': ['cincture', 'sash', 'waistband', 'waistcloth'], 'def': 'a band of material around the waist that strengthens a skirt or trousers', 'name': 'cincture'}, {'frequency': 'f', 'id': 504, 'synset': 'glass.n.02', 'synonyms': ['glass_(drink_container)', 'drinking_glass'], 'def': 'a container for holding liquids while drinking', 'name': 'glass_(drink_container)'}, {'frequency': 'c', 'id': 505, 'synset': 'globe.n.03', 'synonyms': ['globe'], 'def': 'a sphere on which a map (especially of the earth) is represented', 'name': 'globe'}, {'frequency': 'f', 'id': 506, 'synset': 'glove.n.02', 'synonyms': ['glove'], 'def': 'handwear covering the hand', 'name': 'glove'}, {'frequency': 'c', 'id': 507, 'synset': 'goat.n.01', 'synonyms': ['goat'], 'def': 'a common goat', 'name': 'goat'}, {'frequency': 'f', 'id': 508, 'synset': 'goggles.n.01', 'synonyms': ['goggles'], 'def': 'tight-fitting spectacles worn to protect the eyes', 'name': 'goggles'}, {'frequency': 'r', 'id': 509, 'synset': 'goldfish.n.01', 'synonyms': ['goldfish'], 'def': 'small golden or orange-red freshwater fishes used as pond or aquarium pets', 'name': 'goldfish'}, {'frequency': 'r', 'id': 510, 'synset': 'golf_club.n.02', 'synonyms': ['golf_club', 'golf-club'], 'def': 'golf equipment used by a golfer to hit a golf ball', 'name': 'golf_club'}, {'frequency': 'c', 'id': 511, 'synset': 'golfcart.n.01', 'synonyms': ['golfcart'], 'def': 'a small motor vehicle in which golfers can ride between shots', 'name': 'golfcart'}, {'frequency': 'r', 'id': 512, 'synset': 'gondola.n.02', 'synonyms': ['gondola_(boat)'], 'def': 'long narrow flat-bottomed boat propelled by sculling; traditionally used on canals of Venice', 'name': 'gondola_(boat)'}, {'frequency': 'c', 'id': 513, 'synset': 'goose.n.01', 'synonyms': ['goose'], 'def': 'loud, web-footed long-necked aquatic birds usually larger than ducks', 'name': 'goose'}, {'frequency': 'r', 'id': 514, 'synset': 'gorilla.n.01', 'synonyms': ['gorilla'], 'def': 'largest ape', 'name': 'gorilla'}, {'frequency': 'r', 'id': 515, 'synset': 'gourd.n.02', 'synonyms': ['gourd'], 'def': 'any of numerous inedible fruits with hard rinds', 'name': 'gourd'}, {'frequency': 'r', 'id': 516, 'synset': 'gown.n.04', 'synonyms': ['surgical_gown', 'scrubs_(surgical_clothing)'], 'def': 'protective garment worn by surgeons during operations', 'name': 'surgical_gown'}, {'frequency': 'f', 'id': 517, 'synset': 'grape.n.01', 'synonyms': ['grape'], 'def': 'any of various juicy fruit with green or purple skins; grow in clusters', 'name': 'grape'}, {'frequency': 'r', 'id': 518, 'synset': 'grasshopper.n.01', 'synonyms': ['grasshopper'], 'def': 'plant-eating insect with hind legs adapted for leaping', 'name': 'grasshopper'}, {'frequency': 'c', 'id': 519, 'synset': 'grater.n.01', 'synonyms': ['grater'], 'def': 'utensil with sharp perforations for shredding foods (as vegetables or cheese)', 'name': 'grater'}, {'frequency': 'c', 'id': 520, 'synset': 'gravestone.n.01', 'synonyms': ['gravestone', 'headstone', 'tombstone'], 'def': 'a stone that is used to mark a grave', 'name': 'gravestone'}, {'frequency': 'r', 'id': 521, 'synset': 'gravy_boat.n.01', 'synonyms': ['gravy_boat', 'gravy_holder'], 'def': 'a dish (often boat-shaped) for serving gravy or sauce', 'name': 'gravy_boat'}, {'frequency': 'c', 'id': 522, 'synset': 'green_bean.n.02', 'synonyms': ['green_bean'], 'def': 'a common bean plant cultivated for its slender green edible pods', 'name': 'green_bean'}, {'frequency': 'c', 'id': 523, 'synset': 'green_onion.n.01', 'synonyms': ['green_onion', 'spring_onion', 'scallion'], 'def': 'a young onion before the bulb has enlarged', 'name': 'green_onion'}, {'frequency': 'r', 'id': 524, 'synset': 'griddle.n.01', 'synonyms': ['griddle'], 'def': 'cooking utensil consisting of a flat heated surface on which food is cooked', 'name': 'griddle'}, {'frequency': 'r', 'id': 525, 'synset': 'grillroom.n.01', 'synonyms': ['grillroom', 'grill_(restaurant)'], 'def': 'a restaurant where food is cooked on a grill', 'name': 'grillroom'}, {'frequency': 'r', 'id': 526, 'synset': 'grinder.n.04', 'synonyms': ['grinder_(tool)'], 'def': 'a machine tool that polishes metal', 'name': 'grinder_(tool)'}, {'frequency': 'r', 'id': 527, 'synset': 'grits.n.01', 'synonyms': ['grits', 'hominy_grits'], 'def': 'coarsely ground corn boiled as a breakfast dish', 'name': 'grits'}, {'frequency': 'c', 'id': 528, 'synset': 'grizzly.n.01', 'synonyms': ['grizzly', 'grizzly_bear'], 'def': 'powerful brownish-yellow bear of the uplands of western North America', 'name': 'grizzly'}, {'frequency': 'c', 'id': 529, 'synset': 'grocery_bag.n.01', 'synonyms': ['grocery_bag'], 'def': "a sack for holding customer's groceries", 'name': 'grocery_bag'}, {'frequency': 'r', 'id': 530, 'synset': 'guacamole.n.01', 'synonyms': ['guacamole'], 'def': 'a dip made of mashed avocado mixed with chopped onions and other seasonings', 'name': 'guacamole'}, {'frequency': 'f', 'id': 531, 'synset': 'guitar.n.01', 'synonyms': ['guitar'], 'def': 'a stringed instrument usually having six strings; played by strumming or plucking', 'name': 'guitar'}, {'frequency': 'c', 'id': 532, 'synset': 'gull.n.02', 'synonyms': ['gull', 'seagull'], 'def': 'mostly white aquatic bird having long pointed wings and short legs', 'name': 'gull'}, {'frequency': 'c', 'id': 533, 'synset': 'gun.n.01', 'synonyms': ['gun'], 'def': 'a weapon that discharges a bullet at high velocity from a metal tube', 'name': 'gun'}, {'frequency': 'r', 'id': 534, 'synset': 'hair_spray.n.01', 'synonyms': ['hair_spray'], 'def': 'substance sprayed on the hair to hold it in place', 'name': 'hair_spray'}, {'frequency': 'c', 'id': 535, 'synset': 'hairbrush.n.01', 'synonyms': ['hairbrush'], 'def': "a brush used to groom a person's hair", 'name': 'hairbrush'}, {'frequency': 'c', 'id': 536, 'synset': 'hairnet.n.01', 'synonyms': ['hairnet'], 'def': 'a small net that someone wears over their hair to keep it in place', 'name': 'hairnet'}, {'frequency': 'c', 'id': 537, 'synset': 'hairpin.n.01', 'synonyms': ['hairpin'], 'def': "a double pronged pin used to hold women's hair in place", 'name': 'hairpin'}, {'frequency': 'f', 'id': 538, 'synset': 'ham.n.01', 'synonyms': ['ham', 'jambon', 'gammon'], 'def': 'meat cut from the thigh of a hog (usually smoked)', 'name': 'ham'}, {'frequency': 'c', 'id': 539, 'synset': 'hamburger.n.01', 'synonyms': ['hamburger', 'beefburger', 'burger'], 'def': 'a sandwich consisting of a patty of minced beef served on a bun', 'name': 'hamburger'}, {'frequency': 'c', 'id': 540, 'synset': 'hammer.n.02', 'synonyms': ['hammer'], 'def': 'a hand tool with a heavy head and a handle; used to deliver an impulsive force by striking', 'name': 'hammer'}, {'frequency': 'r', 'id': 541, 'synset': 'hammock.n.02', 'synonyms': ['hammock'], 'def': 'a hanging bed of canvas or rope netting (usually suspended between two trees)', 'name': 'hammock'}, {'frequency': 'r', 'id': 542, 'synset': 'hamper.n.02', 'synonyms': ['hamper'], 'def': 'a basket usually with a cover', 'name': 'hamper'}, {'frequency': 'r', 'id': 543, 'synset': 'hamster.n.01', 'synonyms': ['hamster'], 'def': 'short-tailed burrowing rodent with large cheek pouches', 'name': 'hamster'}, {'frequency': 'c', 'id': 544, 'synset': 'hand_blower.n.01', 'synonyms': ['hair_dryer'], 'def': 'a hand-held electric blower that can blow warm air onto the hair', 'name': 'hair_dryer'}, {'frequency': 'r', 'id': 545, 'synset': 'hand_glass.n.01', 'synonyms': ['hand_glass', 'hand_mirror'], 'def': 'a mirror intended to be held in the hand', 'name': 'hand_glass'}, {'frequency': 'f', 'id': 546, 'synset': 'hand_towel.n.01', 'synonyms': ['hand_towel', 'face_towel'], 'def': 'a small towel used to dry the hands or face', 'name': 'hand_towel'}, {'frequency': 'c', 'id': 547, 'synset': 'handcart.n.01', 'synonyms': ['handcart', 'pushcart', 'hand_truck'], 'def': 'wheeled vehicle that can be pushed by a person', 'name': 'handcart'}, {'frequency': 'r', 'id': 548, 'synset': 'handcuff.n.01', 'synonyms': ['handcuff'], 'def': 'shackle that consists of a metal loop that can be locked around the wrist', 'name': 'handcuff'}, {'frequency': 'c', 'id': 549, 'synset': 'handkerchief.n.01', 'synonyms': ['handkerchief'], 'def': 'a square piece of cloth used for wiping the eyes or nose or as a costume accessory', 'name': 'handkerchief'}, {'frequency': 'f', 'id': 550, 'synset': 'handle.n.01', 'synonyms': ['handle', 'grip', 'handgrip'], 'def': 'the appendage to an object that is designed to be held in order to use or move it', 'name': 'handle'}, {'frequency': 'r', 'id': 551, 'synset': 'handsaw.n.01', 'synonyms': ['handsaw', "carpenter's_saw"], 'def': 'a saw used with one hand for cutting wood', 'name': 'handsaw'}, {'frequency': 'r', 'id': 552, 'synset': 'hardback.n.01', 'synonyms': ['hardback_book', 'hardcover_book'], 'def': 'a book with cardboard or cloth or leather covers', 'name': 'hardback_book'}, {'frequency': 'r', 'id': 553, 'synset': 'harmonium.n.01', 'synonyms': ['harmonium', 'organ_(musical_instrument)', 'reed_organ_(musical_instrument)'], 'def': 'a free-reed instrument in which air is forced through the reeds by bellows', 'name': 'harmonium'}, {'frequency': 'f', 'id': 554, 'synset': 'hat.n.01', 'synonyms': ['hat'], 'def': 'headwear that protects the head from bad weather, sun, or worn for fashion', 'name': 'hat'}, {'frequency': 'r', 'id': 555, 'synset': 'hatbox.n.01', 'synonyms': ['hatbox'], 'def': 'a round piece of luggage for carrying hats', 'name': 'hatbox'}, {'frequency': 'r', 'id': 556, 'synset': 'hatch.n.03', 'synonyms': ['hatch'], 'def': 'a movable barrier covering a hatchway', 'name': 'hatch'}, {'frequency': 'c', 'id': 557, 'synset': 'head_covering.n.01', 'synonyms': ['veil'], 'def': 'a garment that covers the head and face', 'name': 'veil'}, {'frequency': 'f', 'id': 558, 'synset': 'headband.n.01', 'synonyms': ['headband'], 'def': 'a band worn around or over the head', 'name': 'headband'}, {'frequency': 'f', 'id': 559, 'synset': 'headboard.n.01', 'synonyms': ['headboard'], 'def': 'a vertical board or panel forming the head of a bedstead', 'name': 'headboard'}, {'frequency': 'f', 'id': 560, 'synset': 'headlight.n.01', 'synonyms': ['headlight', 'headlamp'], 'def': 'a powerful light with reflector; attached to the front of an automobile or locomotive', 'name': 'headlight'}, {'frequency': 'c', 'id': 561, 'synset': 'headscarf.n.01', 'synonyms': ['headscarf'], 'def': 'a kerchief worn over the head and tied under the chin', 'name': 'headscarf'}, {'frequency': 'r', 'id': 562, 'synset': 'headset.n.01', 'synonyms': ['headset'], 'def': 'receiver consisting of a pair of headphones', 'name': 'headset'}, {'frequency': 'c', 'id': 563, 'synset': 'headstall.n.01', 'synonyms': ['headstall_(for_horses)', 'headpiece_(for_horses)'], 'def': "the band that is the part of a bridle that fits around a horse's head", 'name': 'headstall_(for_horses)'}, {'frequency': 'r', 'id': 564, 'synset': 'hearing_aid.n.02', 'synonyms': ['hearing_aid'], 'def': 'an acoustic device used to direct sound to the ear of a hearing-impaired person', 'name': 'hearing_aid'}, {'frequency': 'c', 'id': 565, 'synset': 'heart.n.02', 'synonyms': ['heart'], 'def': 'a muscular organ; its contractions move the blood through the body', 'name': 'heart'}, {'frequency': 'c', 'id': 566, 'synset': 'heater.n.01', 'synonyms': ['heater', 'warmer'], 'def': 'device that heats water or supplies warmth to a room', 'name': 'heater'}, {'frequency': 'c', 'id': 567, 'synset': 'helicopter.n.01', 'synonyms': ['helicopter'], 'def': 'an aircraft without wings that obtains its lift from the rotation of overhead blades', 'name': 'helicopter'}, {'frequency': 'f', 'id': 568, 'synset': 'helmet.n.02', 'synonyms': ['helmet'], 'def': 'a protective headgear made of hard material to resist blows', 'name': 'helmet'}, {'frequency': 'r', 'id': 569, 'synset': 'heron.n.02', 'synonyms': ['heron'], 'def': 'grey or white wading bird with long neck and long legs and (usually) long bill', 'name': 'heron'}, {'frequency': 'c', 'id': 570, 'synset': 'highchair.n.01', 'synonyms': ['highchair', 'feeding_chair'], 'def': 'a chair for feeding a very young child', 'name': 'highchair'}, {'frequency': 'f', 'id': 571, 'synset': 'hinge.n.01', 'synonyms': ['hinge'], 'def': 'a joint that holds two parts together so that one can swing relative to the other', 'name': 'hinge'}, {'frequency': 'r', 'id': 572, 'synset': 'hippopotamus.n.01', 'synonyms': ['hippopotamus'], 'def': 'massive thick-skinned animal living in or around rivers of tropical Africa', 'name': 'hippopotamus'}, {'frequency': 'r', 'id': 573, 'synset': 'hockey_stick.n.01', 'synonyms': ['hockey_stick'], 'def': 'sports implement consisting of a stick used by hockey players to move the puck', 'name': 'hockey_stick'}, {'frequency': 'c', 'id': 574, 'synset': 'hog.n.03', 'synonyms': ['hog', 'pig'], 'def': 'domestic swine', 'name': 'hog'}, {'frequency': 'f', 'id': 575, 'synset': 'home_plate.n.01', 'synonyms': ['home_plate_(baseball)', 'home_base_(baseball)'], 'def': '(baseball) a rubber slab where the batter stands; it must be touched by a base runner in order to score', 'name': 'home_plate_(baseball)'}, {'frequency': 'c', 'id': 576, 'synset': 'honey.n.01', 'synonyms': ['honey'], 'def': 'a sweet yellow liquid produced by bees', 'name': 'honey'}, {'frequency': 'f', 'id': 577, 'synset': 'hood.n.06', 'synonyms': ['fume_hood', 'exhaust_hood'], 'def': 'metal covering leading to a vent that exhausts smoke or fumes', 'name': 'fume_hood'}, {'frequency': 'f', 'id': 578, 'synset': 'hook.n.05', 'synonyms': ['hook'], 'def': 'a curved or bent implement for suspending or pulling something', 'name': 'hook'}, {'frequency': 'f', 'id': 579, 'synset': 'horse.n.01', 'synonyms': ['horse'], 'def': 'a common horse', 'name': 'horse'}, {'frequency': 'f', 'id': 580, 'synset': 'hose.n.03', 'synonyms': ['hose', 'hosepipe'], 'def': 'a flexible pipe for conveying a liquid or gas', 'name': 'hose'}, {'frequency': 'r', 'id': 581, 'synset': 'hot-air_balloon.n.01', 'synonyms': ['hot-air_balloon'], 'def': 'balloon for travel through the air in a basket suspended below a large bag of heated air', 'name': 'hot-air_balloon'}, {'frequency': 'r', 'id': 582, 'synset': 'hot_plate.n.01', 'synonyms': ['hotplate'], 'def': 'a portable electric appliance for heating or cooking or keeping food warm', 'name': 'hotplate'}, {'frequency': 'c', 'id': 583, 'synset': 'hot_sauce.n.01', 'synonyms': ['hot_sauce'], 'def': 'a pungent peppery sauce', 'name': 'hot_sauce'}, {'frequency': 'r', 'id': 584, 'synset': 'hourglass.n.01', 'synonyms': ['hourglass'], 'def': 'a sandglass timer that runs for sixty minutes', 'name': 'hourglass'}, {'frequency': 'r', 'id': 585, 'synset': 'houseboat.n.01', 'synonyms': ['houseboat'], 'def': 'a barge that is designed and equipped for use as a dwelling', 'name': 'houseboat'}, {'frequency': 'r', 'id': 586, 'synset': 'hummingbird.n.01', 'synonyms': ['hummingbird'], 'def': 'tiny American bird having brilliant iridescent plumage and long slender bills', 'name': 'hummingbird'}, {'frequency': 'r', 'id': 587, 'synset': 'hummus.n.01', 'synonyms': ['hummus', 'humus', 'hommos', 'hoummos', 'humous'], 'def': 'a thick spread made from mashed chickpeas', 'name': 'hummus'}, {'frequency': 'c', 'id': 588, 'synset': 'ice_bear.n.01', 'synonyms': ['polar_bear'], 'def': 'white bear of Arctic regions', 'name': 'polar_bear'}, {'frequency': 'c', 'id': 589, 'synset': 'ice_cream.n.01', 'synonyms': ['icecream'], 'def': 'frozen dessert containing cream and sugar and flavoring', 'name': 'icecream'}, {'frequency': 'r', 'id': 590, 'synset': 'ice_lolly.n.01', 'synonyms': ['popsicle'], 'def': 'ice cream or water ice on a small wooden stick', 'name': 'popsicle'}, {'frequency': 'c', 'id': 591, 'synset': 'ice_maker.n.01', 'synonyms': ['ice_maker'], 'def': 'an appliance included in some electric refrigerators for making ice cubes', 'name': 'ice_maker'}, {'frequency': 'r', 'id': 592, 'synset': 'ice_pack.n.01', 'synonyms': ['ice_pack', 'ice_bag'], 'def': 'a waterproof bag filled with ice: applied to the body (especially the head) to cool or reduce swelling', 'name': 'ice_pack'}, {'frequency': 'r', 'id': 593, 'synset': 'ice_skate.n.01', 'synonyms': ['ice_skate'], 'def': 'skate consisting of a boot with a steel blade fitted to the sole', 'name': 'ice_skate'}, {'frequency': 'r', 'id': 594, 'synset': 'ice_tea.n.01', 'synonyms': ['ice_tea', 'iced_tea'], 'def': 'strong tea served over ice', 'name': 'ice_tea'}, {'frequency': 'c', 'id': 595, 'synset': 'igniter.n.01', 'synonyms': ['igniter', 'ignitor', 'lighter'], 'def': 'a substance or device used to start a fire', 'name': 'igniter'}, {'frequency': 'r', 'id': 596, 'synset': 'incense.n.01', 'synonyms': ['incense'], 'def': 'a substance that produces a fragrant odor when burned', 'name': 'incense'}, {'frequency': 'r', 'id': 597, 'synset': 'inhaler.n.01', 'synonyms': ['inhaler', 'inhalator'], 'def': 'a dispenser that produces a chemical vapor to be inhaled through mouth or nose', 'name': 'inhaler'}, {'frequency': 'c', 'id': 598, 'synset': 'ipod.n.01', 'synonyms': ['iPod'], 'def': 'a pocket-sized device used to play music files', 'name': 'iPod'}, {'frequency': 'c', 'id': 599, 'synset': 'iron.n.04', 'synonyms': ['iron_(for_clothing)', 'smoothing_iron_(for_clothing)'], 'def': 'home appliance consisting of a flat metal base that is heated and used to smooth cloth', 'name': 'iron_(for_clothing)'}, {'frequency': 'r', 'id': 600, 'synset': 'ironing_board.n.01', 'synonyms': ['ironing_board'], 'def': 'narrow padded board on collapsible supports; used for ironing clothes', 'name': 'ironing_board'}, {'frequency': 'f', 'id': 601, 'synset': 'jacket.n.01', 'synonyms': ['jacket'], 'def': 'a waist-length coat', 'name': 'jacket'}, {'frequency': 'r', 'id': 602, 'synset': 'jam.n.01', 'synonyms': ['jam'], 'def': 'preserve of crushed fruit', 'name': 'jam'}, {'frequency': 'f', 'id': 603, 'synset': 'jean.n.01', 'synonyms': ['jean', 'blue_jean', 'denim'], 'def': '(usually plural) close-fitting trousers of heavy denim for manual work or casual wear', 'name': 'jean'}, {'frequency': 'c', 'id': 604, 'synset': 'jeep.n.01', 'synonyms': ['jeep', 'landrover'], 'def': 'a car suitable for traveling over rough terrain', 'name': 'jeep'}, {'frequency': 'r', 'id': 605, 'synset': 'jelly_bean.n.01', 'synonyms': ['jelly_bean', 'jelly_egg'], 'def': 'sugar-glazed jellied candy', 'name': 'jelly_bean'}, {'frequency': 'f', 'id': 606, 'synset': 'jersey.n.03', 'synonyms': ['jersey', 'T-shirt', 'tee_shirt'], 'def': 'a close-fitting pullover shirt', 'name': 'jersey'}, {'frequency': 'c', 'id': 607, 'synset': 'jet.n.01', 'synonyms': ['jet_plane', 'jet-propelled_plane'], 'def': 'an airplane powered by one or more jet engines', 'name': 'jet_plane'}, {'frequency': 'c', 'id': 608, 'synset': 'jewelry.n.01', 'synonyms': ['jewelry', 'jewellery'], 'def': 'an adornment (as a bracelet or ring or necklace) made of precious metals and set with gems (or imitation gems)', 'name': 'jewelry'}, {'frequency': 'r', 'id': 609, 'synset': 'joystick.n.02', 'synonyms': ['joystick'], 'def': 'a control device for computers consisting of a vertical handle that can move freely in two directions', 'name': 'joystick'}, {'frequency': 'r', 'id': 610, 'synset': 'jump_suit.n.01', 'synonyms': ['jumpsuit'], 'def': "one-piece garment fashioned after a parachutist's uniform", 'name': 'jumpsuit'}, {'frequency': 'c', 'id': 611, 'synset': 'kayak.n.01', 'synonyms': ['kayak'], 'def': 'a small canoe consisting of a light frame made watertight with animal skins', 'name': 'kayak'}, {'frequency': 'r', 'id': 612, 'synset': 'keg.n.02', 'synonyms': ['keg'], 'def': 'small cask or barrel', 'name': 'keg'}, {'frequency': 'r', 'id': 613, 'synset': 'kennel.n.01', 'synonyms': ['kennel', 'doghouse'], 'def': 'outbuilding that serves as a shelter for a dog', 'name': 'kennel'}, {'frequency': 'c', 'id': 614, 'synset': 'kettle.n.01', 'synonyms': ['kettle', 'boiler'], 'def': 'a metal pot for stewing or boiling; usually has a lid', 'name': 'kettle'}, {'frequency': 'f', 'id': 615, 'synset': 'key.n.01', 'synonyms': ['key'], 'def': 'metal instrument used to unlock a lock', 'name': 'key'}, {'frequency': 'r', 'id': 616, 'synset': 'keycard.n.01', 'synonyms': ['keycard'], 'def': 'a plastic card used to gain access typically to a door', 'name': 'keycard'}, {'frequency': 'r', 'id': 617, 'synset': 'kilt.n.01', 'synonyms': ['kilt'], 'def': 'a knee-length pleated tartan skirt worn by men as part of the traditional dress in the Highlands of northern Scotland', 'name': 'kilt'}, {'frequency': 'c', 'id': 618, 'synset': 'kimono.n.01', 'synonyms': ['kimono'], 'def': 'a loose robe; imitated from robes originally worn by Japanese', 'name': 'kimono'}, {'frequency': 'f', 'id': 619, 'synset': 'kitchen_sink.n.01', 'synonyms': ['kitchen_sink'], 'def': 'a sink in a kitchen', 'name': 'kitchen_sink'}, {'frequency': 'c', 'id': 620, 'synset': 'kitchen_table.n.01', 'synonyms': ['kitchen_table'], 'def': 'a table in the kitchen', 'name': 'kitchen_table'}, {'frequency': 'f', 'id': 621, 'synset': 'kite.n.03', 'synonyms': ['kite'], 'def': 'plaything consisting of a light frame covered with tissue paper; flown in wind at end of a string', 'name': 'kite'}, {'frequency': 'c', 'id': 622, 'synset': 'kitten.n.01', 'synonyms': ['kitten', 'kitty'], 'def': 'young domestic cat', 'name': 'kitten'}, {'frequency': 'c', 'id': 623, 'synset': 'kiwi.n.03', 'synonyms': ['kiwi_fruit'], 'def': 'fuzzy brown egg-shaped fruit with slightly tart green flesh', 'name': 'kiwi_fruit'}, {'frequency': 'f', 'id': 624, 'synset': 'knee_pad.n.01', 'synonyms': ['knee_pad'], 'def': 'protective garment consisting of a pad worn by football or baseball or hockey players', 'name': 'knee_pad'}, {'frequency': 'f', 'id': 625, 'synset': 'knife.n.01', 'synonyms': ['knife'], 'def': 'tool with a blade and point used as a cutting instrument', 'name': 'knife'}, {'frequency': 'r', 'id': 626, 'synset': 'knight.n.02', 'synonyms': ['knight_(chess_piece)', 'horse_(chess_piece)'], 'def': 'a chess game piece shaped to resemble the head of a horse', 'name': 'knight_(chess_piece)'}, {'frequency': 'r', 'id': 627, 'synset': 'knitting_needle.n.01', 'synonyms': ['knitting_needle'], 'def': 'needle consisting of a slender rod with pointed ends; usually used in pairs', 'name': 'knitting_needle'}, {'frequency': 'f', 'id': 628, 'synset': 'knob.n.02', 'synonyms': ['knob'], 'def': 'a round handle often found on a door', 'name': 'knob'}, {'frequency': 'r', 'id': 629, 'synset': 'knocker.n.05', 'synonyms': ['knocker_(on_a_door)', 'doorknocker'], 'def': 'a device (usually metal and ornamental) attached by a hinge to a door', 'name': 'knocker_(on_a_door)'}, {'frequency': 'r', 'id': 630, 'synset': 'koala.n.01', 'synonyms': ['koala', 'koala_bear'], 'def': 'sluggish tailless Australian marsupial with grey furry ears and coat', 'name': 'koala'}, {'frequency': 'r', 'id': 631, 'synset': 'lab_coat.n.01', 'synonyms': ['lab_coat', 'laboratory_coat'], 'def': 'a light coat worn to protect clothing from substances used while working in a laboratory', 'name': 'lab_coat'}, {'frequency': 'f', 'id': 632, 'synset': 'ladder.n.01', 'synonyms': ['ladder'], 'def': 'steps consisting of two parallel members connected by rungs', 'name': 'ladder'}, {'frequency': 'c', 'id': 633, 'synset': 'ladle.n.01', 'synonyms': ['ladle'], 'def': 'a spoon-shaped vessel with a long handle frequently used to transfer liquids', 'name': 'ladle'}, {'frequency': 'r', 'id': 634, 'synset': 'ladybug.n.01', 'synonyms': ['ladybug', 'ladybeetle', 'ladybird_beetle'], 'def': 'small round bright-colored and spotted beetle, typically red and black', 'name': 'ladybug'}, {'frequency': 'c', 'id': 635, 'synset': 'lamb.n.01', 'synonyms': ['lamb_(animal)'], 'def': 'young sheep', 'name': 'lamb_(animal)'}, {'frequency': 'r', 'id': 636, 'synset': 'lamb_chop.n.01', 'synonyms': ['lamb-chop', 'lambchop'], 'def': 'chop cut from a lamb', 'name': 'lamb-chop'}, {'frequency': 'f', 'id': 637, 'synset': 'lamp.n.02', 'synonyms': ['lamp'], 'def': 'a piece of furniture holding one or more electric light bulbs', 'name': 'lamp'}, {'frequency': 'f', 'id': 638, 'synset': 'lamppost.n.01', 'synonyms': ['lamppost'], 'def': 'a metal post supporting an outdoor lamp (such as a streetlight)', 'name': 'lamppost'}, {'frequency': 'f', 'id': 639, 'synset': 'lampshade.n.01', 'synonyms': ['lampshade'], 'def': 'a protective ornamental shade used to screen a light bulb from direct view', 'name': 'lampshade'}, {'frequency': 'c', 'id': 640, 'synset': 'lantern.n.01', 'synonyms': ['lantern'], 'def': 'light in a transparent protective case', 'name': 'lantern'}, {'frequency': 'f', 'id': 641, 'synset': 'lanyard.n.02', 'synonyms': ['lanyard', 'laniard'], 'def': 'a cord worn around the neck to hold a knife or whistle, etc.', 'name': 'lanyard'}, {'frequency': 'f', 'id': 642, 'synset': 'laptop.n.01', 'synonyms': ['laptop_computer', 'notebook_computer'], 'def': 'a portable computer small enough to use in your lap', 'name': 'laptop_computer'}, {'frequency': 'r', 'id': 643, 'synset': 'lasagna.n.01', 'synonyms': ['lasagna', 'lasagne'], 'def': 'baked dish of layers of lasagna pasta with sauce and cheese and meat or vegetables', 'name': 'lasagna'}, {'frequency': 'c', 'id': 644, 'synset': 'latch.n.02', 'synonyms': ['latch'], 'def': 'a bar that can be lowered or slid into a groove to fasten a door or gate', 'name': 'latch'}, {'frequency': 'r', 'id': 645, 'synset': 'lawn_mower.n.01', 'synonyms': ['lawn_mower'], 'def': 'garden tool for mowing grass on lawns', 'name': 'lawn_mower'}, {'frequency': 'r', 'id': 646, 'synset': 'leather.n.01', 'synonyms': ['leather'], 'def': 'an animal skin made smooth and flexible by removing the hair and then tanning', 'name': 'leather'}, {'frequency': 'c', 'id': 647, 'synset': 'legging.n.01', 'synonyms': ['legging_(clothing)', 'leging_(clothing)', 'leg_covering'], 'def': 'a garment covering the leg (usually extending from the knee to the ankle)', 'name': 'legging_(clothing)'}, {'frequency': 'c', 'id': 648, 'synset': 'lego.n.01', 'synonyms': ['Lego', 'Lego_set'], 'def': "a child's plastic construction set for making models from blocks", 'name': 'Lego'}, {'frequency': 'f', 'id': 649, 'synset': 'lemon.n.01', 'synonyms': ['lemon'], 'def': 'yellow oval fruit with juicy acidic flesh', 'name': 'lemon'}, {'frequency': 'r', 'id': 650, 'synset': 'lemonade.n.01', 'synonyms': ['lemonade'], 'def': 'sweetened beverage of diluted lemon juice', 'name': 'lemonade'}, {'frequency': 'f', 'id': 651, 'synset': 'lettuce.n.02', 'synonyms': ['lettuce'], 'def': 'leafy plant commonly eaten in salad or on sandwiches', 'name': 'lettuce'}, {'frequency': 'f', 'id': 652, 'synset': 'license_plate.n.01', 'synonyms': ['license_plate', 'numberplate'], 'def': "a plate mounted on the front and back of car and bearing the car's registration number", 'name': 'license_plate'}, {'frequency': 'f', 'id': 653, 'synset': 'life_buoy.n.01', 'synonyms': ['life_buoy', 'lifesaver', 'life_belt', 'life_ring'], 'def': 'a ring-shaped life preserver used to prevent drowning (NOT a life-jacket or vest)', 'name': 'life_buoy'}, {'frequency': 'f', 'id': 654, 'synset': 'life_jacket.n.01', 'synonyms': ['life_jacket', 'life_vest'], 'def': 'life preserver consisting of a sleeveless jacket of buoyant or inflatable design', 'name': 'life_jacket'}, {'frequency': 'f', 'id': 655, 'synset': 'light_bulb.n.01', 'synonyms': ['lightbulb'], 'def': 'glass bulb or tube shaped electric device that emits light (DO NOT MARK LAMPS AS A WHOLE)', 'name': 'lightbulb'}, {'frequency': 'r', 'id': 656, 'synset': 'lightning_rod.n.02', 'synonyms': ['lightning_rod', 'lightning_conductor'], 'def': 'a metallic conductor that is attached to a high point and leads to the ground', 'name': 'lightning_rod'}, {'frequency': 'c', 'id': 657, 'synset': 'lime.n.06', 'synonyms': ['lime'], 'def': 'the green acidic fruit of any of various lime trees', 'name': 'lime'}, {'frequency': 'r', 'id': 658, 'synset': 'limousine.n.01', 'synonyms': ['limousine'], 'def': 'long luxurious car; usually driven by a chauffeur', 'name': 'limousine'}, {'frequency': 'r', 'id': 659, 'synset': 'linen.n.02', 'synonyms': ['linen_paper'], 'def': 'a high-quality paper made of linen fibers or with a linen finish', 'name': 'linen_paper'}, {'frequency': 'c', 'id': 660, 'synset': 'lion.n.01', 'synonyms': ['lion'], 'def': 'large gregarious predatory cat of Africa and India', 'name': 'lion'}, {'frequency': 'c', 'id': 661, 'synset': 'lip_balm.n.01', 'synonyms': ['lip_balm'], 'def': 'a balm applied to the lips', 'name': 'lip_balm'}, {'frequency': 'c', 'id': 662, 'synset': 'lipstick.n.01', 'synonyms': ['lipstick', 'lip_rouge'], 'def': 'makeup that is used to color the lips', 'name': 'lipstick'}, {'frequency': 'r', 'id': 663, 'synset': 'liquor.n.01', 'synonyms': ['liquor', 'spirits', 'hard_liquor', 'liqueur', 'cordial'], 'def': 'an alcoholic beverage that is distilled rather than fermented', 'name': 'liquor'}, {'frequency': 'r', 'id': 664, 'synset': 'lizard.n.01', 'synonyms': ['lizard'], 'def': 'a reptile with usually two pairs of legs and a tapering tail', 'name': 'lizard'}, {'frequency': 'r', 'id': 665, 'synset': 'loafer.n.02', 'synonyms': ['Loafer_(type_of_shoe)'], 'def': 'a low leather step-in shoe', 'name': 'Loafer_(type_of_shoe)'}, {'frequency': 'f', 'id': 666, 'synset': 'log.n.01', 'synonyms': ['log'], 'def': 'a segment of the trunk of a tree when stripped of branches', 'name': 'log'}, {'frequency': 'c', 'id': 667, 'synset': 'lollipop.n.02', 'synonyms': ['lollipop'], 'def': 'hard candy on a stick', 'name': 'lollipop'}, {'frequency': 'c', 'id': 668, 'synset': 'lotion.n.01', 'synonyms': ['lotion'], 'def': 'any of various cosmetic preparations that are applied to the skin', 'name': 'lotion'}, {'frequency': 'f', 'id': 669, 'synset': 'loudspeaker.n.01', 'synonyms': ['speaker_(stero_equipment)'], 'def': 'electronic device that produces sound often as part of a stereo system', 'name': 'speaker_(stero_equipment)'}, {'frequency': 'c', 'id': 670, 'synset': 'love_seat.n.01', 'synonyms': ['loveseat'], 'def': 'small sofa that seats two people', 'name': 'loveseat'}, {'frequency': 'r', 'id': 671, 'synset': 'machine_gun.n.01', 'synonyms': ['machine_gun'], 'def': 'a rapidly firing automatic gun', 'name': 'machine_gun'}, {'frequency': 'f', 'id': 672, 'synset': 'magazine.n.02', 'synonyms': ['magazine'], 'def': 'a paperback periodic publication', 'name': 'magazine'}, {'frequency': 'f', 'id': 673, 'synset': 'magnet.n.01', 'synonyms': ['magnet'], 'def': 'a device that attracts iron and produces a magnetic field', 'name': 'magnet'}, {'frequency': 'r', 'id': 674, 'synset': 'mail_slot.n.01', 'synonyms': ['mail_slot'], 'def': 'a slot (usually in a door) through which mail can be delivered', 'name': 'mail_slot'}, {'frequency': 'c', 'id': 675, 'synset': 'mailbox.n.01', 'synonyms': ['mailbox_(at_home)', 'letter_box_(at_home)'], 'def': 'a private box for delivery of mail', 'name': 'mailbox_(at_home)'}, {'frequency': 'r', 'id': 676, 'synset': 'mallet.n.01', 'synonyms': ['mallet'], 'def': 'a sports implement with a long handle and a hammer-like head used to hit a ball', 'name': 'mallet'}, {'frequency': 'r', 'id': 677, 'synset': 'mammoth.n.01', 'synonyms': ['mammoth'], 'def': 'any of numerous extinct elephants widely distributed in the Pleistocene', 'name': 'mammoth'}, {'frequency': 'c', 'id': 678, 'synset': 'mandarin.n.05', 'synonyms': ['mandarin_orange'], 'def': 'a somewhat flat reddish-orange loose skinned citrus of China', 'name': 'mandarin_orange'}, {'frequency': 'c', 'id': 679, 'synset': 'manger.n.01', 'synonyms': ['manger', 'trough'], 'def': 'a container (usually in a barn or stable) from which cattle or horses feed', 'name': 'manger'}, {'frequency': 'f', 'id': 680, 'synset': 'manhole.n.01', 'synonyms': ['manhole'], 'def': 'a hole (usually with a flush cover) through which a person can gain access to an underground structure', 'name': 'manhole'}, {'frequency': 'c', 'id': 681, 'synset': 'map.n.01', 'synonyms': ['map'], 'def': "a diagrammatic representation of the earth's surface (or part of it)", 'name': 'map'}, {'frequency': 'c', 'id': 682, 'synset': 'marker.n.03', 'synonyms': ['marker'], 'def': 'a writing implement for making a mark', 'name': 'marker'}, {'frequency': 'r', 'id': 683, 'synset': 'martini.n.01', 'synonyms': ['martini'], 'def': 'a cocktail made of gin (or vodka) with dry vermouth', 'name': 'martini'}, {'frequency': 'r', 'id': 684, 'synset': 'mascot.n.01', 'synonyms': ['mascot'], 'def': 'a person or animal that is adopted by a team or other group as a symbolic figure', 'name': 'mascot'}, {'frequency': 'c', 'id': 685, 'synset': 'mashed_potato.n.01', 'synonyms': ['mashed_potato'], 'def': 'potato that has been peeled and boiled and then mashed', 'name': 'mashed_potato'}, {'frequency': 'r', 'id': 686, 'synset': 'masher.n.02', 'synonyms': ['masher'], 'def': 'a kitchen utensil used for mashing (e.g. potatoes)', 'name': 'masher'}, {'frequency': 'f', 'id': 687, 'synset': 'mask.n.04', 'synonyms': ['mask', 'facemask'], 'def': 'a protective covering worn over the face', 'name': 'mask'}, {'frequency': 'f', 'id': 688, 'synset': 'mast.n.01', 'synonyms': ['mast'], 'def': 'a vertical spar for supporting sails', 'name': 'mast'}, {'frequency': 'c', 'id': 689, 'synset': 'mat.n.03', 'synonyms': ['mat_(gym_equipment)', 'gym_mat'], 'def': 'sports equipment consisting of a piece of thick padding on the floor for gymnastics', 'name': 'mat_(gym_equipment)'}, {'frequency': 'r', 'id': 690, 'synset': 'matchbox.n.01', 'synonyms': ['matchbox'], 'def': 'a box for holding matches', 'name': 'matchbox'}, {'frequency': 'f', 'id': 691, 'synset': 'mattress.n.01', 'synonyms': ['mattress'], 'def': 'a thick pad filled with resilient material used as a bed or part of a bed', 'name': 'mattress'}, {'frequency': 'c', 'id': 692, 'synset': 'measuring_cup.n.01', 'synonyms': ['measuring_cup'], 'def': 'graduated cup used to measure liquid or granular ingredients', 'name': 'measuring_cup'}, {'frequency': 'c', 'id': 693, 'synset': 'measuring_stick.n.01', 'synonyms': ['measuring_stick', 'ruler_(measuring_stick)', 'measuring_rod'], 'def': 'measuring instrument having a sequence of marks at regular intervals', 'name': 'measuring_stick'}, {'frequency': 'c', 'id': 694, 'synset': 'meatball.n.01', 'synonyms': ['meatball'], 'def': 'ground meat formed into a ball and fried or simmered in broth', 'name': 'meatball'}, {'frequency': 'c', 'id': 695, 'synset': 'medicine.n.02', 'synonyms': ['medicine'], 'def': 'something that treats or prevents or alleviates the symptoms of disease', 'name': 'medicine'}, {'frequency': 'r', 'id': 696, 'synset': 'melon.n.01', 'synonyms': ['melon'], 'def': 'fruit of the gourd family having a hard rind and sweet juicy flesh', 'name': 'melon'}, {'frequency': 'f', 'id': 697, 'synset': 'microphone.n.01', 'synonyms': ['microphone'], 'def': 'device for converting sound waves into electrical energy', 'name': 'microphone'}, {'frequency': 'r', 'id': 698, 'synset': 'microscope.n.01', 'synonyms': ['microscope'], 'def': 'magnifier of the image of small objects', 'name': 'microscope'}, {'frequency': 'f', 'id': 699, 'synset': 'microwave.n.02', 'synonyms': ['microwave_oven'], 'def': 'kitchen appliance that cooks food by passing an electromagnetic wave through it', 'name': 'microwave_oven'}, {'frequency': 'r', 'id': 700, 'synset': 'milestone.n.01', 'synonyms': ['milestone', 'milepost'], 'def': 'stone post at side of a road to show distances', 'name': 'milestone'}, {'frequency': 'c', 'id': 701, 'synset': 'milk.n.01', 'synonyms': ['milk'], 'def': 'a white nutritious liquid secreted by mammals and used as food by human beings', 'name': 'milk'}, {'frequency': 'f', 'id': 702, 'synset': 'minivan.n.01', 'synonyms': ['minivan'], 'def': 'a small box-shaped passenger van', 'name': 'minivan'}, {'frequency': 'r', 'id': 703, 'synset': 'mint.n.05', 'synonyms': ['mint_candy'], 'def': 'a candy that is flavored with a mint oil', 'name': 'mint_candy'}, {'frequency': 'f', 'id': 704, 'synset': 'mirror.n.01', 'synonyms': ['mirror'], 'def': 'polished surface that forms images by reflecting light', 'name': 'mirror'}, {'frequency': 'c', 'id': 705, 'synset': 'mitten.n.01', 'synonyms': ['mitten'], 'def': 'glove that encases the thumb separately and the other four fingers together', 'name': 'mitten'}, {'frequency': 'c', 'id': 706, 'synset': 'mixer.n.04', 'synonyms': ['mixer_(kitchen_tool)', 'stand_mixer'], 'def': 'a kitchen utensil that is used for mixing foods', 'name': 'mixer_(kitchen_tool)'}, {'frequency': 'c', 'id': 707, 'synset': 'money.n.03', 'synonyms': ['money'], 'def': 'the official currency issued by a government or national bank', 'name': 'money'}, {'frequency': 'f', 'id': 708, 'synset': 'monitor.n.04', 'synonyms': ['monitor_(computer_equipment) computer_monitor'], 'def': 'a computer monitor', 'name': 'monitor_(computer_equipment) computer_monitor'}, {'frequency': 'c', 'id': 709, 'synset': 'monkey.n.01', 'synonyms': ['monkey'], 'def': 'any of various long-tailed primates', 'name': 'monkey'}, {'frequency': 'f', 'id': 710, 'synset': 'motor.n.01', 'synonyms': ['motor'], 'def': 'machine that converts other forms of energy into mechanical energy and so imparts motion', 'name': 'motor'}, {'frequency': 'f', 'id': 711, 'synset': 'motor_scooter.n.01', 'synonyms': ['motor_scooter', 'scooter'], 'def': 'a wheeled vehicle with small wheels and a low-powered engine', 'name': 'motor_scooter'}, {'frequency': 'r', 'id': 712, 'synset': 'motor_vehicle.n.01', 'synonyms': ['motor_vehicle', 'automotive_vehicle'], 'def': 'a self-propelled wheeled vehicle that does not run on rails', 'name': 'motor_vehicle'}, {'frequency': 'r', 'id': 713, 'synset': 'motorboat.n.01', 'synonyms': ['motorboat', 'powerboat'], 'def': 'a boat propelled by an internal-combustion engine', 'name': 'motorboat'}, {'frequency': 'f', 'id': 714, 'synset': 'motorcycle.n.01', 'synonyms': ['motorcycle'], 'def': 'a motor vehicle with two wheels and a strong frame', 'name': 'motorcycle'}, {'frequency': 'f', 'id': 715, 'synset': 'mound.n.01', 'synonyms': ['mound_(baseball)', "pitcher's_mound"], 'def': '(baseball) the slight elevation on which the pitcher stands', 'name': 'mound_(baseball)'}, {'frequency': 'r', 'id': 716, 'synset': 'mouse.n.01', 'synonyms': ['mouse_(animal_rodent)'], 'def': 'a small rodent with pointed snouts and small ears on elongated bodies with slender usually hairless tails', 'name': 'mouse_(animal_rodent)'}, {'frequency': 'f', 'id': 717, 'synset': 'mouse.n.04', 'synonyms': ['mouse_(computer_equipment)', 'computer_mouse'], 'def': 'a computer input device that controls an on-screen pointer', 'name': 'mouse_(computer_equipment)'}, {'frequency': 'f', 'id': 718, 'synset': 'mousepad.n.01', 'synonyms': ['mousepad'], 'def': 'a small portable pad that provides an operating surface for a computer mouse', 'name': 'mousepad'}, {'frequency': 'c', 'id': 719, 'synset': 'muffin.n.01', 'synonyms': ['muffin'], 'def': 'a sweet quick bread baked in a cup-shaped pan', 'name': 'muffin'}, {'frequency': 'f', 'id': 720, 'synset': 'mug.n.04', 'synonyms': ['mug'], 'def': 'with handle and usually cylindrical', 'name': 'mug'}, {'frequency': 'f', 'id': 721, 'synset': 'mushroom.n.02', 'synonyms': ['mushroom'], 'def': 'a common mushroom', 'name': 'mushroom'}, {'frequency': 'r', 'id': 722, 'synset': 'music_stool.n.01', 'synonyms': ['music_stool', 'piano_stool'], 'def': 'a stool for piano players; usually adjustable in height', 'name': 'music_stool'}, {'frequency': 'r', 'id': 723, 'synset': 'musical_instrument.n.01', 'synonyms': ['musical_instrument', 'instrument_(musical)'], 'def': 'any of various devices or contrivances that can be used to produce musical tones or sounds', 'name': 'musical_instrument'}, {'frequency': 'r', 'id': 724, 'synset': 'nailfile.n.01', 'synonyms': ['nailfile'], 'def': 'a small flat file for shaping the nails', 'name': 'nailfile'}, {'frequency': 'r', 'id': 725, 'synset': 'nameplate.n.01', 'synonyms': ['nameplate'], 'def': 'a plate bearing a name', 'name': 'nameplate'}, {'frequency': 'f', 'id': 726, 'synset': 'napkin.n.01', 'synonyms': ['napkin', 'table_napkin', 'serviette'], 'def': 'a small piece of table linen or paper that is used to wipe the mouth and to cover the lap in order to protect clothing', 'name': 'napkin'}, {'frequency': 'r', 'id': 727, 'synset': 'neckerchief.n.01', 'synonyms': ['neckerchief'], 'def': 'a kerchief worn around the neck', 'name': 'neckerchief'}, {'frequency': 'f', 'id': 728, 'synset': 'necklace.n.01', 'synonyms': ['necklace'], 'def': 'jewelry consisting of a cord or chain (often bearing gems) worn about the neck as an ornament', 'name': 'necklace'}, {'frequency': 'f', 'id': 729, 'synset': 'necktie.n.01', 'synonyms': ['necktie', 'tie_(necktie)'], 'def': 'neckwear consisting of a long narrow piece of material worn under a collar and tied in knot at the front', 'name': 'necktie'}, {'frequency': 'r', 'id': 730, 'synset': 'needle.n.03', 'synonyms': ['needle'], 'def': 'a sharp pointed implement (usually metal)', 'name': 'needle'}, {'frequency': 'c', 'id': 731, 'synset': 'nest.n.01', 'synonyms': ['nest'], 'def': 'a structure in which animals lay eggs or give birth to their young', 'name': 'nest'}, {'frequency': 'r', 'id': 732, 'synset': 'newsstand.n.01', 'synonyms': ['newsstand'], 'def': 'a stall where newspapers and other periodicals are sold', 'name': 'newsstand'}, {'frequency': 'c', 'id': 733, 'synset': 'nightwear.n.01', 'synonyms': ['nightshirt', 'nightwear', 'sleepwear', 'nightclothes'], 'def': 'garments designed to be worn in bed', 'name': 'nightshirt'}, {'frequency': 'r', 'id': 734, 'synset': 'nosebag.n.01', 'synonyms': ['nosebag_(for_animals)', 'feedbag'], 'def': 'a canvas bag that is used to feed an animal (such as a horse); covers the muzzle and fastens at the top of the head', 'name': 'nosebag_(for_animals)'}, {'frequency': 'r', 'id': 735, 'synset': 'noseband.n.01', 'synonyms': ['noseband_(for_animals)', 'nosepiece_(for_animals)'], 'def': "a strap that is the part of a bridle that goes over the animal's nose", 'name': 'noseband_(for_animals)'}, {'frequency': 'f', 'id': 736, 'synset': 'notebook.n.01', 'synonyms': ['notebook'], 'def': 'a book with blank pages for recording notes or memoranda', 'name': 'notebook'}, {'frequency': 'c', 'id': 737, 'synset': 'notepad.n.01', 'synonyms': ['notepad'], 'def': 'a pad of paper for keeping notes', 'name': 'notepad'}, {'frequency': 'c', 'id': 738, 'synset': 'nut.n.03', 'synonyms': ['nut'], 'def': 'a small metal block (usually square or hexagonal) with internal screw thread to be fitted onto a bolt', 'name': 'nut'}, {'frequency': 'r', 'id': 739, 'synset': 'nutcracker.n.01', 'synonyms': ['nutcracker'], 'def': 'a hand tool used to crack nuts open', 'name': 'nutcracker'}, {'frequency': 'c', 'id': 740, 'synset': 'oar.n.01', 'synonyms': ['oar'], 'def': 'an implement used to propel or steer a boat', 'name': 'oar'}, {'frequency': 'r', 'id': 741, 'synset': 'octopus.n.01', 'synonyms': ['octopus_(food)'], 'def': 'tentacles of octopus prepared as food', 'name': 'octopus_(food)'}, {'frequency': 'r', 'id': 742, 'synset': 'octopus.n.02', 'synonyms': ['octopus_(animal)'], 'def': 'bottom-living cephalopod having a soft oval body with eight long tentacles', 'name': 'octopus_(animal)'}, {'frequency': 'c', 'id': 743, 'synset': 'oil_lamp.n.01', 'synonyms': ['oil_lamp', 'kerosene_lamp', 'kerosine_lamp'], 'def': 'a lamp that burns oil (as kerosine) for light', 'name': 'oil_lamp'}, {'frequency': 'c', 'id': 744, 'synset': 'olive_oil.n.01', 'synonyms': ['olive_oil'], 'def': 'oil from olives', 'name': 'olive_oil'}, {'frequency': 'r', 'id': 745, 'synset': 'omelet.n.01', 'synonyms': ['omelet', 'omelette'], 'def': 'beaten eggs cooked until just set; may be folded around e.g. ham or cheese or jelly', 'name': 'omelet'}, {'frequency': 'f', 'id': 746, 'synset': 'onion.n.01', 'synonyms': ['onion'], 'def': 'the bulb of an onion plant', 'name': 'onion'}, {'frequency': 'f', 'id': 747, 'synset': 'orange.n.01', 'synonyms': ['orange_(fruit)'], 'def': 'orange (FRUIT of an orange tree)', 'name': 'orange_(fruit)'}, {'frequency': 'c', 'id': 748, 'synset': 'orange_juice.n.01', 'synonyms': ['orange_juice'], 'def': 'bottled or freshly squeezed juice of oranges', 'name': 'orange_juice'}, {'frequency': 'r', 'id': 749, 'synset': 'oregano.n.01', 'synonyms': ['oregano', 'marjoram'], 'def': 'aromatic Eurasian perennial herb used in cooking and baking', 'name': 'oregano'}, {'frequency': 'c', 'id': 750, 'synset': 'ostrich.n.02', 'synonyms': ['ostrich'], 'def': 'fast-running African flightless bird with two-toed feet; largest living bird', 'name': 'ostrich'}, {'frequency': 'c', 'id': 751, 'synset': 'ottoman.n.03', 'synonyms': ['ottoman', 'pouf', 'pouffe', 'hassock'], 'def': 'thick cushion used as a seat', 'name': 'ottoman'}, {'frequency': 'c', 'id': 752, 'synset': 'overall.n.01', 'synonyms': ['overalls_(clothing)'], 'def': 'work clothing consisting of denim trousers usually with a bib and shoulder straps', 'name': 'overalls_(clothing)'}, {'frequency': 'c', 'id': 753, 'synset': 'owl.n.01', 'synonyms': ['owl'], 'def': 'nocturnal bird of prey with hawk-like beak and claws and large head with front-facing eyes', 'name': 'owl'}, {'frequency': 'c', 'id': 754, 'synset': 'packet.n.03', 'synonyms': ['packet'], 'def': 'a small package or bundle', 'name': 'packet'}, {'frequency': 'r', 'id': 755, 'synset': 'pad.n.03', 'synonyms': ['inkpad', 'inking_pad', 'stamp_pad'], 'def': 'absorbent material saturated with ink used to transfer ink evenly to a rubber stamp', 'name': 'inkpad'}, {'frequency': 'c', 'id': 756, 'synset': 'pad.n.04', 'synonyms': ['pad'], 'def': 'a flat mass of soft material used for protection, stuffing, or comfort', 'name': 'pad'}, {'frequency': 'c', 'id': 757, 'synset': 'paddle.n.04', 'synonyms': ['paddle', 'boat_paddle'], 'def': 'a short light oar used without an oarlock to propel a canoe or small boat', 'name': 'paddle'}, {'frequency': 'c', 'id': 758, 'synset': 'padlock.n.01', 'synonyms': ['padlock'], 'def': 'a detachable, portable lock', 'name': 'padlock'}, {'frequency': 'r', 'id': 759, 'synset': 'paintbox.n.01', 'synonyms': ['paintbox'], 'def': "a box containing a collection of cubes or tubes of artists' paint", 'name': 'paintbox'}, {'frequency': 'c', 'id': 760, 'synset': 'paintbrush.n.01', 'synonyms': ['paintbrush'], 'def': 'a brush used as an applicator to apply paint', 'name': 'paintbrush'}, {'frequency': 'f', 'id': 761, 'synset': 'painting.n.01', 'synonyms': ['painting'], 'def': 'graphic art consisting of an artistic composition made by applying paints to a surface', 'name': 'painting'}, {'frequency': 'c', 'id': 762, 'synset': 'pajama.n.02', 'synonyms': ['pajamas', 'pyjamas'], 'def': 'loose-fitting nightclothes worn for sleeping or lounging', 'name': 'pajamas'}, {'frequency': 'c', 'id': 763, 'synset': 'palette.n.02', 'synonyms': ['palette', 'pallet'], 'def': 'board that provides a flat surface on which artists mix paints and the range of colors used', 'name': 'palette'}, {'frequency': 'f', 'id': 764, 'synset': 'pan.n.01', 'synonyms': ['pan_(for_cooking)', 'cooking_pan'], 'def': 'cooking utensil consisting of a wide metal vessel', 'name': 'pan_(for_cooking)'}, {'frequency': 'r', 'id': 765, 'synset': 'pan.n.03', 'synonyms': ['pan_(metal_container)'], 'def': 'shallow container made of metal', 'name': 'pan_(metal_container)'}, {'frequency': 'c', 'id': 766, 'synset': 'pancake.n.01', 'synonyms': ['pancake'], 'def': 'a flat cake of thin batter fried on both sides on a griddle', 'name': 'pancake'}, {'frequency': 'r', 'id': 767, 'synset': 'pantyhose.n.01', 'synonyms': ['pantyhose'], 'def': "a woman's tights consisting of underpants and stockings", 'name': 'pantyhose'}, {'frequency': 'r', 'id': 768, 'synset': 'papaya.n.02', 'synonyms': ['papaya'], 'def': 'large oval melon-like tropical fruit with yellowish flesh', 'name': 'papaya'}, {'frequency': 'r', 'id': 769, 'synset': 'paper_clip.n.01', 'synonyms': ['paperclip'], 'def': 'a wire or plastic clip for holding sheets of paper together', 'name': 'paperclip'}, {'frequency': 'f', 'id': 770, 'synset': 'paper_plate.n.01', 'synonyms': ['paper_plate'], 'def': 'a disposable plate made of cardboard', 'name': 'paper_plate'}, {'frequency': 'f', 'id': 771, 'synset': 'paper_towel.n.01', 'synonyms': ['paper_towel'], 'def': 'a disposable towel made of absorbent paper', 'name': 'paper_towel'}, {'frequency': 'r', 'id': 772, 'synset': 'paperback_book.n.01', 'synonyms': ['paperback_book', 'paper-back_book', 'softback_book', 'soft-cover_book'], 'def': 'a book with paper covers', 'name': 'paperback_book'}, {'frequency': 'r', 'id': 773, 'synset': 'paperweight.n.01', 'synonyms': ['paperweight'], 'def': 'a weight used to hold down a stack of papers', 'name': 'paperweight'}, {'frequency': 'c', 'id': 774, 'synset': 'parachute.n.01', 'synonyms': ['parachute'], 'def': 'rescue equipment consisting of a device that fills with air and retards your fall', 'name': 'parachute'}, {'frequency': 'r', 'id': 775, 'synset': 'parakeet.n.01', 'synonyms': ['parakeet', 'parrakeet', 'parroket', 'paraquet', 'paroquet', 'parroquet'], 'def': 'any of numerous small slender long-tailed parrots', 'name': 'parakeet'}, {'frequency': 'c', 'id': 776, 'synset': 'parasail.n.01', 'synonyms': ['parasail_(sports)'], 'def': 'parachute that will lift a person up into the air when it is towed by a motorboat or a car', 'name': 'parasail_(sports)'}, {'frequency': 'r', 'id': 777, 'synset': 'parchment.n.01', 'synonyms': ['parchment'], 'def': 'a superior paper resembling sheepskin', 'name': 'parchment'}, {'frequency': 'r', 'id': 778, 'synset': 'parka.n.01', 'synonyms': ['parka', 'anorak'], 'def': "a kind of heavy jacket (`windcheater' is a British term)", 'name': 'parka'}, {'frequency': 'f', 'id': 779, 'synset': 'parking_meter.n.01', 'synonyms': ['parking_meter'], 'def': 'a coin-operated timer located next to a parking space', 'name': 'parking_meter'}, {'frequency': 'c', 'id': 780, 'synset': 'parrot.n.01', 'synonyms': ['parrot'], 'def': 'usually brightly colored tropical birds with short hooked beaks and the ability to mimic sounds', 'name': 'parrot'}, {'frequency': 'c', 'id': 781, 'synset': 'passenger_car.n.01', 'synonyms': ['passenger_car_(part_of_a_train)', 'coach_(part_of_a_train)'], 'def': 'a railcar where passengers ride', 'name': 'passenger_car_(part_of_a_train)'}, {'frequency': 'r', 'id': 782, 'synset': 'passenger_ship.n.01', 'synonyms': ['passenger_ship'], 'def': 'a ship built to carry passengers', 'name': 'passenger_ship'}, {'frequency': 'r', 'id': 783, 'synset': 'passport.n.02', 'synonyms': ['passport'], 'def': 'a document issued by a country to a citizen allowing that person to travel abroad and re-enter the home country', 'name': 'passport'}, {'frequency': 'f', 'id': 784, 'synset': 'pastry.n.02', 'synonyms': ['pastry'], 'def': 'any of various baked foods made of dough or batter', 'name': 'pastry'}, {'frequency': 'r', 'id': 785, 'synset': 'patty.n.01', 'synonyms': ['patty_(food)'], 'def': 'small flat mass of chopped food', 'name': 'patty_(food)'}, {'frequency': 'c', 'id': 786, 'synset': 'pea.n.01', 'synonyms': ['pea_(food)'], 'def': 'seed of a pea plant used for food', 'name': 'pea_(food)'}, {'frequency': 'c', 'id': 787, 'synset': 'peach.n.03', 'synonyms': ['peach'], 'def': 'downy juicy fruit with sweet yellowish or whitish flesh', 'name': 'peach'}, {'frequency': 'c', 'id': 788, 'synset': 'peanut_butter.n.01', 'synonyms': ['peanut_butter'], 'def': 'a spread made from ground peanuts', 'name': 'peanut_butter'}, {'frequency': 'c', 'id': 789, 'synset': 'pear.n.01', 'synonyms': ['pear'], 'def': 'sweet juicy gritty-textured fruit available in many varieties', 'name': 'pear'}, {'frequency': 'r', 'id': 790, 'synset': 'peeler.n.03', 'synonyms': ['peeler_(tool_for_fruit_and_vegetables)'], 'def': 'a device for peeling vegetables or fruits', 'name': 'peeler_(tool_for_fruit_and_vegetables)'}, {'frequency': 'r', 'id': 791, 'synset': 'pegboard.n.01', 'synonyms': ['pegboard'], 'def': 'a board perforated with regularly spaced holes into which pegs can be fitted', 'name': 'pegboard'}, {'frequency': 'c', 'id': 792, 'synset': 'pelican.n.01', 'synonyms': ['pelican'], 'def': 'large long-winged warm-water seabird having a large bill with a distensible pouch for fish', 'name': 'pelican'}, {'frequency': 'f', 'id': 793, 'synset': 'pen.n.01', 'synonyms': ['pen'], 'def': 'a writing implement with a point from which ink flows', 'name': 'pen'}, {'frequency': 'c', 'id': 794, 'synset': 'pencil.n.01', 'synonyms': ['pencil'], 'def': 'a thin cylindrical pointed writing implement made of wood and graphite', 'name': 'pencil'}, {'frequency': 'r', 'id': 795, 'synset': 'pencil_box.n.01', 'synonyms': ['pencil_box', 'pencil_case'], 'def': 'a box for holding pencils', 'name': 'pencil_box'}, {'frequency': 'r', 'id': 796, 'synset': 'pencil_sharpener.n.01', 'synonyms': ['pencil_sharpener'], 'def': 'a rotary implement for sharpening the point on pencils', 'name': 'pencil_sharpener'}, {'frequency': 'r', 'id': 797, 'synset': 'pendulum.n.01', 'synonyms': ['pendulum'], 'def': 'an apparatus consisting of an object mounted so that it swings freely under the influence of gravity', 'name': 'pendulum'}, {'frequency': 'c', 'id': 798, 'synset': 'penguin.n.01', 'synonyms': ['penguin'], 'def': 'short-legged flightless birds of cold southern regions having webbed feet and wings modified as flippers', 'name': 'penguin'}, {'frequency': 'r', 'id': 799, 'synset': 'pennant.n.02', 'synonyms': ['pennant'], 'def': 'a flag longer than it is wide (and often tapering)', 'name': 'pennant'}, {'frequency': 'r', 'id': 800, 'synset': 'penny.n.02', 'synonyms': ['penny_(coin)'], 'def': 'a coin worth one-hundredth of the value of the basic unit', 'name': 'penny_(coin)'}, {'frequency': 'c', 'id': 801, 'synset': 'pepper.n.03', 'synonyms': ['pepper', 'peppercorn'], 'def': 'pungent seasoning from the berry of the common pepper plant; whole or ground', 'name': 'pepper'}, {'frequency': 'c', 'id': 802, 'synset': 'pepper_mill.n.01', 'synonyms': ['pepper_mill', 'pepper_grinder'], 'def': 'a mill for grinding pepper', 'name': 'pepper_mill'}, {'frequency': 'c', 'id': 803, 'synset': 'perfume.n.02', 'synonyms': ['perfume'], 'def': 'a toiletry that emits and diffuses a fragrant odor', 'name': 'perfume'}, {'frequency': 'r', 'id': 804, 'synset': 'persimmon.n.02', 'synonyms': ['persimmon'], 'def': 'orange fruit resembling a plum; edible when fully ripe', 'name': 'persimmon'}, {'frequency': 'f', 'id': 805, 'synset': 'person.n.01', 'synonyms': ['baby', 'child', 'boy', 'girl', 'man', 'woman', 'person', 'human'], 'def': 'a human being', 'name': 'baby'}, {'frequency': 'r', 'id': 806, 'synset': 'pet.n.01', 'synonyms': ['pet'], 'def': 'a domesticated animal kept for companionship or amusement', 'name': 'pet'}, {'frequency': 'r', 'id': 807, 'synset': 'petfood.n.01', 'synonyms': ['petfood', 'pet-food'], 'def': 'food prepared for animal pets', 'name': 'petfood'}, {'frequency': 'r', 'id': 808, 'synset': 'pew.n.01', 'synonyms': ['pew_(church_bench)', 'church_bench'], 'def': 'long bench with backs; used in church by the congregation', 'name': 'pew_(church_bench)'}, {'frequency': 'r', 'id': 809, 'synset': 'phonebook.n.01', 'synonyms': ['phonebook', 'telephone_book', 'telephone_directory'], 'def': 'a directory containing an alphabetical list of telephone subscribers and their telephone numbers', 'name': 'phonebook'}, {'frequency': 'c', 'id': 810, 'synset': 'phonograph_record.n.01', 'synonyms': ['phonograph_record', 'phonograph_recording', 'record_(phonograph_recording)'], 'def': 'sound recording consisting of a typically black disk with a continuous groove', 'name': 'phonograph_record'}, {'frequency': 'c', 'id': 811, 'synset': 'piano.n.01', 'synonyms': ['piano'], 'def': 'a keyboard instrument that is played by depressing keys that cause hammers to strike tuned strings and produce sounds', 'name': 'piano'}, {'frequency': 'f', 'id': 812, 'synset': 'pickle.n.01', 'synonyms': ['pickle'], 'def': 'vegetables (especially cucumbers) preserved in brine or vinegar', 'name': 'pickle'}, {'frequency': 'f', 'id': 813, 'synset': 'pickup.n.01', 'synonyms': ['pickup_truck'], 'def': 'a light truck with an open body and low sides and a tailboard', 'name': 'pickup_truck'}, {'frequency': 'c', 'id': 814, 'synset': 'pie.n.01', 'synonyms': ['pie'], 'def': 'dish baked in pastry-lined pan often with a pastry top', 'name': 'pie'}, {'frequency': 'c', 'id': 815, 'synset': 'pigeon.n.01', 'synonyms': ['pigeon'], 'def': 'wild and domesticated birds having a heavy body and short legs', 'name': 'pigeon'}, {'frequency': 'r', 'id': 816, 'synset': 'piggy_bank.n.01', 'synonyms': ['piggy_bank', 'penny_bank'], 'def': "a child's coin bank (often shaped like a pig)", 'name': 'piggy_bank'}, {'frequency': 'f', 'id': 817, 'synset': 'pillow.n.01', 'synonyms': ['pillow'], 'def': 'a cushion to support the head of a sleeping person', 'name': 'pillow'}, {'frequency': 'r', 'id': 818, 'synset': 'pin.n.09', 'synonyms': ['pin_(non_jewelry)'], 'def': 'a small slender (often pointed) piece of wood or metal used to support or fasten or attach things', 'name': 'pin_(non_jewelry)'}, {'frequency': 'f', 'id': 819, 'synset': 'pineapple.n.02', 'synonyms': ['pineapple'], 'def': 'large sweet fleshy tropical fruit with a tuft of stiff leaves', 'name': 'pineapple'}, {'frequency': 'c', 'id': 820, 'synset': 'pinecone.n.01', 'synonyms': ['pinecone'], 'def': 'the seed-producing cone of a pine tree', 'name': 'pinecone'}, {'frequency': 'r', 'id': 821, 'synset': 'ping-pong_ball.n.01', 'synonyms': ['ping-pong_ball'], 'def': 'light hollow ball used in playing table tennis', 'name': 'ping-pong_ball'}, {'frequency': 'r', 'id': 822, 'synset': 'pinwheel.n.03', 'synonyms': ['pinwheel'], 'def': 'a toy consisting of vanes of colored paper or plastic that is pinned to a stick and spins when it is pointed into the wind', 'name': 'pinwheel'}, {'frequency': 'r', 'id': 823, 'synset': 'pipe.n.01', 'synonyms': ['tobacco_pipe'], 'def': 'a tube with a small bowl at one end; used for smoking tobacco', 'name': 'tobacco_pipe'}, {'frequency': 'f', 'id': 824, 'synset': 'pipe.n.02', 'synonyms': ['pipe', 'piping'], 'def': 'a long tube made of metal or plastic that is used to carry water or oil or gas etc.', 'name': 'pipe'}, {'frequency': 'r', 'id': 825, 'synset': 'pistol.n.01', 'synonyms': ['pistol', 'handgun'], 'def': 'a firearm that is held and fired with one hand', 'name': 'pistol'}, {'frequency': 'r', 'id': 826, 'synset': 'pita.n.01', 'synonyms': ['pita_(bread)', 'pocket_bread'], 'def': 'usually small round bread that can open into a pocket for filling', 'name': 'pita_(bread)'}, {'frequency': 'f', 'id': 827, 'synset': 'pitcher.n.02', 'synonyms': ['pitcher_(vessel_for_liquid)', 'ewer'], 'def': 'an open vessel with a handle and a spout for pouring', 'name': 'pitcher_(vessel_for_liquid)'}, {'frequency': 'r', 'id': 828, 'synset': 'pitchfork.n.01', 'synonyms': ['pitchfork'], 'def': 'a long-handled hand tool with sharp widely spaced prongs for lifting and pitching hay', 'name': 'pitchfork'}, {'frequency': 'f', 'id': 829, 'synset': 'pizza.n.01', 'synonyms': ['pizza'], 'def': 'Italian open pie made of thin bread dough spread with a spiced mixture of e.g. tomato sauce and cheese', 'name': 'pizza'}, {'frequency': 'f', 'id': 830, 'synset': 'place_mat.n.01', 'synonyms': ['place_mat'], 'def': 'a mat placed on a table for an individual place setting', 'name': 'place_mat'}, {'frequency': 'f', 'id': 831, 'synset': 'plate.n.04', 'synonyms': ['plate'], 'def': 'dish on which food is served or from which food is eaten', 'name': 'plate'}, {'frequency': 'c', 'id': 832, 'synset': 'platter.n.01', 'synonyms': ['platter'], 'def': 'a large shallow dish used for serving food', 'name': 'platter'}, {'frequency': 'r', 'id': 833, 'synset': 'playing_card.n.01', 'synonyms': ['playing_card'], 'def': 'one of a pack of cards that are used to play card games', 'name': 'playing_card'}, {'frequency': 'r', 'id': 834, 'synset': 'playpen.n.01', 'synonyms': ['playpen'], 'def': 'a portable enclosure in which babies may be left to play', 'name': 'playpen'}, {'frequency': 'c', 'id': 835, 'synset': 'pliers.n.01', 'synonyms': ['pliers', 'plyers'], 'def': 'a gripping hand tool with two hinged arms and (usually) serrated jaws', 'name': 'pliers'}, {'frequency': 'r', 'id': 836, 'synset': 'plow.n.01', 'synonyms': ['plow_(farm_equipment)', 'plough_(farm_equipment)'], 'def': 'a farm tool having one or more heavy blades to break the soil and cut a furrow prior to sowing', 'name': 'plow_(farm_equipment)'}, {'frequency': 'r', 'id': 837, 'synset': 'pocket_watch.n.01', 'synonyms': ['pocket_watch'], 'def': 'a watch that is carried in a small watch pocket', 'name': 'pocket_watch'}, {'frequency': 'c', 'id': 838, 'synset': 'pocketknife.n.01', 'synonyms': ['pocketknife'], 'def': 'a knife with a blade that folds into the handle; suitable for carrying in the pocket', 'name': 'pocketknife'}, {'frequency': 'c', 'id': 839, 'synset': 'poker.n.01', 'synonyms': ['poker_(fire_stirring_tool)', 'stove_poker', 'fire_hook'], 'def': 'fire iron consisting of a metal rod with a handle; used to stir a fire', 'name': 'poker_(fire_stirring_tool)'}, {'frequency': 'f', 'id': 840, 'synset': 'pole.n.01', 'synonyms': ['pole', 'post'], 'def': 'a long (usually round) rod of wood or metal or plastic', 'name': 'pole'}, {'frequency': 'r', 'id': 841, 'synset': 'police_van.n.01', 'synonyms': ['police_van', 'police_wagon', 'paddy_wagon', 'patrol_wagon'], 'def': 'van used by police to transport prisoners', 'name': 'police_van'}, {'frequency': 'f', 'id': 842, 'synset': 'polo_shirt.n.01', 'synonyms': ['polo_shirt', 'sport_shirt'], 'def': 'a shirt with short sleeves designed for comfort and casual wear', 'name': 'polo_shirt'}, {'frequency': 'r', 'id': 843, 'synset': 'poncho.n.01', 'synonyms': ['poncho'], 'def': 'a blanket-like cloak with a hole in the center for the head', 'name': 'poncho'}, {'frequency': 'c', 'id': 844, 'synset': 'pony.n.05', 'synonyms': ['pony'], 'def': 'any of various breeds of small gentle horses usually less than five feet high at the shoulder', 'name': 'pony'}, {'frequency': 'r', 'id': 845, 'synset': 'pool_table.n.01', 'synonyms': ['pool_table', 'billiard_table', 'snooker_table'], 'def': 'game equipment consisting of a heavy table on which pool is played', 'name': 'pool_table'}, {'frequency': 'f', 'id': 846, 'synset': 'pop.n.02', 'synonyms': ['pop_(soda)', 'soda_(pop)', 'tonic', 'soft_drink'], 'def': 'a sweet drink containing carbonated water and flavoring', 'name': 'pop_(soda)'}, {'frequency': 'r', 'id': 847, 'synset': 'portrait.n.02', 'synonyms': ['portrait', 'portrayal'], 'def': 'any likeness of a person, in any medium', 'name': 'portrait'}, {'frequency': 'c', 'id': 848, 'synset': 'postbox.n.01', 'synonyms': ['postbox_(public)', 'mailbox_(public)'], 'def': 'public box for deposit of mail', 'name': 'postbox_(public)'}, {'frequency': 'c', 'id': 849, 'synset': 'postcard.n.01', 'synonyms': ['postcard', 'postal_card', 'mailing-card'], 'def': 'a card for sending messages by post without an envelope', 'name': 'postcard'}, {'frequency': 'f', 'id': 850, 'synset': 'poster.n.01', 'synonyms': ['poster', 'placard'], 'def': 'a sign posted in a public place as an advertisement', 'name': 'poster'}, {'frequency': 'f', 'id': 851, 'synset': 'pot.n.01', 'synonyms': ['pot'], 'def': 'metal or earthenware cooking vessel that is usually round and deep; often has a handle and lid', 'name': 'pot'}, {'frequency': 'f', 'id': 852, 'synset': 'pot.n.04', 'synonyms': ['flowerpot'], 'def': 'a container in which plants are cultivated', 'name': 'flowerpot'}, {'frequency': 'f', 'id': 853, 'synset': 'potato.n.01', 'synonyms': ['potato'], 'def': 'an edible tuber native to South America', 'name': 'potato'}, {'frequency': 'c', 'id': 854, 'synset': 'potholder.n.01', 'synonyms': ['potholder'], 'def': 'an insulated pad for holding hot pots', 'name': 'potholder'}, {'frequency': 'c', 'id': 855, 'synset': 'pottery.n.01', 'synonyms': ['pottery', 'clayware'], 'def': 'ceramic ware made from clay and baked in a kiln', 'name': 'pottery'}, {'frequency': 'c', 'id': 856, 'synset': 'pouch.n.01', 'synonyms': ['pouch'], 'def': 'a small or medium size container for holding or carrying things', 'name': 'pouch'}, {'frequency': 'r', 'id': 857, 'synset': 'power_shovel.n.01', 'synonyms': ['power_shovel', 'excavator', 'digger'], 'def': 'a machine for excavating', 'name': 'power_shovel'}, {'frequency': 'c', 'id': 858, 'synset': 'prawn.n.01', 'synonyms': ['prawn', 'shrimp'], 'def': 'any of various edible decapod crustaceans', 'name': 'prawn'}, {'frequency': 'f', 'id': 859, 'synset': 'printer.n.03', 'synonyms': ['printer', 'printing_machine'], 'def': 'a machine that prints', 'name': 'printer'}, {'frequency': 'c', 'id': 860, 'synset': 'projectile.n.01', 'synonyms': ['projectile_(weapon)', 'missile'], 'def': 'a weapon that is forcibly thrown or projected at a targets', 'name': 'projectile_(weapon)'}, {'frequency': 'c', 'id': 861, 'synset': 'projector.n.02', 'synonyms': ['projector'], 'def': 'an optical instrument that projects an enlarged image onto a screen', 'name': 'projector'}, {'frequency': 'f', 'id': 862, 'synset': 'propeller.n.01', 'synonyms': ['propeller', 'propellor'], 'def': 'a mechanical device that rotates to push against air or water', 'name': 'propeller'}, {'frequency': 'r', 'id': 863, 'synset': 'prune.n.01', 'synonyms': ['prune'], 'def': 'dried plum', 'name': 'prune'}, {'frequency': 'r', 'id': 864, 'synset': 'pudding.n.01', 'synonyms': ['pudding'], 'def': 'any of various soft thick unsweetened baked dishes', 'name': 'pudding'}, {'frequency': 'r', 'id': 865, 'synset': 'puffer.n.02', 'synonyms': ['puffer_(fish)', 'pufferfish', 'blowfish', 'globefish'], 'def': 'fishes whose elongated spiny body can inflate itself with water or air to form a globe', 'name': 'puffer_(fish)'}, {'frequency': 'r', 'id': 866, 'synset': 'puffin.n.01', 'synonyms': ['puffin'], 'def': 'seabirds having short necks and brightly colored compressed bills', 'name': 'puffin'}, {'frequency': 'r', 'id': 867, 'synset': 'pug.n.01', 'synonyms': ['pug-dog'], 'def': 'small compact smooth-coated breed of Asiatic origin having a tightly curled tail and broad flat wrinkled muzzle', 'name': 'pug-dog'}, {'frequency': 'c', 'id': 868, 'synset': 'pumpkin.n.02', 'synonyms': ['pumpkin'], 'def': 'usually large pulpy deep-yellow round fruit of the squash family maturing in late summer or early autumn', 'name': 'pumpkin'}, {'frequency': 'r', 'id': 869, 'synset': 'punch.n.03', 'synonyms': ['puncher'], 'def': 'a tool for making holes or indentations', 'name': 'puncher'}, {'frequency': 'r', 'id': 870, 'synset': 'puppet.n.01', 'synonyms': ['puppet', 'marionette'], 'def': 'a small figure of a person operated from above with strings by a puppeteer', 'name': 'puppet'}, {'frequency': 'r', 'id': 871, 'synset': 'puppy.n.01', 'synonyms': ['puppy'], 'def': 'a young dog', 'name': 'puppy'}, {'frequency': 'r', 'id': 872, 'synset': 'quesadilla.n.01', 'synonyms': ['quesadilla'], 'def': 'a tortilla that is filled with cheese and heated', 'name': 'quesadilla'}, {'frequency': 'r', 'id': 873, 'synset': 'quiche.n.02', 'synonyms': ['quiche'], 'def': 'a tart filled with rich unsweetened custard; often contains other ingredients (as cheese or ham or seafood or vegetables)', 'name': 'quiche'}, {'frequency': 'f', 'id': 874, 'synset': 'quilt.n.01', 'synonyms': ['quilt', 'comforter'], 'def': 'bedding made of two layers of cloth filled with stuffing and stitched together', 'name': 'quilt'}, {'frequency': 'c', 'id': 875, 'synset': 'rabbit.n.01', 'synonyms': ['rabbit'], 'def': 'any of various burrowing animals of the family Leporidae having long ears and short tails', 'name': 'rabbit'}, {'frequency': 'r', 'id': 876, 'synset': 'racer.n.02', 'synonyms': ['race_car', 'racing_car'], 'def': 'a fast car that competes in races', 'name': 'race_car'}, {'frequency': 'c', 'id': 877, 'synset': 'racket.n.04', 'synonyms': ['racket', 'racquet'], 'def': 'a sports implement used to strike a ball in various games', 'name': 'racket'}, {'frequency': 'r', 'id': 878, 'synset': 'radar.n.01', 'synonyms': ['radar'], 'def': 'measuring instrument in which the echo of a pulse of microwave radiation is used to detect and locate distant objects', 'name': 'radar'}, {'frequency': 'c', 'id': 879, 'synset': 'radiator.n.03', 'synonyms': ['radiator'], 'def': 'a mechanism consisting of a metal honeycomb through which hot fluids circulate', 'name': 'radiator'}, {'frequency': 'c', 'id': 880, 'synset': 'radio_receiver.n.01', 'synonyms': ['radio_receiver', 'radio_set', 'radio', 'tuner_(radio)'], 'def': 'an electronic receiver that detects and demodulates and amplifies transmitted radio signals', 'name': 'radio_receiver'}, {'frequency': 'c', 'id': 881, 'synset': 'radish.n.03', 'synonyms': ['radish', 'daikon'], 'def': 'pungent edible root of any of various cultivated radish plants', 'name': 'radish'}, {'frequency': 'c', 'id': 882, 'synset': 'raft.n.01', 'synonyms': ['raft'], 'def': 'a flat float (usually made of logs or planks) that can be used for transport or as a platform for swimmers', 'name': 'raft'}, {'frequency': 'r', 'id': 883, 'synset': 'rag_doll.n.01', 'synonyms': ['rag_doll'], 'def': 'a cloth doll that is stuffed and (usually) painted', 'name': 'rag_doll'}, {'frequency': 'c', 'id': 884, 'synset': 'raincoat.n.01', 'synonyms': ['raincoat', 'waterproof_jacket'], 'def': 'a water-resistant coat', 'name': 'raincoat'}, {'frequency': 'c', 'id': 885, 'synset': 'ram.n.05', 'synonyms': ['ram_(animal)'], 'def': 'uncastrated adult male sheep', 'name': 'ram_(animal)'}, {'frequency': 'c', 'id': 886, 'synset': 'raspberry.n.02', 'synonyms': ['raspberry'], 'def': 'red or black edible aggregate berries usually smaller than the related blackberries', 'name': 'raspberry'}, {'frequency': 'r', 'id': 887, 'synset': 'rat.n.01', 'synonyms': ['rat'], 'def': 'any of various long-tailed rodents similar to but larger than a mouse', 'name': 'rat'}, {'frequency': 'c', 'id': 888, 'synset': 'razorblade.n.01', 'synonyms': ['razorblade'], 'def': 'a blade that has very sharp edge', 'name': 'razorblade'}, {'frequency': 'c', 'id': 889, 'synset': 'reamer.n.01', 'synonyms': ['reamer_(juicer)', 'juicer', 'juice_reamer'], 'def': 'a squeezer with a conical ridged center that is used for squeezing juice from citrus fruit', 'name': 'reamer_(juicer)'}, {'frequency': 'f', 'id': 890, 'synset': 'rearview_mirror.n.01', 'synonyms': ['rearview_mirror'], 'def': 'car mirror that reflects the view out of the rear window', 'name': 'rearview_mirror'}, {'frequency': 'c', 'id': 891, 'synset': 'receipt.n.02', 'synonyms': ['receipt'], 'def': 'an acknowledgment (usually tangible) that payment has been made', 'name': 'receipt'}, {'frequency': 'c', 'id': 892, 'synset': 'recliner.n.01', 'synonyms': ['recliner', 'reclining_chair', 'lounger_(chair)'], 'def': 'an armchair whose back can be lowered and foot can be raised to allow the sitter to recline in it', 'name': 'recliner'}, {'frequency': 'r', 'id': 893, 'synset': 'record_player.n.01', 'synonyms': ['record_player', 'phonograph_(record_player)', 'turntable'], 'def': 'machine in which rotating records cause a stylus to vibrate and the vibrations are amplified acoustically or electronically', 'name': 'record_player'}, {'frequency': 'r', 'id': 894, 'synset': 'red_cabbage.n.02', 'synonyms': ['red_cabbage'], 'def': 'compact head of purplish-red leaves', 'name': 'red_cabbage'}, {'frequency': 'f', 'id': 895, 'synset': 'reflector.n.01', 'synonyms': ['reflector'], 'def': 'device that reflects light, radiation, etc.', 'name': 'reflector'}, {'frequency': 'f', 'id': 896, 'synset': 'remote_control.n.01', 'synonyms': ['remote_control'], 'def': 'a device that can be used to control a machine or apparatus from a distance', 'name': 'remote_control'}, {'frequency': 'c', 'id': 897, 'synset': 'rhinoceros.n.01', 'synonyms': ['rhinoceros'], 'def': 'massive powerful herbivorous odd-toed ungulate of southeast Asia and Africa having very thick skin and one or two horns on the snout', 'name': 'rhinoceros'}, {'frequency': 'r', 'id': 898, 'synset': 'rib.n.03', 'synonyms': ['rib_(food)'], 'def': 'cut of meat including one or more ribs', 'name': 'rib_(food)'}, {'frequency': 'r', 'id': 899, 'synset': 'rifle.n.01', 'synonyms': ['rifle'], 'def': 'a shoulder firearm with a long barrel', 'name': 'rifle'}, {'frequency': 'f', 'id': 900, 'synset': 'ring.n.08', 'synonyms': ['ring'], 'def': 'jewelry consisting of a circlet of precious metal (often set with jewels) worn on the finger', 'name': 'ring'}, {'frequency': 'r', 'id': 901, 'synset': 'river_boat.n.01', 'synonyms': ['river_boat'], 'def': 'a boat used on rivers or to ply a river', 'name': 'river_boat'}, {'frequency': 'r', 'id': 902, 'synset': 'road_map.n.02', 'synonyms': ['road_map'], 'def': '(NOT A ROAD) a MAP showing roads (for automobile travel)', 'name': 'road_map'}, {'frequency': 'c', 'id': 903, 'synset': 'robe.n.01', 'synonyms': ['robe'], 'def': 'any loose flowing garment', 'name': 'robe'}, {'frequency': 'c', 'id': 904, 'synset': 'rocking_chair.n.01', 'synonyms': ['rocking_chair'], 'def': 'a chair mounted on rockers', 'name': 'rocking_chair'}, {'frequency': 'r', 'id': 905, 'synset': 'roller_skate.n.01', 'synonyms': ['roller_skate'], 'def': 'a shoe with pairs of rollers (small hard wheels) fixed to the sole', 'name': 'roller_skate'}, {'frequency': 'r', 'id': 906, 'synset': 'rollerblade.n.01', 'synonyms': ['Rollerblade'], 'def': 'an in-line variant of a roller skate', 'name': 'Rollerblade'}, {'frequency': 'c', 'id': 907, 'synset': 'rolling_pin.n.01', 'synonyms': ['rolling_pin'], 'def': 'utensil consisting of a cylinder (usually of wood) with a handle at each end; used to roll out dough', 'name': 'rolling_pin'}, {'frequency': 'r', 'id': 908, 'synset': 'root_beer.n.01', 'synonyms': ['root_beer'], 'def': 'carbonated drink containing extracts of roots and herbs', 'name': 'root_beer'}, {'frequency': 'c', 'id': 909, 'synset': 'router.n.02', 'synonyms': ['router_(computer_equipment)'], 'def': 'a device that forwards data packets between computer networks', 'name': 'router_(computer_equipment)'}, {'frequency': 'f', 'id': 910, 'synset': 'rubber_band.n.01', 'synonyms': ['rubber_band', 'elastic_band'], 'def': 'a narrow band of elastic rubber used to hold things (such as papers) together', 'name': 'rubber_band'}, {'frequency': 'c', 'id': 911, 'synset': 'runner.n.08', 'synonyms': ['runner_(carpet)'], 'def': 'a long narrow carpet', 'name': 'runner_(carpet)'}, {'frequency': 'f', 'id': 912, 'synset': 'sack.n.01', 'synonyms': ['plastic_bag', 'paper_bag'], 'def': "a bag made of paper or plastic for holding customer's purchases", 'name': 'plastic_bag'}, {'frequency': 'f', 'id': 913, 'synset': 'saddle.n.01', 'synonyms': ['saddle_(on_an_animal)'], 'def': 'a seat for the rider of a horse or camel', 'name': 'saddle_(on_an_animal)'}, {'frequency': 'f', 'id': 914, 'synset': 'saddle_blanket.n.01', 'synonyms': ['saddle_blanket', 'saddlecloth', 'horse_blanket'], 'def': 'stable gear consisting of a blanket placed under the saddle', 'name': 'saddle_blanket'}, {'frequency': 'c', 'id': 915, 'synset': 'saddlebag.n.01', 'synonyms': ['saddlebag'], 'def': 'a large bag (or pair of bags) hung over a saddle', 'name': 'saddlebag'}, {'frequency': 'r', 'id': 916, 'synset': 'safety_pin.n.01', 'synonyms': ['safety_pin'], 'def': 'a pin in the form of a clasp; has a guard so the point of the pin will not stick the user', 'name': 'safety_pin'}, {'frequency': 'c', 'id': 917, 'synset': 'sail.n.01', 'synonyms': ['sail'], 'def': 'a large piece of fabric by means of which wind is used to propel a sailing vessel', 'name': 'sail'}, {'frequency': 'c', 'id': 918, 'synset': 'salad.n.01', 'synonyms': ['salad'], 'def': 'food mixtures either arranged on a plate or tossed and served with a moist dressing; usually consisting of or including greens', 'name': 'salad'}, {'frequency': 'r', 'id': 919, 'synset': 'salad_plate.n.01', 'synonyms': ['salad_plate', 'salad_bowl'], 'def': 'a plate or bowl for individual servings of salad', 'name': 'salad_plate'}, {'frequency': 'r', 'id': 920, 'synset': 'salami.n.01', 'synonyms': ['salami'], 'def': 'highly seasoned fatty sausage of pork and beef usually dried', 'name': 'salami'}, {'frequency': 'r', 'id': 921, 'synset': 'salmon.n.01', 'synonyms': ['salmon_(fish)'], 'def': 'any of various large food and game fishes of northern waters', 'name': 'salmon_(fish)'}, {'frequency': 'r', 'id': 922, 'synset': 'salmon.n.03', 'synonyms': ['salmon_(food)'], 'def': 'flesh of any of various marine or freshwater fish of the family Salmonidae', 'name': 'salmon_(food)'}, {'frequency': 'r', 'id': 923, 'synset': 'salsa.n.01', 'synonyms': ['salsa'], 'def': 'spicy sauce of tomatoes and onions and chili peppers to accompany Mexican foods', 'name': 'salsa'}, {'frequency': 'f', 'id': 924, 'synset': 'saltshaker.n.01', 'synonyms': ['saltshaker'], 'def': 'a shaker with a perforated top for sprinkling salt', 'name': 'saltshaker'}, {'frequency': 'f', 'id': 925, 'synset': 'sandal.n.01', 'synonyms': ['sandal_(type_of_shoe)'], 'def': 'a shoe consisting of a sole fastened by straps to the foot', 'name': 'sandal_(type_of_shoe)'}, {'frequency': 'f', 'id': 926, 'synset': 'sandwich.n.01', 'synonyms': ['sandwich'], 'def': 'two (or more) slices of bread with a filling between them', 'name': 'sandwich'}, {'frequency': 'r', 'id': 927, 'synset': 'satchel.n.01', 'synonyms': ['satchel'], 'def': 'luggage consisting of a small case with a flat bottom and (usually) a shoulder strap', 'name': 'satchel'}, {'frequency': 'r', 'id': 928, 'synset': 'saucepan.n.01', 'synonyms': ['saucepan'], 'def': 'a deep pan with a handle; used for stewing or boiling', 'name': 'saucepan'}, {'frequency': 'f', 'id': 929, 'synset': 'saucer.n.02', 'synonyms': ['saucer'], 'def': 'a small shallow dish for holding a cup at the table', 'name': 'saucer'}, {'frequency': 'f', 'id': 930, 'synset': 'sausage.n.01', 'synonyms': ['sausage'], 'def': 'highly seasoned minced meat stuffed in casings', 'name': 'sausage'}, {'frequency': 'r', 'id': 931, 'synset': 'sawhorse.n.01', 'synonyms': ['sawhorse', 'sawbuck'], 'def': 'a framework for holding wood that is being sawed', 'name': 'sawhorse'}, {'frequency': 'r', 'id': 932, 'synset': 'sax.n.02', 'synonyms': ['saxophone'], 'def': "a wind instrument with a `J'-shaped form typically made of brass", 'name': 'saxophone'}, {'frequency': 'f', 'id': 933, 'synset': 'scale.n.07', 'synonyms': ['scale_(measuring_instrument)'], 'def': 'a measuring instrument for weighing; shows amount of mass', 'name': 'scale_(measuring_instrument)'}, {'frequency': 'r', 'id': 934, 'synset': 'scarecrow.n.01', 'synonyms': ['scarecrow', 'strawman'], 'def': 'an effigy in the shape of a man to frighten birds away from seeds', 'name': 'scarecrow'}, {'frequency': 'f', 'id': 935, 'synset': 'scarf.n.01', 'synonyms': ['scarf'], 'def': 'a garment worn around the head or neck or shoulders for warmth or decoration', 'name': 'scarf'}, {'frequency': 'c', 'id': 936, 'synset': 'school_bus.n.01', 'synonyms': ['school_bus'], 'def': 'a bus used to transport children to or from school', 'name': 'school_bus'}, {'frequency': 'f', 'id': 937, 'synset': 'scissors.n.01', 'synonyms': ['scissors'], 'def': 'a tool having two crossed pivoting blades with looped handles', 'name': 'scissors'}, {'frequency': 'c', 'id': 938, 'synset': 'scoreboard.n.01', 'synonyms': ['scoreboard'], 'def': 'a large board for displaying the score of a contest (and some other information)', 'name': 'scoreboard'}, {'frequency': 'c', 'id': 939, 'synset': 'scrambled_eggs.n.01', 'synonyms': ['scrambled_eggs'], 'def': 'eggs beaten and cooked to a soft firm consistency while stirring', 'name': 'scrambled_eggs'}, {'frequency': 'r', 'id': 940, 'synset': 'scraper.n.01', 'synonyms': ['scraper'], 'def': 'any of various hand tools for scraping', 'name': 'scraper'}, {'frequency': 'r', 'id': 941, 'synset': 'scratcher.n.03', 'synonyms': ['scratcher'], 'def': 'a device used for scratching', 'name': 'scratcher'}, {'frequency': 'c', 'id': 942, 'synset': 'screwdriver.n.01', 'synonyms': ['screwdriver'], 'def': 'a hand tool for driving screws; has a tip that fits into the head of a screw', 'name': 'screwdriver'}, {'frequency': 'c', 'id': 943, 'synset': 'scrub_brush.n.01', 'synonyms': ['scrubbing_brush'], 'def': 'a brush with short stiff bristles for heavy cleaning', 'name': 'scrubbing_brush'}, {'frequency': 'c', 'id': 944, 'synset': 'sculpture.n.01', 'synonyms': ['sculpture'], 'def': 'a three-dimensional work of art', 'name': 'sculpture'}, {'frequency': 'r', 'id': 945, 'synset': 'seabird.n.01', 'synonyms': ['seabird', 'seafowl'], 'def': 'a bird that frequents coastal waters and the open ocean: gulls; pelicans; gannets; cormorants; albatrosses; petrels; etc.', 'name': 'seabird'}, {'frequency': 'r', 'id': 946, 'synset': 'seahorse.n.02', 'synonyms': ['seahorse'], 'def': 'small fish with horse-like heads bent sharply downward and curled tails', 'name': 'seahorse'}, {'frequency': 'r', 'id': 947, 'synset': 'seaplane.n.01', 'synonyms': ['seaplane', 'hydroplane'], 'def': 'an airplane that can land on or take off from water', 'name': 'seaplane'}, {'frequency': 'c', 'id': 948, 'synset': 'seashell.n.01', 'synonyms': ['seashell'], 'def': 'the shell of a marine organism', 'name': 'seashell'}, {'frequency': 'r', 'id': 949, 'synset': 'seedling.n.01', 'synonyms': ['seedling'], 'def': 'young plant or tree grown from a seed', 'name': 'seedling'}, {'frequency': 'c', 'id': 950, 'synset': 'serving_dish.n.01', 'synonyms': ['serving_dish'], 'def': 'a dish used for serving food', 'name': 'serving_dish'}, {'frequency': 'r', 'id': 951, 'synset': 'sewing_machine.n.01', 'synonyms': ['sewing_machine'], 'def': 'a textile machine used as a home appliance for sewing', 'name': 'sewing_machine'}, {'frequency': 'r', 'id': 952, 'synset': 'shaker.n.03', 'synonyms': ['shaker'], 'def': 'a container in which something can be shaken', 'name': 'shaker'}, {'frequency': 'c', 'id': 953, 'synset': 'shampoo.n.01', 'synonyms': ['shampoo'], 'def': 'cleansing agent consisting of soaps or detergents used for washing the hair', 'name': 'shampoo'}, {'frequency': 'r', 'id': 954, 'synset': 'shark.n.01', 'synonyms': ['shark'], 'def': 'typically large carnivorous fishes with sharpe teeth', 'name': 'shark'}, {'frequency': 'r', 'id': 955, 'synset': 'sharpener.n.01', 'synonyms': ['sharpener'], 'def': 'any implement that is used to make something (an edge or a point) sharper', 'name': 'sharpener'}, {'frequency': 'r', 'id': 956, 'synset': 'sharpie.n.03', 'synonyms': ['Sharpie'], 'def': 'a pen with indelible ink that will write on any surface', 'name': 'Sharpie'}, {'frequency': 'r', 'id': 957, 'synset': 'shaver.n.03', 'synonyms': ['shaver_(electric)', 'electric_shaver', 'electric_razor'], 'def': 'a razor powered by an electric motor', 'name': 'shaver_(electric)'}, {'frequency': 'c', 'id': 958, 'synset': 'shaving_cream.n.01', 'synonyms': ['shaving_cream', 'shaving_soap'], 'def': 'toiletry consisting that forms a rich lather for softening the beard before shaving', 'name': 'shaving_cream'}, {'frequency': 'r', 'id': 959, 'synset': 'shawl.n.01', 'synonyms': ['shawl'], 'def': 'cloak consisting of an oblong piece of cloth used to cover the head and shoulders', 'name': 'shawl'}, {'frequency': 'r', 'id': 960, 'synset': 'shears.n.01', 'synonyms': ['shears'], 'def': 'large scissors with strong blades', 'name': 'shears'}, {'frequency': 'f', 'id': 961, 'synset': 'sheep.n.01', 'synonyms': ['sheep'], 'def': 'woolly usually horned ruminant mammal related to the goat', 'name': 'sheep'}, {'frequency': 'r', 'id': 962, 'synset': 'shepherd_dog.n.01', 'synonyms': ['shepherd_dog', 'sheepdog'], 'def': 'any of various usually long-haired breeds of dog reared to herd and guard sheep', 'name': 'shepherd_dog'}, {'frequency': 'r', 'id': 963, 'synset': 'sherbert.n.01', 'synonyms': ['sherbert', 'sherbet'], 'def': 'a frozen dessert made primarily of fruit juice and sugar', 'name': 'sherbert'}, {'frequency': 'r', 'id': 964, 'synset': 'shield.n.02', 'synonyms': ['shield'], 'def': 'armor carried on the arm to intercept blows', 'name': 'shield'}, {'frequency': 'f', 'id': 965, 'synset': 'shirt.n.01', 'synonyms': ['shirt'], 'def': 'a garment worn on the upper half of the body', 'name': 'shirt'}, {'frequency': 'f', 'id': 966, 'synset': 'shoe.n.01', 'synonyms': ['shoe', 'sneaker_(type_of_shoe)', 'tennis_shoe'], 'def': 'common footwear covering the foot', 'name': 'shoe'}, {'frequency': 'c', 'id': 967, 'synset': 'shopping_bag.n.01', 'synonyms': ['shopping_bag'], 'def': 'a bag made of plastic or strong paper (often with handles); used to transport goods after shopping', 'name': 'shopping_bag'}, {'frequency': 'c', 'id': 968, 'synset': 'shopping_cart.n.01', 'synonyms': ['shopping_cart'], 'def': 'a handcart that holds groceries or other goods while shopping', 'name': 'shopping_cart'}, {'frequency': 'f', 'id': 969, 'synset': 'short_pants.n.01', 'synonyms': ['short_pants', 'shorts_(clothing)', 'trunks_(clothing)'], 'def': 'trousers that end at or above the knee', 'name': 'short_pants'}, {'frequency': 'r', 'id': 970, 'synset': 'shot_glass.n.01', 'synonyms': ['shot_glass'], 'def': 'a small glass adequate to hold a single swallow of whiskey', 'name': 'shot_glass'}, {'frequency': 'c', 'id': 971, 'synset': 'shoulder_bag.n.01', 'synonyms': ['shoulder_bag'], 'def': 'a large handbag that can be carried by a strap looped over the shoulder', 'name': 'shoulder_bag'}, {'frequency': 'c', 'id': 972, 'synset': 'shovel.n.01', 'synonyms': ['shovel'], 'def': 'a hand tool for lifting loose material such as snow, dirt, etc.', 'name': 'shovel'}, {'frequency': 'f', 'id': 973, 'synset': 'shower.n.01', 'synonyms': ['shower_head'], 'def': 'a plumbing fixture that sprays water over you', 'name': 'shower_head'}, {'frequency': 'f', 'id': 974, 'synset': 'shower_curtain.n.01', 'synonyms': ['shower_curtain'], 'def': 'a curtain that keeps water from splashing out of the shower area', 'name': 'shower_curtain'}, {'frequency': 'r', 'id': 975, 'synset': 'shredder.n.01', 'synonyms': ['shredder_(for_paper)'], 'def': 'a device that shreds documents', 'name': 'shredder_(for_paper)'}, {'frequency': 'r', 'id': 976, 'synset': 'sieve.n.01', 'synonyms': ['sieve', 'screen_(sieve)'], 'def': 'a strainer for separating lumps from powdered material or grading particles', 'name': 'sieve'}, {'frequency': 'f', 'id': 977, 'synset': 'signboard.n.01', 'synonyms': ['signboard'], 'def': 'structure displaying a board on which advertisements can be posted', 'name': 'signboard'}, {'frequency': 'c', 'id': 978, 'synset': 'silo.n.01', 'synonyms': ['silo'], 'def': 'a cylindrical tower used for storing goods', 'name': 'silo'}, {'frequency': 'f', 'id': 979, 'synset': 'sink.n.01', 'synonyms': ['sink'], 'def': 'plumbing fixture consisting of a water basin fixed to a wall or floor and having a drainpipe', 'name': 'sink'}, {'frequency': 'f', 'id': 980, 'synset': 'skateboard.n.01', 'synonyms': ['skateboard'], 'def': 'a board with wheels that is ridden in a standing or crouching position and propelled by foot', 'name': 'skateboard'}, {'frequency': 'c', 'id': 981, 'synset': 'skewer.n.01', 'synonyms': ['skewer'], 'def': 'a long pin for holding meat in position while it is being roasted', 'name': 'skewer'}, {'frequency': 'f', 'id': 982, 'synset': 'ski.n.01', 'synonyms': ['ski'], 'def': 'sports equipment for skiing on snow', 'name': 'ski'}, {'frequency': 'f', 'id': 983, 'synset': 'ski_boot.n.01', 'synonyms': ['ski_boot'], 'def': 'a stiff boot that is fastened to a ski with a ski binding', 'name': 'ski_boot'}, {'frequency': 'f', 'id': 984, 'synset': 'ski_parka.n.01', 'synonyms': ['ski_parka', 'ski_jacket'], 'def': 'a parka to be worn while skiing', 'name': 'ski_parka'}, {'frequency': 'f', 'id': 985, 'synset': 'ski_pole.n.01', 'synonyms': ['ski_pole'], 'def': 'a pole with metal points used as an aid in skiing', 'name': 'ski_pole'}, {'frequency': 'f', 'id': 986, 'synset': 'skirt.n.02', 'synonyms': ['skirt'], 'def': 'a garment hanging from the waist; worn mainly by girls and women', 'name': 'skirt'}, {'frequency': 'c', 'id': 987, 'synset': 'sled.n.01', 'synonyms': ['sled', 'sledge', 'sleigh'], 'def': 'a vehicle or flat object for transportation over snow by sliding or pulled by dogs, etc.', 'name': 'sled'}, {'frequency': 'c', 'id': 988, 'synset': 'sleeping_bag.n.01', 'synonyms': ['sleeping_bag'], 'def': 'large padded bag designed to be slept in outdoors', 'name': 'sleeping_bag'}, {'frequency': 'r', 'id': 989, 'synset': 'sling.n.05', 'synonyms': ['sling_(bandage)', 'triangular_bandage'], 'def': 'bandage to support an injured forearm; slung over the shoulder or neck', 'name': 'sling_(bandage)'}, {'frequency': 'c', 'id': 990, 'synset': 'slipper.n.01', 'synonyms': ['slipper_(footwear)', 'carpet_slipper_(footwear)'], 'def': 'low footwear that can be slipped on and off easily; usually worn indoors', 'name': 'slipper_(footwear)'}, {'frequency': 'r', 'id': 991, 'synset': 'smoothie.n.02', 'synonyms': ['smoothie'], 'def': 'a thick smooth drink consisting of fresh fruit pureed with ice cream or yoghurt or milk', 'name': 'smoothie'}, {'frequency': 'r', 'id': 992, 'synset': 'snake.n.01', 'synonyms': ['snake', 'serpent'], 'def': 'limbless scaly elongate reptile; some are venomous', 'name': 'snake'}, {'frequency': 'f', 'id': 993, 'synset': 'snowboard.n.01', 'synonyms': ['snowboard'], 'def': 'a board that resembles a broad ski or a small surfboard; used in a standing position to slide down snow-covered slopes', 'name': 'snowboard'}, {'frequency': 'c', 'id': 994, 'synset': 'snowman.n.01', 'synonyms': ['snowman'], 'def': 'a figure of a person made of packed snow', 'name': 'snowman'}, {'frequency': 'c', 'id': 995, 'synset': 'snowmobile.n.01', 'synonyms': ['snowmobile'], 'def': 'tracked vehicle for travel on snow having skis in front', 'name': 'snowmobile'}, {'frequency': 'f', 'id': 996, 'synset': 'soap.n.01', 'synonyms': ['soap'], 'def': 'a cleansing agent made from the salts of vegetable or animal fats', 'name': 'soap'}, {'frequency': 'f', 'id': 997, 'synset': 'soccer_ball.n.01', 'synonyms': ['soccer_ball'], 'def': "an inflated ball used in playing soccer (called `football' outside of the United States)", 'name': 'soccer_ball'}, {'frequency': 'f', 'id': 998, 'synset': 'sock.n.01', 'synonyms': ['sock'], 'def': 'cloth covering for the foot; worn inside the shoe; reaches to between the ankle and the knee', 'name': 'sock'}, {'frequency': 'r', 'id': 999, 'synset': 'soda_fountain.n.02', 'synonyms': ['soda_fountain'], 'def': 'an apparatus for dispensing soda water', 'name': 'soda_fountain'}, {'frequency': 'r', 'id': 1000, 'synset': 'soda_water.n.01', 'synonyms': ['carbonated_water', 'club_soda', 'seltzer', 'sparkling_water'], 'def': 'effervescent beverage artificially charged with carbon dioxide', 'name': 'carbonated_water'}, {'frequency': 'f', 'id': 1001, 'synset': 'sofa.n.01', 'synonyms': ['sofa', 'couch', 'lounge'], 'def': 'an upholstered seat for more than one person', 'name': 'sofa'}, {'frequency': 'r', 'id': 1002, 'synset': 'softball.n.01', 'synonyms': ['softball'], 'def': 'ball used in playing softball', 'name': 'softball'}, {'frequency': 'c', 'id': 1003, 'synset': 'solar_array.n.01', 'synonyms': ['solar_array', 'solar_battery', 'solar_panel'], 'def': 'electrical device consisting of a large array of connected solar cells', 'name': 'solar_array'}, {'frequency': 'r', 'id': 1004, 'synset': 'sombrero.n.02', 'synonyms': ['sombrero'], 'def': 'a straw hat with a tall crown and broad brim; worn in American southwest and in Mexico', 'name': 'sombrero'}, {'frequency': 'c', 'id': 1005, 'synset': 'soup.n.01', 'synonyms': ['soup'], 'def': 'liquid food especially of meat or fish or vegetable stock often containing pieces of solid food', 'name': 'soup'}, {'frequency': 'r', 'id': 1006, 'synset': 'soup_bowl.n.01', 'synonyms': ['soup_bowl'], 'def': 'a bowl for serving soup', 'name': 'soup_bowl'}, {'frequency': 'c', 'id': 1007, 'synset': 'soupspoon.n.01', 'synonyms': ['soupspoon'], 'def': 'a spoon with a rounded bowl for eating soup', 'name': 'soupspoon'}, {'frequency': 'c', 'id': 1008, 'synset': 'sour_cream.n.01', 'synonyms': ['sour_cream', 'soured_cream'], 'def': 'soured light cream', 'name': 'sour_cream'}, {'frequency': 'r', 'id': 1009, 'synset': 'soya_milk.n.01', 'synonyms': ['soya_milk', 'soybean_milk', 'soymilk'], 'def': 'a milk substitute containing soybean flour and water; used in some infant formulas and in making tofu', 'name': 'soya_milk'}, {'frequency': 'r', 'id': 1010, 'synset': 'space_shuttle.n.01', 'synonyms': ['space_shuttle'], 'def': "a reusable spacecraft with wings for a controlled descent through the Earth's atmosphere", 'name': 'space_shuttle'}, {'frequency': 'r', 'id': 1011, 'synset': 'sparkler.n.02', 'synonyms': ['sparkler_(fireworks)'], 'def': 'a firework that burns slowly and throws out a shower of sparks', 'name': 'sparkler_(fireworks)'}, {'frequency': 'f', 'id': 1012, 'synset': 'spatula.n.02', 'synonyms': ['spatula'], 'def': 'a hand tool with a thin flexible blade used to mix or spread soft substances', 'name': 'spatula'}, {'frequency': 'r', 'id': 1013, 'synset': 'spear.n.01', 'synonyms': ['spear', 'lance'], 'def': 'a long pointed rod used as a tool or weapon', 'name': 'spear'}, {'frequency': 'f', 'id': 1014, 'synset': 'spectacles.n.01', 'synonyms': ['spectacles', 'specs', 'eyeglasses', 'glasses'], 'def': 'optical instrument consisting of a frame that holds a pair of lenses for correcting defective vision', 'name': 'spectacles'}, {'frequency': 'c', 'id': 1015, 'synset': 'spice_rack.n.01', 'synonyms': ['spice_rack'], 'def': 'a rack for displaying containers filled with spices', 'name': 'spice_rack'}, {'frequency': 'r', 'id': 1016, 'synset': 'spider.n.01', 'synonyms': ['spider'], 'def': 'predatory arachnid with eight legs, two poison fangs, two feelers, and usually two silk-spinning organs at the back end of the body', 'name': 'spider'}, {'frequency': 'c', 'id': 1017, 'synset': 'sponge.n.01', 'synonyms': ['sponge'], 'def': 'a porous mass usable to absorb water typically used for cleaning', 'name': 'sponge'}, {'frequency': 'f', 'id': 1018, 'synset': 'spoon.n.01', 'synonyms': ['spoon'], 'def': 'a piece of cutlery with a shallow bowl-shaped container and a handle', 'name': 'spoon'}, {'frequency': 'c', 'id': 1019, 'synset': 'sportswear.n.01', 'synonyms': ['sportswear', 'athletic_wear', 'activewear'], 'def': 'attire worn for sport or for casual wear', 'name': 'sportswear'}, {'frequency': 'c', 'id': 1020, 'synset': 'spotlight.n.02', 'synonyms': ['spotlight'], 'def': 'a lamp that produces a strong beam of light to illuminate a restricted area; used to focus attention of a stage performer', 'name': 'spotlight'}, {'frequency': 'r', 'id': 1021, 'synset': 'squirrel.n.01', 'synonyms': ['squirrel'], 'def': 'a kind of arboreal rodent having a long bushy tail', 'name': 'squirrel'}, {'frequency': 'c', 'id': 1022, 'synset': 'stapler.n.01', 'synonyms': ['stapler_(stapling_machine)'], 'def': 'a machine that inserts staples into sheets of paper in order to fasten them together', 'name': 'stapler_(stapling_machine)'}, {'frequency': 'r', 'id': 1023, 'synset': 'starfish.n.01', 'synonyms': ['starfish', 'sea_star'], 'def': 'echinoderms characterized by five arms extending from a central disk', 'name': 'starfish'}, {'frequency': 'f', 'id': 1024, 'synset': 'statue.n.01', 'synonyms': ['statue_(sculpture)'], 'def': 'a sculpture representing a human or animal', 'name': 'statue_(sculpture)'}, {'frequency': 'c', 'id': 1025, 'synset': 'steak.n.01', 'synonyms': ['steak_(food)'], 'def': 'a slice of meat cut from the fleshy part of an animal or large fish', 'name': 'steak_(food)'}, {'frequency': 'r', 'id': 1026, 'synset': 'steak_knife.n.01', 'synonyms': ['steak_knife'], 'def': 'a sharp table knife used in eating steak', 'name': 'steak_knife'}, {'frequency': 'r', 'id': 1027, 'synset': 'steamer.n.02', 'synonyms': ['steamer_(kitchen_appliance)'], 'def': 'a cooking utensil that can be used to cook food by steaming it', 'name': 'steamer_(kitchen_appliance)'}, {'frequency': 'f', 'id': 1028, 'synset': 'steering_wheel.n.01', 'synonyms': ['steering_wheel'], 'def': 'a handwheel that is used for steering', 'name': 'steering_wheel'}, {'frequency': 'r', 'id': 1029, 'synset': 'stencil.n.01', 'synonyms': ['stencil'], 'def': 'a sheet of material (metal, plastic, etc.) that has been perforated with a pattern; ink or paint can pass through the perforations to create the printed pattern on the surface below', 'name': 'stencil'}, {'frequency': 'r', 'id': 1030, 'synset': 'step_ladder.n.01', 'synonyms': ['stepladder'], 'def': 'a folding portable ladder hinged at the top', 'name': 'stepladder'}, {'frequency': 'c', 'id': 1031, 'synset': 'step_stool.n.01', 'synonyms': ['step_stool'], 'def': 'a stool that has one or two steps that fold under the seat', 'name': 'step_stool'}, {'frequency': 'c', 'id': 1032, 'synset': 'stereo.n.01', 'synonyms': ['stereo_(sound_system)'], 'def': 'electronic device for playing audio', 'name': 'stereo_(sound_system)'}, {'frequency': 'r', 'id': 1033, 'synset': 'stew.n.02', 'synonyms': ['stew'], 'def': 'food prepared by stewing especially meat or fish with vegetables', 'name': 'stew'}, {'frequency': 'r', 'id': 1034, 'synset': 'stirrer.n.02', 'synonyms': ['stirrer'], 'def': 'an implement used for stirring', 'name': 'stirrer'}, {'frequency': 'f', 'id': 1035, 'synset': 'stirrup.n.01', 'synonyms': ['stirrup'], 'def': "support consisting of metal loops into which rider's feet go", 'name': 'stirrup'}, {'frequency': 'c', 'id': 1036, 'synset': 'stocking.n.01', 'synonyms': ['stockings_(leg_wear)'], 'def': 'close-fitting hosiery to cover the foot and leg; come in matched pairs', 'name': 'stockings_(leg_wear)'}, {'frequency': 'f', 'id': 1037, 'synset': 'stool.n.01', 'synonyms': ['stool'], 'def': 'a simple seat without a back or arms', 'name': 'stool'}, {'frequency': 'f', 'id': 1038, 'synset': 'stop_sign.n.01', 'synonyms': ['stop_sign'], 'def': 'a traffic sign to notify drivers that they must come to a complete stop', 'name': 'stop_sign'}, {'frequency': 'f', 'id': 1039, 'synset': 'stoplight.n.01', 'synonyms': ['brake_light'], 'def': 'a red light on the rear of a motor vehicle that signals when the brakes are applied', 'name': 'brake_light'}, {'frequency': 'f', 'id': 1040, 'synset': 'stove.n.01', 'synonyms': ['stove', 'kitchen_stove', 'range_(kitchen_appliance)', 'kitchen_range', 'cooking_stove'], 'def': 'a kitchen appliance used for cooking food', 'name': 'stove'}, {'frequency': 'c', 'id': 1041, 'synset': 'strainer.n.01', 'synonyms': ['strainer'], 'def': 'a filter to retain larger pieces while smaller pieces and liquids pass through', 'name': 'strainer'}, {'frequency': 'f', 'id': 1042, 'synset': 'strap.n.01', 'synonyms': ['strap'], 'def': 'an elongated strip of material for binding things together or holding', 'name': 'strap'}, {'frequency': 'f', 'id': 1043, 'synset': 'straw.n.04', 'synonyms': ['straw_(for_drinking)', 'drinking_straw'], 'def': 'a thin paper or plastic tube used to suck liquids into the mouth', 'name': 'straw_(for_drinking)'}, {'frequency': 'f', 'id': 1044, 'synset': 'strawberry.n.01', 'synonyms': ['strawberry'], 'def': 'sweet fleshy red fruit', 'name': 'strawberry'}, {'frequency': 'f', 'id': 1045, 'synset': 'street_sign.n.01', 'synonyms': ['street_sign'], 'def': 'a sign visible from the street', 'name': 'street_sign'}, {'frequency': 'f', 'id': 1046, 'synset': 'streetlight.n.01', 'synonyms': ['streetlight', 'street_lamp'], 'def': 'a lamp supported on a lamppost; for illuminating a street', 'name': 'streetlight'}, {'frequency': 'r', 'id': 1047, 'synset': 'string_cheese.n.01', 'synonyms': ['string_cheese'], 'def': 'cheese formed in long strings twisted together', 'name': 'string_cheese'}, {'frequency': 'r', 'id': 1048, 'synset': 'stylus.n.02', 'synonyms': ['stylus'], 'def': 'a pointed tool for writing or drawing or engraving', 'name': 'stylus'}, {'frequency': 'r', 'id': 1049, 'synset': 'subwoofer.n.01', 'synonyms': ['subwoofer'], 'def': 'a loudspeaker that is designed to reproduce very low bass frequencies', 'name': 'subwoofer'}, {'frequency': 'r', 'id': 1050, 'synset': 'sugar_bowl.n.01', 'synonyms': ['sugar_bowl'], 'def': 'a dish in which sugar is served', 'name': 'sugar_bowl'}, {'frequency': 'r', 'id': 1051, 'synset': 'sugarcane.n.01', 'synonyms': ['sugarcane_(plant)'], 'def': 'juicy canes whose sap is a source of molasses and commercial sugar; fresh canes are sometimes chewed for the juice', 'name': 'sugarcane_(plant)'}, {'frequency': 'c', 'id': 1052, 'synset': 'suit.n.01', 'synonyms': ['suit_(clothing)'], 'def': 'a set of garments (usually including a jacket and trousers or skirt) for outerwear all of the same fabric and color', 'name': 'suit_(clothing)'}, {'frequency': 'c', 'id': 1053, 'synset': 'sunflower.n.01', 'synonyms': ['sunflower'], 'def': 'any plant of the genus Helianthus having large flower heads with dark disk florets and showy yellow rays', 'name': 'sunflower'}, {'frequency': 'f', 'id': 1054, 'synset': 'sunglasses.n.01', 'synonyms': ['sunglasses'], 'def': 'spectacles that are darkened or polarized to protect the eyes from the glare of the sun', 'name': 'sunglasses'}, {'frequency': 'c', 'id': 1055, 'synset': 'sunhat.n.01', 'synonyms': ['sunhat'], 'def': 'a hat with a broad brim that protects the face from direct exposure to the sun', 'name': 'sunhat'}, {'frequency': 'r', 'id': 1056, 'synset': 'sunscreen.n.01', 'synonyms': ['sunscreen', 'sunblock'], 'def': 'a cream spread on the skin; contains a chemical to filter out ultraviolet light and so protect from sunburn', 'name': 'sunscreen'}, {'frequency': 'f', 'id': 1057, 'synset': 'surfboard.n.01', 'synonyms': ['surfboard'], 'def': 'a narrow buoyant board for riding surf', 'name': 'surfboard'}, {'frequency': 'c', 'id': 1058, 'synset': 'sushi.n.01', 'synonyms': ['sushi'], 'def': 'rice (with raw fish) wrapped in seaweed', 'name': 'sushi'}, {'frequency': 'c', 'id': 1059, 'synset': 'swab.n.02', 'synonyms': ['mop'], 'def': 'cleaning implement consisting of absorbent material fastened to a handle; for cleaning floors', 'name': 'mop'}, {'frequency': 'c', 'id': 1060, 'synset': 'sweat_pants.n.01', 'synonyms': ['sweat_pants'], 'def': 'loose-fitting trousers with elastic cuffs; worn by athletes', 'name': 'sweat_pants'}, {'frequency': 'c', 'id': 1061, 'synset': 'sweatband.n.02', 'synonyms': ['sweatband'], 'def': 'a band of material tied around the forehead or wrist to absorb sweat', 'name': 'sweatband'}, {'frequency': 'f', 'id': 1062, 'synset': 'sweater.n.01', 'synonyms': ['sweater'], 'def': 'a crocheted or knitted garment covering the upper part of the body', 'name': 'sweater'}, {'frequency': 'f', 'id': 1063, 'synset': 'sweatshirt.n.01', 'synonyms': ['sweatshirt'], 'def': 'cotton knit pullover with long sleeves worn during athletic activity', 'name': 'sweatshirt'}, {'frequency': 'c', 'id': 1064, 'synset': 'sweet_potato.n.02', 'synonyms': ['sweet_potato'], 'def': 'the edible tuberous root of the sweet potato vine', 'name': 'sweet_potato'}, {'frequency': 'f', 'id': 1065, 'synset': 'swimsuit.n.01', 'synonyms': ['swimsuit', 'swimwear', 'bathing_suit', 'swimming_costume', 'bathing_costume', 'swimming_trunks', 'bathing_trunks'], 'def': 'garment worn for swimming', 'name': 'swimsuit'}, {'frequency': 'c', 'id': 1066, 'synset': 'sword.n.01', 'synonyms': ['sword'], 'def': 'a cutting or thrusting weapon that has a long metal blade', 'name': 'sword'}, {'frequency': 'r', 'id': 1067, 'synset': 'syringe.n.01', 'synonyms': ['syringe'], 'def': 'a medical instrument used to inject or withdraw fluids', 'name': 'syringe'}, {'frequency': 'r', 'id': 1068, 'synset': 'tabasco.n.02', 'synonyms': ['Tabasco_sauce'], 'def': 'very spicy sauce (trade name Tabasco) made from fully-aged red peppers', 'name': 'Tabasco_sauce'}, {'frequency': 'r', 'id': 1069, 'synset': 'table-tennis_table.n.01', 'synonyms': ['table-tennis_table', 'ping-pong_table'], 'def': 'a table used for playing table tennis', 'name': 'table-tennis_table'}, {'frequency': 'f', 'id': 1070, 'synset': 'table.n.02', 'synonyms': ['table'], 'def': 'a piece of furniture having a smooth flat top that is usually supported by one or more vertical legs', 'name': 'table'}, {'frequency': 'c', 'id': 1071, 'synset': 'table_lamp.n.01', 'synonyms': ['table_lamp'], 'def': 'a lamp that sits on a table', 'name': 'table_lamp'}, {'frequency': 'f', 'id': 1072, 'synset': 'tablecloth.n.01', 'synonyms': ['tablecloth'], 'def': 'a covering spread over a dining table', 'name': 'tablecloth'}, {'frequency': 'r', 'id': 1073, 'synset': 'tachometer.n.01', 'synonyms': ['tachometer'], 'def': 'measuring instrument for indicating speed of rotation', 'name': 'tachometer'}, {'frequency': 'r', 'id': 1074, 'synset': 'taco.n.02', 'synonyms': ['taco'], 'def': 'a small tortilla cupped around a filling', 'name': 'taco'}, {'frequency': 'f', 'id': 1075, 'synset': 'tag.n.02', 'synonyms': ['tag'], 'def': 'a label associated with something for the purpose of identification or information', 'name': 'tag'}, {'frequency': 'f', 'id': 1076, 'synset': 'taillight.n.01', 'synonyms': ['taillight', 'rear_light'], 'def': 'lamp (usually red) mounted at the rear of a motor vehicle', 'name': 'taillight'}, {'frequency': 'r', 'id': 1077, 'synset': 'tambourine.n.01', 'synonyms': ['tambourine'], 'def': 'a shallow drum with a single drumhead and with metallic disks in the sides', 'name': 'tambourine'}, {'frequency': 'r', 'id': 1078, 'synset': 'tank.n.01', 'synonyms': ['army_tank', 'armored_combat_vehicle', 'armoured_combat_vehicle'], 'def': 'an enclosed armored military vehicle; has a cannon and moves on caterpillar treads', 'name': 'army_tank'}, {'frequency': 'c', 'id': 1079, 'synset': 'tank.n.02', 'synonyms': ['tank_(storage_vessel)', 'storage_tank'], 'def': 'a large (usually metallic) vessel for holding gases or liquids', 'name': 'tank_(storage_vessel)'}, {'frequency': 'f', 'id': 1080, 'synset': 'tank_top.n.01', 'synonyms': ['tank_top_(clothing)'], 'def': 'a tight-fitting sleeveless shirt with wide shoulder straps and low neck and no front opening', 'name': 'tank_top_(clothing)'}, {'frequency': 'c', 'id': 1081, 'synset': 'tape.n.01', 'synonyms': ['tape_(sticky_cloth_or_paper)'], 'def': 'a long thin piece of cloth or paper as used for binding or fastening', 'name': 'tape_(sticky_cloth_or_paper)'}, {'frequency': 'c', 'id': 1082, 'synset': 'tape.n.04', 'synonyms': ['tape_measure', 'measuring_tape'], 'def': 'measuring instrument consisting of a narrow strip (cloth or metal) marked in inches or centimeters and used for measuring lengths', 'name': 'tape_measure'}, {'frequency': 'c', 'id': 1083, 'synset': 'tapestry.n.02', 'synonyms': ['tapestry'], 'def': 'a heavy textile with a woven design; used for curtains and upholstery', 'name': 'tapestry'}, {'frequency': 'f', 'id': 1084, 'synset': 'tarpaulin.n.01', 'synonyms': ['tarp'], 'def': 'waterproofed canvas', 'name': 'tarp'}, {'frequency': 'c', 'id': 1085, 'synset': 'tartan.n.01', 'synonyms': ['tartan', 'plaid'], 'def': 'a cloth having a crisscross design', 'name': 'tartan'}, {'frequency': 'c', 'id': 1086, 'synset': 'tassel.n.01', 'synonyms': ['tassel'], 'def': 'adornment consisting of a bunch of cords fastened at one end', 'name': 'tassel'}, {'frequency': 'r', 'id': 1087, 'synset': 'tea_bag.n.01', 'synonyms': ['tea_bag'], 'def': 'a measured amount of tea in a bag for an individual serving of tea', 'name': 'tea_bag'}, {'frequency': 'c', 'id': 1088, 'synset': 'teacup.n.02', 'synonyms': ['teacup'], 'def': 'a cup from which tea is drunk', 'name': 'teacup'}, {'frequency': 'c', 'id': 1089, 'synset': 'teakettle.n.01', 'synonyms': ['teakettle'], 'def': 'kettle for boiling water to make tea', 'name': 'teakettle'}, {'frequency': 'c', 'id': 1090, 'synset': 'teapot.n.01', 'synonyms': ['teapot'], 'def': 'pot for brewing tea; usually has a spout and handle', 'name': 'teapot'}, {'frequency': 'f', 'id': 1091, 'synset': 'teddy.n.01', 'synonyms': ['teddy_bear'], 'def': "plaything consisting of a child's toy bear (usually plush and stuffed with soft materials)", 'name': 'teddy_bear'}, {'frequency': 'f', 'id': 1092, 'synset': 'telephone.n.01', 'synonyms': ['telephone', 'phone', 'telephone_set'], 'def': 'electronic device for communicating by voice over long distances', 'name': 'telephone'}, {'frequency': 'c', 'id': 1093, 'synset': 'telephone_booth.n.01', 'synonyms': ['telephone_booth', 'phone_booth', 'call_box', 'telephone_box', 'telephone_kiosk'], 'def': 'booth for using a telephone', 'name': 'telephone_booth'}, {'frequency': 'f', 'id': 1094, 'synset': 'telephone_pole.n.01', 'synonyms': ['telephone_pole', 'telegraph_pole', 'telegraph_post'], 'def': 'tall pole supporting telephone wires', 'name': 'telephone_pole'}, {'frequency': 'r', 'id': 1095, 'synset': 'telephoto_lens.n.01', 'synonyms': ['telephoto_lens', 'zoom_lens'], 'def': 'a camera lens that magnifies the image', 'name': 'telephoto_lens'}, {'frequency': 'c', 'id': 1096, 'synset': 'television_camera.n.01', 'synonyms': ['television_camera', 'tv_camera'], 'def': 'television equipment for capturing and recording video', 'name': 'television_camera'}, {'frequency': 'f', 'id': 1097, 'synset': 'television_receiver.n.01', 'synonyms': ['television_set', 'tv', 'tv_set'], 'def': 'an electronic device that receives television signals and displays them on a screen', 'name': 'television_set'}, {'frequency': 'f', 'id': 1098, 'synset': 'tennis_ball.n.01', 'synonyms': ['tennis_ball'], 'def': 'ball about the size of a fist used in playing tennis', 'name': 'tennis_ball'}, {'frequency': 'f', 'id': 1099, 'synset': 'tennis_racket.n.01', 'synonyms': ['tennis_racket'], 'def': 'a racket used to play tennis', 'name': 'tennis_racket'}, {'frequency': 'r', 'id': 1100, 'synset': 'tequila.n.01', 'synonyms': ['tequila'], 'def': 'Mexican liquor made from fermented juices of an agave plant', 'name': 'tequila'}, {'frequency': 'c', 'id': 1101, 'synset': 'thermometer.n.01', 'synonyms': ['thermometer'], 'def': 'measuring instrument for measuring temperature', 'name': 'thermometer'}, {'frequency': 'c', 'id': 1102, 'synset': 'thermos.n.01', 'synonyms': ['thermos_bottle'], 'def': 'vacuum flask that preserves temperature of hot or cold drinks', 'name': 'thermos_bottle'}, {'frequency': 'c', 'id': 1103, 'synset': 'thermostat.n.01', 'synonyms': ['thermostat'], 'def': 'a regulator for automatically regulating temperature by starting or stopping the supply of heat', 'name': 'thermostat'}, {'frequency': 'r', 'id': 1104, 'synset': 'thimble.n.02', 'synonyms': ['thimble'], 'def': 'a small metal cap to protect the finger while sewing; can be used as a small container', 'name': 'thimble'}, {'frequency': 'c', 'id': 1105, 'synset': 'thread.n.01', 'synonyms': ['thread', 'yarn'], 'def': 'a fine cord of twisted fibers (of cotton or silk or wool or nylon etc.) used in sewing and weaving', 'name': 'thread'}, {'frequency': 'c', 'id': 1106, 'synset': 'thumbtack.n.01', 'synonyms': ['thumbtack', 'drawing_pin', 'pushpin'], 'def': 'a tack for attaching papers to a bulletin board or drawing board', 'name': 'thumbtack'}, {'frequency': 'c', 'id': 1107, 'synset': 'tiara.n.01', 'synonyms': ['tiara'], 'def': 'a jeweled headdress worn by women on formal occasions', 'name': 'tiara'}, {'frequency': 'c', 'id': 1108, 'synset': 'tiger.n.02', 'synonyms': ['tiger'], 'def': 'large feline of forests in most of Asia having a tawny coat with black stripes', 'name': 'tiger'}, {'frequency': 'c', 'id': 1109, 'synset': 'tights.n.01', 'synonyms': ['tights_(clothing)', 'leotards'], 'def': 'skintight knit hose covering the body from the waist to the feet worn by acrobats and dancers and as stockings by women and girls', 'name': 'tights_(clothing)'}, {'frequency': 'c', 'id': 1110, 'synset': 'timer.n.01', 'synonyms': ['timer', 'stopwatch'], 'def': 'a timepiece that measures a time interval and signals its end', 'name': 'timer'}, {'frequency': 'f', 'id': 1111, 'synset': 'tinfoil.n.01', 'synonyms': ['tinfoil'], 'def': 'foil made of tin or an alloy of tin and lead', 'name': 'tinfoil'}, {'frequency': 'r', 'id': 1112, 'synset': 'tinsel.n.01', 'synonyms': ['tinsel'], 'def': 'a showy decoration that is basically valueless', 'name': 'tinsel'}, {'frequency': 'f', 'id': 1113, 'synset': 'tissue.n.02', 'synonyms': ['tissue_paper'], 'def': 'a soft thin (usually translucent) paper', 'name': 'tissue_paper'}, {'frequency': 'c', 'id': 1114, 'synset': 'toast.n.01', 'synonyms': ['toast_(food)'], 'def': 'slice of bread that has been toasted', 'name': 'toast_(food)'}, {'frequency': 'f', 'id': 1115, 'synset': 'toaster.n.02', 'synonyms': ['toaster'], 'def': 'a kitchen appliance (usually electric) for toasting bread', 'name': 'toaster'}, {'frequency': 'c', 'id': 1116, 'synset': 'toaster_oven.n.01', 'synonyms': ['toaster_oven'], 'def': 'kitchen appliance consisting of a small electric oven for toasting or warming food', 'name': 'toaster_oven'}, {'frequency': 'f', 'id': 1117, 'synset': 'toilet.n.02', 'synonyms': ['toilet'], 'def': 'a plumbing fixture for defecation and urination', 'name': 'toilet'}, {'frequency': 'f', 'id': 1118, 'synset': 'toilet_tissue.n.01', 'synonyms': ['toilet_tissue', 'toilet_paper', 'bathroom_tissue'], 'def': 'a soft thin absorbent paper for use in toilets', 'name': 'toilet_tissue'}, {'frequency': 'f', 'id': 1119, 'synset': 'tomato.n.01', 'synonyms': ['tomato'], 'def': 'mildly acid red or yellow pulpy fruit eaten as a vegetable', 'name': 'tomato'}, {'frequency': 'c', 'id': 1120, 'synset': 'tongs.n.01', 'synonyms': ['tongs'], 'def': 'any of various devices for taking hold of objects; usually have two hinged legs with handles above and pointed hooks below', 'name': 'tongs'}, {'frequency': 'c', 'id': 1121, 'synset': 'toolbox.n.01', 'synonyms': ['toolbox'], 'def': 'a box or chest or cabinet for holding hand tools', 'name': 'toolbox'}, {'frequency': 'f', 'id': 1122, 'synset': 'toothbrush.n.01', 'synonyms': ['toothbrush'], 'def': 'small brush; has long handle; used to clean teeth', 'name': 'toothbrush'}, {'frequency': 'f', 'id': 1123, 'synset': 'toothpaste.n.01', 'synonyms': ['toothpaste'], 'def': 'a dentifrice in the form of a paste', 'name': 'toothpaste'}, {'frequency': 'c', 'id': 1124, 'synset': 'toothpick.n.01', 'synonyms': ['toothpick'], 'def': 'pick consisting of a small strip of wood or plastic; used to pick food from between the teeth', 'name': 'toothpick'}, {'frequency': 'c', 'id': 1125, 'synset': 'top.n.09', 'synonyms': ['cover'], 'def': 'covering for a hole (especially a hole in the top of a container)', 'name': 'cover'}, {'frequency': 'c', 'id': 1126, 'synset': 'tortilla.n.01', 'synonyms': ['tortilla'], 'def': 'thin unleavened pancake made from cornmeal or wheat flour', 'name': 'tortilla'}, {'frequency': 'c', 'id': 1127, 'synset': 'tow_truck.n.01', 'synonyms': ['tow_truck'], 'def': 'a truck equipped to hoist and pull wrecked cars (or to remove cars from no-parking zones)', 'name': 'tow_truck'}, {'frequency': 'f', 'id': 1128, 'synset': 'towel.n.01', 'synonyms': ['towel'], 'def': 'a rectangular piece of absorbent cloth (or paper) for drying or wiping', 'name': 'towel'}, {'frequency': 'f', 'id': 1129, 'synset': 'towel_rack.n.01', 'synonyms': ['towel_rack', 'towel_rail', 'towel_bar'], 'def': 'a rack consisting of one or more bars on which towels can be hung', 'name': 'towel_rack'}, {'frequency': 'f', 'id': 1130, 'synset': 'toy.n.03', 'synonyms': ['toy'], 'def': 'a device regarded as providing amusement', 'name': 'toy'}, {'frequency': 'c', 'id': 1131, 'synset': 'tractor.n.01', 'synonyms': ['tractor_(farm_equipment)'], 'def': 'a wheeled vehicle with large wheels; used in farming and other applications', 'name': 'tractor_(farm_equipment)'}, {'frequency': 'f', 'id': 1132, 'synset': 'traffic_light.n.01', 'synonyms': ['traffic_light'], 'def': 'a device to control vehicle traffic often consisting of three or more lights', 'name': 'traffic_light'}, {'frequency': 'r', 'id': 1133, 'synset': 'trail_bike.n.01', 'synonyms': ['dirt_bike'], 'def': 'a lightweight motorcycle equipped with rugged tires and suspension for off-road use', 'name': 'dirt_bike'}, {'frequency': 'c', 'id': 1134, 'synset': 'trailer_truck.n.01', 'synonyms': ['trailer_truck', 'tractor_trailer', 'trucking_rig', 'articulated_lorry', 'semi_truck'], 'def': 'a truck consisting of a tractor and trailer together', 'name': 'trailer_truck'}, {'frequency': 'f', 'id': 1135, 'synset': 'train.n.01', 'synonyms': ['train_(railroad_vehicle)', 'railroad_train'], 'def': 'public or private transport provided by a line of railway cars coupled together and drawn by a locomotive', 'name': 'train_(railroad_vehicle)'}, {'frequency': 'r', 'id': 1136, 'synset': 'trampoline.n.01', 'synonyms': ['trampoline'], 'def': 'gymnastic apparatus consisting of a strong canvas sheet attached with springs to a metal frame', 'name': 'trampoline'}, {'frequency': 'f', 'id': 1137, 'synset': 'tray.n.01', 'synonyms': ['tray'], 'def': 'an open receptacle for holding or displaying or serving articles or food', 'name': 'tray'}, {'frequency': 'r', 'id': 1138, 'synset': 'tree_house.n.01', 'synonyms': ['tree_house'], 'def': '(NOT A TREE) a PLAYHOUSE built in the branches of a tree', 'name': 'tree_house'}, {'frequency': 'r', 'id': 1139, 'synset': 'trench_coat.n.01', 'synonyms': ['trench_coat'], 'def': 'a military style raincoat; belted with deep pockets', 'name': 'trench_coat'}, {'frequency': 'r', 'id': 1140, 'synset': 'triangle.n.05', 'synonyms': ['triangle_(musical_instrument)'], 'def': 'a percussion instrument consisting of a metal bar bent in the shape of an open triangle', 'name': 'triangle_(musical_instrument)'}, {'frequency': 'r', 'id': 1141, 'synset': 'tricycle.n.01', 'synonyms': ['tricycle'], 'def': 'a vehicle with three wheels that is moved by foot pedals', 'name': 'tricycle'}, {'frequency': 'c', 'id': 1142, 'synset': 'tripod.n.01', 'synonyms': ['tripod'], 'def': 'a three-legged rack used for support', 'name': 'tripod'}, {'frequency': 'f', 'id': 1143, 'synset': 'trouser.n.01', 'synonyms': ['trousers', 'pants_(clothing)'], 'def': 'a garment extending from the waist to the knee or ankle, covering each leg separately', 'name': 'trousers'}, {'frequency': 'f', 'id': 1144, 'synset': 'truck.n.01', 'synonyms': ['truck'], 'def': 'an automotive vehicle suitable for hauling', 'name': 'truck'}, {'frequency': 'r', 'id': 1145, 'synset': 'truffle.n.03', 'synonyms': ['truffle_(chocolate)', 'chocolate_truffle'], 'def': 'creamy chocolate candy', 'name': 'truffle_(chocolate)'}, {'frequency': 'c', 'id': 1146, 'synset': 'trunk.n.02', 'synonyms': ['trunk'], 'def': 'luggage consisting of a large strong case used when traveling or for storage', 'name': 'trunk'}, {'frequency': 'r', 'id': 1147, 'synset': 'tub.n.02', 'synonyms': ['vat'], 'def': 'a large open vessel for holding or storing liquids', 'name': 'vat'}, {'frequency': 'c', 'id': 1148, 'synset': 'turban.n.01', 'synonyms': ['turban'], 'def': 'a traditional headdress consisting of a long scarf wrapped around the head', 'name': 'turban'}, {'frequency': 'r', 'id': 1149, 'synset': 'turkey.n.01', 'synonyms': ['turkey_(bird)'], 'def': 'large gallinaceous bird with fan-shaped tail; widely domesticated for food', 'name': 'turkey_(bird)'}, {'frequency': 'c', 'id': 1150, 'synset': 'turkey.n.04', 'synonyms': ['turkey_(food)'], 'def': 'flesh of large domesticated fowl usually roasted', 'name': 'turkey_(food)'}, {'frequency': 'r', 'id': 1151, 'synset': 'turnip.n.01', 'synonyms': ['turnip'], 'def': 'widely cultivated plant having a large fleshy edible white or yellow root', 'name': 'turnip'}, {'frequency': 'c', 'id': 1152, 'synset': 'turtle.n.02', 'synonyms': ['turtle'], 'def': 'any of various aquatic and land reptiles having a bony shell and flipper-like limbs for swimming', 'name': 'turtle'}, {'frequency': 'r', 'id': 1153, 'synset': 'turtleneck.n.01', 'synonyms': ['turtleneck_(clothing)', 'polo-neck'], 'def': 'a sweater or jersey with a high close-fitting collar', 'name': 'turtleneck_(clothing)'}, {'frequency': 'r', 'id': 1154, 'synset': 'typewriter.n.01', 'synonyms': ['typewriter'], 'def': 'hand-operated character printer for printing written messages one character at a time', 'name': 'typewriter'}, {'frequency': 'f', 'id': 1155, 'synset': 'umbrella.n.01', 'synonyms': ['umbrella'], 'def': 'a lightweight handheld collapsible canopy', 'name': 'umbrella'}, {'frequency': 'c', 'id': 1156, 'synset': 'underwear.n.01', 'synonyms': ['underwear', 'underclothes', 'underclothing', 'underpants'], 'def': 'undergarment worn next to the skin and under the outer garments', 'name': 'underwear'}, {'frequency': 'r', 'id': 1157, 'synset': 'unicycle.n.01', 'synonyms': ['unicycle'], 'def': 'a vehicle with a single wheel that is driven by pedals', 'name': 'unicycle'}, {'frequency': 'c', 'id': 1158, 'synset': 'urinal.n.01', 'synonyms': ['urinal'], 'def': 'a plumbing fixture (usually attached to the wall) used by men to urinate', 'name': 'urinal'}, {'frequency': 'r', 'id': 1159, 'synset': 'urn.n.01', 'synonyms': ['urn'], 'def': 'a large vase that usually has a pedestal or feet', 'name': 'urn'}, {'frequency': 'c', 'id': 1160, 'synset': 'vacuum.n.04', 'synonyms': ['vacuum_cleaner'], 'def': 'an electrical home appliance that cleans by suction', 'name': 'vacuum_cleaner'}, {'frequency': 'c', 'id': 1161, 'synset': 'valve.n.03', 'synonyms': ['valve'], 'def': 'control consisting of a mechanical device for controlling the flow of a fluid', 'name': 'valve'}, {'frequency': 'f', 'id': 1162, 'synset': 'vase.n.01', 'synonyms': ['vase'], 'def': 'an open jar of glass or porcelain used as an ornament or to hold flowers', 'name': 'vase'}, {'frequency': 'c', 'id': 1163, 'synset': 'vending_machine.n.01', 'synonyms': ['vending_machine'], 'def': 'a slot machine for selling goods', 'name': 'vending_machine'}, {'frequency': 'f', 'id': 1164, 'synset': 'vent.n.01', 'synonyms': ['vent', 'blowhole', 'air_vent'], 'def': 'a hole for the escape of gas or air', 'name': 'vent'}, {'frequency': 'c', 'id': 1165, 'synset': 'videotape.n.01', 'synonyms': ['videotape'], 'def': 'a video recording made on magnetic tape', 'name': 'videotape'}, {'frequency': 'r', 'id': 1166, 'synset': 'vinegar.n.01', 'synonyms': ['vinegar'], 'def': 'sour-tasting liquid produced usually by oxidation of the alcohol in wine or cider and used as a condiment or food preservative', 'name': 'vinegar'}, {'frequency': 'r', 'id': 1167, 'synset': 'violin.n.01', 'synonyms': ['violin', 'fiddle'], 'def': 'bowed stringed instrument that is the highest member of the violin family', 'name': 'violin'}, {'frequency': 'r', 'id': 1168, 'synset': 'vodka.n.01', 'synonyms': ['vodka'], 'def': 'unaged colorless liquor originating in Russia', 'name': 'vodka'}, {'frequency': 'r', 'id': 1169, 'synset': 'volleyball.n.02', 'synonyms': ['volleyball'], 'def': 'an inflated ball used in playing volleyball', 'name': 'volleyball'}, {'frequency': 'r', 'id': 1170, 'synset': 'vulture.n.01', 'synonyms': ['vulture'], 'def': 'any of various large birds of prey having naked heads and weak claws and feeding chiefly on carrion', 'name': 'vulture'}, {'frequency': 'c', 'id': 1171, 'synset': 'waffle.n.01', 'synonyms': ['waffle'], 'def': 'pancake batter baked in a waffle iron', 'name': 'waffle'}, {'frequency': 'r', 'id': 1172, 'synset': 'waffle_iron.n.01', 'synonyms': ['waffle_iron'], 'def': 'a kitchen appliance for baking waffles', 'name': 'waffle_iron'}, {'frequency': 'c', 'id': 1173, 'synset': 'wagon.n.01', 'synonyms': ['wagon'], 'def': 'any of various kinds of wheeled vehicles drawn by an animal or a tractor', 'name': 'wagon'}, {'frequency': 'c', 'id': 1174, 'synset': 'wagon_wheel.n.01', 'synonyms': ['wagon_wheel'], 'def': 'a wheel of a wagon', 'name': 'wagon_wheel'}, {'frequency': 'c', 'id': 1175, 'synset': 'walking_stick.n.01', 'synonyms': ['walking_stick'], 'def': 'a stick carried in the hand for support in walking', 'name': 'walking_stick'}, {'frequency': 'c', 'id': 1176, 'synset': 'wall_clock.n.01', 'synonyms': ['wall_clock'], 'def': 'a clock mounted on a wall', 'name': 'wall_clock'}, {'frequency': 'f', 'id': 1177, 'synset': 'wall_socket.n.01', 'synonyms': ['wall_socket', 'wall_plug', 'electric_outlet', 'electrical_outlet', 'outlet', 'electric_receptacle'], 'def': 'receptacle providing a place in a wiring system where current can be taken to run electrical devices', 'name': 'wall_socket'}, {'frequency': 'c', 'id': 1178, 'synset': 'wallet.n.01', 'synonyms': ['wallet', 'billfold'], 'def': 'a pocket-size case for holding papers and paper money', 'name': 'wallet'}, {'frequency': 'r', 'id': 1179, 'synset': 'walrus.n.01', 'synonyms': ['walrus'], 'def': 'either of two large northern marine mammals having ivory tusks and tough hide over thick blubber', 'name': 'walrus'}, {'frequency': 'r', 'id': 1180, 'synset': 'wardrobe.n.01', 'synonyms': ['wardrobe'], 'def': 'a tall piece of furniture that provides storage space for clothes; has a door and rails or hooks for hanging clothes', 'name': 'wardrobe'}, {'frequency': 'r', 'id': 1181, 'synset': 'wasabi.n.02', 'synonyms': ['wasabi'], 'def': 'the thick green root of the wasabi plant that the Japanese use in cooking and that tastes like strong horseradish', 'name': 'wasabi'}, {'frequency': 'c', 'id': 1182, 'synset': 'washer.n.03', 'synonyms': ['automatic_washer', 'washing_machine'], 'def': 'a home appliance for washing clothes and linens automatically', 'name': 'automatic_washer'}, {'frequency': 'f', 'id': 1183, 'synset': 'watch.n.01', 'synonyms': ['watch', 'wristwatch'], 'def': 'a small, portable timepiece', 'name': 'watch'}, {'frequency': 'f', 'id': 1184, 'synset': 'water_bottle.n.01', 'synonyms': ['water_bottle'], 'def': 'a bottle for holding water', 'name': 'water_bottle'}, {'frequency': 'c', 'id': 1185, 'synset': 'water_cooler.n.01', 'synonyms': ['water_cooler'], 'def': 'a device for cooling and dispensing drinking water', 'name': 'water_cooler'}, {'frequency': 'c', 'id': 1186, 'synset': 'water_faucet.n.01', 'synonyms': ['water_faucet', 'water_tap', 'tap_(water_faucet)'], 'def': 'a faucet for drawing water from a pipe or cask', 'name': 'water_faucet'}, {'frequency': 'r', 'id': 1187, 'synset': 'water_filter.n.01', 'synonyms': ['water_filter'], 'def': 'a filter to remove impurities from the water supply', 'name': 'water_filter'}, {'frequency': 'r', 'id': 1188, 'synset': 'water_heater.n.01', 'synonyms': ['water_heater', 'hot-water_heater'], 'def': 'a heater and storage tank to supply heated water', 'name': 'water_heater'}, {'frequency': 'r', 'id': 1189, 'synset': 'water_jug.n.01', 'synonyms': ['water_jug'], 'def': 'a jug that holds water', 'name': 'water_jug'}, {'frequency': 'r', 'id': 1190, 'synset': 'water_pistol.n.01', 'synonyms': ['water_gun', 'squirt_gun'], 'def': 'plaything consisting of a toy pistol that squirts water', 'name': 'water_gun'}, {'frequency': 'c', 'id': 1191, 'synset': 'water_scooter.n.01', 'synonyms': ['water_scooter', 'sea_scooter', 'jet_ski'], 'def': 'a motorboat resembling a motor scooter (NOT A SURFBOARD OR WATER SKI)', 'name': 'water_scooter'}, {'frequency': 'c', 'id': 1192, 'synset': 'water_ski.n.01', 'synonyms': ['water_ski'], 'def': 'broad ski for skimming over water towed by a speedboat (DO NOT MARK WATER)', 'name': 'water_ski'}, {'frequency': 'c', 'id': 1193, 'synset': 'water_tower.n.01', 'synonyms': ['water_tower'], 'def': 'a large reservoir for water', 'name': 'water_tower'}, {'frequency': 'c', 'id': 1194, 'synset': 'watering_can.n.01', 'synonyms': ['watering_can'], 'def': 'a container with a handle and a spout with a perforated nozzle; used to sprinkle water over plants', 'name': 'watering_can'}, {'frequency': 'c', 'id': 1195, 'synset': 'watermelon.n.02', 'synonyms': ['watermelon'], 'def': 'large oblong or roundish melon with a hard green rind and sweet watery red or occasionally yellowish pulp', 'name': 'watermelon'}, {'frequency': 'f', 'id': 1196, 'synset': 'weathervane.n.01', 'synonyms': ['weathervane', 'vane_(weathervane)', 'wind_vane'], 'def': 'mechanical device attached to an elevated structure; rotates freely to show the direction of the wind', 'name': 'weathervane'}, {'frequency': 'c', 'id': 1197, 'synset': 'webcam.n.01', 'synonyms': ['webcam'], 'def': 'a digital camera designed to take digital photographs and transmit them over the internet', 'name': 'webcam'}, {'frequency': 'c', 'id': 1198, 'synset': 'wedding_cake.n.01', 'synonyms': ['wedding_cake', 'bridecake'], 'def': 'a rich cake with two or more tiers and covered with frosting and decorations; served at a wedding reception', 'name': 'wedding_cake'}, {'frequency': 'c', 'id': 1199, 'synset': 'wedding_ring.n.01', 'synonyms': ['wedding_ring', 'wedding_band'], 'def': 'a ring given to the bride and/or groom at the wedding', 'name': 'wedding_ring'}, {'frequency': 'f', 'id': 1200, 'synset': 'wet_suit.n.01', 'synonyms': ['wet_suit'], 'def': 'a close-fitting garment made of a permeable material; worn in cold water to retain body heat', 'name': 'wet_suit'}, {'frequency': 'f', 'id': 1201, 'synset': 'wheel.n.01', 'synonyms': ['wheel'], 'def': 'a circular frame with spokes (or a solid disc) that can rotate on a shaft or axle', 'name': 'wheel'}, {'frequency': 'c', 'id': 1202, 'synset': 'wheelchair.n.01', 'synonyms': ['wheelchair'], 'def': 'a movable chair mounted on large wheels', 'name': 'wheelchair'}, {'frequency': 'c', 'id': 1203, 'synset': 'whipped_cream.n.01', 'synonyms': ['whipped_cream'], 'def': 'cream that has been beaten until light and fluffy', 'name': 'whipped_cream'}, {'frequency': 'r', 'id': 1204, 'synset': 'whiskey.n.01', 'synonyms': ['whiskey'], 'def': 'a liquor made from fermented mash of grain', 'name': 'whiskey'}, {'frequency': 'r', 'id': 1205, 'synset': 'whistle.n.03', 'synonyms': ['whistle'], 'def': 'a small wind instrument that produces a whistling sound by blowing into it', 'name': 'whistle'}, {'frequency': 'r', 'id': 1206, 'synset': 'wick.n.02', 'synonyms': ['wick'], 'def': 'a loosely woven cord in a candle or oil lamp that is lit on fire', 'name': 'wick'}, {'frequency': 'c', 'id': 1207, 'synset': 'wig.n.01', 'synonyms': ['wig'], 'def': 'hairpiece covering the head and made of real or synthetic hair', 'name': 'wig'}, {'frequency': 'c', 'id': 1208, 'synset': 'wind_chime.n.01', 'synonyms': ['wind_chime'], 'def': 'a decorative arrangement of pieces of metal or glass or pottery that hang together loosely so the wind can cause them to tinkle', 'name': 'wind_chime'}, {'frequency': 'c', 'id': 1209, 'synset': 'windmill.n.01', 'synonyms': ['windmill'], 'def': 'a mill that is powered by the wind', 'name': 'windmill'}, {'frequency': 'c', 'id': 1210, 'synset': 'window_box.n.01', 'synonyms': ['window_box_(for_plants)'], 'def': 'a container for growing plants on a windowsill', 'name': 'window_box_(for_plants)'}, {'frequency': 'f', 'id': 1211, 'synset': 'windshield_wiper.n.01', 'synonyms': ['windshield_wiper', 'windscreen_wiper', 'wiper_(for_windshield/screen)'], 'def': 'a mechanical device that cleans the windshield', 'name': 'windshield_wiper'}, {'frequency': 'c', 'id': 1212, 'synset': 'windsock.n.01', 'synonyms': ['windsock', 'air_sock', 'air-sleeve', 'wind_sleeve', 'wind_cone'], 'def': 'a truncated cloth cone mounted on a mast/pole; shows wind direction', 'name': 'windsock'}, {'frequency': 'f', 'id': 1213, 'synset': 'wine_bottle.n.01', 'synonyms': ['wine_bottle'], 'def': 'a bottle for holding wine', 'name': 'wine_bottle'}, {'frequency': 'r', 'id': 1214, 'synset': 'wine_bucket.n.01', 'synonyms': ['wine_bucket', 'wine_cooler'], 'def': 'a bucket of ice used to chill a bottle of wine', 'name': 'wine_bucket'}, {'frequency': 'f', 'id': 1215, 'synset': 'wineglass.n.01', 'synonyms': ['wineglass'], 'def': 'a glass that has a stem and in which wine is served', 'name': 'wineglass'}, {'frequency': 'r', 'id': 1216, 'synset': 'wing_chair.n.01', 'synonyms': ['wing_chair'], 'def': 'easy chair having wings on each side of a high back', 'name': 'wing_chair'}, {'frequency': 'c', 'id': 1217, 'synset': 'winker.n.02', 'synonyms': ['blinder_(for_horses)'], 'def': 'blinds that prevent a horse from seeing something on either side', 'name': 'blinder_(for_horses)'}, {'frequency': 'c', 'id': 1218, 'synset': 'wok.n.01', 'synonyms': ['wok'], 'def': 'pan with a convex bottom; used for frying in Chinese cooking', 'name': 'wok'}, {'frequency': 'r', 'id': 1219, 'synset': 'wolf.n.01', 'synonyms': ['wolf'], 'def': 'a wild carnivorous mammal of the dog family, living and hunting in packs', 'name': 'wolf'}, {'frequency': 'c', 'id': 1220, 'synset': 'wooden_spoon.n.02', 'synonyms': ['wooden_spoon'], 'def': 'a spoon made of wood', 'name': 'wooden_spoon'}, {'frequency': 'c', 'id': 1221, 'synset': 'wreath.n.01', 'synonyms': ['wreath'], 'def': 'an arrangement of flowers, leaves, or stems fastened in a ring', 'name': 'wreath'}, {'frequency': 'c', 'id': 1222, 'synset': 'wrench.n.03', 'synonyms': ['wrench', 'spanner'], 'def': 'a hand tool that is used to hold or twist a nut or bolt', 'name': 'wrench'}, {'frequency': 'c', 'id': 1223, 'synset': 'wristband.n.01', 'synonyms': ['wristband'], 'def': 'band consisting of a part of a sleeve that covers the wrist', 'name': 'wristband'}, {'frequency': 'f', 'id': 1224, 'synset': 'wristlet.n.01', 'synonyms': ['wristlet', 'wrist_band'], 'def': 'a band or bracelet worn around the wrist', 'name': 'wristlet'}, {'frequency': 'r', 'id': 1225, 'synset': 'yacht.n.01', 'synonyms': ['yacht'], 'def': 'an expensive vessel propelled by sail or power and used for cruising or racing', 'name': 'yacht'}, {'frequency': 'r', 'id': 1226, 'synset': 'yak.n.02', 'synonyms': ['yak'], 'def': 'large long-haired wild ox of Tibet often domesticated', 'name': 'yak'}, {'frequency': 'c', 'id': 1227, 'synset': 'yogurt.n.01', 'synonyms': ['yogurt', 'yoghurt', 'yoghourt'], 'def': 'a custard-like food made from curdled milk', 'name': 'yogurt'}, {'frequency': 'r', 'id': 1228, 'synset': 'yoke.n.07', 'synonyms': ['yoke_(animal_equipment)'], 'def': 'gear joining two animals at the neck; NOT egg yolk', 'name': 'yoke_(animal_equipment)'}, {'frequency': 'f', 'id': 1229, 'synset': 'zebra.n.01', 'synonyms': ['zebra'], 'def': 'any of several fleet black-and-white striped African equines', 'name': 'zebra'}, {'frequency': 'c', 'id': 1230, 'synset': 'zucchini.n.02', 'synonyms': ['zucchini', 'courgette'], 'def': 'small cucumber-shaped vegetable marrow; typically dark green', 'name': 'zucchini'}]  # noqa
+# fmt: on
diff --git a/detectron2/data/datasets/lvis_v1_categories.py b/detectron2/data/datasets/lvis_v1_categories.py
new file mode 100644
index 0000000000000000000000000000000000000000..7374e6968bb006f5d8c49e75d9d3b31ea3d77d05
--- /dev/null
+++ b/detectron2/data/datasets/lvis_v1_categories.py
@@ -0,0 +1,16 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# Autogen with
+# with open("lvis_v1_val.json", "r") as f:
+#     a = json.load(f)
+# c = a["categories"]
+# for x in c:
+#     del x["image_count"]
+#     del x["instance_count"]
+# LVIS_CATEGORIES = repr(c) + "  # noqa"
+# with open("/tmp/lvis_categories.py", "wt") as f:
+#     f.write(f"LVIS_CATEGORIES = {LVIS_CATEGORIES}")
+# Then paste the contents of that file below
+
+# fmt: off
+LVIS_CATEGORIES = [{'frequency': 'c', 'synset': 'aerosol.n.02', 'synonyms': ['aerosol_can', 'spray_can'], 'id': 1, 'def': 'a dispenser that holds a substance under pressure', 'name': 'aerosol_can'}, {'frequency': 'f', 'synset': 'air_conditioner.n.01', 'synonyms': ['air_conditioner'], 'id': 2, 'def': 'a machine that keeps air cool and dry', 'name': 'air_conditioner'}, {'frequency': 'f', 'synset': 'airplane.n.01', 'synonyms': ['airplane', 'aeroplane'], 'id': 3, 'def': 'an aircraft that has a fixed wing and is powered by propellers or jets', 'name': 'airplane'}, {'frequency': 'f', 'synset': 'alarm_clock.n.01', 'synonyms': ['alarm_clock'], 'id': 4, 'def': 'a clock that wakes a sleeper at some preset time', 'name': 'alarm_clock'}, {'frequency': 'c', 'synset': 'alcohol.n.01', 'synonyms': ['alcohol', 'alcoholic_beverage'], 'id': 5, 'def': 'a liquor or brew containing alcohol as the active agent', 'name': 'alcohol'}, {'frequency': 'c', 'synset': 'alligator.n.02', 'synonyms': ['alligator', 'gator'], 'id': 6, 'def': 'amphibious reptiles related to crocodiles but with shorter broader snouts', 'name': 'alligator'}, {'frequency': 'c', 'synset': 'almond.n.02', 'synonyms': ['almond'], 'id': 7, 'def': 'oval-shaped edible seed of the almond tree', 'name': 'almond'}, {'frequency': 'c', 'synset': 'ambulance.n.01', 'synonyms': ['ambulance'], 'id': 8, 'def': 'a vehicle that takes people to and from hospitals', 'name': 'ambulance'}, {'frequency': 'c', 'synset': 'amplifier.n.01', 'synonyms': ['amplifier'], 'id': 9, 'def': 'electronic equipment that increases strength of signals', 'name': 'amplifier'}, {'frequency': 'c', 'synset': 'anklet.n.03', 'synonyms': ['anklet', 'ankle_bracelet'], 'id': 10, 'def': 'an ornament worn around the ankle', 'name': 'anklet'}, {'frequency': 'f', 'synset': 'antenna.n.01', 'synonyms': ['antenna', 'aerial', 'transmitting_aerial'], 'id': 11, 'def': 'an electrical device that sends or receives radio or television signals', 'name': 'antenna'}, {'frequency': 'f', 'synset': 'apple.n.01', 'synonyms': ['apple'], 'id': 12, 'def': 'fruit with red or yellow or green skin and sweet to tart crisp whitish flesh', 'name': 'apple'}, {'frequency': 'r', 'synset': 'applesauce.n.01', 'synonyms': ['applesauce'], 'id': 13, 'def': 'puree of stewed apples usually sweetened and spiced', 'name': 'applesauce'}, {'frequency': 'r', 'synset': 'apricot.n.02', 'synonyms': ['apricot'], 'id': 14, 'def': 'downy yellow to rosy-colored fruit resembling a small peach', 'name': 'apricot'}, {'frequency': 'f', 'synset': 'apron.n.01', 'synonyms': ['apron'], 'id': 15, 'def': 'a garment of cloth that is tied about the waist and worn to protect clothing', 'name': 'apron'}, {'frequency': 'c', 'synset': 'aquarium.n.01', 'synonyms': ['aquarium', 'fish_tank'], 'id': 16, 'def': 'a tank/pool/bowl filled with water for keeping live fish and underwater animals', 'name': 'aquarium'}, {'frequency': 'r', 'synset': 'arctic.n.02', 'synonyms': ['arctic_(type_of_shoe)', 'galosh', 'golosh', 'rubber_(type_of_shoe)', 'gumshoe'], 'id': 17, 'def': 'a waterproof overshoe that protects shoes from water or snow', 'name': 'arctic_(type_of_shoe)'}, {'frequency': 'c', 'synset': 'armband.n.02', 'synonyms': ['armband'], 'id': 18, 'def': 'a band worn around the upper arm', 'name': 'armband'}, {'frequency': 'f', 'synset': 'armchair.n.01', 'synonyms': ['armchair'], 'id': 19, 'def': 'chair with a support on each side for arms', 'name': 'armchair'}, {'frequency': 'r', 'synset': 'armoire.n.01', 'synonyms': ['armoire'], 'id': 20, 'def': 'a large wardrobe or cabinet', 'name': 'armoire'}, {'frequency': 'r', 'synset': 'armor.n.01', 'synonyms': ['armor', 'armour'], 'id': 21, 'def': 'protective covering made of metal and used in combat', 'name': 'armor'}, {'frequency': 'c', 'synset': 'artichoke.n.02', 'synonyms': ['artichoke'], 'id': 22, 'def': 'a thistlelike flower head with edible fleshy leaves and heart', 'name': 'artichoke'}, {'frequency': 'f', 'synset': 'ashcan.n.01', 'synonyms': ['trash_can', 'garbage_can', 'wastebin', 'dustbin', 'trash_barrel', 'trash_bin'], 'id': 23, 'def': 'a bin that holds rubbish until it is collected', 'name': 'trash_can'}, {'frequency': 'c', 'synset': 'ashtray.n.01', 'synonyms': ['ashtray'], 'id': 24, 'def': "a receptacle for the ash from smokers' cigars or cigarettes", 'name': 'ashtray'}, {'frequency': 'c', 'synset': 'asparagus.n.02', 'synonyms': ['asparagus'], 'id': 25, 'def': 'edible young shoots of the asparagus plant', 'name': 'asparagus'}, {'frequency': 'c', 'synset': 'atomizer.n.01', 'synonyms': ['atomizer', 'atomiser', 'spray', 'sprayer', 'nebulizer', 'nebuliser'], 'id': 26, 'def': 'a dispenser that turns a liquid (such as perfume) into a fine mist', 'name': 'atomizer'}, {'frequency': 'f', 'synset': 'avocado.n.01', 'synonyms': ['avocado'], 'id': 27, 'def': 'a pear-shaped fruit with green or blackish skin and rich yellowish pulp enclosing a single large seed', 'name': 'avocado'}, {'frequency': 'c', 'synset': 'award.n.02', 'synonyms': ['award', 'accolade'], 'id': 28, 'def': 'a tangible symbol signifying approval or distinction', 'name': 'award'}, {'frequency': 'f', 'synset': 'awning.n.01', 'synonyms': ['awning'], 'id': 29, 'def': 'a canopy made of canvas to shelter people or things from rain or sun', 'name': 'awning'}, {'frequency': 'r', 'synset': 'ax.n.01', 'synonyms': ['ax', 'axe'], 'id': 30, 'def': 'an edge tool with a heavy bladed head mounted across a handle', 'name': 'ax'}, {'frequency': 'r', 'synset': 'baboon.n.01', 'synonyms': ['baboon'], 'id': 31, 'def': 'large terrestrial monkeys having doglike muzzles', 'name': 'baboon'}, {'frequency': 'f', 'synset': 'baby_buggy.n.01', 'synonyms': ['baby_buggy', 'baby_carriage', 'perambulator', 'pram', 'stroller'], 'id': 32, 'def': 'a small vehicle with four wheels in which a baby or child is pushed around', 'name': 'baby_buggy'}, {'frequency': 'c', 'synset': 'backboard.n.01', 'synonyms': ['basketball_backboard'], 'id': 33, 'def': 'a raised vertical board with basket attached; used to play basketball', 'name': 'basketball_backboard'}, {'frequency': 'f', 'synset': 'backpack.n.01', 'synonyms': ['backpack', 'knapsack', 'packsack', 'rucksack', 'haversack'], 'id': 34, 'def': 'a bag carried by a strap on your back or shoulder', 'name': 'backpack'}, {'frequency': 'f', 'synset': 'bag.n.04', 'synonyms': ['handbag', 'purse', 'pocketbook'], 'id': 35, 'def': 'a container used for carrying money and small personal items or accessories', 'name': 'handbag'}, {'frequency': 'f', 'synset': 'bag.n.06', 'synonyms': ['suitcase', 'baggage', 'luggage'], 'id': 36, 'def': 'cases used to carry belongings when traveling', 'name': 'suitcase'}, {'frequency': 'c', 'synset': 'bagel.n.01', 'synonyms': ['bagel', 'beigel'], 'id': 37, 'def': 'glazed yeast-raised doughnut-shaped roll with hard crust', 'name': 'bagel'}, {'frequency': 'r', 'synset': 'bagpipe.n.01', 'synonyms': ['bagpipe'], 'id': 38, 'def': 'a tubular wind instrument; the player blows air into a bag and squeezes it out', 'name': 'bagpipe'}, {'frequency': 'r', 'synset': 'baguet.n.01', 'synonyms': ['baguet', 'baguette'], 'id': 39, 'def': 'narrow French stick loaf', 'name': 'baguet'}, {'frequency': 'r', 'synset': 'bait.n.02', 'synonyms': ['bait', 'lure'], 'id': 40, 'def': 'something used to lure fish or other animals into danger so they can be trapped or killed', 'name': 'bait'}, {'frequency': 'f', 'synset': 'ball.n.06', 'synonyms': ['ball'], 'id': 41, 'def': 'a spherical object used as a plaything', 'name': 'ball'}, {'frequency': 'r', 'synset': 'ballet_skirt.n.01', 'synonyms': ['ballet_skirt', 'tutu'], 'id': 42, 'def': 'very short skirt worn by ballerinas', 'name': 'ballet_skirt'}, {'frequency': 'f', 'synset': 'balloon.n.01', 'synonyms': ['balloon'], 'id': 43, 'def': 'large tough nonrigid bag filled with gas or heated air', 'name': 'balloon'}, {'frequency': 'c', 'synset': 'bamboo.n.02', 'synonyms': ['bamboo'], 'id': 44, 'def': 'woody tropical grass having hollow woody stems', 'name': 'bamboo'}, {'frequency': 'f', 'synset': 'banana.n.02', 'synonyms': ['banana'], 'id': 45, 'def': 'elongated crescent-shaped yellow fruit with soft sweet flesh', 'name': 'banana'}, {'frequency': 'c', 'synset': 'band_aid.n.01', 'synonyms': ['Band_Aid'], 'id': 46, 'def': 'trade name for an adhesive bandage to cover small cuts or blisters', 'name': 'Band_Aid'}, {'frequency': 'c', 'synset': 'bandage.n.01', 'synonyms': ['bandage'], 'id': 47, 'def': 'a piece of soft material that covers and protects an injured part of the body', 'name': 'bandage'}, {'frequency': 'f', 'synset': 'bandanna.n.01', 'synonyms': ['bandanna', 'bandana'], 'id': 48, 'def': 'large and brightly colored handkerchief; often used as a neckerchief', 'name': 'bandanna'}, {'frequency': 'r', 'synset': 'banjo.n.01', 'synonyms': ['banjo'], 'id': 49, 'def': 'a stringed instrument of the guitar family with a long neck and circular body', 'name': 'banjo'}, {'frequency': 'f', 'synset': 'banner.n.01', 'synonyms': ['banner', 'streamer'], 'id': 50, 'def': 'long strip of cloth or paper used for decoration or advertising', 'name': 'banner'}, {'frequency': 'r', 'synset': 'barbell.n.01', 'synonyms': ['barbell'], 'id': 51, 'def': 'a bar to which heavy discs are attached at each end; used in weightlifting', 'name': 'barbell'}, {'frequency': 'r', 'synset': 'barge.n.01', 'synonyms': ['barge'], 'id': 52, 'def': 'a flatbottom boat for carrying heavy loads (especially on canals)', 'name': 'barge'}, {'frequency': 'f', 'synset': 'barrel.n.02', 'synonyms': ['barrel', 'cask'], 'id': 53, 'def': 'a cylindrical container that holds liquids', 'name': 'barrel'}, {'frequency': 'c', 'synset': 'barrette.n.01', 'synonyms': ['barrette'], 'id': 54, 'def': "a pin for holding women's hair in place", 'name': 'barrette'}, {'frequency': 'c', 'synset': 'barrow.n.03', 'synonyms': ['barrow', 'garden_cart', 'lawn_cart', 'wheelbarrow'], 'id': 55, 'def': 'a cart for carrying small loads; has handles and one or more wheels', 'name': 'barrow'}, {'frequency': 'f', 'synset': 'base.n.03', 'synonyms': ['baseball_base'], 'id': 56, 'def': 'a place that the runner must touch before scoring', 'name': 'baseball_base'}, {'frequency': 'f', 'synset': 'baseball.n.02', 'synonyms': ['baseball'], 'id': 57, 'def': 'a ball used in playing baseball', 'name': 'baseball'}, {'frequency': 'f', 'synset': 'baseball_bat.n.01', 'synonyms': ['baseball_bat'], 'id': 58, 'def': 'an implement used in baseball by the batter', 'name': 'baseball_bat'}, {'frequency': 'f', 'synset': 'baseball_cap.n.01', 'synonyms': ['baseball_cap', 'jockey_cap', 'golf_cap'], 'id': 59, 'def': 'a cap with a bill', 'name': 'baseball_cap'}, {'frequency': 'f', 'synset': 'baseball_glove.n.01', 'synonyms': ['baseball_glove', 'baseball_mitt'], 'id': 60, 'def': 'the handwear used by fielders in playing baseball', 'name': 'baseball_glove'}, {'frequency': 'f', 'synset': 'basket.n.01', 'synonyms': ['basket', 'handbasket'], 'id': 61, 'def': 'a container that is usually woven and has handles', 'name': 'basket'}, {'frequency': 'c', 'synset': 'basketball.n.02', 'synonyms': ['basketball'], 'id': 62, 'def': 'an inflated ball used in playing basketball', 'name': 'basketball'}, {'frequency': 'r', 'synset': 'bass_horn.n.01', 'synonyms': ['bass_horn', 'sousaphone', 'tuba'], 'id': 63, 'def': 'the lowest brass wind instrument', 'name': 'bass_horn'}, {'frequency': 'c', 'synset': 'bat.n.01', 'synonyms': ['bat_(animal)'], 'id': 64, 'def': 'nocturnal mouselike mammal with forelimbs modified to form membranous wings', 'name': 'bat_(animal)'}, {'frequency': 'f', 'synset': 'bath_mat.n.01', 'synonyms': ['bath_mat'], 'id': 65, 'def': 'a heavy towel or mat to stand on while drying yourself after a bath', 'name': 'bath_mat'}, {'frequency': 'f', 'synset': 'bath_towel.n.01', 'synonyms': ['bath_towel'], 'id': 66, 'def': 'a large towel; to dry yourself after a bath', 'name': 'bath_towel'}, {'frequency': 'c', 'synset': 'bathrobe.n.01', 'synonyms': ['bathrobe'], 'id': 67, 'def': 'a loose-fitting robe of towelling; worn after a bath or swim', 'name': 'bathrobe'}, {'frequency': 'f', 'synset': 'bathtub.n.01', 'synonyms': ['bathtub', 'bathing_tub'], 'id': 68, 'def': 'a large open container that you fill with water and use to wash the body', 'name': 'bathtub'}, {'frequency': 'r', 'synset': 'batter.n.02', 'synonyms': ['batter_(food)'], 'id': 69, 'def': 'a liquid or semiliquid mixture, as of flour, eggs, and milk, used in cooking', 'name': 'batter_(food)'}, {'frequency': 'c', 'synset': 'battery.n.02', 'synonyms': ['battery'], 'id': 70, 'def': 'a portable device that produces electricity', 'name': 'battery'}, {'frequency': 'r', 'synset': 'beach_ball.n.01', 'synonyms': ['beachball'], 'id': 71, 'def': 'large and light ball; for play at the seaside', 'name': 'beachball'}, {'frequency': 'c', 'synset': 'bead.n.01', 'synonyms': ['bead'], 'id': 72, 'def': 'a small ball with a hole through the middle used for ornamentation, jewellery, etc.', 'name': 'bead'}, {'frequency': 'c', 'synset': 'bean_curd.n.01', 'synonyms': ['bean_curd', 'tofu'], 'id': 73, 'def': 'cheeselike food made of curdled soybean milk', 'name': 'bean_curd'}, {'frequency': 'c', 'synset': 'beanbag.n.01', 'synonyms': ['beanbag'], 'id': 74, 'def': 'a bag filled with dried beans or similar items; used in games or to sit on', 'name': 'beanbag'}, {'frequency': 'f', 'synset': 'beanie.n.01', 'synonyms': ['beanie', 'beany'], 'id': 75, 'def': 'a small skullcap; formerly worn by schoolboys and college freshmen', 'name': 'beanie'}, {'frequency': 'f', 'synset': 'bear.n.01', 'synonyms': ['bear'], 'id': 76, 'def': 'large carnivorous or omnivorous mammals with shaggy coats and claws', 'name': 'bear'}, {'frequency': 'f', 'synset': 'bed.n.01', 'synonyms': ['bed'], 'id': 77, 'def': 'a piece of furniture that provides a place to sleep', 'name': 'bed'}, {'frequency': 'r', 'synset': 'bedpan.n.01', 'synonyms': ['bedpan'], 'id': 78, 'def': 'a shallow vessel used by a bedridden patient for defecation and urination', 'name': 'bedpan'}, {'frequency': 'f', 'synset': 'bedspread.n.01', 'synonyms': ['bedspread', 'bedcover', 'bed_covering', 'counterpane', 'spread'], 'id': 79, 'def': 'decorative cover for a bed', 'name': 'bedspread'}, {'frequency': 'f', 'synset': 'beef.n.01', 'synonyms': ['cow'], 'id': 80, 'def': 'cattle/cow', 'name': 'cow'}, {'frequency': 'f', 'synset': 'beef.n.02', 'synonyms': ['beef_(food)', 'boeuf_(food)'], 'id': 81, 'def': 'meat from an adult domestic bovine', 'name': 'beef_(food)'}, {'frequency': 'r', 'synset': 'beeper.n.01', 'synonyms': ['beeper', 'pager'], 'id': 82, 'def': 'an device that beeps when the person carrying it is being paged', 'name': 'beeper'}, {'frequency': 'f', 'synset': 'beer_bottle.n.01', 'synonyms': ['beer_bottle'], 'id': 83, 'def': 'a bottle that holds beer', 'name': 'beer_bottle'}, {'frequency': 'c', 'synset': 'beer_can.n.01', 'synonyms': ['beer_can'], 'id': 84, 'def': 'a can that holds beer', 'name': 'beer_can'}, {'frequency': 'r', 'synset': 'beetle.n.01', 'synonyms': ['beetle'], 'id': 85, 'def': 'insect with hard wing covers', 'name': 'beetle'}, {'frequency': 'f', 'synset': 'bell.n.01', 'synonyms': ['bell'], 'id': 86, 'def': 'a hollow device made of metal that makes a ringing sound when struck', 'name': 'bell'}, {'frequency': 'f', 'synset': 'bell_pepper.n.02', 'synonyms': ['bell_pepper', 'capsicum'], 'id': 87, 'def': 'large bell-shaped sweet pepper in green or red or yellow or orange or black varieties', 'name': 'bell_pepper'}, {'frequency': 'f', 'synset': 'belt.n.02', 'synonyms': ['belt'], 'id': 88, 'def': 'a band to tie or buckle around the body (usually at the waist)', 'name': 'belt'}, {'frequency': 'f', 'synset': 'belt_buckle.n.01', 'synonyms': ['belt_buckle'], 'id': 89, 'def': 'the buckle used to fasten a belt', 'name': 'belt_buckle'}, {'frequency': 'f', 'synset': 'bench.n.01', 'synonyms': ['bench'], 'id': 90, 'def': 'a long seat for more than one person', 'name': 'bench'}, {'frequency': 'c', 'synset': 'beret.n.01', 'synonyms': ['beret'], 'id': 91, 'def': 'a cap with no brim or bill; made of soft cloth', 'name': 'beret'}, {'frequency': 'c', 'synset': 'bib.n.02', 'synonyms': ['bib'], 'id': 92, 'def': 'a napkin tied under the chin of a child while eating', 'name': 'bib'}, {'frequency': 'r', 'synset': 'bible.n.01', 'synonyms': ['Bible'], 'id': 93, 'def': 'the sacred writings of the Christian religions', 'name': 'Bible'}, {'frequency': 'f', 'synset': 'bicycle.n.01', 'synonyms': ['bicycle', 'bike_(bicycle)'], 'id': 94, 'def': 'a wheeled vehicle that has two wheels and is moved by foot pedals', 'name': 'bicycle'}, {'frequency': 'f', 'synset': 'bill.n.09', 'synonyms': ['visor', 'vizor'], 'id': 95, 'def': 'a brim that projects to the front to shade the eyes', 'name': 'visor'}, {'frequency': 'f', 'synset': 'billboard.n.01', 'synonyms': ['billboard'], 'id': 96, 'def': 'large outdoor signboard', 'name': 'billboard'}, {'frequency': 'c', 'synset': 'binder.n.03', 'synonyms': ['binder', 'ring-binder'], 'id': 97, 'def': 'holds loose papers or magazines', 'name': 'binder'}, {'frequency': 'c', 'synset': 'binoculars.n.01', 'synonyms': ['binoculars', 'field_glasses', 'opera_glasses'], 'id': 98, 'def': 'an optical instrument designed for simultaneous use by both eyes', 'name': 'binoculars'}, {'frequency': 'f', 'synset': 'bird.n.01', 'synonyms': ['bird'], 'id': 99, 'def': 'animal characterized by feathers and wings', 'name': 'bird'}, {'frequency': 'c', 'synset': 'bird_feeder.n.01', 'synonyms': ['birdfeeder'], 'id': 100, 'def': 'an outdoor device that supplies food for wild birds', 'name': 'birdfeeder'}, {'frequency': 'c', 'synset': 'birdbath.n.01', 'synonyms': ['birdbath'], 'id': 101, 'def': 'an ornamental basin (usually in a garden) for birds to bathe in', 'name': 'birdbath'}, {'frequency': 'c', 'synset': 'birdcage.n.01', 'synonyms': ['birdcage'], 'id': 102, 'def': 'a cage in which a bird can be kept', 'name': 'birdcage'}, {'frequency': 'c', 'synset': 'birdhouse.n.01', 'synonyms': ['birdhouse'], 'id': 103, 'def': 'a shelter for birds', 'name': 'birdhouse'}, {'frequency': 'f', 'synset': 'birthday_cake.n.01', 'synonyms': ['birthday_cake'], 'id': 104, 'def': 'decorated cake served at a birthday party', 'name': 'birthday_cake'}, {'frequency': 'r', 'synset': 'birthday_card.n.01', 'synonyms': ['birthday_card'], 'id': 105, 'def': 'a card expressing a birthday greeting', 'name': 'birthday_card'}, {'frequency': 'r', 'synset': 'black_flag.n.01', 'synonyms': ['pirate_flag'], 'id': 106, 'def': 'a flag usually bearing a white skull and crossbones on a black background', 'name': 'pirate_flag'}, {'frequency': 'c', 'synset': 'black_sheep.n.02', 'synonyms': ['black_sheep'], 'id': 107, 'def': 'sheep with a black coat', 'name': 'black_sheep'}, {'frequency': 'c', 'synset': 'blackberry.n.01', 'synonyms': ['blackberry'], 'id': 108, 'def': 'large sweet black or very dark purple edible aggregate fruit', 'name': 'blackberry'}, {'frequency': 'f', 'synset': 'blackboard.n.01', 'synonyms': ['blackboard', 'chalkboard'], 'id': 109, 'def': 'sheet of slate; for writing with chalk', 'name': 'blackboard'}, {'frequency': 'f', 'synset': 'blanket.n.01', 'synonyms': ['blanket'], 'id': 110, 'def': 'bedding that keeps a person warm in bed', 'name': 'blanket'}, {'frequency': 'c', 'synset': 'blazer.n.01', 'synonyms': ['blazer', 'sport_jacket', 'sport_coat', 'sports_jacket', 'sports_coat'], 'id': 111, 'def': 'lightweight jacket; often striped in the colors of a club or school', 'name': 'blazer'}, {'frequency': 'f', 'synset': 'blender.n.01', 'synonyms': ['blender', 'liquidizer', 'liquidiser'], 'id': 112, 'def': 'an electrically powered mixer that mix or chop or liquefy foods', 'name': 'blender'}, {'frequency': 'r', 'synset': 'blimp.n.02', 'synonyms': ['blimp'], 'id': 113, 'def': 'a small nonrigid airship used for observation or as a barrage balloon', 'name': 'blimp'}, {'frequency': 'f', 'synset': 'blinker.n.01', 'synonyms': ['blinker', 'flasher'], 'id': 114, 'def': 'a light that flashes on and off; used as a signal or to send messages', 'name': 'blinker'}, {'frequency': 'f', 'synset': 'blouse.n.01', 'synonyms': ['blouse'], 'id': 115, 'def': 'a top worn by women', 'name': 'blouse'}, {'frequency': 'f', 'synset': 'blueberry.n.02', 'synonyms': ['blueberry'], 'id': 116, 'def': 'sweet edible dark-blue berries of blueberry plants', 'name': 'blueberry'}, {'frequency': 'r', 'synset': 'board.n.09', 'synonyms': ['gameboard'], 'id': 117, 'def': 'a flat portable surface (usually rectangular) designed for board games', 'name': 'gameboard'}, {'frequency': 'f', 'synset': 'boat.n.01', 'synonyms': ['boat', 'ship_(boat)'], 'id': 118, 'def': 'a vessel for travel on water', 'name': 'boat'}, {'frequency': 'r', 'synset': 'bob.n.05', 'synonyms': ['bob', 'bobber', 'bobfloat'], 'id': 119, 'def': 'a small float usually made of cork; attached to a fishing line', 'name': 'bob'}, {'frequency': 'c', 'synset': 'bobbin.n.01', 'synonyms': ['bobbin', 'spool', 'reel'], 'id': 120, 'def': 'a thing around which thread/tape/film or other flexible materials can be wound', 'name': 'bobbin'}, {'frequency': 'c', 'synset': 'bobby_pin.n.01', 'synonyms': ['bobby_pin', 'hairgrip'], 'id': 121, 'def': 'a flat wire hairpin used to hold bobbed hair in place', 'name': 'bobby_pin'}, {'frequency': 'c', 'synset': 'boiled_egg.n.01', 'synonyms': ['boiled_egg', 'coddled_egg'], 'id': 122, 'def': 'egg cooked briefly in the shell in gently boiling water', 'name': 'boiled_egg'}, {'frequency': 'r', 'synset': 'bolo_tie.n.01', 'synonyms': ['bolo_tie', 'bolo', 'bola_tie', 'bola'], 'id': 123, 'def': 'a cord fastened around the neck with an ornamental clasp and worn as a necktie', 'name': 'bolo_tie'}, {'frequency': 'c', 'synset': 'bolt.n.03', 'synonyms': ['deadbolt'], 'id': 124, 'def': 'the part of a lock that is engaged or withdrawn with a key', 'name': 'deadbolt'}, {'frequency': 'f', 'synset': 'bolt.n.06', 'synonyms': ['bolt'], 'id': 125, 'def': 'a screw that screws into a nut to form a fastener', 'name': 'bolt'}, {'frequency': 'r', 'synset': 'bonnet.n.01', 'synonyms': ['bonnet'], 'id': 126, 'def': 'a hat tied under the chin', 'name': 'bonnet'}, {'frequency': 'f', 'synset': 'book.n.01', 'synonyms': ['book'], 'id': 127, 'def': 'a written work or composition that has been published', 'name': 'book'}, {'frequency': 'c', 'synset': 'bookcase.n.01', 'synonyms': ['bookcase'], 'id': 128, 'def': 'a piece of furniture with shelves for storing books', 'name': 'bookcase'}, {'frequency': 'c', 'synset': 'booklet.n.01', 'synonyms': ['booklet', 'brochure', 'leaflet', 'pamphlet'], 'id': 129, 'def': 'a small book usually having a paper cover', 'name': 'booklet'}, {'frequency': 'r', 'synset': 'bookmark.n.01', 'synonyms': ['bookmark', 'bookmarker'], 'id': 130, 'def': 'a marker (a piece of paper or ribbon) placed between the pages of a book', 'name': 'bookmark'}, {'frequency': 'r', 'synset': 'boom.n.04', 'synonyms': ['boom_microphone', 'microphone_boom'], 'id': 131, 'def': 'a pole carrying an overhead microphone projected over a film or tv set', 'name': 'boom_microphone'}, {'frequency': 'f', 'synset': 'boot.n.01', 'synonyms': ['boot'], 'id': 132, 'def': 'footwear that covers the whole foot and lower leg', 'name': 'boot'}, {'frequency': 'f', 'synset': 'bottle.n.01', 'synonyms': ['bottle'], 'id': 133, 'def': 'a glass or plastic vessel used for storing drinks or other liquids', 'name': 'bottle'}, {'frequency': 'c', 'synset': 'bottle_opener.n.01', 'synonyms': ['bottle_opener'], 'id': 134, 'def': 'an opener for removing caps or corks from bottles', 'name': 'bottle_opener'}, {'frequency': 'c', 'synset': 'bouquet.n.01', 'synonyms': ['bouquet'], 'id': 135, 'def': 'an arrangement of flowers that is usually given as a present', 'name': 'bouquet'}, {'frequency': 'r', 'synset': 'bow.n.04', 'synonyms': ['bow_(weapon)'], 'id': 136, 'def': 'a weapon for shooting arrows', 'name': 'bow_(weapon)'}, {'frequency': 'f', 'synset': 'bow.n.08', 'synonyms': ['bow_(decorative_ribbons)'], 'id': 137, 'def': 'a decorative interlacing of ribbons', 'name': 'bow_(decorative_ribbons)'}, {'frequency': 'f', 'synset': 'bow_tie.n.01', 'synonyms': ['bow-tie', 'bowtie'], 'id': 138, 'def': "a man's tie that ties in a bow", 'name': 'bow-tie'}, {'frequency': 'f', 'synset': 'bowl.n.03', 'synonyms': ['bowl'], 'id': 139, 'def': 'a dish that is round and open at the top for serving foods', 'name': 'bowl'}, {'frequency': 'r', 'synset': 'bowl.n.08', 'synonyms': ['pipe_bowl'], 'id': 140, 'def': 'a small round container that is open at the top for holding tobacco', 'name': 'pipe_bowl'}, {'frequency': 'c', 'synset': 'bowler_hat.n.01', 'synonyms': ['bowler_hat', 'bowler', 'derby_hat', 'derby', 'plug_hat'], 'id': 141, 'def': 'a felt hat that is round and hard with a narrow brim', 'name': 'bowler_hat'}, {'frequency': 'r', 'synset': 'bowling_ball.n.01', 'synonyms': ['bowling_ball'], 'id': 142, 'def': 'a large ball with finger holes used in the sport of bowling', 'name': 'bowling_ball'}, {'frequency': 'f', 'synset': 'box.n.01', 'synonyms': ['box'], 'id': 143, 'def': 'a (usually rectangular) container; may have a lid', 'name': 'box'}, {'frequency': 'r', 'synset': 'boxing_glove.n.01', 'synonyms': ['boxing_glove'], 'id': 144, 'def': 'large glove coverings the fists of a fighter worn for the sport of boxing', 'name': 'boxing_glove'}, {'frequency': 'c', 'synset': 'brace.n.06', 'synonyms': ['suspenders'], 'id': 145, 'def': 'elastic straps that hold trousers up (usually used in the plural)', 'name': 'suspenders'}, {'frequency': 'f', 'synset': 'bracelet.n.02', 'synonyms': ['bracelet', 'bangle'], 'id': 146, 'def': 'jewelry worn around the wrist for decoration', 'name': 'bracelet'}, {'frequency': 'r', 'synset': 'brass.n.07', 'synonyms': ['brass_plaque'], 'id': 147, 'def': 'a memorial made of brass', 'name': 'brass_plaque'}, {'frequency': 'c', 'synset': 'brassiere.n.01', 'synonyms': ['brassiere', 'bra', 'bandeau'], 'id': 148, 'def': 'an undergarment worn by women to support their breasts', 'name': 'brassiere'}, {'frequency': 'c', 'synset': 'bread-bin.n.01', 'synonyms': ['bread-bin', 'breadbox'], 'id': 149, 'def': 'a container used to keep bread or cake in', 'name': 'bread-bin'}, {'frequency': 'f', 'synset': 'bread.n.01', 'synonyms': ['bread'], 'id': 150, 'def': 'food made from dough of flour or meal and usually raised with yeast or baking powder and then baked', 'name': 'bread'}, {'frequency': 'r', 'synset': 'breechcloth.n.01', 'synonyms': ['breechcloth', 'breechclout', 'loincloth'], 'id': 151, 'def': 'a garment that provides covering for the loins', 'name': 'breechcloth'}, {'frequency': 'f', 'synset': 'bridal_gown.n.01', 'synonyms': ['bridal_gown', 'wedding_gown', 'wedding_dress'], 'id': 152, 'def': 'a gown worn by the bride at a wedding', 'name': 'bridal_gown'}, {'frequency': 'c', 'synset': 'briefcase.n.01', 'synonyms': ['briefcase'], 'id': 153, 'def': 'a case with a handle; for carrying papers or files or books', 'name': 'briefcase'}, {'frequency': 'f', 'synset': 'broccoli.n.01', 'synonyms': ['broccoli'], 'id': 154, 'def': 'plant with dense clusters of tight green flower buds', 'name': 'broccoli'}, {'frequency': 'r', 'synset': 'brooch.n.01', 'synonyms': ['broach'], 'id': 155, 'def': 'a decorative pin worn by women', 'name': 'broach'}, {'frequency': 'c', 'synset': 'broom.n.01', 'synonyms': ['broom'], 'id': 156, 'def': 'bundle of straws or twigs attached to a long handle; used for cleaning', 'name': 'broom'}, {'frequency': 'c', 'synset': 'brownie.n.03', 'synonyms': ['brownie'], 'id': 157, 'def': 'square or bar of very rich chocolate cake usually with nuts', 'name': 'brownie'}, {'frequency': 'c', 'synset': 'brussels_sprouts.n.01', 'synonyms': ['brussels_sprouts'], 'id': 158, 'def': 'the small edible cabbage-like buds growing along a stalk', 'name': 'brussels_sprouts'}, {'frequency': 'r', 'synset': 'bubble_gum.n.01', 'synonyms': ['bubble_gum'], 'id': 159, 'def': 'a kind of chewing gum that can be blown into bubbles', 'name': 'bubble_gum'}, {'frequency': 'f', 'synset': 'bucket.n.01', 'synonyms': ['bucket', 'pail'], 'id': 160, 'def': 'a roughly cylindrical vessel that is open at the top', 'name': 'bucket'}, {'frequency': 'r', 'synset': 'buggy.n.01', 'synonyms': ['horse_buggy'], 'id': 161, 'def': 'a small lightweight carriage; drawn by a single horse', 'name': 'horse_buggy'}, {'frequency': 'c', 'synset': 'bull.n.11', 'synonyms': ['horned_cow'], 'id': 162, 'def': 'a cow with horns', 'name': 'bull'}, {'frequency': 'c', 'synset': 'bulldog.n.01', 'synonyms': ['bulldog'], 'id': 163, 'def': 'a thickset short-haired dog with a large head and strong undershot lower jaw', 'name': 'bulldog'}, {'frequency': 'r', 'synset': 'bulldozer.n.01', 'synonyms': ['bulldozer', 'dozer'], 'id': 164, 'def': 'large powerful tractor; a large blade in front flattens areas of ground', 'name': 'bulldozer'}, {'frequency': 'c', 'synset': 'bullet_train.n.01', 'synonyms': ['bullet_train'], 'id': 165, 'def': 'a high-speed passenger train', 'name': 'bullet_train'}, {'frequency': 'c', 'synset': 'bulletin_board.n.02', 'synonyms': ['bulletin_board', 'notice_board'], 'id': 166, 'def': 'a board that hangs on a wall; displays announcements', 'name': 'bulletin_board'}, {'frequency': 'r', 'synset': 'bulletproof_vest.n.01', 'synonyms': ['bulletproof_vest'], 'id': 167, 'def': 'a vest capable of resisting the impact of a bullet', 'name': 'bulletproof_vest'}, {'frequency': 'c', 'synset': 'bullhorn.n.01', 'synonyms': ['bullhorn', 'megaphone'], 'id': 168, 'def': 'a portable loudspeaker with built-in microphone and amplifier', 'name': 'bullhorn'}, {'frequency': 'f', 'synset': 'bun.n.01', 'synonyms': ['bun', 'roll'], 'id': 169, 'def': 'small rounded bread either plain or sweet', 'name': 'bun'}, {'frequency': 'c', 'synset': 'bunk_bed.n.01', 'synonyms': ['bunk_bed'], 'id': 170, 'def': 'beds built one above the other', 'name': 'bunk_bed'}, {'frequency': 'f', 'synset': 'buoy.n.01', 'synonyms': ['buoy'], 'id': 171, 'def': 'a float attached by rope to the seabed to mark channels in a harbor or underwater hazards', 'name': 'buoy'}, {'frequency': 'r', 'synset': 'burrito.n.01', 'synonyms': ['burrito'], 'id': 172, 'def': 'a flour tortilla folded around a filling', 'name': 'burrito'}, {'frequency': 'f', 'synset': 'bus.n.01', 'synonyms': ['bus_(vehicle)', 'autobus', 'charabanc', 'double-decker', 'motorbus', 'motorcoach'], 'id': 173, 'def': 'a vehicle carrying many passengers; used for public transport', 'name': 'bus_(vehicle)'}, {'frequency': 'c', 'synset': 'business_card.n.01', 'synonyms': ['business_card'], 'id': 174, 'def': "a card on which are printed the person's name and business affiliation", 'name': 'business_card'}, {'frequency': 'f', 'synset': 'butter.n.01', 'synonyms': ['butter'], 'id': 175, 'def': 'an edible emulsion of fat globules made by churning milk or cream; for cooking and table use', 'name': 'butter'}, {'frequency': 'c', 'synset': 'butterfly.n.01', 'synonyms': ['butterfly'], 'id': 176, 'def': 'insect typically having a slender body with knobbed antennae and broad colorful wings', 'name': 'butterfly'}, {'frequency': 'f', 'synset': 'button.n.01', 'synonyms': ['button'], 'id': 177, 'def': 'a round fastener sewn to shirts and coats etc to fit through buttonholes', 'name': 'button'}, {'frequency': 'f', 'synset': 'cab.n.03', 'synonyms': ['cab_(taxi)', 'taxi', 'taxicab'], 'id': 178, 'def': 'a car that takes passengers where they want to go in exchange for money', 'name': 'cab_(taxi)'}, {'frequency': 'r', 'synset': 'cabana.n.01', 'synonyms': ['cabana'], 'id': 179, 'def': 'a small tent used as a dressing room beside the sea or a swimming pool', 'name': 'cabana'}, {'frequency': 'c', 'synset': 'cabin_car.n.01', 'synonyms': ['cabin_car', 'caboose'], 'id': 180, 'def': 'a car on a freight train for use of the train crew; usually the last car on the train', 'name': 'cabin_car'}, {'frequency': 'f', 'synset': 'cabinet.n.01', 'synonyms': ['cabinet'], 'id': 181, 'def': 'a piece of furniture resembling a cupboard with doors and shelves and drawers', 'name': 'cabinet'}, {'frequency': 'r', 'synset': 'cabinet.n.03', 'synonyms': ['locker', 'storage_locker'], 'id': 182, 'def': 'a storage compartment for clothes and valuables; usually it has a lock', 'name': 'locker'}, {'frequency': 'f', 'synset': 'cake.n.03', 'synonyms': ['cake'], 'id': 183, 'def': 'baked goods made from or based on a mixture of flour, sugar, eggs, and fat', 'name': 'cake'}, {'frequency': 'c', 'synset': 'calculator.n.02', 'synonyms': ['calculator'], 'id': 184, 'def': 'a small machine that is used for mathematical calculations', 'name': 'calculator'}, {'frequency': 'f', 'synset': 'calendar.n.02', 'synonyms': ['calendar'], 'id': 185, 'def': 'a list or register of events (appointments/social events/court cases, etc)', 'name': 'calendar'}, {'frequency': 'c', 'synset': 'calf.n.01', 'synonyms': ['calf'], 'id': 186, 'def': 'young of domestic cattle', 'name': 'calf'}, {'frequency': 'c', 'synset': 'camcorder.n.01', 'synonyms': ['camcorder'], 'id': 187, 'def': 'a portable television camera and videocassette recorder', 'name': 'camcorder'}, {'frequency': 'c', 'synset': 'camel.n.01', 'synonyms': ['camel'], 'id': 188, 'def': 'cud-chewing mammal used as a draft or saddle animal in desert regions', 'name': 'camel'}, {'frequency': 'f', 'synset': 'camera.n.01', 'synonyms': ['camera'], 'id': 189, 'def': 'equipment for taking photographs', 'name': 'camera'}, {'frequency': 'c', 'synset': 'camera_lens.n.01', 'synonyms': ['camera_lens'], 'id': 190, 'def': 'a lens that focuses the image in a camera', 'name': 'camera_lens'}, {'frequency': 'c', 'synset': 'camper.n.02', 'synonyms': ['camper_(vehicle)', 'camping_bus', 'motor_home'], 'id': 191, 'def': 'a recreational vehicle equipped for camping out while traveling', 'name': 'camper_(vehicle)'}, {'frequency': 'f', 'synset': 'can.n.01', 'synonyms': ['can', 'tin_can'], 'id': 192, 'def': 'airtight sealed metal container for food or drink or paint etc.', 'name': 'can'}, {'frequency': 'c', 'synset': 'can_opener.n.01', 'synonyms': ['can_opener', 'tin_opener'], 'id': 193, 'def': 'a device for cutting cans open', 'name': 'can_opener'}, {'frequency': 'f', 'synset': 'candle.n.01', 'synonyms': ['candle', 'candlestick'], 'id': 194, 'def': 'stick of wax with a wick in the middle', 'name': 'candle'}, {'frequency': 'f', 'synset': 'candlestick.n.01', 'synonyms': ['candle_holder'], 'id': 195, 'def': 'a holder with sockets for candles', 'name': 'candle_holder'}, {'frequency': 'r', 'synset': 'candy_bar.n.01', 'synonyms': ['candy_bar'], 'id': 196, 'def': 'a candy shaped as a bar', 'name': 'candy_bar'}, {'frequency': 'c', 'synset': 'candy_cane.n.01', 'synonyms': ['candy_cane'], 'id': 197, 'def': 'a hard candy in the shape of a rod (usually with stripes)', 'name': 'candy_cane'}, {'frequency': 'c', 'synset': 'cane.n.01', 'synonyms': ['walking_cane'], 'id': 198, 'def': 'a stick that people can lean on to help them walk', 'name': 'walking_cane'}, {'frequency': 'c', 'synset': 'canister.n.02', 'synonyms': ['canister', 'cannister'], 'id': 199, 'def': 'metal container for storing dry foods such as tea or flour', 'name': 'canister'}, {'frequency': 'c', 'synset': 'canoe.n.01', 'synonyms': ['canoe'], 'id': 200, 'def': 'small and light boat; pointed at both ends; propelled with a paddle', 'name': 'canoe'}, {'frequency': 'c', 'synset': 'cantaloup.n.02', 'synonyms': ['cantaloup', 'cantaloupe'], 'id': 201, 'def': 'the fruit of a cantaloup vine; small to medium-sized melon with yellowish flesh', 'name': 'cantaloup'}, {'frequency': 'r', 'synset': 'canteen.n.01', 'synonyms': ['canteen'], 'id': 202, 'def': 'a flask for carrying water; used by soldiers or travelers', 'name': 'canteen'}, {'frequency': 'f', 'synset': 'cap.n.01', 'synonyms': ['cap_(headwear)'], 'id': 203, 'def': 'a tight-fitting headwear', 'name': 'cap_(headwear)'}, {'frequency': 'f', 'synset': 'cap.n.02', 'synonyms': ['bottle_cap', 'cap_(container_lid)'], 'id': 204, 'def': 'a top (as for a bottle)', 'name': 'bottle_cap'}, {'frequency': 'c', 'synset': 'cape.n.02', 'synonyms': ['cape'], 'id': 205, 'def': 'a sleeveless garment like a cloak but shorter', 'name': 'cape'}, {'frequency': 'c', 'synset': 'cappuccino.n.01', 'synonyms': ['cappuccino', 'coffee_cappuccino'], 'id': 206, 'def': 'equal parts of espresso and steamed milk', 'name': 'cappuccino'}, {'frequency': 'f', 'synset': 'car.n.01', 'synonyms': ['car_(automobile)', 'auto_(automobile)', 'automobile'], 'id': 207, 'def': 'a motor vehicle with four wheels', 'name': 'car_(automobile)'}, {'frequency': 'f', 'synset': 'car.n.02', 'synonyms': ['railcar_(part_of_a_train)', 'railway_car_(part_of_a_train)', 'railroad_car_(part_of_a_train)'], 'id': 208, 'def': 'a wheeled vehicle adapted to the rails of railroad (mark each individual railcar separately)', 'name': 'railcar_(part_of_a_train)'}, {'frequency': 'r', 'synset': 'car.n.04', 'synonyms': ['elevator_car'], 'id': 209, 'def': 'where passengers ride up and down', 'name': 'elevator_car'}, {'frequency': 'r', 'synset': 'car_battery.n.01', 'synonyms': ['car_battery', 'automobile_battery'], 'id': 210, 'def': 'a battery in a motor vehicle', 'name': 'car_battery'}, {'frequency': 'c', 'synset': 'card.n.02', 'synonyms': ['identity_card'], 'id': 211, 'def': 'a card certifying the identity of the bearer', 'name': 'identity_card'}, {'frequency': 'c', 'synset': 'card.n.03', 'synonyms': ['card'], 'id': 212, 'def': 'a rectangular piece of paper used to send messages (e.g. greetings or pictures)', 'name': 'card'}, {'frequency': 'c', 'synset': 'cardigan.n.01', 'synonyms': ['cardigan'], 'id': 213, 'def': 'knitted jacket that is fastened up the front with buttons or a zipper', 'name': 'cardigan'}, {'frequency': 'r', 'synset': 'cargo_ship.n.01', 'synonyms': ['cargo_ship', 'cargo_vessel'], 'id': 214, 'def': 'a ship designed to carry cargo', 'name': 'cargo_ship'}, {'frequency': 'r', 'synset': 'carnation.n.01', 'synonyms': ['carnation'], 'id': 215, 'def': 'plant with pink to purple-red spice-scented usually double flowers', 'name': 'carnation'}, {'frequency': 'c', 'synset': 'carriage.n.02', 'synonyms': ['horse_carriage'], 'id': 216, 'def': 'a vehicle with wheels drawn by one or more horses', 'name': 'horse_carriage'}, {'frequency': 'f', 'synset': 'carrot.n.01', 'synonyms': ['carrot'], 'id': 217, 'def': 'deep orange edible root of the cultivated carrot plant', 'name': 'carrot'}, {'frequency': 'f', 'synset': 'carryall.n.01', 'synonyms': ['tote_bag'], 'id': 218, 'def': 'a capacious bag or basket', 'name': 'tote_bag'}, {'frequency': 'c', 'synset': 'cart.n.01', 'synonyms': ['cart'], 'id': 219, 'def': 'a heavy open wagon usually having two wheels and drawn by an animal', 'name': 'cart'}, {'frequency': 'c', 'synset': 'carton.n.02', 'synonyms': ['carton'], 'id': 220, 'def': 'a container made of cardboard for holding food or drink', 'name': 'carton'}, {'frequency': 'c', 'synset': 'cash_register.n.01', 'synonyms': ['cash_register', 'register_(for_cash_transactions)'], 'id': 221, 'def': 'a cashbox with an adding machine to register transactions', 'name': 'cash_register'}, {'frequency': 'r', 'synset': 'casserole.n.01', 'synonyms': ['casserole'], 'id': 222, 'def': 'food cooked and served in a casserole', 'name': 'casserole'}, {'frequency': 'r', 'synset': 'cassette.n.01', 'synonyms': ['cassette'], 'id': 223, 'def': 'a container that holds a magnetic tape used for recording or playing sound or video', 'name': 'cassette'}, {'frequency': 'c', 'synset': 'cast.n.05', 'synonyms': ['cast', 'plaster_cast', 'plaster_bandage'], 'id': 224, 'def': 'bandage consisting of a firm covering that immobilizes broken bones while they heal', 'name': 'cast'}, {'frequency': 'f', 'synset': 'cat.n.01', 'synonyms': ['cat'], 'id': 225, 'def': 'a domestic house cat', 'name': 'cat'}, {'frequency': 'f', 'synset': 'cauliflower.n.02', 'synonyms': ['cauliflower'], 'id': 226, 'def': 'edible compact head of white undeveloped flowers', 'name': 'cauliflower'}, {'frequency': 'c', 'synset': 'cayenne.n.02', 'synonyms': ['cayenne_(spice)', 'cayenne_pepper_(spice)', 'red_pepper_(spice)'], 'id': 227, 'def': 'ground pods and seeds of pungent red peppers of the genus Capsicum', 'name': 'cayenne_(spice)'}, {'frequency': 'c', 'synset': 'cd_player.n.01', 'synonyms': ['CD_player'], 'id': 228, 'def': 'electronic equipment for playing compact discs (CDs)', 'name': 'CD_player'}, {'frequency': 'f', 'synset': 'celery.n.01', 'synonyms': ['celery'], 'id': 229, 'def': 'widely cultivated herb with aromatic leaf stalks that are eaten raw or cooked', 'name': 'celery'}, {'frequency': 'f', 'synset': 'cellular_telephone.n.01', 'synonyms': ['cellular_telephone', 'cellular_phone', 'cellphone', 'mobile_phone', 'smart_phone'], 'id': 230, 'def': 'a hand-held mobile telephone', 'name': 'cellular_telephone'}, {'frequency': 'r', 'synset': 'chain_mail.n.01', 'synonyms': ['chain_mail', 'ring_mail', 'chain_armor', 'chain_armour', 'ring_armor', 'ring_armour'], 'id': 231, 'def': '(Middle Ages) flexible armor made of interlinked metal rings', 'name': 'chain_mail'}, {'frequency': 'f', 'synset': 'chair.n.01', 'synonyms': ['chair'], 'id': 232, 'def': 'a seat for one person, with a support for the back', 'name': 'chair'}, {'frequency': 'r', 'synset': 'chaise_longue.n.01', 'synonyms': ['chaise_longue', 'chaise', 'daybed'], 'id': 233, 'def': 'a long chair; for reclining', 'name': 'chaise_longue'}, {'frequency': 'r', 'synset': 'chalice.n.01', 'synonyms': ['chalice'], 'id': 234, 'def': 'a bowl-shaped drinking vessel; especially the Eucharistic cup', 'name': 'chalice'}, {'frequency': 'f', 'synset': 'chandelier.n.01', 'synonyms': ['chandelier'], 'id': 235, 'def': 'branched lighting fixture; often ornate; hangs from the ceiling', 'name': 'chandelier'}, {'frequency': 'r', 'synset': 'chap.n.04', 'synonyms': ['chap'], 'id': 236, 'def': 'leather leggings without a seat; worn over trousers by cowboys to protect their legs', 'name': 'chap'}, {'frequency': 'r', 'synset': 'checkbook.n.01', 'synonyms': ['checkbook', 'chequebook'], 'id': 237, 'def': 'a book issued to holders of checking accounts', 'name': 'checkbook'}, {'frequency': 'r', 'synset': 'checkerboard.n.01', 'synonyms': ['checkerboard'], 'id': 238, 'def': 'a board having 64 squares of two alternating colors', 'name': 'checkerboard'}, {'frequency': 'c', 'synset': 'cherry.n.03', 'synonyms': ['cherry'], 'id': 239, 'def': 'a red fruit with a single hard stone', 'name': 'cherry'}, {'frequency': 'r', 'synset': 'chessboard.n.01', 'synonyms': ['chessboard'], 'id': 240, 'def': 'a checkerboard used to play chess', 'name': 'chessboard'}, {'frequency': 'c', 'synset': 'chicken.n.02', 'synonyms': ['chicken_(animal)'], 'id': 241, 'def': 'a domestic fowl bred for flesh or eggs', 'name': 'chicken_(animal)'}, {'frequency': 'c', 'synset': 'chickpea.n.01', 'synonyms': ['chickpea', 'garbanzo'], 'id': 242, 'def': 'the seed of the chickpea plant; usually dried', 'name': 'chickpea'}, {'frequency': 'c', 'synset': 'chili.n.02', 'synonyms': ['chili_(vegetable)', 'chili_pepper_(vegetable)', 'chilli_(vegetable)', 'chilly_(vegetable)', 'chile_(vegetable)'], 'id': 243, 'def': 'very hot and finely tapering pepper of special pungency', 'name': 'chili_(vegetable)'}, {'frequency': 'r', 'synset': 'chime.n.01', 'synonyms': ['chime', 'gong'], 'id': 244, 'def': 'an instrument consisting of a set of bells that are struck with a hammer', 'name': 'chime'}, {'frequency': 'r', 'synset': 'chinaware.n.01', 'synonyms': ['chinaware'], 'id': 245, 'def': 'dishware made of high quality porcelain', 'name': 'chinaware'}, {'frequency': 'c', 'synset': 'chip.n.04', 'synonyms': ['crisp_(potato_chip)', 'potato_chip'], 'id': 246, 'def': 'a thin crisp slice of potato fried in deep fat', 'name': 'crisp_(potato_chip)'}, {'frequency': 'r', 'synset': 'chip.n.06', 'synonyms': ['poker_chip'], 'id': 247, 'def': 'a small disk-shaped counter used to represent money when gambling', 'name': 'poker_chip'}, {'frequency': 'c', 'synset': 'chocolate_bar.n.01', 'synonyms': ['chocolate_bar'], 'id': 248, 'def': 'a bar of chocolate candy', 'name': 'chocolate_bar'}, {'frequency': 'c', 'synset': 'chocolate_cake.n.01', 'synonyms': ['chocolate_cake'], 'id': 249, 'def': 'cake containing chocolate', 'name': 'chocolate_cake'}, {'frequency': 'r', 'synset': 'chocolate_milk.n.01', 'synonyms': ['chocolate_milk'], 'id': 250, 'def': 'milk flavored with chocolate syrup', 'name': 'chocolate_milk'}, {'frequency': 'r', 'synset': 'chocolate_mousse.n.01', 'synonyms': ['chocolate_mousse'], 'id': 251, 'def': 'dessert mousse made with chocolate', 'name': 'chocolate_mousse'}, {'frequency': 'f', 'synset': 'choker.n.03', 'synonyms': ['choker', 'collar', 'neckband'], 'id': 252, 'def': 'shirt collar, animal collar, or tight-fitting necklace', 'name': 'choker'}, {'frequency': 'f', 'synset': 'chopping_board.n.01', 'synonyms': ['chopping_board', 'cutting_board', 'chopping_block'], 'id': 253, 'def': 'a wooden board where meats or vegetables can be cut', 'name': 'chopping_board'}, {'frequency': 'f', 'synset': 'chopstick.n.01', 'synonyms': ['chopstick'], 'id': 254, 'def': 'one of a pair of slender sticks used as oriental tableware to eat food with', 'name': 'chopstick'}, {'frequency': 'f', 'synset': 'christmas_tree.n.05', 'synonyms': ['Christmas_tree'], 'id': 255, 'def': 'an ornamented evergreen used as a Christmas decoration', 'name': 'Christmas_tree'}, {'frequency': 'c', 'synset': 'chute.n.02', 'synonyms': ['slide'], 'id': 256, 'def': 'sloping channel through which things can descend', 'name': 'slide'}, {'frequency': 'r', 'synset': 'cider.n.01', 'synonyms': ['cider', 'cyder'], 'id': 257, 'def': 'a beverage made from juice pressed from apples', 'name': 'cider'}, {'frequency': 'r', 'synset': 'cigar_box.n.01', 'synonyms': ['cigar_box'], 'id': 258, 'def': 'a box for holding cigars', 'name': 'cigar_box'}, {'frequency': 'f', 'synset': 'cigarette.n.01', 'synonyms': ['cigarette'], 'id': 259, 'def': 'finely ground tobacco wrapped in paper; for smoking', 'name': 'cigarette'}, {'frequency': 'c', 'synset': 'cigarette_case.n.01', 'synonyms': ['cigarette_case', 'cigarette_pack'], 'id': 260, 'def': 'a small flat case for holding cigarettes', 'name': 'cigarette_case'}, {'frequency': 'f', 'synset': 'cistern.n.02', 'synonyms': ['cistern', 'water_tank'], 'id': 261, 'def': 'a tank that holds the water used to flush a toilet', 'name': 'cistern'}, {'frequency': 'r', 'synset': 'clarinet.n.01', 'synonyms': ['clarinet'], 'id': 262, 'def': 'a single-reed instrument with a straight tube', 'name': 'clarinet'}, {'frequency': 'c', 'synset': 'clasp.n.01', 'synonyms': ['clasp'], 'id': 263, 'def': 'a fastener (as a buckle or hook) that is used to hold two things together', 'name': 'clasp'}, {'frequency': 'c', 'synset': 'cleansing_agent.n.01', 'synonyms': ['cleansing_agent', 'cleanser', 'cleaner'], 'id': 264, 'def': 'a preparation used in cleaning something', 'name': 'cleansing_agent'}, {'frequency': 'r', 'synset': 'cleat.n.02', 'synonyms': ['cleat_(for_securing_rope)'], 'id': 265, 'def': 'a fastener (usually with two projecting horns) around which a rope can be secured', 'name': 'cleat_(for_securing_rope)'}, {'frequency': 'r', 'synset': 'clementine.n.01', 'synonyms': ['clementine'], 'id': 266, 'def': 'a variety of mandarin orange', 'name': 'clementine'}, {'frequency': 'c', 'synset': 'clip.n.03', 'synonyms': ['clip'], 'id': 267, 'def': 'any of various small fasteners used to hold loose articles together', 'name': 'clip'}, {'frequency': 'c', 'synset': 'clipboard.n.01', 'synonyms': ['clipboard'], 'id': 268, 'def': 'a small writing board with a clip at the top for holding papers', 'name': 'clipboard'}, {'frequency': 'r', 'synset': 'clipper.n.03', 'synonyms': ['clippers_(for_plants)'], 'id': 269, 'def': 'shears for cutting grass or shrubbery (often used in the plural)', 'name': 'clippers_(for_plants)'}, {'frequency': 'r', 'synset': 'cloak.n.02', 'synonyms': ['cloak'], 'id': 270, 'def': 'a loose outer garment', 'name': 'cloak'}, {'frequency': 'f', 'synset': 'clock.n.01', 'synonyms': ['clock', 'timepiece', 'timekeeper'], 'id': 271, 'def': 'a timepiece that shows the time of day', 'name': 'clock'}, {'frequency': 'f', 'synset': 'clock_tower.n.01', 'synonyms': ['clock_tower'], 'id': 272, 'def': 'a tower with a large clock visible high up on an outside face', 'name': 'clock_tower'}, {'frequency': 'c', 'synset': 'clothes_hamper.n.01', 'synonyms': ['clothes_hamper', 'laundry_basket', 'clothes_basket'], 'id': 273, 'def': 'a hamper that holds dirty clothes to be washed or wet clothes to be dried', 'name': 'clothes_hamper'}, {'frequency': 'c', 'synset': 'clothespin.n.01', 'synonyms': ['clothespin', 'clothes_peg'], 'id': 274, 'def': 'wood or plastic fastener; for holding clothes on a clothesline', 'name': 'clothespin'}, {'frequency': 'r', 'synset': 'clutch_bag.n.01', 'synonyms': ['clutch_bag'], 'id': 275, 'def': "a woman's strapless purse that is carried in the hand", 'name': 'clutch_bag'}, {'frequency': 'f', 'synset': 'coaster.n.03', 'synonyms': ['coaster'], 'id': 276, 'def': 'a covering (plate or mat) that protects the surface of a table', 'name': 'coaster'}, {'frequency': 'f', 'synset': 'coat.n.01', 'synonyms': ['coat'], 'id': 277, 'def': 'an outer garment that has sleeves and covers the body from shoulder down', 'name': 'coat'}, {'frequency': 'c', 'synset': 'coat_hanger.n.01', 'synonyms': ['coat_hanger', 'clothes_hanger', 'dress_hanger'], 'id': 278, 'def': "a hanger that is shaped like a person's shoulders", 'name': 'coat_hanger'}, {'frequency': 'c', 'synset': 'coatrack.n.01', 'synonyms': ['coatrack', 'hatrack'], 'id': 279, 'def': 'a rack with hooks for temporarily holding coats and hats', 'name': 'coatrack'}, {'frequency': 'c', 'synset': 'cock.n.04', 'synonyms': ['cock', 'rooster'], 'id': 280, 'def': 'adult male chicken', 'name': 'cock'}, {'frequency': 'r', 'synset': 'cockroach.n.01', 'synonyms': ['cockroach'], 'id': 281, 'def': 'any of numerous chiefly nocturnal insects; some are domestic pests', 'name': 'cockroach'}, {'frequency': 'r', 'synset': 'cocoa.n.01', 'synonyms': ['cocoa_(beverage)', 'hot_chocolate_(beverage)', 'drinking_chocolate'], 'id': 282, 'def': 'a beverage made from cocoa powder and milk and sugar; usually drunk hot', 'name': 'cocoa_(beverage)'}, {'frequency': 'c', 'synset': 'coconut.n.02', 'synonyms': ['coconut', 'cocoanut'], 'id': 283, 'def': 'large hard-shelled brown oval nut with a fibrous husk', 'name': 'coconut'}, {'frequency': 'f', 'synset': 'coffee_maker.n.01', 'synonyms': ['coffee_maker', 'coffee_machine'], 'id': 284, 'def': 'a kitchen appliance for brewing coffee automatically', 'name': 'coffee_maker'}, {'frequency': 'f', 'synset': 'coffee_table.n.01', 'synonyms': ['coffee_table', 'cocktail_table'], 'id': 285, 'def': 'low table where magazines can be placed and coffee or cocktails are served', 'name': 'coffee_table'}, {'frequency': 'c', 'synset': 'coffeepot.n.01', 'synonyms': ['coffeepot'], 'id': 286, 'def': 'tall pot in which coffee is brewed', 'name': 'coffeepot'}, {'frequency': 'r', 'synset': 'coil.n.05', 'synonyms': ['coil'], 'id': 287, 'def': 'tubing that is wound in a spiral', 'name': 'coil'}, {'frequency': 'c', 'synset': 'coin.n.01', 'synonyms': ['coin'], 'id': 288, 'def': 'a flat metal piece (usually a disc) used as money', 'name': 'coin'}, {'frequency': 'c', 'synset': 'colander.n.01', 'synonyms': ['colander', 'cullender'], 'id': 289, 'def': 'bowl-shaped strainer; used to wash or drain foods', 'name': 'colander'}, {'frequency': 'c', 'synset': 'coleslaw.n.01', 'synonyms': ['coleslaw', 'slaw'], 'id': 290, 'def': 'basically shredded cabbage', 'name': 'coleslaw'}, {'frequency': 'r', 'synset': 'coloring_material.n.01', 'synonyms': ['coloring_material', 'colouring_material'], 'id': 291, 'def': 'any material used for its color', 'name': 'coloring_material'}, {'frequency': 'r', 'synset': 'combination_lock.n.01', 'synonyms': ['combination_lock'], 'id': 292, 'def': 'lock that can be opened only by turning dials in a special sequence', 'name': 'combination_lock'}, {'frequency': 'c', 'synset': 'comforter.n.04', 'synonyms': ['pacifier', 'teething_ring'], 'id': 293, 'def': 'device used for an infant to suck or bite on', 'name': 'pacifier'}, {'frequency': 'r', 'synset': 'comic_book.n.01', 'synonyms': ['comic_book'], 'id': 294, 'def': 'a magazine devoted to comic strips', 'name': 'comic_book'}, {'frequency': 'r', 'synset': 'compass.n.01', 'synonyms': ['compass'], 'id': 295, 'def': 'navigational instrument for finding directions', 'name': 'compass'}, {'frequency': 'f', 'synset': 'computer_keyboard.n.01', 'synonyms': ['computer_keyboard', 'keyboard_(computer)'], 'id': 296, 'def': 'a keyboard that is a data input device for computers', 'name': 'computer_keyboard'}, {'frequency': 'f', 'synset': 'condiment.n.01', 'synonyms': ['condiment'], 'id': 297, 'def': 'a preparation (a sauce or relish or spice) to enhance flavor or enjoyment', 'name': 'condiment'}, {'frequency': 'f', 'synset': 'cone.n.01', 'synonyms': ['cone', 'traffic_cone'], 'id': 298, 'def': 'a cone-shaped object used to direct traffic', 'name': 'cone'}, {'frequency': 'f', 'synset': 'control.n.09', 'synonyms': ['control', 'controller'], 'id': 299, 'def': 'a mechanism that controls the operation of a machine', 'name': 'control'}, {'frequency': 'r', 'synset': 'convertible.n.01', 'synonyms': ['convertible_(automobile)'], 'id': 300, 'def': 'a car that has top that can be folded or removed', 'name': 'convertible_(automobile)'}, {'frequency': 'r', 'synset': 'convertible.n.03', 'synonyms': ['sofa_bed'], 'id': 301, 'def': 'a sofa that can be converted into a bed', 'name': 'sofa_bed'}, {'frequency': 'r', 'synset': 'cooker.n.01', 'synonyms': ['cooker'], 'id': 302, 'def': 'a utensil for cooking', 'name': 'cooker'}, {'frequency': 'f', 'synset': 'cookie.n.01', 'synonyms': ['cookie', 'cooky', 'biscuit_(cookie)'], 'id': 303, 'def': "any of various small flat sweet cakes (`biscuit' is the British term)", 'name': 'cookie'}, {'frequency': 'r', 'synset': 'cooking_utensil.n.01', 'synonyms': ['cooking_utensil'], 'id': 304, 'def': 'a kitchen utensil made of material that does not melt easily; used for cooking', 'name': 'cooking_utensil'}, {'frequency': 'f', 'synset': 'cooler.n.01', 'synonyms': ['cooler_(for_food)', 'ice_chest'], 'id': 305, 'def': 'an insulated box for storing food often with ice', 'name': 'cooler_(for_food)'}, {'frequency': 'f', 'synset': 'cork.n.04', 'synonyms': ['cork_(bottle_plug)', 'bottle_cork'], 'id': 306, 'def': 'the plug in the mouth of a bottle (especially a wine bottle)', 'name': 'cork_(bottle_plug)'}, {'frequency': 'r', 'synset': 'corkboard.n.01', 'synonyms': ['corkboard'], 'id': 307, 'def': 'a sheet consisting of cork granules', 'name': 'corkboard'}, {'frequency': 'c', 'synset': 'corkscrew.n.01', 'synonyms': ['corkscrew', 'bottle_screw'], 'id': 308, 'def': 'a bottle opener that pulls corks', 'name': 'corkscrew'}, {'frequency': 'f', 'synset': 'corn.n.03', 'synonyms': ['edible_corn', 'corn', 'maize'], 'id': 309, 'def': 'ears or kernels of corn that can be prepared and served for human food (only mark individual ears or kernels)', 'name': 'edible_corn'}, {'frequency': 'r', 'synset': 'cornbread.n.01', 'synonyms': ['cornbread'], 'id': 310, 'def': 'bread made primarily of cornmeal', 'name': 'cornbread'}, {'frequency': 'c', 'synset': 'cornet.n.01', 'synonyms': ['cornet', 'horn', 'trumpet'], 'id': 311, 'def': 'a brass musical instrument with a narrow tube and a flared bell and many valves', 'name': 'cornet'}, {'frequency': 'c', 'synset': 'cornice.n.01', 'synonyms': ['cornice', 'valance', 'valance_board', 'pelmet'], 'id': 312, 'def': 'a decorative framework to conceal curtain fixtures at the top of a window casing', 'name': 'cornice'}, {'frequency': 'r', 'synset': 'cornmeal.n.01', 'synonyms': ['cornmeal'], 'id': 313, 'def': 'coarsely ground corn', 'name': 'cornmeal'}, {'frequency': 'c', 'synset': 'corset.n.01', 'synonyms': ['corset', 'girdle'], 'id': 314, 'def': "a woman's close-fitting foundation garment", 'name': 'corset'}, {'frequency': 'c', 'synset': 'costume.n.04', 'synonyms': ['costume'], 'id': 315, 'def': 'the attire characteristic of a country or a time or a social class', 'name': 'costume'}, {'frequency': 'r', 'synset': 'cougar.n.01', 'synonyms': ['cougar', 'puma', 'catamount', 'mountain_lion', 'panther'], 'id': 316, 'def': 'large American feline resembling a lion', 'name': 'cougar'}, {'frequency': 'r', 'synset': 'coverall.n.01', 'synonyms': ['coverall'], 'id': 317, 'def': 'a loose-fitting protective garment that is worn over other clothing', 'name': 'coverall'}, {'frequency': 'c', 'synset': 'cowbell.n.01', 'synonyms': ['cowbell'], 'id': 318, 'def': 'a bell hung around the neck of cow so that the cow can be easily located', 'name': 'cowbell'}, {'frequency': 'f', 'synset': 'cowboy_hat.n.01', 'synonyms': ['cowboy_hat', 'ten-gallon_hat'], 'id': 319, 'def': 'a hat with a wide brim and a soft crown; worn by American ranch hands', 'name': 'cowboy_hat'}, {'frequency': 'c', 'synset': 'crab.n.01', 'synonyms': ['crab_(animal)'], 'id': 320, 'def': 'decapod having eyes on short stalks and a broad flattened shell and pincers', 'name': 'crab_(animal)'}, {'frequency': 'r', 'synset': 'crab.n.05', 'synonyms': ['crabmeat'], 'id': 321, 'def': 'the edible flesh of any of various crabs', 'name': 'crabmeat'}, {'frequency': 'c', 'synset': 'cracker.n.01', 'synonyms': ['cracker'], 'id': 322, 'def': 'a thin crisp wafer', 'name': 'cracker'}, {'frequency': 'r', 'synset': 'crape.n.01', 'synonyms': ['crape', 'crepe', 'French_pancake'], 'id': 323, 'def': 'small very thin pancake', 'name': 'crape'}, {'frequency': 'f', 'synset': 'crate.n.01', 'synonyms': ['crate'], 'id': 324, 'def': 'a rugged box (usually made of wood); used for shipping', 'name': 'crate'}, {'frequency': 'c', 'synset': 'crayon.n.01', 'synonyms': ['crayon', 'wax_crayon'], 'id': 325, 'def': 'writing or drawing implement made of a colored stick of composition wax', 'name': 'crayon'}, {'frequency': 'r', 'synset': 'cream_pitcher.n.01', 'synonyms': ['cream_pitcher'], 'id': 326, 'def': 'a small pitcher for serving cream', 'name': 'cream_pitcher'}, {'frequency': 'c', 'synset': 'crescent_roll.n.01', 'synonyms': ['crescent_roll', 'croissant'], 'id': 327, 'def': 'very rich flaky crescent-shaped roll', 'name': 'crescent_roll'}, {'frequency': 'c', 'synset': 'crib.n.01', 'synonyms': ['crib', 'cot'], 'id': 328, 'def': 'baby bed with high sides made of slats', 'name': 'crib'}, {'frequency': 'c', 'synset': 'crock.n.03', 'synonyms': ['crock_pot', 'earthenware_jar'], 'id': 329, 'def': 'an earthen jar (made of baked clay) or a modern electric crockpot', 'name': 'crock_pot'}, {'frequency': 'f', 'synset': 'crossbar.n.01', 'synonyms': ['crossbar'], 'id': 330, 'def': 'a horizontal bar that goes across something', 'name': 'crossbar'}, {'frequency': 'r', 'synset': 'crouton.n.01', 'synonyms': ['crouton'], 'id': 331, 'def': 'a small piece of toasted or fried bread; served in soup or salads', 'name': 'crouton'}, {'frequency': 'c', 'synset': 'crow.n.01', 'synonyms': ['crow'], 'id': 332, 'def': 'black birds having a raucous call', 'name': 'crow'}, {'frequency': 'r', 'synset': 'crowbar.n.01', 'synonyms': ['crowbar', 'wrecking_bar', 'pry_bar'], 'id': 333, 'def': 'a heavy iron lever with one end forged into a wedge', 'name': 'crowbar'}, {'frequency': 'c', 'synset': 'crown.n.04', 'synonyms': ['crown'], 'id': 334, 'def': 'an ornamental jeweled headdress signifying sovereignty', 'name': 'crown'}, {'frequency': 'c', 'synset': 'crucifix.n.01', 'synonyms': ['crucifix'], 'id': 335, 'def': 'representation of the cross on which Jesus died', 'name': 'crucifix'}, {'frequency': 'c', 'synset': 'cruise_ship.n.01', 'synonyms': ['cruise_ship', 'cruise_liner'], 'id': 336, 'def': 'a passenger ship used commercially for pleasure cruises', 'name': 'cruise_ship'}, {'frequency': 'c', 'synset': 'cruiser.n.01', 'synonyms': ['police_cruiser', 'patrol_car', 'police_car', 'squad_car'], 'id': 337, 'def': 'a car in which policemen cruise the streets', 'name': 'police_cruiser'}, {'frequency': 'f', 'synset': 'crumb.n.03', 'synonyms': ['crumb'], 'id': 338, 'def': 'small piece of e.g. bread or cake', 'name': 'crumb'}, {'frequency': 'c', 'synset': 'crutch.n.01', 'synonyms': ['crutch'], 'id': 339, 'def': 'a wooden or metal staff that fits under the armpit and reaches to the ground', 'name': 'crutch'}, {'frequency': 'c', 'synset': 'cub.n.03', 'synonyms': ['cub_(animal)'], 'id': 340, 'def': 'the young of certain carnivorous mammals such as the bear or wolf or lion', 'name': 'cub_(animal)'}, {'frequency': 'c', 'synset': 'cube.n.05', 'synonyms': ['cube', 'square_block'], 'id': 341, 'def': 'a block in the (approximate) shape of a cube', 'name': 'cube'}, {'frequency': 'f', 'synset': 'cucumber.n.02', 'synonyms': ['cucumber', 'cuke'], 'id': 342, 'def': 'cylindrical green fruit with thin green rind and white flesh eaten as a vegetable', 'name': 'cucumber'}, {'frequency': 'c', 'synset': 'cufflink.n.01', 'synonyms': ['cufflink'], 'id': 343, 'def': 'jewelry consisting of linked buttons used to fasten the cuffs of a shirt', 'name': 'cufflink'}, {'frequency': 'f', 'synset': 'cup.n.01', 'synonyms': ['cup'], 'id': 344, 'def': 'a small open container usually used for drinking; usually has a handle', 'name': 'cup'}, {'frequency': 'c', 'synset': 'cup.n.08', 'synonyms': ['trophy_cup'], 'id': 345, 'def': 'a metal award or cup-shaped vessel with handles that is awarded as a trophy to a competition winner', 'name': 'trophy_cup'}, {'frequency': 'f', 'synset': 'cupboard.n.01', 'synonyms': ['cupboard', 'closet'], 'id': 346, 'def': 'a small room (or recess) or cabinet used for storage space', 'name': 'cupboard'}, {'frequency': 'f', 'synset': 'cupcake.n.01', 'synonyms': ['cupcake'], 'id': 347, 'def': 'small cake baked in a muffin tin', 'name': 'cupcake'}, {'frequency': 'r', 'synset': 'curler.n.01', 'synonyms': ['hair_curler', 'hair_roller', 'hair_crimper'], 'id': 348, 'def': 'a cylindrical tube around which the hair is wound to curl it', 'name': 'hair_curler'}, {'frequency': 'r', 'synset': 'curling_iron.n.01', 'synonyms': ['curling_iron'], 'id': 349, 'def': 'a cylindrical home appliance that heats hair that has been curled around it', 'name': 'curling_iron'}, {'frequency': 'f', 'synset': 'curtain.n.01', 'synonyms': ['curtain', 'drapery'], 'id': 350, 'def': 'hanging cloth used as a blind (especially for a window)', 'name': 'curtain'}, {'frequency': 'f', 'synset': 'cushion.n.03', 'synonyms': ['cushion'], 'id': 351, 'def': 'a soft bag filled with air or padding such as feathers or foam rubber', 'name': 'cushion'}, {'frequency': 'r', 'synset': 'cylinder.n.04', 'synonyms': ['cylinder'], 'id': 352, 'def': 'a cylindrical container', 'name': 'cylinder'}, {'frequency': 'r', 'synset': 'cymbal.n.01', 'synonyms': ['cymbal'], 'id': 353, 'def': 'a percussion instrument consisting of a concave brass disk', 'name': 'cymbal'}, {'frequency': 'r', 'synset': 'dagger.n.01', 'synonyms': ['dagger'], 'id': 354, 'def': 'a short knife with a pointed blade used for piercing or stabbing', 'name': 'dagger'}, {'frequency': 'r', 'synset': 'dalmatian.n.02', 'synonyms': ['dalmatian'], 'id': 355, 'def': 'a large breed having a smooth white coat with black or brown spots', 'name': 'dalmatian'}, {'frequency': 'c', 'synset': 'dartboard.n.01', 'synonyms': ['dartboard'], 'id': 356, 'def': 'a circular board of wood or cork used as the target in the game of darts', 'name': 'dartboard'}, {'frequency': 'r', 'synset': 'date.n.08', 'synonyms': ['date_(fruit)'], 'id': 357, 'def': 'sweet edible fruit of the date palm with a single long woody seed', 'name': 'date_(fruit)'}, {'frequency': 'f', 'synset': 'deck_chair.n.01', 'synonyms': ['deck_chair', 'beach_chair'], 'id': 358, 'def': 'a folding chair for use outdoors; a wooden frame supports a length of canvas', 'name': 'deck_chair'}, {'frequency': 'c', 'synset': 'deer.n.01', 'synonyms': ['deer', 'cervid'], 'id': 359, 'def': "distinguished from Bovidae by the male's having solid deciduous antlers", 'name': 'deer'}, {'frequency': 'c', 'synset': 'dental_floss.n.01', 'synonyms': ['dental_floss', 'floss'], 'id': 360, 'def': 'a soft thread for cleaning the spaces between the teeth', 'name': 'dental_floss'}, {'frequency': 'f', 'synset': 'desk.n.01', 'synonyms': ['desk'], 'id': 361, 'def': 'a piece of furniture with a writing surface and usually drawers or other compartments', 'name': 'desk'}, {'frequency': 'r', 'synset': 'detergent.n.01', 'synonyms': ['detergent'], 'id': 362, 'def': 'a surface-active chemical widely used in industry and laundering', 'name': 'detergent'}, {'frequency': 'c', 'synset': 'diaper.n.01', 'synonyms': ['diaper'], 'id': 363, 'def': 'garment consisting of a folded cloth drawn up between the legs and fastened at the waist', 'name': 'diaper'}, {'frequency': 'r', 'synset': 'diary.n.01', 'synonyms': ['diary', 'journal'], 'id': 364, 'def': 'yearly planner book', 'name': 'diary'}, {'frequency': 'r', 'synset': 'die.n.01', 'synonyms': ['die', 'dice'], 'id': 365, 'def': 'a small cube with 1 to 6 spots on the six faces; used in gambling', 'name': 'die'}, {'frequency': 'r', 'synset': 'dinghy.n.01', 'synonyms': ['dinghy', 'dory', 'rowboat'], 'id': 366, 'def': 'a small boat of shallow draft with seats and oars with which it is propelled', 'name': 'dinghy'}, {'frequency': 'f', 'synset': 'dining_table.n.01', 'synonyms': ['dining_table'], 'id': 367, 'def': 'a table at which meals are served', 'name': 'dining_table'}, {'frequency': 'r', 'synset': 'dinner_jacket.n.01', 'synonyms': ['tux', 'tuxedo'], 'id': 368, 'def': 'semiformal evening dress for men', 'name': 'tux'}, {'frequency': 'f', 'synset': 'dish.n.01', 'synonyms': ['dish'], 'id': 369, 'def': 'a piece of dishware normally used as a container for holding or serving food', 'name': 'dish'}, {'frequency': 'c', 'synset': 'dish.n.05', 'synonyms': ['dish_antenna'], 'id': 370, 'def': 'directional antenna consisting of a parabolic reflector', 'name': 'dish_antenna'}, {'frequency': 'c', 'synset': 'dishrag.n.01', 'synonyms': ['dishrag', 'dishcloth'], 'id': 371, 'def': 'a cloth for washing dishes or cleaning in general', 'name': 'dishrag'}, {'frequency': 'f', 'synset': 'dishtowel.n.01', 'synonyms': ['dishtowel', 'tea_towel'], 'id': 372, 'def': 'a towel for drying dishes', 'name': 'dishtowel'}, {'frequency': 'f', 'synset': 'dishwasher.n.01', 'synonyms': ['dishwasher', 'dishwashing_machine'], 'id': 373, 'def': 'a machine for washing dishes', 'name': 'dishwasher'}, {'frequency': 'r', 'synset': 'dishwasher_detergent.n.01', 'synonyms': ['dishwasher_detergent', 'dishwashing_detergent', 'dishwashing_liquid', 'dishsoap'], 'id': 374, 'def': 'dishsoap or dish detergent designed for use in dishwashers', 'name': 'dishwasher_detergent'}, {'frequency': 'f', 'synset': 'dispenser.n.01', 'synonyms': ['dispenser'], 'id': 375, 'def': 'a container so designed that the contents can be used in prescribed amounts', 'name': 'dispenser'}, {'frequency': 'r', 'synset': 'diving_board.n.01', 'synonyms': ['diving_board'], 'id': 376, 'def': 'a springboard from which swimmers can dive', 'name': 'diving_board'}, {'frequency': 'f', 'synset': 'dixie_cup.n.01', 'synonyms': ['Dixie_cup', 'paper_cup'], 'id': 377, 'def': 'a disposable cup made of paper; for holding drinks', 'name': 'Dixie_cup'}, {'frequency': 'f', 'synset': 'dog.n.01', 'synonyms': ['dog'], 'id': 378, 'def': 'a common domesticated dog', 'name': 'dog'}, {'frequency': 'f', 'synset': 'dog_collar.n.01', 'synonyms': ['dog_collar'], 'id': 379, 'def': 'a collar for a dog', 'name': 'dog_collar'}, {'frequency': 'f', 'synset': 'doll.n.01', 'synonyms': ['doll'], 'id': 380, 'def': 'a toy replica of a HUMAN (NOT AN ANIMAL)', 'name': 'doll'}, {'frequency': 'r', 'synset': 'dollar.n.02', 'synonyms': ['dollar', 'dollar_bill', 'one_dollar_bill'], 'id': 381, 'def': 'a piece of paper money worth one dollar', 'name': 'dollar'}, {'frequency': 'r', 'synset': 'dollhouse.n.01', 'synonyms': ['dollhouse', "doll's_house"], 'id': 382, 'def': "a house so small that it is likened to a child's plaything", 'name': 'dollhouse'}, {'frequency': 'c', 'synset': 'dolphin.n.02', 'synonyms': ['dolphin'], 'id': 383, 'def': 'any of various small toothed whales with a beaklike snout; larger than porpoises', 'name': 'dolphin'}, {'frequency': 'c', 'synset': 'domestic_ass.n.01', 'synonyms': ['domestic_ass', 'donkey'], 'id': 384, 'def': 'domestic beast of burden descended from the African wild ass; patient but stubborn', 'name': 'domestic_ass'}, {'frequency': 'f', 'synset': 'doorknob.n.01', 'synonyms': ['doorknob', 'doorhandle'], 'id': 385, 'def': "a knob used to open a door (often called `doorhandle' in Great Britain)", 'name': 'doorknob'}, {'frequency': 'c', 'synset': 'doormat.n.02', 'synonyms': ['doormat', 'welcome_mat'], 'id': 386, 'def': 'a mat placed outside an exterior door for wiping the shoes before entering', 'name': 'doormat'}, {'frequency': 'f', 'synset': 'doughnut.n.02', 'synonyms': ['doughnut', 'donut'], 'id': 387, 'def': 'a small ring-shaped friedcake', 'name': 'doughnut'}, {'frequency': 'r', 'synset': 'dove.n.01', 'synonyms': ['dove'], 'id': 388, 'def': 'any of numerous small pigeons', 'name': 'dove'}, {'frequency': 'r', 'synset': 'dragonfly.n.01', 'synonyms': ['dragonfly'], 'id': 389, 'def': 'slender-bodied non-stinging insect having iridescent wings that are outspread at rest', 'name': 'dragonfly'}, {'frequency': 'f', 'synset': 'drawer.n.01', 'synonyms': ['drawer'], 'id': 390, 'def': 'a boxlike container in a piece of furniture; made so as to slide in and out', 'name': 'drawer'}, {'frequency': 'c', 'synset': 'drawers.n.01', 'synonyms': ['underdrawers', 'boxers', 'boxershorts'], 'id': 391, 'def': 'underpants worn by men', 'name': 'underdrawers'}, {'frequency': 'f', 'synset': 'dress.n.01', 'synonyms': ['dress', 'frock'], 'id': 392, 'def': 'a one-piece garment for a woman; has skirt and bodice', 'name': 'dress'}, {'frequency': 'c', 'synset': 'dress_hat.n.01', 'synonyms': ['dress_hat', 'high_hat', 'opera_hat', 'silk_hat', 'top_hat'], 'id': 393, 'def': "a man's hat with a tall crown; usually covered with silk or with beaver fur", 'name': 'dress_hat'}, {'frequency': 'f', 'synset': 'dress_suit.n.01', 'synonyms': ['dress_suit'], 'id': 394, 'def': 'formalwear consisting of full evening dress for men', 'name': 'dress_suit'}, {'frequency': 'f', 'synset': 'dresser.n.05', 'synonyms': ['dresser'], 'id': 395, 'def': 'a cabinet with shelves', 'name': 'dresser'}, {'frequency': 'c', 'synset': 'drill.n.01', 'synonyms': ['drill'], 'id': 396, 'def': 'a tool with a sharp rotating point for making holes in hard materials', 'name': 'drill'}, {'frequency': 'r', 'synset': 'drone.n.04', 'synonyms': ['drone'], 'id': 397, 'def': 'an aircraft without a pilot that is operated by remote control', 'name': 'drone'}, {'frequency': 'r', 'synset': 'dropper.n.01', 'synonyms': ['dropper', 'eye_dropper'], 'id': 398, 'def': 'pipet consisting of a small tube with a vacuum bulb at one end for drawing liquid in and releasing it a drop at a time', 'name': 'dropper'}, {'frequency': 'c', 'synset': 'drum.n.01', 'synonyms': ['drum_(musical_instrument)'], 'id': 399, 'def': 'a musical percussion instrument; usually consists of a hollow cylinder with a membrane stretched across each end', 'name': 'drum_(musical_instrument)'}, {'frequency': 'r', 'synset': 'drumstick.n.02', 'synonyms': ['drumstick'], 'id': 400, 'def': 'a stick used for playing a drum', 'name': 'drumstick'}, {'frequency': 'f', 'synset': 'duck.n.01', 'synonyms': ['duck'], 'id': 401, 'def': 'small web-footed broad-billed swimming bird', 'name': 'duck'}, {'frequency': 'c', 'synset': 'duckling.n.02', 'synonyms': ['duckling'], 'id': 402, 'def': 'young duck', 'name': 'duckling'}, {'frequency': 'c', 'synset': 'duct_tape.n.01', 'synonyms': ['duct_tape'], 'id': 403, 'def': 'a wide silvery adhesive tape', 'name': 'duct_tape'}, {'frequency': 'f', 'synset': 'duffel_bag.n.01', 'synonyms': ['duffel_bag', 'duffle_bag', 'duffel', 'duffle'], 'id': 404, 'def': 'a large cylindrical bag of heavy cloth (does not include suitcases)', 'name': 'duffel_bag'}, {'frequency': 'r', 'synset': 'dumbbell.n.01', 'synonyms': ['dumbbell'], 'id': 405, 'def': 'an exercising weight with two ball-like ends connected by a short handle', 'name': 'dumbbell'}, {'frequency': 'c', 'synset': 'dumpster.n.01', 'synonyms': ['dumpster'], 'id': 406, 'def': 'a container designed to receive and transport and dump waste', 'name': 'dumpster'}, {'frequency': 'r', 'synset': 'dustpan.n.02', 'synonyms': ['dustpan'], 'id': 407, 'def': 'a short-handled receptacle into which dust can be swept', 'name': 'dustpan'}, {'frequency': 'c', 'synset': 'eagle.n.01', 'synonyms': ['eagle'], 'id': 408, 'def': 'large birds of prey noted for their broad wings and strong soaring flight', 'name': 'eagle'}, {'frequency': 'f', 'synset': 'earphone.n.01', 'synonyms': ['earphone', 'earpiece', 'headphone'], 'id': 409, 'def': 'device for listening to audio that is held over or inserted into the ear', 'name': 'earphone'}, {'frequency': 'r', 'synset': 'earplug.n.01', 'synonyms': ['earplug'], 'id': 410, 'def': 'a soft plug that is inserted into the ear canal to block sound', 'name': 'earplug'}, {'frequency': 'f', 'synset': 'earring.n.01', 'synonyms': ['earring'], 'id': 411, 'def': 'jewelry to ornament the ear', 'name': 'earring'}, {'frequency': 'c', 'synset': 'easel.n.01', 'synonyms': ['easel'], 'id': 412, 'def': "an upright tripod for displaying something (usually an artist's canvas)", 'name': 'easel'}, {'frequency': 'r', 'synset': 'eclair.n.01', 'synonyms': ['eclair'], 'id': 413, 'def': 'oblong cream puff', 'name': 'eclair'}, {'frequency': 'r', 'synset': 'eel.n.01', 'synonyms': ['eel'], 'id': 414, 'def': 'an elongate fish with fatty flesh', 'name': 'eel'}, {'frequency': 'f', 'synset': 'egg.n.02', 'synonyms': ['egg', 'eggs'], 'id': 415, 'def': 'oval reproductive body of a fowl (especially a hen) used as food', 'name': 'egg'}, {'frequency': 'r', 'synset': 'egg_roll.n.01', 'synonyms': ['egg_roll', 'spring_roll'], 'id': 416, 'def': 'minced vegetables and meat wrapped in a pancake and fried', 'name': 'egg_roll'}, {'frequency': 'c', 'synset': 'egg_yolk.n.01', 'synonyms': ['egg_yolk', 'yolk_(egg)'], 'id': 417, 'def': 'the yellow spherical part of an egg', 'name': 'egg_yolk'}, {'frequency': 'c', 'synset': 'eggbeater.n.02', 'synonyms': ['eggbeater', 'eggwhisk'], 'id': 418, 'def': 'a mixer for beating eggs or whipping cream', 'name': 'eggbeater'}, {'frequency': 'c', 'synset': 'eggplant.n.01', 'synonyms': ['eggplant', 'aubergine'], 'id': 419, 'def': 'egg-shaped vegetable having a shiny skin typically dark purple', 'name': 'eggplant'}, {'frequency': 'r', 'synset': 'electric_chair.n.01', 'synonyms': ['electric_chair'], 'id': 420, 'def': 'a chair-shaped instrument of execution by electrocution', 'name': 'electric_chair'}, {'frequency': 'f', 'synset': 'electric_refrigerator.n.01', 'synonyms': ['refrigerator'], 'id': 421, 'def': 'a refrigerator in which the coolant is pumped around by an electric motor', 'name': 'refrigerator'}, {'frequency': 'f', 'synset': 'elephant.n.01', 'synonyms': ['elephant'], 'id': 422, 'def': 'a common elephant', 'name': 'elephant'}, {'frequency': 'c', 'synset': 'elk.n.01', 'synonyms': ['elk', 'moose'], 'id': 423, 'def': 'large northern deer with enormous flattened antlers in the male', 'name': 'elk'}, {'frequency': 'c', 'synset': 'envelope.n.01', 'synonyms': ['envelope'], 'id': 424, 'def': 'a flat (usually rectangular) container for a letter, thin package, etc.', 'name': 'envelope'}, {'frequency': 'c', 'synset': 'eraser.n.01', 'synonyms': ['eraser'], 'id': 425, 'def': 'an implement used to erase something', 'name': 'eraser'}, {'frequency': 'r', 'synset': 'escargot.n.01', 'synonyms': ['escargot'], 'id': 426, 'def': 'edible snail usually served in the shell with a sauce of melted butter and garlic', 'name': 'escargot'}, {'frequency': 'r', 'synset': 'eyepatch.n.01', 'synonyms': ['eyepatch'], 'id': 427, 'def': 'a protective cloth covering for an injured eye', 'name': 'eyepatch'}, {'frequency': 'r', 'synset': 'falcon.n.01', 'synonyms': ['falcon'], 'id': 428, 'def': 'birds of prey having long pointed powerful wings adapted for swift flight', 'name': 'falcon'}, {'frequency': 'f', 'synset': 'fan.n.01', 'synonyms': ['fan'], 'id': 429, 'def': 'a device for creating a current of air by movement of a surface or surfaces', 'name': 'fan'}, {'frequency': 'f', 'synset': 'faucet.n.01', 'synonyms': ['faucet', 'spigot', 'tap'], 'id': 430, 'def': 'a regulator for controlling the flow of a liquid from a reservoir', 'name': 'faucet'}, {'frequency': 'r', 'synset': 'fedora.n.01', 'synonyms': ['fedora'], 'id': 431, 'def': 'a hat made of felt with a creased crown', 'name': 'fedora'}, {'frequency': 'r', 'synset': 'ferret.n.02', 'synonyms': ['ferret'], 'id': 432, 'def': 'domesticated albino variety of the European polecat bred for hunting rats and rabbits', 'name': 'ferret'}, {'frequency': 'c', 'synset': 'ferris_wheel.n.01', 'synonyms': ['Ferris_wheel'], 'id': 433, 'def': 'a large wheel with suspended seats that remain upright as the wheel rotates', 'name': 'Ferris_wheel'}, {'frequency': 'c', 'synset': 'ferry.n.01', 'synonyms': ['ferry', 'ferryboat'], 'id': 434, 'def': 'a boat that transports people or vehicles across a body of water and operates on a regular schedule', 'name': 'ferry'}, {'frequency': 'r', 'synset': 'fig.n.04', 'synonyms': ['fig_(fruit)'], 'id': 435, 'def': 'fleshy sweet pear-shaped yellowish or purple fruit eaten fresh or preserved or dried', 'name': 'fig_(fruit)'}, {'frequency': 'c', 'synset': 'fighter.n.02', 'synonyms': ['fighter_jet', 'fighter_aircraft', 'attack_aircraft'], 'id': 436, 'def': 'a high-speed military or naval airplane designed to destroy enemy targets', 'name': 'fighter_jet'}, {'frequency': 'f', 'synset': 'figurine.n.01', 'synonyms': ['figurine'], 'id': 437, 'def': 'a small carved or molded figure', 'name': 'figurine'}, {'frequency': 'c', 'synset': 'file.n.03', 'synonyms': ['file_cabinet', 'filing_cabinet'], 'id': 438, 'def': 'office furniture consisting of a container for keeping papers in order', 'name': 'file_cabinet'}, {'frequency': 'r', 'synset': 'file.n.04', 'synonyms': ['file_(tool)'], 'id': 439, 'def': 'a steel hand tool with small sharp teeth on some or all of its surfaces; used for smoothing wood or metal', 'name': 'file_(tool)'}, {'frequency': 'f', 'synset': 'fire_alarm.n.02', 'synonyms': ['fire_alarm', 'smoke_alarm'], 'id': 440, 'def': 'an alarm that is tripped off by fire or smoke', 'name': 'fire_alarm'}, {'frequency': 'f', 'synset': 'fire_engine.n.01', 'synonyms': ['fire_engine', 'fire_truck'], 'id': 441, 'def': 'large trucks that carry firefighters and equipment to the site of a fire', 'name': 'fire_engine'}, {'frequency': 'f', 'synset': 'fire_extinguisher.n.01', 'synonyms': ['fire_extinguisher', 'extinguisher'], 'id': 442, 'def': 'a manually operated device for extinguishing small fires', 'name': 'fire_extinguisher'}, {'frequency': 'c', 'synset': 'fire_hose.n.01', 'synonyms': ['fire_hose'], 'id': 443, 'def': 'a large hose that carries water from a fire hydrant to the site of the fire', 'name': 'fire_hose'}, {'frequency': 'f', 'synset': 'fireplace.n.01', 'synonyms': ['fireplace'], 'id': 444, 'def': 'an open recess in a wall at the base of a chimney where a fire can be built', 'name': 'fireplace'}, {'frequency': 'f', 'synset': 'fireplug.n.01', 'synonyms': ['fireplug', 'fire_hydrant', 'hydrant'], 'id': 445, 'def': 'an upright hydrant for drawing water to use in fighting a fire', 'name': 'fireplug'}, {'frequency': 'r', 'synset': 'first-aid_kit.n.01', 'synonyms': ['first-aid_kit'], 'id': 446, 'def': 'kit consisting of a set of bandages and medicines for giving first aid', 'name': 'first-aid_kit'}, {'frequency': 'f', 'synset': 'fish.n.01', 'synonyms': ['fish'], 'id': 447, 'def': 'any of various mostly cold-blooded aquatic vertebrates usually having scales and breathing through gills', 'name': 'fish'}, {'frequency': 'c', 'synset': 'fish.n.02', 'synonyms': ['fish_(food)'], 'id': 448, 'def': 'the flesh of fish used as food', 'name': 'fish_(food)'}, {'frequency': 'r', 'synset': 'fishbowl.n.02', 'synonyms': ['fishbowl', 'goldfish_bowl'], 'id': 449, 'def': 'a transparent bowl in which small fish are kept', 'name': 'fishbowl'}, {'frequency': 'c', 'synset': 'fishing_rod.n.01', 'synonyms': ['fishing_rod', 'fishing_pole'], 'id': 450, 'def': 'a rod that is used in fishing to extend the fishing line', 'name': 'fishing_rod'}, {'frequency': 'f', 'synset': 'flag.n.01', 'synonyms': ['flag'], 'id': 451, 'def': 'emblem usually consisting of a rectangular piece of cloth of distinctive design (do not include pole)', 'name': 'flag'}, {'frequency': 'f', 'synset': 'flagpole.n.02', 'synonyms': ['flagpole', 'flagstaff'], 'id': 452, 'def': 'a tall staff or pole on which a flag is raised', 'name': 'flagpole'}, {'frequency': 'c', 'synset': 'flamingo.n.01', 'synonyms': ['flamingo'], 'id': 453, 'def': 'large pink web-footed bird with down-bent bill', 'name': 'flamingo'}, {'frequency': 'c', 'synset': 'flannel.n.01', 'synonyms': ['flannel'], 'id': 454, 'def': 'a soft light woolen fabric; used for clothing', 'name': 'flannel'}, {'frequency': 'c', 'synset': 'flap.n.01', 'synonyms': ['flap'], 'id': 455, 'def': 'any broad thin covering attached at one edge, such as a mud flap next to a wheel or a flap on an airplane wing', 'name': 'flap'}, {'frequency': 'r', 'synset': 'flash.n.10', 'synonyms': ['flash', 'flashbulb'], 'id': 456, 'def': 'a lamp for providing momentary light to take a photograph', 'name': 'flash'}, {'frequency': 'c', 'synset': 'flashlight.n.01', 'synonyms': ['flashlight', 'torch'], 'id': 457, 'def': 'a small portable battery-powered electric lamp', 'name': 'flashlight'}, {'frequency': 'r', 'synset': 'fleece.n.03', 'synonyms': ['fleece'], 'id': 458, 'def': 'a soft bulky fabric with deep pile; used chiefly for clothing', 'name': 'fleece'}, {'frequency': 'f', 'synset': 'flip-flop.n.02', 'synonyms': ['flip-flop_(sandal)'], 'id': 459, 'def': 'a backless sandal held to the foot by a thong between two toes', 'name': 'flip-flop_(sandal)'}, {'frequency': 'c', 'synset': 'flipper.n.01', 'synonyms': ['flipper_(footwear)', 'fin_(footwear)'], 'id': 460, 'def': 'a shoe to aid a person in swimming', 'name': 'flipper_(footwear)'}, {'frequency': 'f', 'synset': 'flower_arrangement.n.01', 'synonyms': ['flower_arrangement', 'floral_arrangement'], 'id': 461, 'def': 'a decorative arrangement of flowers', 'name': 'flower_arrangement'}, {'frequency': 'c', 'synset': 'flute.n.02', 'synonyms': ['flute_glass', 'champagne_flute'], 'id': 462, 'def': 'a tall narrow wineglass', 'name': 'flute_glass'}, {'frequency': 'c', 'synset': 'foal.n.01', 'synonyms': ['foal'], 'id': 463, 'def': 'a young horse', 'name': 'foal'}, {'frequency': 'c', 'synset': 'folding_chair.n.01', 'synonyms': ['folding_chair'], 'id': 464, 'def': 'a chair that can be folded flat for storage', 'name': 'folding_chair'}, {'frequency': 'c', 'synset': 'food_processor.n.01', 'synonyms': ['food_processor'], 'id': 465, 'def': 'a kitchen appliance for shredding, blending, chopping, or slicing food', 'name': 'food_processor'}, {'frequency': 'c', 'synset': 'football.n.02', 'synonyms': ['football_(American)'], 'id': 466, 'def': 'the inflated oblong ball used in playing American football', 'name': 'football_(American)'}, {'frequency': 'r', 'synset': 'football_helmet.n.01', 'synonyms': ['football_helmet'], 'id': 467, 'def': 'a padded helmet with a face mask to protect the head of football players', 'name': 'football_helmet'}, {'frequency': 'c', 'synset': 'footstool.n.01', 'synonyms': ['footstool', 'footrest'], 'id': 468, 'def': 'a low seat or a stool to rest the feet of a seated person', 'name': 'footstool'}, {'frequency': 'f', 'synset': 'fork.n.01', 'synonyms': ['fork'], 'id': 469, 'def': 'cutlery used for serving and eating food', 'name': 'fork'}, {'frequency': 'c', 'synset': 'forklift.n.01', 'synonyms': ['forklift'], 'id': 470, 'def': 'an industrial vehicle with a power operated fork in front that can be inserted under loads to lift and move them', 'name': 'forklift'}, {'frequency': 'c', 'synset': 'freight_car.n.01', 'synonyms': ['freight_car'], 'id': 471, 'def': 'a railway car that carries freight', 'name': 'freight_car'}, {'frequency': 'c', 'synset': 'french_toast.n.01', 'synonyms': ['French_toast'], 'id': 472, 'def': 'bread slice dipped in egg and milk and fried', 'name': 'French_toast'}, {'frequency': 'c', 'synset': 'freshener.n.01', 'synonyms': ['freshener', 'air_freshener'], 'id': 473, 'def': 'anything that freshens air by removing or covering odor', 'name': 'freshener'}, {'frequency': 'f', 'synset': 'frisbee.n.01', 'synonyms': ['frisbee'], 'id': 474, 'def': 'a light, plastic disk propelled with a flip of the wrist for recreation or competition', 'name': 'frisbee'}, {'frequency': 'c', 'synset': 'frog.n.01', 'synonyms': ['frog', 'toad', 'toad_frog'], 'id': 475, 'def': 'a tailless stout-bodied amphibians with long hind limbs for leaping', 'name': 'frog'}, {'frequency': 'c', 'synset': 'fruit_juice.n.01', 'synonyms': ['fruit_juice'], 'id': 476, 'def': 'drink produced by squeezing or crushing fruit', 'name': 'fruit_juice'}, {'frequency': 'f', 'synset': 'frying_pan.n.01', 'synonyms': ['frying_pan', 'frypan', 'skillet'], 'id': 477, 'def': 'a pan used for frying foods', 'name': 'frying_pan'}, {'frequency': 'r', 'synset': 'fudge.n.01', 'synonyms': ['fudge'], 'id': 478, 'def': 'soft creamy candy', 'name': 'fudge'}, {'frequency': 'r', 'synset': 'funnel.n.02', 'synonyms': ['funnel'], 'id': 479, 'def': 'a cone-shaped utensil used to channel a substance into a container with a small mouth', 'name': 'funnel'}, {'frequency': 'r', 'synset': 'futon.n.01', 'synonyms': ['futon'], 'id': 480, 'def': 'a pad that is used for sleeping on the floor or on a raised frame', 'name': 'futon'}, {'frequency': 'r', 'synset': 'gag.n.02', 'synonyms': ['gag', 'muzzle'], 'id': 481, 'def': "restraint put into a person's mouth to prevent speaking or shouting", 'name': 'gag'}, {'frequency': 'r', 'synset': 'garbage.n.03', 'synonyms': ['garbage'], 'id': 482, 'def': 'a receptacle where waste can be discarded', 'name': 'garbage'}, {'frequency': 'c', 'synset': 'garbage_truck.n.01', 'synonyms': ['garbage_truck'], 'id': 483, 'def': 'a truck for collecting domestic refuse', 'name': 'garbage_truck'}, {'frequency': 'c', 'synset': 'garden_hose.n.01', 'synonyms': ['garden_hose'], 'id': 484, 'def': 'a hose used for watering a lawn or garden', 'name': 'garden_hose'}, {'frequency': 'c', 'synset': 'gargle.n.01', 'synonyms': ['gargle', 'mouthwash'], 'id': 485, 'def': 'a medicated solution used for gargling and rinsing the mouth', 'name': 'gargle'}, {'frequency': 'r', 'synset': 'gargoyle.n.02', 'synonyms': ['gargoyle'], 'id': 486, 'def': 'an ornament consisting of a grotesquely carved figure of a person or animal', 'name': 'gargoyle'}, {'frequency': 'c', 'synset': 'garlic.n.02', 'synonyms': ['garlic', 'ail'], 'id': 487, 'def': 'aromatic bulb used as seasoning', 'name': 'garlic'}, {'frequency': 'r', 'synset': 'gasmask.n.01', 'synonyms': ['gasmask', 'respirator', 'gas_helmet'], 'id': 488, 'def': 'a protective face mask with a filter', 'name': 'gasmask'}, {'frequency': 'c', 'synset': 'gazelle.n.01', 'synonyms': ['gazelle'], 'id': 489, 'def': 'small swift graceful antelope of Africa and Asia having lustrous eyes', 'name': 'gazelle'}, {'frequency': 'c', 'synset': 'gelatin.n.02', 'synonyms': ['gelatin', 'jelly'], 'id': 490, 'def': 'an edible jelly made with gelatin and used as a dessert or salad base or a coating for foods', 'name': 'gelatin'}, {'frequency': 'r', 'synset': 'gem.n.02', 'synonyms': ['gemstone'], 'id': 491, 'def': 'a crystalline rock that can be cut and polished for jewelry', 'name': 'gemstone'}, {'frequency': 'r', 'synset': 'generator.n.02', 'synonyms': ['generator'], 'id': 492, 'def': 'engine that converts mechanical energy into electrical energy by electromagnetic induction', 'name': 'generator'}, {'frequency': 'c', 'synset': 'giant_panda.n.01', 'synonyms': ['giant_panda', 'panda', 'panda_bear'], 'id': 493, 'def': 'large black-and-white herbivorous mammal of bamboo forests of China and Tibet', 'name': 'giant_panda'}, {'frequency': 'c', 'synset': 'gift_wrap.n.01', 'synonyms': ['gift_wrap'], 'id': 494, 'def': 'attractive wrapping paper suitable for wrapping gifts', 'name': 'gift_wrap'}, {'frequency': 'c', 'synset': 'ginger.n.03', 'synonyms': ['ginger', 'gingerroot'], 'id': 495, 'def': 'the root of the common ginger plant; used fresh as a seasoning', 'name': 'ginger'}, {'frequency': 'f', 'synset': 'giraffe.n.01', 'synonyms': ['giraffe'], 'id': 496, 'def': 'tall animal having a spotted coat and small horns and very long neck and legs', 'name': 'giraffe'}, {'frequency': 'c', 'synset': 'girdle.n.02', 'synonyms': ['cincture', 'sash', 'waistband', 'waistcloth'], 'id': 497, 'def': 'a band of material around the waist that strengthens a skirt or trousers', 'name': 'cincture'}, {'frequency': 'f', 'synset': 'glass.n.02', 'synonyms': ['glass_(drink_container)', 'drinking_glass'], 'id': 498, 'def': 'a container for holding liquids while drinking', 'name': 'glass_(drink_container)'}, {'frequency': 'c', 'synset': 'globe.n.03', 'synonyms': ['globe'], 'id': 499, 'def': 'a sphere on which a map (especially of the earth) is represented', 'name': 'globe'}, {'frequency': 'f', 'synset': 'glove.n.02', 'synonyms': ['glove'], 'id': 500, 'def': 'handwear covering the hand', 'name': 'glove'}, {'frequency': 'c', 'synset': 'goat.n.01', 'synonyms': ['goat'], 'id': 501, 'def': 'a common goat', 'name': 'goat'}, {'frequency': 'f', 'synset': 'goggles.n.01', 'synonyms': ['goggles'], 'id': 502, 'def': 'tight-fitting spectacles worn to protect the eyes', 'name': 'goggles'}, {'frequency': 'r', 'synset': 'goldfish.n.01', 'synonyms': ['goldfish'], 'id': 503, 'def': 'small golden or orange-red freshwater fishes used as pond or aquarium pets', 'name': 'goldfish'}, {'frequency': 'c', 'synset': 'golf_club.n.02', 'synonyms': ['golf_club', 'golf-club'], 'id': 504, 'def': 'golf equipment used by a golfer to hit a golf ball', 'name': 'golf_club'}, {'frequency': 'c', 'synset': 'golfcart.n.01', 'synonyms': ['golfcart'], 'id': 505, 'def': 'a small motor vehicle in which golfers can ride between shots', 'name': 'golfcart'}, {'frequency': 'r', 'synset': 'gondola.n.02', 'synonyms': ['gondola_(boat)'], 'id': 506, 'def': 'long narrow flat-bottomed boat propelled by sculling; traditionally used on canals of Venice', 'name': 'gondola_(boat)'}, {'frequency': 'c', 'synset': 'goose.n.01', 'synonyms': ['goose'], 'id': 507, 'def': 'loud, web-footed long-necked aquatic birds usually larger than ducks', 'name': 'goose'}, {'frequency': 'r', 'synset': 'gorilla.n.01', 'synonyms': ['gorilla'], 'id': 508, 'def': 'largest ape', 'name': 'gorilla'}, {'frequency': 'r', 'synset': 'gourd.n.02', 'synonyms': ['gourd'], 'id': 509, 'def': 'any of numerous inedible fruits with hard rinds', 'name': 'gourd'}, {'frequency': 'f', 'synset': 'grape.n.01', 'synonyms': ['grape'], 'id': 510, 'def': 'any of various juicy fruit with green or purple skins; grow in clusters', 'name': 'grape'}, {'frequency': 'c', 'synset': 'grater.n.01', 'synonyms': ['grater'], 'id': 511, 'def': 'utensil with sharp perforations for shredding foods (as vegetables or cheese)', 'name': 'grater'}, {'frequency': 'c', 'synset': 'gravestone.n.01', 'synonyms': ['gravestone', 'headstone', 'tombstone'], 'id': 512, 'def': 'a stone that is used to mark a grave', 'name': 'gravestone'}, {'frequency': 'r', 'synset': 'gravy_boat.n.01', 'synonyms': ['gravy_boat', 'gravy_holder'], 'id': 513, 'def': 'a dish (often boat-shaped) for serving gravy or sauce', 'name': 'gravy_boat'}, {'frequency': 'f', 'synset': 'green_bean.n.02', 'synonyms': ['green_bean'], 'id': 514, 'def': 'a common bean plant cultivated for its slender green edible pods', 'name': 'green_bean'}, {'frequency': 'f', 'synset': 'green_onion.n.01', 'synonyms': ['green_onion', 'spring_onion', 'scallion'], 'id': 515, 'def': 'a young onion before the bulb has enlarged', 'name': 'green_onion'}, {'frequency': 'r', 'synset': 'griddle.n.01', 'synonyms': ['griddle'], 'id': 516, 'def': 'cooking utensil consisting of a flat heated surface on which food is cooked', 'name': 'griddle'}, {'frequency': 'f', 'synset': 'grill.n.02', 'synonyms': ['grill', 'grille', 'grillwork', 'radiator_grille'], 'id': 517, 'def': 'a framework of metal bars used as a partition or a grate', 'name': 'grill'}, {'frequency': 'r', 'synset': 'grits.n.01', 'synonyms': ['grits', 'hominy_grits'], 'id': 518, 'def': 'coarsely ground corn boiled as a breakfast dish', 'name': 'grits'}, {'frequency': 'c', 'synset': 'grizzly.n.01', 'synonyms': ['grizzly', 'grizzly_bear'], 'id': 519, 'def': 'powerful brownish-yellow bear of the uplands of western North America', 'name': 'grizzly'}, {'frequency': 'c', 'synset': 'grocery_bag.n.01', 'synonyms': ['grocery_bag'], 'id': 520, 'def': "a sack for holding customer's groceries", 'name': 'grocery_bag'}, {'frequency': 'f', 'synset': 'guitar.n.01', 'synonyms': ['guitar'], 'id': 521, 'def': 'a stringed instrument usually having six strings; played by strumming or plucking', 'name': 'guitar'}, {'frequency': 'c', 'synset': 'gull.n.02', 'synonyms': ['gull', 'seagull'], 'id': 522, 'def': 'mostly white aquatic bird having long pointed wings and short legs', 'name': 'gull'}, {'frequency': 'c', 'synset': 'gun.n.01', 'synonyms': ['gun'], 'id': 523, 'def': 'a weapon that discharges a bullet at high velocity from a metal tube', 'name': 'gun'}, {'frequency': 'f', 'synset': 'hairbrush.n.01', 'synonyms': ['hairbrush'], 'id': 524, 'def': "a brush used to groom a person's hair", 'name': 'hairbrush'}, {'frequency': 'c', 'synset': 'hairnet.n.01', 'synonyms': ['hairnet'], 'id': 525, 'def': 'a small net that someone wears over their hair to keep it in place', 'name': 'hairnet'}, {'frequency': 'c', 'synset': 'hairpin.n.01', 'synonyms': ['hairpin'], 'id': 526, 'def': "a double pronged pin used to hold women's hair in place", 'name': 'hairpin'}, {'frequency': 'r', 'synset': 'halter.n.03', 'synonyms': ['halter_top'], 'id': 527, 'def': "a woman's top that fastens behind the back and neck leaving the back and arms uncovered", 'name': 'halter_top'}, {'frequency': 'f', 'synset': 'ham.n.01', 'synonyms': ['ham', 'jambon', 'gammon'], 'id': 528, 'def': 'meat cut from the thigh of a hog (usually smoked)', 'name': 'ham'}, {'frequency': 'c', 'synset': 'hamburger.n.01', 'synonyms': ['hamburger', 'beefburger', 'burger'], 'id': 529, 'def': 'a sandwich consisting of a patty of minced beef served on a bun', 'name': 'hamburger'}, {'frequency': 'c', 'synset': 'hammer.n.02', 'synonyms': ['hammer'], 'id': 530, 'def': 'a hand tool with a heavy head and a handle; used to deliver an impulsive force by striking', 'name': 'hammer'}, {'frequency': 'c', 'synset': 'hammock.n.02', 'synonyms': ['hammock'], 'id': 531, 'def': 'a hanging bed of canvas or rope netting (usually suspended between two trees)', 'name': 'hammock'}, {'frequency': 'r', 'synset': 'hamper.n.02', 'synonyms': ['hamper'], 'id': 532, 'def': 'a basket usually with a cover', 'name': 'hamper'}, {'frequency': 'c', 'synset': 'hamster.n.01', 'synonyms': ['hamster'], 'id': 533, 'def': 'short-tailed burrowing rodent with large cheek pouches', 'name': 'hamster'}, {'frequency': 'f', 'synset': 'hand_blower.n.01', 'synonyms': ['hair_dryer'], 'id': 534, 'def': 'a hand-held electric blower that can blow warm air onto the hair', 'name': 'hair_dryer'}, {'frequency': 'r', 'synset': 'hand_glass.n.01', 'synonyms': ['hand_glass', 'hand_mirror'], 'id': 535, 'def': 'a mirror intended to be held in the hand', 'name': 'hand_glass'}, {'frequency': 'f', 'synset': 'hand_towel.n.01', 'synonyms': ['hand_towel', 'face_towel'], 'id': 536, 'def': 'a small towel used to dry the hands or face', 'name': 'hand_towel'}, {'frequency': 'c', 'synset': 'handcart.n.01', 'synonyms': ['handcart', 'pushcart', 'hand_truck'], 'id': 537, 'def': 'wheeled vehicle that can be pushed by a person', 'name': 'handcart'}, {'frequency': 'r', 'synset': 'handcuff.n.01', 'synonyms': ['handcuff'], 'id': 538, 'def': 'shackle that consists of a metal loop that can be locked around the wrist', 'name': 'handcuff'}, {'frequency': 'c', 'synset': 'handkerchief.n.01', 'synonyms': ['handkerchief'], 'id': 539, 'def': 'a square piece of cloth used for wiping the eyes or nose or as a costume accessory', 'name': 'handkerchief'}, {'frequency': 'f', 'synset': 'handle.n.01', 'synonyms': ['handle', 'grip', 'handgrip'], 'id': 540, 'def': 'the appendage to an object that is designed to be held in order to use or move it', 'name': 'handle'}, {'frequency': 'r', 'synset': 'handsaw.n.01', 'synonyms': ['handsaw', "carpenter's_saw"], 'id': 541, 'def': 'a saw used with one hand for cutting wood', 'name': 'handsaw'}, {'frequency': 'r', 'synset': 'hardback.n.01', 'synonyms': ['hardback_book', 'hardcover_book'], 'id': 542, 'def': 'a book with cardboard or cloth or leather covers', 'name': 'hardback_book'}, {'frequency': 'r', 'synset': 'harmonium.n.01', 'synonyms': ['harmonium', 'organ_(musical_instrument)', 'reed_organ_(musical_instrument)'], 'id': 543, 'def': 'a free-reed instrument in which air is forced through the reeds by bellows', 'name': 'harmonium'}, {'frequency': 'f', 'synset': 'hat.n.01', 'synonyms': ['hat'], 'id': 544, 'def': 'headwear that protects the head from bad weather, sun, or worn for fashion', 'name': 'hat'}, {'frequency': 'r', 'synset': 'hatbox.n.01', 'synonyms': ['hatbox'], 'id': 545, 'def': 'a round piece of luggage for carrying hats', 'name': 'hatbox'}, {'frequency': 'c', 'synset': 'head_covering.n.01', 'synonyms': ['veil'], 'id': 546, 'def': 'a garment that covers the head OR face', 'name': 'veil'}, {'frequency': 'f', 'synset': 'headband.n.01', 'synonyms': ['headband'], 'id': 547, 'def': 'a band worn around or over the head', 'name': 'headband'}, {'frequency': 'f', 'synset': 'headboard.n.01', 'synonyms': ['headboard'], 'id': 548, 'def': 'a vertical board or panel forming the head of a bedstead', 'name': 'headboard'}, {'frequency': 'f', 'synset': 'headlight.n.01', 'synonyms': ['headlight', 'headlamp'], 'id': 549, 'def': 'a powerful light with reflector; attached to the front of an automobile or locomotive', 'name': 'headlight'}, {'frequency': 'c', 'synset': 'headscarf.n.01', 'synonyms': ['headscarf'], 'id': 550, 'def': 'a kerchief worn over the head and tied under the chin', 'name': 'headscarf'}, {'frequency': 'r', 'synset': 'headset.n.01', 'synonyms': ['headset'], 'id': 551, 'def': 'receiver consisting of a pair of headphones', 'name': 'headset'}, {'frequency': 'c', 'synset': 'headstall.n.01', 'synonyms': ['headstall_(for_horses)', 'headpiece_(for_horses)'], 'id': 552, 'def': "the band that is the part of a bridle that fits around a horse's head", 'name': 'headstall_(for_horses)'}, {'frequency': 'c', 'synset': 'heart.n.02', 'synonyms': ['heart'], 'id': 553, 'def': 'a muscular organ; its contractions move the blood through the body', 'name': 'heart'}, {'frequency': 'c', 'synset': 'heater.n.01', 'synonyms': ['heater', 'warmer'], 'id': 554, 'def': 'device that heats water or supplies warmth to a room', 'name': 'heater'}, {'frequency': 'c', 'synset': 'helicopter.n.01', 'synonyms': ['helicopter'], 'id': 555, 'def': 'an aircraft without wings that obtains its lift from the rotation of overhead blades', 'name': 'helicopter'}, {'frequency': 'f', 'synset': 'helmet.n.02', 'synonyms': ['helmet'], 'id': 556, 'def': 'a protective headgear made of hard material to resist blows', 'name': 'helmet'}, {'frequency': 'r', 'synset': 'heron.n.02', 'synonyms': ['heron'], 'id': 557, 'def': 'grey or white wading bird with long neck and long legs and (usually) long bill', 'name': 'heron'}, {'frequency': 'c', 'synset': 'highchair.n.01', 'synonyms': ['highchair', 'feeding_chair'], 'id': 558, 'def': 'a chair for feeding a very young child', 'name': 'highchair'}, {'frequency': 'f', 'synset': 'hinge.n.01', 'synonyms': ['hinge'], 'id': 559, 'def': 'a joint that holds two parts together so that one can swing relative to the other', 'name': 'hinge'}, {'frequency': 'r', 'synset': 'hippopotamus.n.01', 'synonyms': ['hippopotamus'], 'id': 560, 'def': 'massive thick-skinned animal living in or around rivers of tropical Africa', 'name': 'hippopotamus'}, {'frequency': 'r', 'synset': 'hockey_stick.n.01', 'synonyms': ['hockey_stick'], 'id': 561, 'def': 'sports implement consisting of a stick used by hockey players to move the puck', 'name': 'hockey_stick'}, {'frequency': 'c', 'synset': 'hog.n.03', 'synonyms': ['hog', 'pig'], 'id': 562, 'def': 'domestic swine', 'name': 'hog'}, {'frequency': 'f', 'synset': 'home_plate.n.01', 'synonyms': ['home_plate_(baseball)', 'home_base_(baseball)'], 'id': 563, 'def': '(baseball) a rubber slab where the batter stands; it must be touched by a base runner in order to score', 'name': 'home_plate_(baseball)'}, {'frequency': 'c', 'synset': 'honey.n.01', 'synonyms': ['honey'], 'id': 564, 'def': 'a sweet yellow liquid produced by bees', 'name': 'honey'}, {'frequency': 'f', 'synset': 'hood.n.06', 'synonyms': ['fume_hood', 'exhaust_hood'], 'id': 565, 'def': 'metal covering leading to a vent that exhausts smoke or fumes', 'name': 'fume_hood'}, {'frequency': 'f', 'synset': 'hook.n.05', 'synonyms': ['hook'], 'id': 566, 'def': 'a curved or bent implement for suspending or pulling something', 'name': 'hook'}, {'frequency': 'r', 'synset': 'hookah.n.01', 'synonyms': ['hookah', 'narghile', 'nargileh', 'sheesha', 'shisha', 'water_pipe'], 'id': 567, 'def': 'a tobacco pipe with a long flexible tube connected to a container where the smoke is cooled by passing through water', 'name': 'hookah'}, {'frequency': 'r', 'synset': 'hornet.n.01', 'synonyms': ['hornet'], 'id': 568, 'def': 'large stinging wasp', 'name': 'hornet'}, {'frequency': 'f', 'synset': 'horse.n.01', 'synonyms': ['horse'], 'id': 569, 'def': 'a common horse', 'name': 'horse'}, {'frequency': 'f', 'synset': 'hose.n.03', 'synonyms': ['hose', 'hosepipe'], 'id': 570, 'def': 'a flexible pipe for conveying a liquid or gas', 'name': 'hose'}, {'frequency': 'r', 'synset': 'hot-air_balloon.n.01', 'synonyms': ['hot-air_balloon'], 'id': 571, 'def': 'balloon for travel through the air in a basket suspended below a large bag of heated air', 'name': 'hot-air_balloon'}, {'frequency': 'r', 'synset': 'hot_plate.n.01', 'synonyms': ['hotplate'], 'id': 572, 'def': 'a portable electric appliance for heating or cooking or keeping food warm', 'name': 'hotplate'}, {'frequency': 'c', 'synset': 'hot_sauce.n.01', 'synonyms': ['hot_sauce'], 'id': 573, 'def': 'a pungent peppery sauce', 'name': 'hot_sauce'}, {'frequency': 'r', 'synset': 'hourglass.n.01', 'synonyms': ['hourglass'], 'id': 574, 'def': 'a sandglass timer that runs for sixty minutes', 'name': 'hourglass'}, {'frequency': 'r', 'synset': 'houseboat.n.01', 'synonyms': ['houseboat'], 'id': 575, 'def': 'a barge that is designed and equipped for use as a dwelling', 'name': 'houseboat'}, {'frequency': 'c', 'synset': 'hummingbird.n.01', 'synonyms': ['hummingbird'], 'id': 576, 'def': 'tiny American bird having brilliant iridescent plumage and long slender bills', 'name': 'hummingbird'}, {'frequency': 'r', 'synset': 'hummus.n.01', 'synonyms': ['hummus', 'humus', 'hommos', 'hoummos', 'humous'], 'id': 577, 'def': 'a thick spread made from mashed chickpeas', 'name': 'hummus'}, {'frequency': 'f', 'synset': 'ice_bear.n.01', 'synonyms': ['polar_bear'], 'id': 578, 'def': 'white bear of Arctic regions', 'name': 'polar_bear'}, {'frequency': 'c', 'synset': 'ice_cream.n.01', 'synonyms': ['icecream'], 'id': 579, 'def': 'frozen dessert containing cream and sugar and flavoring', 'name': 'icecream'}, {'frequency': 'r', 'synset': 'ice_lolly.n.01', 'synonyms': ['popsicle'], 'id': 580, 'def': 'ice cream or water ice on a small wooden stick', 'name': 'popsicle'}, {'frequency': 'c', 'synset': 'ice_maker.n.01', 'synonyms': ['ice_maker'], 'id': 581, 'def': 'an appliance included in some electric refrigerators for making ice cubes', 'name': 'ice_maker'}, {'frequency': 'r', 'synset': 'ice_pack.n.01', 'synonyms': ['ice_pack', 'ice_bag'], 'id': 582, 'def': 'a waterproof bag filled with ice: applied to the body (especially the head) to cool or reduce swelling', 'name': 'ice_pack'}, {'frequency': 'r', 'synset': 'ice_skate.n.01', 'synonyms': ['ice_skate'], 'id': 583, 'def': 'skate consisting of a boot with a steel blade fitted to the sole', 'name': 'ice_skate'}, {'frequency': 'c', 'synset': 'igniter.n.01', 'synonyms': ['igniter', 'ignitor', 'lighter'], 'id': 584, 'def': 'a substance or device used to start a fire', 'name': 'igniter'}, {'frequency': 'r', 'synset': 'inhaler.n.01', 'synonyms': ['inhaler', 'inhalator'], 'id': 585, 'def': 'a dispenser that produces a chemical vapor to be inhaled through mouth or nose', 'name': 'inhaler'}, {'frequency': 'f', 'synset': 'ipod.n.01', 'synonyms': ['iPod'], 'id': 586, 'def': 'a pocket-sized device used to play music files', 'name': 'iPod'}, {'frequency': 'c', 'synset': 'iron.n.04', 'synonyms': ['iron_(for_clothing)', 'smoothing_iron_(for_clothing)'], 'id': 587, 'def': 'home appliance consisting of a flat metal base that is heated and used to smooth cloth', 'name': 'iron_(for_clothing)'}, {'frequency': 'c', 'synset': 'ironing_board.n.01', 'synonyms': ['ironing_board'], 'id': 588, 'def': 'narrow padded board on collapsible supports; used for ironing clothes', 'name': 'ironing_board'}, {'frequency': 'f', 'synset': 'jacket.n.01', 'synonyms': ['jacket'], 'id': 589, 'def': 'a waist-length coat', 'name': 'jacket'}, {'frequency': 'c', 'synset': 'jam.n.01', 'synonyms': ['jam'], 'id': 590, 'def': 'preserve of crushed fruit', 'name': 'jam'}, {'frequency': 'f', 'synset': 'jar.n.01', 'synonyms': ['jar'], 'id': 591, 'def': 'a vessel (usually cylindrical) with a wide mouth and without handles', 'name': 'jar'}, {'frequency': 'f', 'synset': 'jean.n.01', 'synonyms': ['jean', 'blue_jean', 'denim'], 'id': 592, 'def': '(usually plural) close-fitting trousers of heavy denim for manual work or casual wear', 'name': 'jean'}, {'frequency': 'c', 'synset': 'jeep.n.01', 'synonyms': ['jeep', 'landrover'], 'id': 593, 'def': 'a car suitable for traveling over rough terrain', 'name': 'jeep'}, {'frequency': 'r', 'synset': 'jelly_bean.n.01', 'synonyms': ['jelly_bean', 'jelly_egg'], 'id': 594, 'def': 'sugar-glazed jellied candy', 'name': 'jelly_bean'}, {'frequency': 'f', 'synset': 'jersey.n.03', 'synonyms': ['jersey', 'T-shirt', 'tee_shirt'], 'id': 595, 'def': 'a close-fitting pullover shirt', 'name': 'jersey'}, {'frequency': 'c', 'synset': 'jet.n.01', 'synonyms': ['jet_plane', 'jet-propelled_plane'], 'id': 596, 'def': 'an airplane powered by one or more jet engines', 'name': 'jet_plane'}, {'frequency': 'r', 'synset': 'jewel.n.01', 'synonyms': ['jewel', 'gem', 'precious_stone'], 'id': 597, 'def': 'a precious or semiprecious stone incorporated into a piece of jewelry', 'name': 'jewel'}, {'frequency': 'c', 'synset': 'jewelry.n.01', 'synonyms': ['jewelry', 'jewellery'], 'id': 598, 'def': 'an adornment (as a bracelet or ring or necklace) made of precious metals and set with gems (or imitation gems)', 'name': 'jewelry'}, {'frequency': 'r', 'synset': 'joystick.n.02', 'synonyms': ['joystick'], 'id': 599, 'def': 'a control device for computers consisting of a vertical handle that can move freely in two directions', 'name': 'joystick'}, {'frequency': 'c', 'synset': 'jump_suit.n.01', 'synonyms': ['jumpsuit'], 'id': 600, 'def': "one-piece garment fashioned after a parachutist's uniform", 'name': 'jumpsuit'}, {'frequency': 'c', 'synset': 'kayak.n.01', 'synonyms': ['kayak'], 'id': 601, 'def': 'a small canoe consisting of a light frame made watertight with animal skins', 'name': 'kayak'}, {'frequency': 'r', 'synset': 'keg.n.02', 'synonyms': ['keg'], 'id': 602, 'def': 'small cask or barrel', 'name': 'keg'}, {'frequency': 'r', 'synset': 'kennel.n.01', 'synonyms': ['kennel', 'doghouse'], 'id': 603, 'def': 'outbuilding that serves as a shelter for a dog', 'name': 'kennel'}, {'frequency': 'c', 'synset': 'kettle.n.01', 'synonyms': ['kettle', 'boiler'], 'id': 604, 'def': 'a metal pot for stewing or boiling; usually has a lid', 'name': 'kettle'}, {'frequency': 'f', 'synset': 'key.n.01', 'synonyms': ['key'], 'id': 605, 'def': 'metal instrument used to unlock a lock', 'name': 'key'}, {'frequency': 'r', 'synset': 'keycard.n.01', 'synonyms': ['keycard'], 'id': 606, 'def': 'a plastic card used to gain access typically to a door', 'name': 'keycard'}, {'frequency': 'c', 'synset': 'kilt.n.01', 'synonyms': ['kilt'], 'id': 607, 'def': 'a knee-length pleated tartan skirt worn by men as part of the traditional dress in the Highlands of northern Scotland', 'name': 'kilt'}, {'frequency': 'c', 'synset': 'kimono.n.01', 'synonyms': ['kimono'], 'id': 608, 'def': 'a loose robe; imitated from robes originally worn by Japanese', 'name': 'kimono'}, {'frequency': 'f', 'synset': 'kitchen_sink.n.01', 'synonyms': ['kitchen_sink'], 'id': 609, 'def': 'a sink in a kitchen', 'name': 'kitchen_sink'}, {'frequency': 'r', 'synset': 'kitchen_table.n.01', 'synonyms': ['kitchen_table'], 'id': 610, 'def': 'a table in the kitchen', 'name': 'kitchen_table'}, {'frequency': 'f', 'synset': 'kite.n.03', 'synonyms': ['kite'], 'id': 611, 'def': 'plaything consisting of a light frame covered with tissue paper; flown in wind at end of a string', 'name': 'kite'}, {'frequency': 'c', 'synset': 'kitten.n.01', 'synonyms': ['kitten', 'kitty'], 'id': 612, 'def': 'young domestic cat', 'name': 'kitten'}, {'frequency': 'c', 'synset': 'kiwi.n.03', 'synonyms': ['kiwi_fruit'], 'id': 613, 'def': 'fuzzy brown egg-shaped fruit with slightly tart green flesh', 'name': 'kiwi_fruit'}, {'frequency': 'f', 'synset': 'knee_pad.n.01', 'synonyms': ['knee_pad'], 'id': 614, 'def': 'protective garment consisting of a pad worn by football or baseball or hockey players', 'name': 'knee_pad'}, {'frequency': 'f', 'synset': 'knife.n.01', 'synonyms': ['knife'], 'id': 615, 'def': 'tool with a blade and point used as a cutting instrument', 'name': 'knife'}, {'frequency': 'r', 'synset': 'knitting_needle.n.01', 'synonyms': ['knitting_needle'], 'id': 616, 'def': 'needle consisting of a slender rod with pointed ends; usually used in pairs', 'name': 'knitting_needle'}, {'frequency': 'f', 'synset': 'knob.n.02', 'synonyms': ['knob'], 'id': 617, 'def': 'a round handle often found on a door', 'name': 'knob'}, {'frequency': 'r', 'synset': 'knocker.n.05', 'synonyms': ['knocker_(on_a_door)', 'doorknocker'], 'id': 618, 'def': 'a device (usually metal and ornamental) attached by a hinge to a door', 'name': 'knocker_(on_a_door)'}, {'frequency': 'r', 'synset': 'koala.n.01', 'synonyms': ['koala', 'koala_bear'], 'id': 619, 'def': 'sluggish tailless Australian marsupial with grey furry ears and coat', 'name': 'koala'}, {'frequency': 'r', 'synset': 'lab_coat.n.01', 'synonyms': ['lab_coat', 'laboratory_coat'], 'id': 620, 'def': 'a light coat worn to protect clothing from substances used while working in a laboratory', 'name': 'lab_coat'}, {'frequency': 'f', 'synset': 'ladder.n.01', 'synonyms': ['ladder'], 'id': 621, 'def': 'steps consisting of two parallel members connected by rungs', 'name': 'ladder'}, {'frequency': 'c', 'synset': 'ladle.n.01', 'synonyms': ['ladle'], 'id': 622, 'def': 'a spoon-shaped vessel with a long handle frequently used to transfer liquids', 'name': 'ladle'}, {'frequency': 'c', 'synset': 'ladybug.n.01', 'synonyms': ['ladybug', 'ladybeetle', 'ladybird_beetle'], 'id': 623, 'def': 'small round bright-colored and spotted beetle, typically red and black', 'name': 'ladybug'}, {'frequency': 'f', 'synset': 'lamb.n.01', 'synonyms': ['lamb_(animal)'], 'id': 624, 'def': 'young sheep', 'name': 'lamb_(animal)'}, {'frequency': 'r', 'synset': 'lamb_chop.n.01', 'synonyms': ['lamb-chop', 'lambchop'], 'id': 625, 'def': 'chop cut from a lamb', 'name': 'lamb-chop'}, {'frequency': 'f', 'synset': 'lamp.n.02', 'synonyms': ['lamp'], 'id': 626, 'def': 'a piece of furniture holding one or more electric light bulbs', 'name': 'lamp'}, {'frequency': 'f', 'synset': 'lamppost.n.01', 'synonyms': ['lamppost'], 'id': 627, 'def': 'a metal post supporting an outdoor lamp (such as a streetlight)', 'name': 'lamppost'}, {'frequency': 'f', 'synset': 'lampshade.n.01', 'synonyms': ['lampshade'], 'id': 628, 'def': 'a protective ornamental shade used to screen a light bulb from direct view', 'name': 'lampshade'}, {'frequency': 'c', 'synset': 'lantern.n.01', 'synonyms': ['lantern'], 'id': 629, 'def': 'light in a transparent protective case', 'name': 'lantern'}, {'frequency': 'f', 'synset': 'lanyard.n.02', 'synonyms': ['lanyard', 'laniard'], 'id': 630, 'def': 'a cord worn around the neck to hold a knife or whistle, etc.', 'name': 'lanyard'}, {'frequency': 'f', 'synset': 'laptop.n.01', 'synonyms': ['laptop_computer', 'notebook_computer'], 'id': 631, 'def': 'a portable computer small enough to use in your lap', 'name': 'laptop_computer'}, {'frequency': 'r', 'synset': 'lasagna.n.01', 'synonyms': ['lasagna', 'lasagne'], 'id': 632, 'def': 'baked dish of layers of lasagna pasta with sauce and cheese and meat or vegetables', 'name': 'lasagna'}, {'frequency': 'f', 'synset': 'latch.n.02', 'synonyms': ['latch'], 'id': 633, 'def': 'a bar that can be lowered or slid into a groove to fasten a door or gate', 'name': 'latch'}, {'frequency': 'r', 'synset': 'lawn_mower.n.01', 'synonyms': ['lawn_mower'], 'id': 634, 'def': 'garden tool for mowing grass on lawns', 'name': 'lawn_mower'}, {'frequency': 'r', 'synset': 'leather.n.01', 'synonyms': ['leather'], 'id': 635, 'def': 'an animal skin made smooth and flexible by removing the hair and then tanning', 'name': 'leather'}, {'frequency': 'c', 'synset': 'legging.n.01', 'synonyms': ['legging_(clothing)', 'leging_(clothing)', 'leg_covering'], 'id': 636, 'def': 'a garment covering the leg (usually extending from the knee to the ankle)', 'name': 'legging_(clothing)'}, {'frequency': 'c', 'synset': 'lego.n.01', 'synonyms': ['Lego', 'Lego_set'], 'id': 637, 'def': "a child's plastic construction set for making models from blocks", 'name': 'Lego'}, {'frequency': 'r', 'synset': 'legume.n.02', 'synonyms': ['legume'], 'id': 638, 'def': 'the fruit or seed of bean or pea plants', 'name': 'legume'}, {'frequency': 'f', 'synset': 'lemon.n.01', 'synonyms': ['lemon'], 'id': 639, 'def': 'yellow oval fruit with juicy acidic flesh', 'name': 'lemon'}, {'frequency': 'r', 'synset': 'lemonade.n.01', 'synonyms': ['lemonade'], 'id': 640, 'def': 'sweetened beverage of diluted lemon juice', 'name': 'lemonade'}, {'frequency': 'f', 'synset': 'lettuce.n.02', 'synonyms': ['lettuce'], 'id': 641, 'def': 'leafy plant commonly eaten in salad or on sandwiches', 'name': 'lettuce'}, {'frequency': 'f', 'synset': 'license_plate.n.01', 'synonyms': ['license_plate', 'numberplate'], 'id': 642, 'def': "a plate mounted on the front and back of car and bearing the car's registration number", 'name': 'license_plate'}, {'frequency': 'f', 'synset': 'life_buoy.n.01', 'synonyms': ['life_buoy', 'lifesaver', 'life_belt', 'life_ring'], 'id': 643, 'def': 'a ring-shaped life preserver used to prevent drowning (NOT a life-jacket or vest)', 'name': 'life_buoy'}, {'frequency': 'f', 'synset': 'life_jacket.n.01', 'synonyms': ['life_jacket', 'life_vest'], 'id': 644, 'def': 'life preserver consisting of a sleeveless jacket of buoyant or inflatable design', 'name': 'life_jacket'}, {'frequency': 'f', 'synset': 'light_bulb.n.01', 'synonyms': ['lightbulb'], 'id': 645, 'def': 'lightblub/source of light', 'name': 'lightbulb'}, {'frequency': 'r', 'synset': 'lightning_rod.n.02', 'synonyms': ['lightning_rod', 'lightning_conductor'], 'id': 646, 'def': 'a metallic conductor that is attached to a high point and leads to the ground', 'name': 'lightning_rod'}, {'frequency': 'f', 'synset': 'lime.n.06', 'synonyms': ['lime'], 'id': 647, 'def': 'the green acidic fruit of any of various lime trees', 'name': 'lime'}, {'frequency': 'r', 'synset': 'limousine.n.01', 'synonyms': ['limousine'], 'id': 648, 'def': 'long luxurious car; usually driven by a chauffeur', 'name': 'limousine'}, {'frequency': 'c', 'synset': 'lion.n.01', 'synonyms': ['lion'], 'id': 649, 'def': 'large gregarious predatory cat of Africa and India', 'name': 'lion'}, {'frequency': 'c', 'synset': 'lip_balm.n.01', 'synonyms': ['lip_balm'], 'id': 650, 'def': 'a balm applied to the lips', 'name': 'lip_balm'}, {'frequency': 'r', 'synset': 'liquor.n.01', 'synonyms': ['liquor', 'spirits', 'hard_liquor', 'liqueur', 'cordial'], 'id': 651, 'def': 'liquor or beer', 'name': 'liquor'}, {'frequency': 'c', 'synset': 'lizard.n.01', 'synonyms': ['lizard'], 'id': 652, 'def': 'a reptile with usually two pairs of legs and a tapering tail', 'name': 'lizard'}, {'frequency': 'f', 'synset': 'log.n.01', 'synonyms': ['log'], 'id': 653, 'def': 'a segment of the trunk of a tree when stripped of branches', 'name': 'log'}, {'frequency': 'c', 'synset': 'lollipop.n.02', 'synonyms': ['lollipop'], 'id': 654, 'def': 'hard candy on a stick', 'name': 'lollipop'}, {'frequency': 'f', 'synset': 'loudspeaker.n.01', 'synonyms': ['speaker_(stero_equipment)'], 'id': 655, 'def': 'electronic device that produces sound often as part of a stereo system', 'name': 'speaker_(stero_equipment)'}, {'frequency': 'c', 'synset': 'love_seat.n.01', 'synonyms': ['loveseat'], 'id': 656, 'def': 'small sofa that seats two people', 'name': 'loveseat'}, {'frequency': 'r', 'synset': 'machine_gun.n.01', 'synonyms': ['machine_gun'], 'id': 657, 'def': 'a rapidly firing automatic gun', 'name': 'machine_gun'}, {'frequency': 'f', 'synset': 'magazine.n.02', 'synonyms': ['magazine'], 'id': 658, 'def': 'a paperback periodic publication', 'name': 'magazine'}, {'frequency': 'f', 'synset': 'magnet.n.01', 'synonyms': ['magnet'], 'id': 659, 'def': 'a device that attracts iron and produces a magnetic field', 'name': 'magnet'}, {'frequency': 'c', 'synset': 'mail_slot.n.01', 'synonyms': ['mail_slot'], 'id': 660, 'def': 'a slot (usually in a door) through which mail can be delivered', 'name': 'mail_slot'}, {'frequency': 'f', 'synset': 'mailbox.n.01', 'synonyms': ['mailbox_(at_home)', 'letter_box_(at_home)'], 'id': 661, 'def': 'a private box for delivery of mail', 'name': 'mailbox_(at_home)'}, {'frequency': 'r', 'synset': 'mallard.n.01', 'synonyms': ['mallard'], 'id': 662, 'def': 'wild dabbling duck from which domestic ducks are descended', 'name': 'mallard'}, {'frequency': 'r', 'synset': 'mallet.n.01', 'synonyms': ['mallet'], 'id': 663, 'def': 'a sports implement with a long handle and a hammer-like head used to hit a ball', 'name': 'mallet'}, {'frequency': 'r', 'synset': 'mammoth.n.01', 'synonyms': ['mammoth'], 'id': 664, 'def': 'any of numerous extinct elephants widely distributed in the Pleistocene', 'name': 'mammoth'}, {'frequency': 'r', 'synset': 'manatee.n.01', 'synonyms': ['manatee'], 'id': 665, 'def': 'sirenian mammal of tropical coastal waters of America', 'name': 'manatee'}, {'frequency': 'c', 'synset': 'mandarin.n.05', 'synonyms': ['mandarin_orange'], 'id': 666, 'def': 'a somewhat flat reddish-orange loose skinned citrus of China', 'name': 'mandarin_orange'}, {'frequency': 'c', 'synset': 'manger.n.01', 'synonyms': ['manger', 'trough'], 'id': 667, 'def': 'a container (usually in a barn or stable) from which cattle or horses feed', 'name': 'manger'}, {'frequency': 'f', 'synset': 'manhole.n.01', 'synonyms': ['manhole'], 'id': 668, 'def': 'a hole (usually with a flush cover) through which a person can gain access to an underground structure', 'name': 'manhole'}, {'frequency': 'f', 'synset': 'map.n.01', 'synonyms': ['map'], 'id': 669, 'def': "a diagrammatic representation of the earth's surface (or part of it)", 'name': 'map'}, {'frequency': 'f', 'synset': 'marker.n.03', 'synonyms': ['marker'], 'id': 670, 'def': 'a writing implement for making a mark', 'name': 'marker'}, {'frequency': 'r', 'synset': 'martini.n.01', 'synonyms': ['martini'], 'id': 671, 'def': 'a cocktail made of gin (or vodka) with dry vermouth', 'name': 'martini'}, {'frequency': 'r', 'synset': 'mascot.n.01', 'synonyms': ['mascot'], 'id': 672, 'def': 'a person or animal that is adopted by a team or other group as a symbolic figure', 'name': 'mascot'}, {'frequency': 'c', 'synset': 'mashed_potato.n.01', 'synonyms': ['mashed_potato'], 'id': 673, 'def': 'potato that has been peeled and boiled and then mashed', 'name': 'mashed_potato'}, {'frequency': 'r', 'synset': 'masher.n.02', 'synonyms': ['masher'], 'id': 674, 'def': 'a kitchen utensil used for mashing (e.g. potatoes)', 'name': 'masher'}, {'frequency': 'f', 'synset': 'mask.n.04', 'synonyms': ['mask', 'facemask'], 'id': 675, 'def': 'a protective covering worn over the face', 'name': 'mask'}, {'frequency': 'f', 'synset': 'mast.n.01', 'synonyms': ['mast'], 'id': 676, 'def': 'a vertical spar for supporting sails', 'name': 'mast'}, {'frequency': 'c', 'synset': 'mat.n.03', 'synonyms': ['mat_(gym_equipment)', 'gym_mat'], 'id': 677, 'def': 'sports equipment consisting of a piece of thick padding on the floor for gymnastics', 'name': 'mat_(gym_equipment)'}, {'frequency': 'r', 'synset': 'matchbox.n.01', 'synonyms': ['matchbox'], 'id': 678, 'def': 'a box for holding matches', 'name': 'matchbox'}, {'frequency': 'f', 'synset': 'mattress.n.01', 'synonyms': ['mattress'], 'id': 679, 'def': 'a thick pad filled with resilient material used as a bed or part of a bed', 'name': 'mattress'}, {'frequency': 'c', 'synset': 'measuring_cup.n.01', 'synonyms': ['measuring_cup'], 'id': 680, 'def': 'graduated cup used to measure liquid or granular ingredients', 'name': 'measuring_cup'}, {'frequency': 'c', 'synset': 'measuring_stick.n.01', 'synonyms': ['measuring_stick', 'ruler_(measuring_stick)', 'measuring_rod'], 'id': 681, 'def': 'measuring instrument having a sequence of marks at regular intervals', 'name': 'measuring_stick'}, {'frequency': 'c', 'synset': 'meatball.n.01', 'synonyms': ['meatball'], 'id': 682, 'def': 'ground meat formed into a ball and fried or simmered in broth', 'name': 'meatball'}, {'frequency': 'c', 'synset': 'medicine.n.02', 'synonyms': ['medicine'], 'id': 683, 'def': 'something that treats or prevents or alleviates the symptoms of disease', 'name': 'medicine'}, {'frequency': 'c', 'synset': 'melon.n.01', 'synonyms': ['melon'], 'id': 684, 'def': 'fruit of the gourd family having a hard rind and sweet juicy flesh', 'name': 'melon'}, {'frequency': 'f', 'synset': 'microphone.n.01', 'synonyms': ['microphone'], 'id': 685, 'def': 'device for converting sound waves into electrical energy', 'name': 'microphone'}, {'frequency': 'r', 'synset': 'microscope.n.01', 'synonyms': ['microscope'], 'id': 686, 'def': 'magnifier of the image of small objects', 'name': 'microscope'}, {'frequency': 'f', 'synset': 'microwave.n.02', 'synonyms': ['microwave_oven'], 'id': 687, 'def': 'kitchen appliance that cooks food by passing an electromagnetic wave through it', 'name': 'microwave_oven'}, {'frequency': 'r', 'synset': 'milestone.n.01', 'synonyms': ['milestone', 'milepost'], 'id': 688, 'def': 'stone post at side of a road to show distances', 'name': 'milestone'}, {'frequency': 'f', 'synset': 'milk.n.01', 'synonyms': ['milk'], 'id': 689, 'def': 'a white nutritious liquid secreted by mammals and used as food by human beings', 'name': 'milk'}, {'frequency': 'r', 'synset': 'milk_can.n.01', 'synonyms': ['milk_can'], 'id': 690, 'def': 'can for transporting milk', 'name': 'milk_can'}, {'frequency': 'r', 'synset': 'milkshake.n.01', 'synonyms': ['milkshake'], 'id': 691, 'def': 'frothy drink of milk and flavoring and sometimes fruit or ice cream', 'name': 'milkshake'}, {'frequency': 'f', 'synset': 'minivan.n.01', 'synonyms': ['minivan'], 'id': 692, 'def': 'a small box-shaped passenger van', 'name': 'minivan'}, {'frequency': 'r', 'synset': 'mint.n.05', 'synonyms': ['mint_candy'], 'id': 693, 'def': 'a candy that is flavored with a mint oil', 'name': 'mint_candy'}, {'frequency': 'f', 'synset': 'mirror.n.01', 'synonyms': ['mirror'], 'id': 694, 'def': 'polished surface that forms images by reflecting light', 'name': 'mirror'}, {'frequency': 'c', 'synset': 'mitten.n.01', 'synonyms': ['mitten'], 'id': 695, 'def': 'glove that encases the thumb separately and the other four fingers together', 'name': 'mitten'}, {'frequency': 'c', 'synset': 'mixer.n.04', 'synonyms': ['mixer_(kitchen_tool)', 'stand_mixer'], 'id': 696, 'def': 'a kitchen utensil that is used for mixing foods', 'name': 'mixer_(kitchen_tool)'}, {'frequency': 'c', 'synset': 'money.n.03', 'synonyms': ['money'], 'id': 697, 'def': 'the official currency issued by a government or national bank', 'name': 'money'}, {'frequency': 'f', 'synset': 'monitor.n.04', 'synonyms': ['monitor_(computer_equipment) computer_monitor'], 'id': 698, 'def': 'a computer monitor', 'name': 'monitor_(computer_equipment) computer_monitor'}, {'frequency': 'c', 'synset': 'monkey.n.01', 'synonyms': ['monkey'], 'id': 699, 'def': 'any of various long-tailed primates', 'name': 'monkey'}, {'frequency': 'f', 'synset': 'motor.n.01', 'synonyms': ['motor'], 'id': 700, 'def': 'machine that converts other forms of energy into mechanical energy and so imparts motion', 'name': 'motor'}, {'frequency': 'f', 'synset': 'motor_scooter.n.01', 'synonyms': ['motor_scooter', 'scooter'], 'id': 701, 'def': 'a wheeled vehicle with small wheels and a low-powered engine', 'name': 'motor_scooter'}, {'frequency': 'r', 'synset': 'motor_vehicle.n.01', 'synonyms': ['motor_vehicle', 'automotive_vehicle'], 'id': 702, 'def': 'a self-propelled wheeled vehicle that does not run on rails', 'name': 'motor_vehicle'}, {'frequency': 'f', 'synset': 'motorcycle.n.01', 'synonyms': ['motorcycle'], 'id': 703, 'def': 'a motor vehicle with two wheels and a strong frame', 'name': 'motorcycle'}, {'frequency': 'f', 'synset': 'mound.n.01', 'synonyms': ['mound_(baseball)', "pitcher's_mound"], 'id': 704, 'def': '(baseball) the slight elevation on which the pitcher stands', 'name': 'mound_(baseball)'}, {'frequency': 'f', 'synset': 'mouse.n.04', 'synonyms': ['mouse_(computer_equipment)', 'computer_mouse'], 'id': 705, 'def': 'a computer input device that controls an on-screen pointer (does not include trackpads / touchpads)', 'name': 'mouse_(computer_equipment)'}, {'frequency': 'f', 'synset': 'mousepad.n.01', 'synonyms': ['mousepad'], 'id': 706, 'def': 'a small portable pad that provides an operating surface for a computer mouse', 'name': 'mousepad'}, {'frequency': 'c', 'synset': 'muffin.n.01', 'synonyms': ['muffin'], 'id': 707, 'def': 'a sweet quick bread baked in a cup-shaped pan', 'name': 'muffin'}, {'frequency': 'f', 'synset': 'mug.n.04', 'synonyms': ['mug'], 'id': 708, 'def': 'with handle and usually cylindrical', 'name': 'mug'}, {'frequency': 'f', 'synset': 'mushroom.n.02', 'synonyms': ['mushroom'], 'id': 709, 'def': 'a common mushroom', 'name': 'mushroom'}, {'frequency': 'r', 'synset': 'music_stool.n.01', 'synonyms': ['music_stool', 'piano_stool'], 'id': 710, 'def': 'a stool for piano players; usually adjustable in height', 'name': 'music_stool'}, {'frequency': 'c', 'synset': 'musical_instrument.n.01', 'synonyms': ['musical_instrument', 'instrument_(musical)'], 'id': 711, 'def': 'any of various devices or contrivances that can be used to produce musical tones or sounds', 'name': 'musical_instrument'}, {'frequency': 'r', 'synset': 'nailfile.n.01', 'synonyms': ['nailfile'], 'id': 712, 'def': 'a small flat file for shaping the nails', 'name': 'nailfile'}, {'frequency': 'f', 'synset': 'napkin.n.01', 'synonyms': ['napkin', 'table_napkin', 'serviette'], 'id': 713, 'def': 'a small piece of table linen or paper that is used to wipe the mouth and to cover the lap in order to protect clothing', 'name': 'napkin'}, {'frequency': 'r', 'synset': 'neckerchief.n.01', 'synonyms': ['neckerchief'], 'id': 714, 'def': 'a kerchief worn around the neck', 'name': 'neckerchief'}, {'frequency': 'f', 'synset': 'necklace.n.01', 'synonyms': ['necklace'], 'id': 715, 'def': 'jewelry consisting of a cord or chain (often bearing gems) worn about the neck as an ornament', 'name': 'necklace'}, {'frequency': 'f', 'synset': 'necktie.n.01', 'synonyms': ['necktie', 'tie_(necktie)'], 'id': 716, 'def': 'neckwear consisting of a long narrow piece of material worn under a collar and tied in knot at the front', 'name': 'necktie'}, {'frequency': 'c', 'synset': 'needle.n.03', 'synonyms': ['needle'], 'id': 717, 'def': 'a sharp pointed implement (usually metal)', 'name': 'needle'}, {'frequency': 'c', 'synset': 'nest.n.01', 'synonyms': ['nest'], 'id': 718, 'def': 'a structure in which animals lay eggs or give birth to their young', 'name': 'nest'}, {'frequency': 'f', 'synset': 'newspaper.n.01', 'synonyms': ['newspaper', 'paper_(newspaper)'], 'id': 719, 'def': 'a daily or weekly publication on folded sheets containing news, articles, and advertisements', 'name': 'newspaper'}, {'frequency': 'c', 'synset': 'newsstand.n.01', 'synonyms': ['newsstand'], 'id': 720, 'def': 'a stall where newspapers and other periodicals are sold', 'name': 'newsstand'}, {'frequency': 'c', 'synset': 'nightwear.n.01', 'synonyms': ['nightshirt', 'nightwear', 'sleepwear', 'nightclothes'], 'id': 721, 'def': 'garments designed to be worn in bed', 'name': 'nightshirt'}, {'frequency': 'r', 'synset': 'nosebag.n.01', 'synonyms': ['nosebag_(for_animals)', 'feedbag'], 'id': 722, 'def': 'a canvas bag that is used to feed an animal (such as a horse); covers the muzzle and fastens at the top of the head', 'name': 'nosebag_(for_animals)'}, {'frequency': 'c', 'synset': 'noseband.n.01', 'synonyms': ['noseband_(for_animals)', 'nosepiece_(for_animals)'], 'id': 723, 'def': "a strap that is the part of a bridle that goes over the animal's nose", 'name': 'noseband_(for_animals)'}, {'frequency': 'f', 'synset': 'notebook.n.01', 'synonyms': ['notebook'], 'id': 724, 'def': 'a book with blank pages for recording notes or memoranda', 'name': 'notebook'}, {'frequency': 'c', 'synset': 'notepad.n.01', 'synonyms': ['notepad'], 'id': 725, 'def': 'a pad of paper for keeping notes', 'name': 'notepad'}, {'frequency': 'f', 'synset': 'nut.n.03', 'synonyms': ['nut'], 'id': 726, 'def': 'a small metal block (usually square or hexagonal) with internal screw thread to be fitted onto a bolt', 'name': 'nut'}, {'frequency': 'r', 'synset': 'nutcracker.n.01', 'synonyms': ['nutcracker'], 'id': 727, 'def': 'a hand tool used to crack nuts open', 'name': 'nutcracker'}, {'frequency': 'f', 'synset': 'oar.n.01', 'synonyms': ['oar'], 'id': 728, 'def': 'an implement used to propel or steer a boat', 'name': 'oar'}, {'frequency': 'r', 'synset': 'octopus.n.01', 'synonyms': ['octopus_(food)'], 'id': 729, 'def': 'tentacles of octopus prepared as food', 'name': 'octopus_(food)'}, {'frequency': 'r', 'synset': 'octopus.n.02', 'synonyms': ['octopus_(animal)'], 'id': 730, 'def': 'bottom-living cephalopod having a soft oval body with eight long tentacles', 'name': 'octopus_(animal)'}, {'frequency': 'c', 'synset': 'oil_lamp.n.01', 'synonyms': ['oil_lamp', 'kerosene_lamp', 'kerosine_lamp'], 'id': 731, 'def': 'a lamp that burns oil (as kerosine) for light', 'name': 'oil_lamp'}, {'frequency': 'c', 'synset': 'olive_oil.n.01', 'synonyms': ['olive_oil'], 'id': 732, 'def': 'oil from olives', 'name': 'olive_oil'}, {'frequency': 'r', 'synset': 'omelet.n.01', 'synonyms': ['omelet', 'omelette'], 'id': 733, 'def': 'beaten eggs cooked until just set; may be folded around e.g. ham or cheese or jelly', 'name': 'omelet'}, {'frequency': 'f', 'synset': 'onion.n.01', 'synonyms': ['onion'], 'id': 734, 'def': 'the bulb of an onion plant', 'name': 'onion'}, {'frequency': 'f', 'synset': 'orange.n.01', 'synonyms': ['orange_(fruit)'], 'id': 735, 'def': 'orange (FRUIT of an orange tree)', 'name': 'orange_(fruit)'}, {'frequency': 'c', 'synset': 'orange_juice.n.01', 'synonyms': ['orange_juice'], 'id': 736, 'def': 'bottled or freshly squeezed juice of oranges', 'name': 'orange_juice'}, {'frequency': 'c', 'synset': 'ostrich.n.02', 'synonyms': ['ostrich'], 'id': 737, 'def': 'fast-running African flightless bird with two-toed feet; largest living bird', 'name': 'ostrich'}, {'frequency': 'f', 'synset': 'ottoman.n.03', 'synonyms': ['ottoman', 'pouf', 'pouffe', 'hassock'], 'id': 738, 'def': 'a thick standalone cushion used as a seat or footrest, often next to a chair', 'name': 'ottoman'}, {'frequency': 'f', 'synset': 'oven.n.01', 'synonyms': ['oven'], 'id': 739, 'def': 'kitchen appliance used for baking or roasting', 'name': 'oven'}, {'frequency': 'c', 'synset': 'overall.n.01', 'synonyms': ['overalls_(clothing)'], 'id': 740, 'def': 'work clothing consisting of denim trousers usually with a bib and shoulder straps', 'name': 'overalls_(clothing)'}, {'frequency': 'c', 'synset': 'owl.n.01', 'synonyms': ['owl'], 'id': 741, 'def': 'nocturnal bird of prey with hawk-like beak and claws and large head with front-facing eyes', 'name': 'owl'}, {'frequency': 'c', 'synset': 'packet.n.03', 'synonyms': ['packet'], 'id': 742, 'def': 'a small package or bundle', 'name': 'packet'}, {'frequency': 'r', 'synset': 'pad.n.03', 'synonyms': ['inkpad', 'inking_pad', 'stamp_pad'], 'id': 743, 'def': 'absorbent material saturated with ink used to transfer ink evenly to a rubber stamp', 'name': 'inkpad'}, {'frequency': 'c', 'synset': 'pad.n.04', 'synonyms': ['pad'], 'id': 744, 'def': 'mostly arm/knee pads labeled', 'name': 'pad'}, {'frequency': 'f', 'synset': 'paddle.n.04', 'synonyms': ['paddle', 'boat_paddle'], 'id': 745, 'def': 'a short light oar used without an oarlock to propel a canoe or small boat', 'name': 'paddle'}, {'frequency': 'c', 'synset': 'padlock.n.01', 'synonyms': ['padlock'], 'id': 746, 'def': 'a detachable, portable lock', 'name': 'padlock'}, {'frequency': 'c', 'synset': 'paintbrush.n.01', 'synonyms': ['paintbrush'], 'id': 747, 'def': 'a brush used as an applicator to apply paint', 'name': 'paintbrush'}, {'frequency': 'f', 'synset': 'painting.n.01', 'synonyms': ['painting'], 'id': 748, 'def': 'graphic art consisting of an artistic composition made by applying paints to a surface', 'name': 'painting'}, {'frequency': 'f', 'synset': 'pajama.n.02', 'synonyms': ['pajamas', 'pyjamas'], 'id': 749, 'def': 'loose-fitting nightclothes worn for sleeping or lounging', 'name': 'pajamas'}, {'frequency': 'c', 'synset': 'palette.n.02', 'synonyms': ['palette', 'pallet'], 'id': 750, 'def': 'board that provides a flat surface on which artists mix paints and the range of colors used', 'name': 'palette'}, {'frequency': 'f', 'synset': 'pan.n.01', 'synonyms': ['pan_(for_cooking)', 'cooking_pan'], 'id': 751, 'def': 'cooking utensil consisting of a wide metal vessel', 'name': 'pan_(for_cooking)'}, {'frequency': 'r', 'synset': 'pan.n.03', 'synonyms': ['pan_(metal_container)'], 'id': 752, 'def': 'shallow container made of metal', 'name': 'pan_(metal_container)'}, {'frequency': 'c', 'synset': 'pancake.n.01', 'synonyms': ['pancake'], 'id': 753, 'def': 'a flat cake of thin batter fried on both sides on a griddle', 'name': 'pancake'}, {'frequency': 'r', 'synset': 'pantyhose.n.01', 'synonyms': ['pantyhose'], 'id': 754, 'def': "a woman's tights consisting of underpants and stockings", 'name': 'pantyhose'}, {'frequency': 'r', 'synset': 'papaya.n.02', 'synonyms': ['papaya'], 'id': 755, 'def': 'large oval melon-like tropical fruit with yellowish flesh', 'name': 'papaya'}, {'frequency': 'f', 'synset': 'paper_plate.n.01', 'synonyms': ['paper_plate'], 'id': 756, 'def': 'a disposable plate made of cardboard', 'name': 'paper_plate'}, {'frequency': 'f', 'synset': 'paper_towel.n.01', 'synonyms': ['paper_towel'], 'id': 757, 'def': 'a disposable towel made of absorbent paper', 'name': 'paper_towel'}, {'frequency': 'r', 'synset': 'paperback_book.n.01', 'synonyms': ['paperback_book', 'paper-back_book', 'softback_book', 'soft-cover_book'], 'id': 758, 'def': 'a book with paper covers', 'name': 'paperback_book'}, {'frequency': 'r', 'synset': 'paperweight.n.01', 'synonyms': ['paperweight'], 'id': 759, 'def': 'a weight used to hold down a stack of papers', 'name': 'paperweight'}, {'frequency': 'c', 'synset': 'parachute.n.01', 'synonyms': ['parachute'], 'id': 760, 'def': 'rescue equipment consisting of a device that fills with air and retards your fall', 'name': 'parachute'}, {'frequency': 'c', 'synset': 'parakeet.n.01', 'synonyms': ['parakeet', 'parrakeet', 'parroket', 'paraquet', 'paroquet', 'parroquet'], 'id': 761, 'def': 'any of numerous small slender long-tailed parrots', 'name': 'parakeet'}, {'frequency': 'c', 'synset': 'parasail.n.01', 'synonyms': ['parasail_(sports)'], 'id': 762, 'def': 'parachute that will lift a person up into the air when it is towed by a motorboat or a car', 'name': 'parasail_(sports)'}, {'frequency': 'c', 'synset': 'parasol.n.01', 'synonyms': ['parasol', 'sunshade'], 'id': 763, 'def': 'a handheld collapsible source of shade', 'name': 'parasol'}, {'frequency': 'r', 'synset': 'parchment.n.01', 'synonyms': ['parchment'], 'id': 764, 'def': 'a superior paper resembling sheepskin', 'name': 'parchment'}, {'frequency': 'c', 'synset': 'parka.n.01', 'synonyms': ['parka', 'anorak'], 'id': 765, 'def': "a kind of heavy jacket (`windcheater' is a British term)", 'name': 'parka'}, {'frequency': 'f', 'synset': 'parking_meter.n.01', 'synonyms': ['parking_meter'], 'id': 766, 'def': 'a coin-operated timer located next to a parking space', 'name': 'parking_meter'}, {'frequency': 'c', 'synset': 'parrot.n.01', 'synonyms': ['parrot'], 'id': 767, 'def': 'usually brightly colored tropical birds with short hooked beaks and the ability to mimic sounds', 'name': 'parrot'}, {'frequency': 'c', 'synset': 'passenger_car.n.01', 'synonyms': ['passenger_car_(part_of_a_train)', 'coach_(part_of_a_train)'], 'id': 768, 'def': 'a railcar where passengers ride', 'name': 'passenger_car_(part_of_a_train)'}, {'frequency': 'r', 'synset': 'passenger_ship.n.01', 'synonyms': ['passenger_ship'], 'id': 769, 'def': 'a ship built to carry passengers', 'name': 'passenger_ship'}, {'frequency': 'c', 'synset': 'passport.n.02', 'synonyms': ['passport'], 'id': 770, 'def': 'a document issued by a country to a citizen allowing that person to travel abroad and re-enter the home country', 'name': 'passport'}, {'frequency': 'f', 'synset': 'pastry.n.02', 'synonyms': ['pastry'], 'id': 771, 'def': 'any of various baked foods made of dough or batter', 'name': 'pastry'}, {'frequency': 'r', 'synset': 'patty.n.01', 'synonyms': ['patty_(food)'], 'id': 772, 'def': 'small flat mass of chopped food', 'name': 'patty_(food)'}, {'frequency': 'c', 'synset': 'pea.n.01', 'synonyms': ['pea_(food)'], 'id': 773, 'def': 'seed of a pea plant used for food', 'name': 'pea_(food)'}, {'frequency': 'c', 'synset': 'peach.n.03', 'synonyms': ['peach'], 'id': 774, 'def': 'downy juicy fruit with sweet yellowish or whitish flesh', 'name': 'peach'}, {'frequency': 'c', 'synset': 'peanut_butter.n.01', 'synonyms': ['peanut_butter'], 'id': 775, 'def': 'a spread made from ground peanuts', 'name': 'peanut_butter'}, {'frequency': 'f', 'synset': 'pear.n.01', 'synonyms': ['pear'], 'id': 776, 'def': 'sweet juicy gritty-textured fruit available in many varieties', 'name': 'pear'}, {'frequency': 'c', 'synset': 'peeler.n.03', 'synonyms': ['peeler_(tool_for_fruit_and_vegetables)'], 'id': 777, 'def': 'a device for peeling vegetables or fruits', 'name': 'peeler_(tool_for_fruit_and_vegetables)'}, {'frequency': 'r', 'synset': 'peg.n.04', 'synonyms': ['wooden_leg', 'pegleg'], 'id': 778, 'def': 'a prosthesis that replaces a missing leg', 'name': 'wooden_leg'}, {'frequency': 'r', 'synset': 'pegboard.n.01', 'synonyms': ['pegboard'], 'id': 779, 'def': 'a board perforated with regularly spaced holes into which pegs can be fitted', 'name': 'pegboard'}, {'frequency': 'c', 'synset': 'pelican.n.01', 'synonyms': ['pelican'], 'id': 780, 'def': 'large long-winged warm-water seabird having a large bill with a distensible pouch for fish', 'name': 'pelican'}, {'frequency': 'f', 'synset': 'pen.n.01', 'synonyms': ['pen'], 'id': 781, 'def': 'a writing implement with a point from which ink flows', 'name': 'pen'}, {'frequency': 'f', 'synset': 'pencil.n.01', 'synonyms': ['pencil'], 'id': 782, 'def': 'a thin cylindrical pointed writing implement made of wood and graphite', 'name': 'pencil'}, {'frequency': 'r', 'synset': 'pencil_box.n.01', 'synonyms': ['pencil_box', 'pencil_case'], 'id': 783, 'def': 'a box for holding pencils', 'name': 'pencil_box'}, {'frequency': 'r', 'synset': 'pencil_sharpener.n.01', 'synonyms': ['pencil_sharpener'], 'id': 784, 'def': 'a rotary implement for sharpening the point on pencils', 'name': 'pencil_sharpener'}, {'frequency': 'r', 'synset': 'pendulum.n.01', 'synonyms': ['pendulum'], 'id': 785, 'def': 'an apparatus consisting of an object mounted so that it swings freely under the influence of gravity', 'name': 'pendulum'}, {'frequency': 'c', 'synset': 'penguin.n.01', 'synonyms': ['penguin'], 'id': 786, 'def': 'short-legged flightless birds of cold southern regions having webbed feet and wings modified as flippers', 'name': 'penguin'}, {'frequency': 'r', 'synset': 'pennant.n.02', 'synonyms': ['pennant'], 'id': 787, 'def': 'a flag longer than it is wide (and often tapering)', 'name': 'pennant'}, {'frequency': 'r', 'synset': 'penny.n.02', 'synonyms': ['penny_(coin)'], 'id': 788, 'def': 'a coin worth one-hundredth of the value of the basic unit', 'name': 'penny_(coin)'}, {'frequency': 'f', 'synset': 'pepper.n.03', 'synonyms': ['pepper', 'peppercorn'], 'id': 789, 'def': 'pungent seasoning from the berry of the common pepper plant; whole or ground', 'name': 'pepper'}, {'frequency': 'c', 'synset': 'pepper_mill.n.01', 'synonyms': ['pepper_mill', 'pepper_grinder'], 'id': 790, 'def': 'a mill for grinding pepper', 'name': 'pepper_mill'}, {'frequency': 'c', 'synset': 'perfume.n.02', 'synonyms': ['perfume'], 'id': 791, 'def': 'a toiletry that emits and diffuses a fragrant odor', 'name': 'perfume'}, {'frequency': 'r', 'synset': 'persimmon.n.02', 'synonyms': ['persimmon'], 'id': 792, 'def': 'orange fruit resembling a plum; edible when fully ripe', 'name': 'persimmon'}, {'frequency': 'f', 'synset': 'person.n.01', 'synonyms': ['person', 'baby', 'child', 'boy', 'girl', 'man', 'woman', 'human'], 'id': 793, 'def': 'a human being', 'name': 'person'}, {'frequency': 'c', 'synset': 'pet.n.01', 'synonyms': ['pet'], 'id': 794, 'def': 'a domesticated animal kept for companionship or amusement', 'name': 'pet'}, {'frequency': 'c', 'synset': 'pew.n.01', 'synonyms': ['pew_(church_bench)', 'church_bench'], 'id': 795, 'def': 'long bench with backs; used in church by the congregation', 'name': 'pew_(church_bench)'}, {'frequency': 'r', 'synset': 'phonebook.n.01', 'synonyms': ['phonebook', 'telephone_book', 'telephone_directory'], 'id': 796, 'def': 'a directory containing an alphabetical list of telephone subscribers and their telephone numbers', 'name': 'phonebook'}, {'frequency': 'c', 'synset': 'phonograph_record.n.01', 'synonyms': ['phonograph_record', 'phonograph_recording', 'record_(phonograph_recording)'], 'id': 797, 'def': 'sound recording consisting of a typically black disk with a continuous groove', 'name': 'phonograph_record'}, {'frequency': 'f', 'synset': 'piano.n.01', 'synonyms': ['piano'], 'id': 798, 'def': 'a keyboard instrument that is played by depressing keys that cause hammers to strike tuned strings and produce sounds', 'name': 'piano'}, {'frequency': 'f', 'synset': 'pickle.n.01', 'synonyms': ['pickle'], 'id': 799, 'def': 'vegetables (especially cucumbers) preserved in brine or vinegar', 'name': 'pickle'}, {'frequency': 'f', 'synset': 'pickup.n.01', 'synonyms': ['pickup_truck'], 'id': 800, 'def': 'a light truck with an open body and low sides and a tailboard', 'name': 'pickup_truck'}, {'frequency': 'c', 'synset': 'pie.n.01', 'synonyms': ['pie'], 'id': 801, 'def': 'dish baked in pastry-lined pan often with a pastry top', 'name': 'pie'}, {'frequency': 'c', 'synset': 'pigeon.n.01', 'synonyms': ['pigeon'], 'id': 802, 'def': 'wild and domesticated birds having a heavy body and short legs', 'name': 'pigeon'}, {'frequency': 'r', 'synset': 'piggy_bank.n.01', 'synonyms': ['piggy_bank', 'penny_bank'], 'id': 803, 'def': "a child's coin bank (often shaped like a pig)", 'name': 'piggy_bank'}, {'frequency': 'f', 'synset': 'pillow.n.01', 'synonyms': ['pillow'], 'id': 804, 'def': 'a cushion to support the head of a sleeping person', 'name': 'pillow'}, {'frequency': 'r', 'synset': 'pin.n.09', 'synonyms': ['pin_(non_jewelry)'], 'id': 805, 'def': 'a small slender (often pointed) piece of wood or metal used to support or fasten or attach things', 'name': 'pin_(non_jewelry)'}, {'frequency': 'f', 'synset': 'pineapple.n.02', 'synonyms': ['pineapple'], 'id': 806, 'def': 'large sweet fleshy tropical fruit with a tuft of stiff leaves', 'name': 'pineapple'}, {'frequency': 'c', 'synset': 'pinecone.n.01', 'synonyms': ['pinecone'], 'id': 807, 'def': 'the seed-producing cone of a pine tree', 'name': 'pinecone'}, {'frequency': 'r', 'synset': 'ping-pong_ball.n.01', 'synonyms': ['ping-pong_ball'], 'id': 808, 'def': 'light hollow ball used in playing table tennis', 'name': 'ping-pong_ball'}, {'frequency': 'r', 'synset': 'pinwheel.n.03', 'synonyms': ['pinwheel'], 'id': 809, 'def': 'a toy consisting of vanes of colored paper or plastic that is pinned to a stick and spins when it is pointed into the wind', 'name': 'pinwheel'}, {'frequency': 'r', 'synset': 'pipe.n.01', 'synonyms': ['tobacco_pipe'], 'id': 810, 'def': 'a tube with a small bowl at one end; used for smoking tobacco', 'name': 'tobacco_pipe'}, {'frequency': 'f', 'synset': 'pipe.n.02', 'synonyms': ['pipe', 'piping'], 'id': 811, 'def': 'a long tube made of metal or plastic that is used to carry water or oil or gas etc.', 'name': 'pipe'}, {'frequency': 'r', 'synset': 'pistol.n.01', 'synonyms': ['pistol', 'handgun'], 'id': 812, 'def': 'a firearm that is held and fired with one hand', 'name': 'pistol'}, {'frequency': 'c', 'synset': 'pita.n.01', 'synonyms': ['pita_(bread)', 'pocket_bread'], 'id': 813, 'def': 'usually small round bread that can open into a pocket for filling', 'name': 'pita_(bread)'}, {'frequency': 'f', 'synset': 'pitcher.n.02', 'synonyms': ['pitcher_(vessel_for_liquid)', 'ewer'], 'id': 814, 'def': 'an open vessel with a handle and a spout for pouring', 'name': 'pitcher_(vessel_for_liquid)'}, {'frequency': 'r', 'synset': 'pitchfork.n.01', 'synonyms': ['pitchfork'], 'id': 815, 'def': 'a long-handled hand tool with sharp widely spaced prongs for lifting and pitching hay', 'name': 'pitchfork'}, {'frequency': 'f', 'synset': 'pizza.n.01', 'synonyms': ['pizza'], 'id': 816, 'def': 'Italian open pie made of thin bread dough spread with a spiced mixture of e.g. tomato sauce and cheese', 'name': 'pizza'}, {'frequency': 'f', 'synset': 'place_mat.n.01', 'synonyms': ['place_mat'], 'id': 817, 'def': 'a mat placed on a table for an individual place setting', 'name': 'place_mat'}, {'frequency': 'f', 'synset': 'plate.n.04', 'synonyms': ['plate'], 'id': 818, 'def': 'dish on which food is served or from which food is eaten', 'name': 'plate'}, {'frequency': 'c', 'synset': 'platter.n.01', 'synonyms': ['platter'], 'id': 819, 'def': 'a large shallow dish used for serving food', 'name': 'platter'}, {'frequency': 'r', 'synset': 'playpen.n.01', 'synonyms': ['playpen'], 'id': 820, 'def': 'a portable enclosure in which babies may be left to play', 'name': 'playpen'}, {'frequency': 'c', 'synset': 'pliers.n.01', 'synonyms': ['pliers', 'plyers'], 'id': 821, 'def': 'a gripping hand tool with two hinged arms and (usually) serrated jaws', 'name': 'pliers'}, {'frequency': 'r', 'synset': 'plow.n.01', 'synonyms': ['plow_(farm_equipment)', 'plough_(farm_equipment)'], 'id': 822, 'def': 'a farm tool having one or more heavy blades to break the soil and cut a furrow prior to sowing', 'name': 'plow_(farm_equipment)'}, {'frequency': 'r', 'synset': 'plume.n.02', 'synonyms': ['plume'], 'id': 823, 'def': 'a feather or cluster of feathers worn as an ornament', 'name': 'plume'}, {'frequency': 'r', 'synset': 'pocket_watch.n.01', 'synonyms': ['pocket_watch'], 'id': 824, 'def': 'a watch that is carried in a small watch pocket', 'name': 'pocket_watch'}, {'frequency': 'c', 'synset': 'pocketknife.n.01', 'synonyms': ['pocketknife'], 'id': 825, 'def': 'a knife with a blade that folds into the handle; suitable for carrying in the pocket', 'name': 'pocketknife'}, {'frequency': 'c', 'synset': 'poker.n.01', 'synonyms': ['poker_(fire_stirring_tool)', 'stove_poker', 'fire_hook'], 'id': 826, 'def': 'fire iron consisting of a metal rod with a handle; used to stir a fire', 'name': 'poker_(fire_stirring_tool)'}, {'frequency': 'f', 'synset': 'pole.n.01', 'synonyms': ['pole', 'post'], 'id': 827, 'def': 'a long (usually round) rod of wood or metal or plastic', 'name': 'pole'}, {'frequency': 'f', 'synset': 'polo_shirt.n.01', 'synonyms': ['polo_shirt', 'sport_shirt'], 'id': 828, 'def': 'a shirt with short sleeves designed for comfort and casual wear', 'name': 'polo_shirt'}, {'frequency': 'r', 'synset': 'poncho.n.01', 'synonyms': ['poncho'], 'id': 829, 'def': 'a blanket-like cloak with a hole in the center for the head', 'name': 'poncho'}, {'frequency': 'c', 'synset': 'pony.n.05', 'synonyms': ['pony'], 'id': 830, 'def': 'any of various breeds of small gentle horses usually less than five feet high at the shoulder', 'name': 'pony'}, {'frequency': 'r', 'synset': 'pool_table.n.01', 'synonyms': ['pool_table', 'billiard_table', 'snooker_table'], 'id': 831, 'def': 'game equipment consisting of a heavy table on which pool is played', 'name': 'pool_table'}, {'frequency': 'f', 'synset': 'pop.n.02', 'synonyms': ['pop_(soda)', 'soda_(pop)', 'tonic', 'soft_drink'], 'id': 832, 'def': 'a sweet drink containing carbonated water and flavoring', 'name': 'pop_(soda)'}, {'frequency': 'c', 'synset': 'postbox.n.01', 'synonyms': ['postbox_(public)', 'mailbox_(public)'], 'id': 833, 'def': 'public box for deposit of mail', 'name': 'postbox_(public)'}, {'frequency': 'c', 'synset': 'postcard.n.01', 'synonyms': ['postcard', 'postal_card', 'mailing-card'], 'id': 834, 'def': 'a card for sending messages by post without an envelope', 'name': 'postcard'}, {'frequency': 'f', 'synset': 'poster.n.01', 'synonyms': ['poster', 'placard'], 'id': 835, 'def': 'a sign posted in a public place as an advertisement', 'name': 'poster'}, {'frequency': 'f', 'synset': 'pot.n.01', 'synonyms': ['pot'], 'id': 836, 'def': 'metal or earthenware cooking vessel that is usually round and deep; often has a handle and lid', 'name': 'pot'}, {'frequency': 'f', 'synset': 'pot.n.04', 'synonyms': ['flowerpot'], 'id': 837, 'def': 'a container in which plants are cultivated', 'name': 'flowerpot'}, {'frequency': 'f', 'synset': 'potato.n.01', 'synonyms': ['potato'], 'id': 838, 'def': 'an edible tuber native to South America', 'name': 'potato'}, {'frequency': 'c', 'synset': 'potholder.n.01', 'synonyms': ['potholder'], 'id': 839, 'def': 'an insulated pad for holding hot pots', 'name': 'potholder'}, {'frequency': 'c', 'synset': 'pottery.n.01', 'synonyms': ['pottery', 'clayware'], 'id': 840, 'def': 'ceramic ware made from clay and baked in a kiln', 'name': 'pottery'}, {'frequency': 'c', 'synset': 'pouch.n.01', 'synonyms': ['pouch'], 'id': 841, 'def': 'a small or medium size container for holding or carrying things', 'name': 'pouch'}, {'frequency': 'c', 'synset': 'power_shovel.n.01', 'synonyms': ['power_shovel', 'excavator', 'digger'], 'id': 842, 'def': 'a machine for excavating', 'name': 'power_shovel'}, {'frequency': 'c', 'synset': 'prawn.n.01', 'synonyms': ['prawn', 'shrimp'], 'id': 843, 'def': 'any of various edible decapod crustaceans', 'name': 'prawn'}, {'frequency': 'c', 'synset': 'pretzel.n.01', 'synonyms': ['pretzel'], 'id': 844, 'def': 'glazed and salted cracker typically in the shape of a loose knot', 'name': 'pretzel'}, {'frequency': 'f', 'synset': 'printer.n.03', 'synonyms': ['printer', 'printing_machine'], 'id': 845, 'def': 'a machine that prints', 'name': 'printer'}, {'frequency': 'c', 'synset': 'projectile.n.01', 'synonyms': ['projectile_(weapon)', 'missile'], 'id': 846, 'def': 'a weapon that is forcibly thrown or projected at a targets', 'name': 'projectile_(weapon)'}, {'frequency': 'c', 'synset': 'projector.n.02', 'synonyms': ['projector'], 'id': 847, 'def': 'an optical instrument that projects an enlarged image onto a screen', 'name': 'projector'}, {'frequency': 'f', 'synset': 'propeller.n.01', 'synonyms': ['propeller', 'propellor'], 'id': 848, 'def': 'a mechanical device that rotates to push against air or water', 'name': 'propeller'}, {'frequency': 'r', 'synset': 'prune.n.01', 'synonyms': ['prune'], 'id': 849, 'def': 'dried plum', 'name': 'prune'}, {'frequency': 'r', 'synset': 'pudding.n.01', 'synonyms': ['pudding'], 'id': 850, 'def': 'any of various soft thick unsweetened baked dishes', 'name': 'pudding'}, {'frequency': 'r', 'synset': 'puffer.n.02', 'synonyms': ['puffer_(fish)', 'pufferfish', 'blowfish', 'globefish'], 'id': 851, 'def': 'fishes whose elongated spiny body can inflate itself with water or air to form a globe', 'name': 'puffer_(fish)'}, {'frequency': 'r', 'synset': 'puffin.n.01', 'synonyms': ['puffin'], 'id': 852, 'def': 'seabirds having short necks and brightly colored compressed bills', 'name': 'puffin'}, {'frequency': 'r', 'synset': 'pug.n.01', 'synonyms': ['pug-dog'], 'id': 853, 'def': 'small compact smooth-coated breed of Asiatic origin having a tightly curled tail and broad flat wrinkled muzzle', 'name': 'pug-dog'}, {'frequency': 'c', 'synset': 'pumpkin.n.02', 'synonyms': ['pumpkin'], 'id': 854, 'def': 'usually large pulpy deep-yellow round fruit of the squash family maturing in late summer or early autumn', 'name': 'pumpkin'}, {'frequency': 'r', 'synset': 'punch.n.03', 'synonyms': ['puncher'], 'id': 855, 'def': 'a tool for making holes or indentations', 'name': 'puncher'}, {'frequency': 'r', 'synset': 'puppet.n.01', 'synonyms': ['puppet', 'marionette'], 'id': 856, 'def': 'a small figure of a person operated from above with strings by a puppeteer', 'name': 'puppet'}, {'frequency': 'c', 'synset': 'puppy.n.01', 'synonyms': ['puppy'], 'id': 857, 'def': 'a young dog', 'name': 'puppy'}, {'frequency': 'r', 'synset': 'quesadilla.n.01', 'synonyms': ['quesadilla'], 'id': 858, 'def': 'a tortilla that is filled with cheese and heated', 'name': 'quesadilla'}, {'frequency': 'r', 'synset': 'quiche.n.02', 'synonyms': ['quiche'], 'id': 859, 'def': 'a tart filled with rich unsweetened custard; often contains other ingredients (as cheese or ham or seafood or vegetables)', 'name': 'quiche'}, {'frequency': 'f', 'synset': 'quilt.n.01', 'synonyms': ['quilt', 'comforter'], 'id': 860, 'def': 'bedding made of two layers of cloth filled with stuffing and stitched together', 'name': 'quilt'}, {'frequency': 'c', 'synset': 'rabbit.n.01', 'synonyms': ['rabbit'], 'id': 861, 'def': 'any of various burrowing animals of the family Leporidae having long ears and short tails', 'name': 'rabbit'}, {'frequency': 'r', 'synset': 'racer.n.02', 'synonyms': ['race_car', 'racing_car'], 'id': 862, 'def': 'a fast car that competes in races', 'name': 'race_car'}, {'frequency': 'c', 'synset': 'racket.n.04', 'synonyms': ['racket', 'racquet'], 'id': 863, 'def': 'a sports implement used to strike a ball in various games', 'name': 'racket'}, {'frequency': 'r', 'synset': 'radar.n.01', 'synonyms': ['radar'], 'id': 864, 'def': 'measuring instrument in which the echo of a pulse of microwave radiation is used to detect and locate distant objects', 'name': 'radar'}, {'frequency': 'f', 'synset': 'radiator.n.03', 'synonyms': ['radiator'], 'id': 865, 'def': 'a mechanism consisting of a metal honeycomb through which hot fluids circulate', 'name': 'radiator'}, {'frequency': 'c', 'synset': 'radio_receiver.n.01', 'synonyms': ['radio_receiver', 'radio_set', 'radio', 'tuner_(radio)'], 'id': 866, 'def': 'an electronic receiver that detects and demodulates and amplifies transmitted radio signals', 'name': 'radio_receiver'}, {'frequency': 'c', 'synset': 'radish.n.03', 'synonyms': ['radish', 'daikon'], 'id': 867, 'def': 'pungent edible root of any of various cultivated radish plants', 'name': 'radish'}, {'frequency': 'c', 'synset': 'raft.n.01', 'synonyms': ['raft'], 'id': 868, 'def': 'a flat float (usually made of logs or planks) that can be used for transport or as a platform for swimmers', 'name': 'raft'}, {'frequency': 'r', 'synset': 'rag_doll.n.01', 'synonyms': ['rag_doll'], 'id': 869, 'def': 'a cloth doll that is stuffed and (usually) painted', 'name': 'rag_doll'}, {'frequency': 'c', 'synset': 'raincoat.n.01', 'synonyms': ['raincoat', 'waterproof_jacket'], 'id': 870, 'def': 'a water-resistant coat', 'name': 'raincoat'}, {'frequency': 'c', 'synset': 'ram.n.05', 'synonyms': ['ram_(animal)'], 'id': 871, 'def': 'uncastrated adult male sheep', 'name': 'ram_(animal)'}, {'frequency': 'c', 'synset': 'raspberry.n.02', 'synonyms': ['raspberry'], 'id': 872, 'def': 'red or black edible aggregate berries usually smaller than the related blackberries', 'name': 'raspberry'}, {'frequency': 'r', 'synset': 'rat.n.01', 'synonyms': ['rat'], 'id': 873, 'def': 'any of various long-tailed rodents similar to but larger than a mouse', 'name': 'rat'}, {'frequency': 'c', 'synset': 'razorblade.n.01', 'synonyms': ['razorblade'], 'id': 874, 'def': 'a blade that has very sharp edge', 'name': 'razorblade'}, {'frequency': 'c', 'synset': 'reamer.n.01', 'synonyms': ['reamer_(juicer)', 'juicer', 'juice_reamer'], 'id': 875, 'def': 'a squeezer with a conical ridged center that is used for squeezing juice from citrus fruit', 'name': 'reamer_(juicer)'}, {'frequency': 'f', 'synset': 'rearview_mirror.n.01', 'synonyms': ['rearview_mirror'], 'id': 876, 'def': 'vehicle mirror (side or rearview)', 'name': 'rearview_mirror'}, {'frequency': 'c', 'synset': 'receipt.n.02', 'synonyms': ['receipt'], 'id': 877, 'def': 'an acknowledgment (usually tangible) that payment has been made', 'name': 'receipt'}, {'frequency': 'c', 'synset': 'recliner.n.01', 'synonyms': ['recliner', 'reclining_chair', 'lounger_(chair)'], 'id': 878, 'def': 'an armchair whose back can be lowered and foot can be raised to allow the sitter to recline in it', 'name': 'recliner'}, {'frequency': 'c', 'synset': 'record_player.n.01', 'synonyms': ['record_player', 'phonograph_(record_player)', 'turntable'], 'id': 879, 'def': 'machine in which rotating records cause a stylus to vibrate and the vibrations are amplified acoustically or electronically', 'name': 'record_player'}, {'frequency': 'f', 'synset': 'reflector.n.01', 'synonyms': ['reflector'], 'id': 880, 'def': 'device that reflects light, radiation, etc.', 'name': 'reflector'}, {'frequency': 'f', 'synset': 'remote_control.n.01', 'synonyms': ['remote_control'], 'id': 881, 'def': 'a device that can be used to control a machine or apparatus from a distance', 'name': 'remote_control'}, {'frequency': 'c', 'synset': 'rhinoceros.n.01', 'synonyms': ['rhinoceros'], 'id': 882, 'def': 'massive powerful herbivorous odd-toed ungulate of southeast Asia and Africa having very thick skin and one or two horns on the snout', 'name': 'rhinoceros'}, {'frequency': 'r', 'synset': 'rib.n.03', 'synonyms': ['rib_(food)'], 'id': 883, 'def': 'cut of meat including one or more ribs', 'name': 'rib_(food)'}, {'frequency': 'c', 'synset': 'rifle.n.01', 'synonyms': ['rifle'], 'id': 884, 'def': 'a shoulder firearm with a long barrel', 'name': 'rifle'}, {'frequency': 'f', 'synset': 'ring.n.08', 'synonyms': ['ring'], 'id': 885, 'def': 'jewelry consisting of a circlet of precious metal (often set with jewels) worn on the finger', 'name': 'ring'}, {'frequency': 'r', 'synset': 'river_boat.n.01', 'synonyms': ['river_boat'], 'id': 886, 'def': 'a boat used on rivers or to ply a river', 'name': 'river_boat'}, {'frequency': 'r', 'synset': 'road_map.n.02', 'synonyms': ['road_map'], 'id': 887, 'def': '(NOT A ROAD) a MAP showing roads (for automobile travel)', 'name': 'road_map'}, {'frequency': 'c', 'synset': 'robe.n.01', 'synonyms': ['robe'], 'id': 888, 'def': 'any loose flowing garment', 'name': 'robe'}, {'frequency': 'c', 'synset': 'rocking_chair.n.01', 'synonyms': ['rocking_chair'], 'id': 889, 'def': 'a chair mounted on rockers', 'name': 'rocking_chair'}, {'frequency': 'r', 'synset': 'rodent.n.01', 'synonyms': ['rodent'], 'id': 890, 'def': 'relatively small placental mammals having a single pair of constantly growing incisor teeth specialized for gnawing', 'name': 'rodent'}, {'frequency': 'r', 'synset': 'roller_skate.n.01', 'synonyms': ['roller_skate'], 'id': 891, 'def': 'a shoe with pairs of rollers (small hard wheels) fixed to the sole', 'name': 'roller_skate'}, {'frequency': 'r', 'synset': 'rollerblade.n.01', 'synonyms': ['Rollerblade'], 'id': 892, 'def': 'an in-line variant of a roller skate', 'name': 'Rollerblade'}, {'frequency': 'c', 'synset': 'rolling_pin.n.01', 'synonyms': ['rolling_pin'], 'id': 893, 'def': 'utensil consisting of a cylinder (usually of wood) with a handle at each end; used to roll out dough', 'name': 'rolling_pin'}, {'frequency': 'r', 'synset': 'root_beer.n.01', 'synonyms': ['root_beer'], 'id': 894, 'def': 'carbonated drink containing extracts of roots and herbs', 'name': 'root_beer'}, {'frequency': 'c', 'synset': 'router.n.02', 'synonyms': ['router_(computer_equipment)'], 'id': 895, 'def': 'a device that forwards data packets between computer networks', 'name': 'router_(computer_equipment)'}, {'frequency': 'f', 'synset': 'rubber_band.n.01', 'synonyms': ['rubber_band', 'elastic_band'], 'id': 896, 'def': 'a narrow band of elastic rubber used to hold things (such as papers) together', 'name': 'rubber_band'}, {'frequency': 'c', 'synset': 'runner.n.08', 'synonyms': ['runner_(carpet)'], 'id': 897, 'def': 'a long narrow carpet', 'name': 'runner_(carpet)'}, {'frequency': 'f', 'synset': 'sack.n.01', 'synonyms': ['plastic_bag', 'paper_bag'], 'id': 898, 'def': "a bag made of paper or plastic for holding customer's purchases", 'name': 'plastic_bag'}, {'frequency': 'f', 'synset': 'saddle.n.01', 'synonyms': ['saddle_(on_an_animal)'], 'id': 899, 'def': 'a seat for the rider of a horse or camel', 'name': 'saddle_(on_an_animal)'}, {'frequency': 'f', 'synset': 'saddle_blanket.n.01', 'synonyms': ['saddle_blanket', 'saddlecloth', 'horse_blanket'], 'id': 900, 'def': 'stable gear consisting of a blanket placed under the saddle', 'name': 'saddle_blanket'}, {'frequency': 'c', 'synset': 'saddlebag.n.01', 'synonyms': ['saddlebag'], 'id': 901, 'def': 'a large bag (or pair of bags) hung over a saddle', 'name': 'saddlebag'}, {'frequency': 'r', 'synset': 'safety_pin.n.01', 'synonyms': ['safety_pin'], 'id': 902, 'def': 'a pin in the form of a clasp; has a guard so the point of the pin will not stick the user', 'name': 'safety_pin'}, {'frequency': 'f', 'synset': 'sail.n.01', 'synonyms': ['sail'], 'id': 903, 'def': 'a large piece of fabric by means of which wind is used to propel a sailing vessel', 'name': 'sail'}, {'frequency': 'f', 'synset': 'salad.n.01', 'synonyms': ['salad'], 'id': 904, 'def': 'food mixtures either arranged on a plate or tossed and served with a moist dressing; usually consisting of or including greens', 'name': 'salad'}, {'frequency': 'r', 'synset': 'salad_plate.n.01', 'synonyms': ['salad_plate', 'salad_bowl'], 'id': 905, 'def': 'a plate or bowl for individual servings of salad', 'name': 'salad_plate'}, {'frequency': 'c', 'synset': 'salami.n.01', 'synonyms': ['salami'], 'id': 906, 'def': 'highly seasoned fatty sausage of pork and beef usually dried', 'name': 'salami'}, {'frequency': 'c', 'synset': 'salmon.n.01', 'synonyms': ['salmon_(fish)'], 'id': 907, 'def': 'any of various large food and game fishes of northern waters', 'name': 'salmon_(fish)'}, {'frequency': 'r', 'synset': 'salmon.n.03', 'synonyms': ['salmon_(food)'], 'id': 908, 'def': 'flesh of any of various marine or freshwater fish of the family Salmonidae', 'name': 'salmon_(food)'}, {'frequency': 'c', 'synset': 'salsa.n.01', 'synonyms': ['salsa'], 'id': 909, 'def': 'spicy sauce of tomatoes and onions and chili peppers to accompany Mexican foods', 'name': 'salsa'}, {'frequency': 'f', 'synset': 'saltshaker.n.01', 'synonyms': ['saltshaker'], 'id': 910, 'def': 'a shaker with a perforated top for sprinkling salt', 'name': 'saltshaker'}, {'frequency': 'f', 'synset': 'sandal.n.01', 'synonyms': ['sandal_(type_of_shoe)'], 'id': 911, 'def': 'a shoe consisting of a sole fastened by straps to the foot', 'name': 'sandal_(type_of_shoe)'}, {'frequency': 'f', 'synset': 'sandwich.n.01', 'synonyms': ['sandwich'], 'id': 912, 'def': 'two (or more) slices of bread with a filling between them', 'name': 'sandwich'}, {'frequency': 'r', 'synset': 'satchel.n.01', 'synonyms': ['satchel'], 'id': 913, 'def': 'luggage consisting of a small case with a flat bottom and (usually) a shoulder strap', 'name': 'satchel'}, {'frequency': 'r', 'synset': 'saucepan.n.01', 'synonyms': ['saucepan'], 'id': 914, 'def': 'a deep pan with a handle; used for stewing or boiling', 'name': 'saucepan'}, {'frequency': 'f', 'synset': 'saucer.n.02', 'synonyms': ['saucer'], 'id': 915, 'def': 'a small shallow dish for holding a cup at the table', 'name': 'saucer'}, {'frequency': 'f', 'synset': 'sausage.n.01', 'synonyms': ['sausage'], 'id': 916, 'def': 'highly seasoned minced meat stuffed in casings', 'name': 'sausage'}, {'frequency': 'r', 'synset': 'sawhorse.n.01', 'synonyms': ['sawhorse', 'sawbuck'], 'id': 917, 'def': 'a framework for holding wood that is being sawed', 'name': 'sawhorse'}, {'frequency': 'r', 'synset': 'sax.n.02', 'synonyms': ['saxophone'], 'id': 918, 'def': "a wind instrument with a `J'-shaped form typically made of brass", 'name': 'saxophone'}, {'frequency': 'f', 'synset': 'scale.n.07', 'synonyms': ['scale_(measuring_instrument)'], 'id': 919, 'def': 'a measuring instrument for weighing; shows amount of mass', 'name': 'scale_(measuring_instrument)'}, {'frequency': 'r', 'synset': 'scarecrow.n.01', 'synonyms': ['scarecrow', 'strawman'], 'id': 920, 'def': 'an effigy in the shape of a man to frighten birds away from seeds', 'name': 'scarecrow'}, {'frequency': 'f', 'synset': 'scarf.n.01', 'synonyms': ['scarf'], 'id': 921, 'def': 'a garment worn around the head or neck or shoulders for warmth or decoration', 'name': 'scarf'}, {'frequency': 'c', 'synset': 'school_bus.n.01', 'synonyms': ['school_bus'], 'id': 922, 'def': 'a bus used to transport children to or from school', 'name': 'school_bus'}, {'frequency': 'f', 'synset': 'scissors.n.01', 'synonyms': ['scissors'], 'id': 923, 'def': 'a tool having two crossed pivoting blades with looped handles', 'name': 'scissors'}, {'frequency': 'f', 'synset': 'scoreboard.n.01', 'synonyms': ['scoreboard'], 'id': 924, 'def': 'a large board for displaying the score of a contest (and some other information)', 'name': 'scoreboard'}, {'frequency': 'r', 'synset': 'scraper.n.01', 'synonyms': ['scraper'], 'id': 925, 'def': 'any of various hand tools for scraping', 'name': 'scraper'}, {'frequency': 'c', 'synset': 'screwdriver.n.01', 'synonyms': ['screwdriver'], 'id': 926, 'def': 'a hand tool for driving screws; has a tip that fits into the head of a screw', 'name': 'screwdriver'}, {'frequency': 'f', 'synset': 'scrub_brush.n.01', 'synonyms': ['scrubbing_brush'], 'id': 927, 'def': 'a brush with short stiff bristles for heavy cleaning', 'name': 'scrubbing_brush'}, {'frequency': 'c', 'synset': 'sculpture.n.01', 'synonyms': ['sculpture'], 'id': 928, 'def': 'a three-dimensional work of art', 'name': 'sculpture'}, {'frequency': 'c', 'synset': 'seabird.n.01', 'synonyms': ['seabird', 'seafowl'], 'id': 929, 'def': 'a bird that frequents coastal waters and the open ocean: gulls; pelicans; gannets; cormorants; albatrosses; petrels; etc.', 'name': 'seabird'}, {'frequency': 'c', 'synset': 'seahorse.n.02', 'synonyms': ['seahorse'], 'id': 930, 'def': 'small fish with horse-like heads bent sharply downward and curled tails', 'name': 'seahorse'}, {'frequency': 'r', 'synset': 'seaplane.n.01', 'synonyms': ['seaplane', 'hydroplane'], 'id': 931, 'def': 'an airplane that can land on or take off from water', 'name': 'seaplane'}, {'frequency': 'c', 'synset': 'seashell.n.01', 'synonyms': ['seashell'], 'id': 932, 'def': 'the shell of a marine organism', 'name': 'seashell'}, {'frequency': 'c', 'synset': 'sewing_machine.n.01', 'synonyms': ['sewing_machine'], 'id': 933, 'def': 'a textile machine used as a home appliance for sewing', 'name': 'sewing_machine'}, {'frequency': 'c', 'synset': 'shaker.n.03', 'synonyms': ['shaker'], 'id': 934, 'def': 'a container in which something can be shaken', 'name': 'shaker'}, {'frequency': 'c', 'synset': 'shampoo.n.01', 'synonyms': ['shampoo'], 'id': 935, 'def': 'cleansing agent consisting of soaps or detergents used for washing the hair', 'name': 'shampoo'}, {'frequency': 'c', 'synset': 'shark.n.01', 'synonyms': ['shark'], 'id': 936, 'def': 'typically large carnivorous fishes with sharpe teeth', 'name': 'shark'}, {'frequency': 'r', 'synset': 'sharpener.n.01', 'synonyms': ['sharpener'], 'id': 937, 'def': 'any implement that is used to make something (an edge or a point) sharper', 'name': 'sharpener'}, {'frequency': 'r', 'synset': 'sharpie.n.03', 'synonyms': ['Sharpie'], 'id': 938, 'def': 'a pen with indelible ink that will write on any surface', 'name': 'Sharpie'}, {'frequency': 'r', 'synset': 'shaver.n.03', 'synonyms': ['shaver_(electric)', 'electric_shaver', 'electric_razor'], 'id': 939, 'def': 'a razor powered by an electric motor', 'name': 'shaver_(electric)'}, {'frequency': 'c', 'synset': 'shaving_cream.n.01', 'synonyms': ['shaving_cream', 'shaving_soap'], 'id': 940, 'def': 'toiletry consisting that forms a rich lather for softening the beard before shaving', 'name': 'shaving_cream'}, {'frequency': 'r', 'synset': 'shawl.n.01', 'synonyms': ['shawl'], 'id': 941, 'def': 'cloak consisting of an oblong piece of cloth used to cover the head and shoulders', 'name': 'shawl'}, {'frequency': 'r', 'synset': 'shears.n.01', 'synonyms': ['shears'], 'id': 942, 'def': 'large scissors with strong blades', 'name': 'shears'}, {'frequency': 'f', 'synset': 'sheep.n.01', 'synonyms': ['sheep'], 'id': 943, 'def': 'woolly usually horned ruminant mammal related to the goat', 'name': 'sheep'}, {'frequency': 'r', 'synset': 'shepherd_dog.n.01', 'synonyms': ['shepherd_dog', 'sheepdog'], 'id': 944, 'def': 'any of various usually long-haired breeds of dog reared to herd and guard sheep', 'name': 'shepherd_dog'}, {'frequency': 'r', 'synset': 'sherbert.n.01', 'synonyms': ['sherbert', 'sherbet'], 'id': 945, 'def': 'a frozen dessert made primarily of fruit juice and sugar', 'name': 'sherbert'}, {'frequency': 'c', 'synset': 'shield.n.02', 'synonyms': ['shield'], 'id': 946, 'def': 'armor carried on the arm to intercept blows', 'name': 'shield'}, {'frequency': 'f', 'synset': 'shirt.n.01', 'synonyms': ['shirt'], 'id': 947, 'def': 'a garment worn on the upper half of the body', 'name': 'shirt'}, {'frequency': 'f', 'synset': 'shoe.n.01', 'synonyms': ['shoe', 'sneaker_(type_of_shoe)', 'tennis_shoe'], 'id': 948, 'def': 'common footwear covering the foot', 'name': 'shoe'}, {'frequency': 'f', 'synset': 'shopping_bag.n.01', 'synonyms': ['shopping_bag'], 'id': 949, 'def': 'a bag made of plastic or strong paper (often with handles); used to transport goods after shopping', 'name': 'shopping_bag'}, {'frequency': 'c', 'synset': 'shopping_cart.n.01', 'synonyms': ['shopping_cart'], 'id': 950, 'def': 'a handcart that holds groceries or other goods while shopping', 'name': 'shopping_cart'}, {'frequency': 'f', 'synset': 'short_pants.n.01', 'synonyms': ['short_pants', 'shorts_(clothing)', 'trunks_(clothing)'], 'id': 951, 'def': 'trousers that end at or above the knee', 'name': 'short_pants'}, {'frequency': 'r', 'synset': 'shot_glass.n.01', 'synonyms': ['shot_glass'], 'id': 952, 'def': 'a small glass adequate to hold a single swallow of whiskey', 'name': 'shot_glass'}, {'frequency': 'f', 'synset': 'shoulder_bag.n.01', 'synonyms': ['shoulder_bag'], 'id': 953, 'def': 'a large handbag that can be carried by a strap looped over the shoulder', 'name': 'shoulder_bag'}, {'frequency': 'c', 'synset': 'shovel.n.01', 'synonyms': ['shovel'], 'id': 954, 'def': 'a hand tool for lifting loose material such as snow, dirt, etc.', 'name': 'shovel'}, {'frequency': 'f', 'synset': 'shower.n.01', 'synonyms': ['shower_head'], 'id': 955, 'def': 'a plumbing fixture that sprays water over you', 'name': 'shower_head'}, {'frequency': 'r', 'synset': 'shower_cap.n.01', 'synonyms': ['shower_cap'], 'id': 956, 'def': 'a tight cap worn to keep hair dry while showering', 'name': 'shower_cap'}, {'frequency': 'f', 'synset': 'shower_curtain.n.01', 'synonyms': ['shower_curtain'], 'id': 957, 'def': 'a curtain that keeps water from splashing out of the shower area', 'name': 'shower_curtain'}, {'frequency': 'r', 'synset': 'shredder.n.01', 'synonyms': ['shredder_(for_paper)'], 'id': 958, 'def': 'a device that shreds documents', 'name': 'shredder_(for_paper)'}, {'frequency': 'f', 'synset': 'signboard.n.01', 'synonyms': ['signboard'], 'id': 959, 'def': 'structure displaying a board on which advertisements can be posted', 'name': 'signboard'}, {'frequency': 'c', 'synset': 'silo.n.01', 'synonyms': ['silo'], 'id': 960, 'def': 'a cylindrical tower used for storing goods', 'name': 'silo'}, {'frequency': 'f', 'synset': 'sink.n.01', 'synonyms': ['sink'], 'id': 961, 'def': 'plumbing fixture consisting of a water basin fixed to a wall or floor and having a drainpipe', 'name': 'sink'}, {'frequency': 'f', 'synset': 'skateboard.n.01', 'synonyms': ['skateboard'], 'id': 962, 'def': 'a board with wheels that is ridden in a standing or crouching position and propelled by foot', 'name': 'skateboard'}, {'frequency': 'c', 'synset': 'skewer.n.01', 'synonyms': ['skewer'], 'id': 963, 'def': 'a long pin for holding meat in position while it is being roasted', 'name': 'skewer'}, {'frequency': 'f', 'synset': 'ski.n.01', 'synonyms': ['ski'], 'id': 964, 'def': 'sports equipment for skiing on snow', 'name': 'ski'}, {'frequency': 'f', 'synset': 'ski_boot.n.01', 'synonyms': ['ski_boot'], 'id': 965, 'def': 'a stiff boot that is fastened to a ski with a ski binding', 'name': 'ski_boot'}, {'frequency': 'f', 'synset': 'ski_parka.n.01', 'synonyms': ['ski_parka', 'ski_jacket'], 'id': 966, 'def': 'a parka to be worn while skiing', 'name': 'ski_parka'}, {'frequency': 'f', 'synset': 'ski_pole.n.01', 'synonyms': ['ski_pole'], 'id': 967, 'def': 'a pole with metal points used as an aid in skiing', 'name': 'ski_pole'}, {'frequency': 'f', 'synset': 'skirt.n.02', 'synonyms': ['skirt'], 'id': 968, 'def': 'a garment hanging from the waist; worn mainly by girls and women', 'name': 'skirt'}, {'frequency': 'r', 'synset': 'skullcap.n.01', 'synonyms': ['skullcap'], 'id': 969, 'def': 'rounded brimless cap fitting the crown of the head', 'name': 'skullcap'}, {'frequency': 'c', 'synset': 'sled.n.01', 'synonyms': ['sled', 'sledge', 'sleigh'], 'id': 970, 'def': 'a vehicle or flat object for transportation over snow by sliding or pulled by dogs, etc.', 'name': 'sled'}, {'frequency': 'c', 'synset': 'sleeping_bag.n.01', 'synonyms': ['sleeping_bag'], 'id': 971, 'def': 'large padded bag designed to be slept in outdoors', 'name': 'sleeping_bag'}, {'frequency': 'r', 'synset': 'sling.n.05', 'synonyms': ['sling_(bandage)', 'triangular_bandage'], 'id': 972, 'def': 'bandage to support an injured forearm; slung over the shoulder or neck', 'name': 'sling_(bandage)'}, {'frequency': 'c', 'synset': 'slipper.n.01', 'synonyms': ['slipper_(footwear)', 'carpet_slipper_(footwear)'], 'id': 973, 'def': 'low footwear that can be slipped on and off easily; usually worn indoors', 'name': 'slipper_(footwear)'}, {'frequency': 'r', 'synset': 'smoothie.n.02', 'synonyms': ['smoothie'], 'id': 974, 'def': 'a thick smooth drink consisting of fresh fruit pureed with ice cream or yoghurt or milk', 'name': 'smoothie'}, {'frequency': 'r', 'synset': 'snake.n.01', 'synonyms': ['snake', 'serpent'], 'id': 975, 'def': 'limbless scaly elongate reptile; some are venomous', 'name': 'snake'}, {'frequency': 'f', 'synset': 'snowboard.n.01', 'synonyms': ['snowboard'], 'id': 976, 'def': 'a board that resembles a broad ski or a small surfboard; used in a standing position to slide down snow-covered slopes', 'name': 'snowboard'}, {'frequency': 'c', 'synset': 'snowman.n.01', 'synonyms': ['snowman'], 'id': 977, 'def': 'a figure of a person made of packed snow', 'name': 'snowman'}, {'frequency': 'c', 'synset': 'snowmobile.n.01', 'synonyms': ['snowmobile'], 'id': 978, 'def': 'tracked vehicle for travel on snow having skis in front', 'name': 'snowmobile'}, {'frequency': 'f', 'synset': 'soap.n.01', 'synonyms': ['soap'], 'id': 979, 'def': 'a cleansing agent made from the salts of vegetable or animal fats', 'name': 'soap'}, {'frequency': 'f', 'synset': 'soccer_ball.n.01', 'synonyms': ['soccer_ball'], 'id': 980, 'def': "an inflated ball used in playing soccer (called `football' outside of the United States)", 'name': 'soccer_ball'}, {'frequency': 'f', 'synset': 'sock.n.01', 'synonyms': ['sock'], 'id': 981, 'def': 'cloth covering for the foot; worn inside the shoe; reaches to between the ankle and the knee', 'name': 'sock'}, {'frequency': 'f', 'synset': 'sofa.n.01', 'synonyms': ['sofa', 'couch', 'lounge'], 'id': 982, 'def': 'an upholstered seat for more than one person', 'name': 'sofa'}, {'frequency': 'r', 'synset': 'softball.n.01', 'synonyms': ['softball'], 'id': 983, 'def': 'ball used in playing softball', 'name': 'softball'}, {'frequency': 'c', 'synset': 'solar_array.n.01', 'synonyms': ['solar_array', 'solar_battery', 'solar_panel'], 'id': 984, 'def': 'electrical device consisting of a large array of connected solar cells', 'name': 'solar_array'}, {'frequency': 'r', 'synset': 'sombrero.n.02', 'synonyms': ['sombrero'], 'id': 985, 'def': 'a straw hat with a tall crown and broad brim; worn in American southwest and in Mexico', 'name': 'sombrero'}, {'frequency': 'f', 'synset': 'soup.n.01', 'synonyms': ['soup'], 'id': 986, 'def': 'liquid food especially of meat or fish or vegetable stock often containing pieces of solid food', 'name': 'soup'}, {'frequency': 'r', 'synset': 'soup_bowl.n.01', 'synonyms': ['soup_bowl'], 'id': 987, 'def': 'a bowl for serving soup', 'name': 'soup_bowl'}, {'frequency': 'c', 'synset': 'soupspoon.n.01', 'synonyms': ['soupspoon'], 'id': 988, 'def': 'a spoon with a rounded bowl for eating soup', 'name': 'soupspoon'}, {'frequency': 'c', 'synset': 'sour_cream.n.01', 'synonyms': ['sour_cream', 'soured_cream'], 'id': 989, 'def': 'soured light cream', 'name': 'sour_cream'}, {'frequency': 'r', 'synset': 'soya_milk.n.01', 'synonyms': ['soya_milk', 'soybean_milk', 'soymilk'], 'id': 990, 'def': 'a milk substitute containing soybean flour and water; used in some infant formulas and in making tofu', 'name': 'soya_milk'}, {'frequency': 'r', 'synset': 'space_shuttle.n.01', 'synonyms': ['space_shuttle'], 'id': 991, 'def': "a reusable spacecraft with wings for a controlled descent through the Earth's atmosphere", 'name': 'space_shuttle'}, {'frequency': 'r', 'synset': 'sparkler.n.02', 'synonyms': ['sparkler_(fireworks)'], 'id': 992, 'def': 'a firework that burns slowly and throws out a shower of sparks', 'name': 'sparkler_(fireworks)'}, {'frequency': 'f', 'synset': 'spatula.n.02', 'synonyms': ['spatula'], 'id': 993, 'def': 'a hand tool with a thin flexible blade used to mix or spread soft substances', 'name': 'spatula'}, {'frequency': 'r', 'synset': 'spear.n.01', 'synonyms': ['spear', 'lance'], 'id': 994, 'def': 'a long pointed rod used as a tool or weapon', 'name': 'spear'}, {'frequency': 'f', 'synset': 'spectacles.n.01', 'synonyms': ['spectacles', 'specs', 'eyeglasses', 'glasses'], 'id': 995, 'def': 'optical instrument consisting of a frame that holds a pair of lenses for correcting defective vision', 'name': 'spectacles'}, {'frequency': 'c', 'synset': 'spice_rack.n.01', 'synonyms': ['spice_rack'], 'id': 996, 'def': 'a rack for displaying containers filled with spices', 'name': 'spice_rack'}, {'frequency': 'c', 'synset': 'spider.n.01', 'synonyms': ['spider'], 'id': 997, 'def': 'predatory arachnid with eight legs, two poison fangs, two feelers, and usually two silk-spinning organs at the back end of the body', 'name': 'spider'}, {'frequency': 'r', 'synset': 'spiny_lobster.n.02', 'synonyms': ['crawfish', 'crayfish'], 'id': 998, 'def': 'large edible marine crustacean having a spiny carapace but lacking the large pincers of true lobsters', 'name': 'crawfish'}, {'frequency': 'c', 'synset': 'sponge.n.01', 'synonyms': ['sponge'], 'id': 999, 'def': 'a porous mass usable to absorb water typically used for cleaning', 'name': 'sponge'}, {'frequency': 'f', 'synset': 'spoon.n.01', 'synonyms': ['spoon'], 'id': 1000, 'def': 'a piece of cutlery with a shallow bowl-shaped container and a handle', 'name': 'spoon'}, {'frequency': 'c', 'synset': 'sportswear.n.01', 'synonyms': ['sportswear', 'athletic_wear', 'activewear'], 'id': 1001, 'def': 'attire worn for sport or for casual wear', 'name': 'sportswear'}, {'frequency': 'c', 'synset': 'spotlight.n.02', 'synonyms': ['spotlight'], 'id': 1002, 'def': 'a lamp that produces a strong beam of light to illuminate a restricted area; used to focus attention of a stage performer', 'name': 'spotlight'}, {'frequency': 'r', 'synset': 'squid.n.01', 'synonyms': ['squid_(food)', 'calamari', 'calamary'], 'id': 1003, 'def': '(Italian cuisine) squid prepared as food', 'name': 'squid_(food)'}, {'frequency': 'c', 'synset': 'squirrel.n.01', 'synonyms': ['squirrel'], 'id': 1004, 'def': 'a kind of arboreal rodent having a long bushy tail', 'name': 'squirrel'}, {'frequency': 'r', 'synset': 'stagecoach.n.01', 'synonyms': ['stagecoach'], 'id': 1005, 'def': 'a large coach-and-four formerly used to carry passengers and mail on regular routes between towns', 'name': 'stagecoach'}, {'frequency': 'c', 'synset': 'stapler.n.01', 'synonyms': ['stapler_(stapling_machine)'], 'id': 1006, 'def': 'a machine that inserts staples into sheets of paper in order to fasten them together', 'name': 'stapler_(stapling_machine)'}, {'frequency': 'c', 'synset': 'starfish.n.01', 'synonyms': ['starfish', 'sea_star'], 'id': 1007, 'def': 'echinoderms characterized by five arms extending from a central disk', 'name': 'starfish'}, {'frequency': 'f', 'synset': 'statue.n.01', 'synonyms': ['statue_(sculpture)'], 'id': 1008, 'def': 'a sculpture representing a human or animal', 'name': 'statue_(sculpture)'}, {'frequency': 'c', 'synset': 'steak.n.01', 'synonyms': ['steak_(food)'], 'id': 1009, 'def': 'a slice of meat cut from the fleshy part of an animal or large fish', 'name': 'steak_(food)'}, {'frequency': 'r', 'synset': 'steak_knife.n.01', 'synonyms': ['steak_knife'], 'id': 1010, 'def': 'a sharp table knife used in eating steak', 'name': 'steak_knife'}, {'frequency': 'f', 'synset': 'steering_wheel.n.01', 'synonyms': ['steering_wheel'], 'id': 1011, 'def': 'a handwheel that is used for steering', 'name': 'steering_wheel'}, {'frequency': 'r', 'synset': 'step_ladder.n.01', 'synonyms': ['stepladder'], 'id': 1012, 'def': 'a folding portable ladder hinged at the top', 'name': 'stepladder'}, {'frequency': 'c', 'synset': 'step_stool.n.01', 'synonyms': ['step_stool'], 'id': 1013, 'def': 'a stool that has one or two steps that fold under the seat', 'name': 'step_stool'}, {'frequency': 'c', 'synset': 'stereo.n.01', 'synonyms': ['stereo_(sound_system)'], 'id': 1014, 'def': 'electronic device for playing audio', 'name': 'stereo_(sound_system)'}, {'frequency': 'r', 'synset': 'stew.n.02', 'synonyms': ['stew'], 'id': 1015, 'def': 'food prepared by stewing especially meat or fish with vegetables', 'name': 'stew'}, {'frequency': 'r', 'synset': 'stirrer.n.02', 'synonyms': ['stirrer'], 'id': 1016, 'def': 'an implement used for stirring', 'name': 'stirrer'}, {'frequency': 'f', 'synset': 'stirrup.n.01', 'synonyms': ['stirrup'], 'id': 1017, 'def': "support consisting of metal loops into which rider's feet go", 'name': 'stirrup'}, {'frequency': 'f', 'synset': 'stool.n.01', 'synonyms': ['stool'], 'id': 1018, 'def': 'a simple seat without a back or arms', 'name': 'stool'}, {'frequency': 'f', 'synset': 'stop_sign.n.01', 'synonyms': ['stop_sign'], 'id': 1019, 'def': 'a traffic sign to notify drivers that they must come to a complete stop', 'name': 'stop_sign'}, {'frequency': 'f', 'synset': 'stoplight.n.01', 'synonyms': ['brake_light'], 'id': 1020, 'def': 'a red light on the rear of a motor vehicle that signals when the brakes are applied', 'name': 'brake_light'}, {'frequency': 'f', 'synset': 'stove.n.01', 'synonyms': ['stove', 'kitchen_stove', 'range_(kitchen_appliance)', 'kitchen_range', 'cooking_stove'], 'id': 1021, 'def': 'a kitchen appliance used for cooking food', 'name': 'stove'}, {'frequency': 'c', 'synset': 'strainer.n.01', 'synonyms': ['strainer'], 'id': 1022, 'def': 'a filter to retain larger pieces while smaller pieces and liquids pass through', 'name': 'strainer'}, {'frequency': 'f', 'synset': 'strap.n.01', 'synonyms': ['strap'], 'id': 1023, 'def': 'an elongated strip of material for binding things together or holding', 'name': 'strap'}, {'frequency': 'f', 'synset': 'straw.n.04', 'synonyms': ['straw_(for_drinking)', 'drinking_straw'], 'id': 1024, 'def': 'a thin paper or plastic tube used to suck liquids into the mouth', 'name': 'straw_(for_drinking)'}, {'frequency': 'f', 'synset': 'strawberry.n.01', 'synonyms': ['strawberry'], 'id': 1025, 'def': 'sweet fleshy red fruit', 'name': 'strawberry'}, {'frequency': 'f', 'synset': 'street_sign.n.01', 'synonyms': ['street_sign'], 'id': 1026, 'def': 'a sign visible from the street', 'name': 'street_sign'}, {'frequency': 'f', 'synset': 'streetlight.n.01', 'synonyms': ['streetlight', 'street_lamp'], 'id': 1027, 'def': 'a lamp supported on a lamppost; for illuminating a street', 'name': 'streetlight'}, {'frequency': 'r', 'synset': 'string_cheese.n.01', 'synonyms': ['string_cheese'], 'id': 1028, 'def': 'cheese formed in long strings twisted together', 'name': 'string_cheese'}, {'frequency': 'r', 'synset': 'stylus.n.02', 'synonyms': ['stylus'], 'id': 1029, 'def': 'a pointed tool for writing or drawing or engraving, including pens', 'name': 'stylus'}, {'frequency': 'r', 'synset': 'subwoofer.n.01', 'synonyms': ['subwoofer'], 'id': 1030, 'def': 'a loudspeaker that is designed to reproduce very low bass frequencies', 'name': 'subwoofer'}, {'frequency': 'r', 'synset': 'sugar_bowl.n.01', 'synonyms': ['sugar_bowl'], 'id': 1031, 'def': 'a dish in which sugar is served', 'name': 'sugar_bowl'}, {'frequency': 'r', 'synset': 'sugarcane.n.01', 'synonyms': ['sugarcane_(plant)'], 'id': 1032, 'def': 'juicy canes whose sap is a source of molasses and commercial sugar; fresh canes are sometimes chewed for the juice', 'name': 'sugarcane_(plant)'}, {'frequency': 'f', 'synset': 'suit.n.01', 'synonyms': ['suit_(clothing)'], 'id': 1033, 'def': 'a set of garments (usually including a jacket and trousers or skirt) for outerwear all of the same fabric and color', 'name': 'suit_(clothing)'}, {'frequency': 'c', 'synset': 'sunflower.n.01', 'synonyms': ['sunflower'], 'id': 1034, 'def': 'any plant of the genus Helianthus having large flower heads with dark disk florets and showy yellow rays', 'name': 'sunflower'}, {'frequency': 'f', 'synset': 'sunglasses.n.01', 'synonyms': ['sunglasses'], 'id': 1035, 'def': 'spectacles that are darkened or polarized to protect the eyes from the glare of the sun', 'name': 'sunglasses'}, {'frequency': 'c', 'synset': 'sunhat.n.01', 'synonyms': ['sunhat'], 'id': 1036, 'def': 'a hat with a broad brim that protects the face from direct exposure to the sun', 'name': 'sunhat'}, {'frequency': 'f', 'synset': 'surfboard.n.01', 'synonyms': ['surfboard'], 'id': 1037, 'def': 'a narrow buoyant board for riding surf', 'name': 'surfboard'}, {'frequency': 'c', 'synset': 'sushi.n.01', 'synonyms': ['sushi'], 'id': 1038, 'def': 'rice (with raw fish) wrapped in seaweed', 'name': 'sushi'}, {'frequency': 'c', 'synset': 'swab.n.02', 'synonyms': ['mop'], 'id': 1039, 'def': 'cleaning implement consisting of absorbent material fastened to a handle; for cleaning floors', 'name': 'mop'}, {'frequency': 'c', 'synset': 'sweat_pants.n.01', 'synonyms': ['sweat_pants'], 'id': 1040, 'def': 'loose-fitting trousers with elastic cuffs; worn by athletes', 'name': 'sweat_pants'}, {'frequency': 'c', 'synset': 'sweatband.n.02', 'synonyms': ['sweatband'], 'id': 1041, 'def': 'a band of material tied around the forehead or wrist to absorb sweat', 'name': 'sweatband'}, {'frequency': 'f', 'synset': 'sweater.n.01', 'synonyms': ['sweater'], 'id': 1042, 'def': 'a crocheted or knitted garment covering the upper part of the body', 'name': 'sweater'}, {'frequency': 'f', 'synset': 'sweatshirt.n.01', 'synonyms': ['sweatshirt'], 'id': 1043, 'def': 'cotton knit pullover with long sleeves worn during athletic activity', 'name': 'sweatshirt'}, {'frequency': 'c', 'synset': 'sweet_potato.n.02', 'synonyms': ['sweet_potato'], 'id': 1044, 'def': 'the edible tuberous root of the sweet potato vine', 'name': 'sweet_potato'}, {'frequency': 'f', 'synset': 'swimsuit.n.01', 'synonyms': ['swimsuit', 'swimwear', 'bathing_suit', 'swimming_costume', 'bathing_costume', 'swimming_trunks', 'bathing_trunks'], 'id': 1045, 'def': 'garment worn for swimming', 'name': 'swimsuit'}, {'frequency': 'c', 'synset': 'sword.n.01', 'synonyms': ['sword'], 'id': 1046, 'def': 'a cutting or thrusting weapon that has a long metal blade', 'name': 'sword'}, {'frequency': 'r', 'synset': 'syringe.n.01', 'synonyms': ['syringe'], 'id': 1047, 'def': 'a medical instrument used to inject or withdraw fluids', 'name': 'syringe'}, {'frequency': 'r', 'synset': 'tabasco.n.02', 'synonyms': ['Tabasco_sauce'], 'id': 1048, 'def': 'very spicy sauce (trade name Tabasco) made from fully-aged red peppers', 'name': 'Tabasco_sauce'}, {'frequency': 'r', 'synset': 'table-tennis_table.n.01', 'synonyms': ['table-tennis_table', 'ping-pong_table'], 'id': 1049, 'def': 'a table used for playing table tennis', 'name': 'table-tennis_table'}, {'frequency': 'f', 'synset': 'table.n.02', 'synonyms': ['table'], 'id': 1050, 'def': 'a piece of furniture having a smooth flat top that is usually supported by one or more vertical legs', 'name': 'table'}, {'frequency': 'c', 'synset': 'table_lamp.n.01', 'synonyms': ['table_lamp'], 'id': 1051, 'def': 'a lamp that sits on a table', 'name': 'table_lamp'}, {'frequency': 'f', 'synset': 'tablecloth.n.01', 'synonyms': ['tablecloth'], 'id': 1052, 'def': 'a covering spread over a dining table', 'name': 'tablecloth'}, {'frequency': 'r', 'synset': 'tachometer.n.01', 'synonyms': ['tachometer'], 'id': 1053, 'def': 'measuring instrument for indicating speed of rotation', 'name': 'tachometer'}, {'frequency': 'r', 'synset': 'taco.n.02', 'synonyms': ['taco'], 'id': 1054, 'def': 'a small tortilla cupped around a filling', 'name': 'taco'}, {'frequency': 'f', 'synset': 'tag.n.02', 'synonyms': ['tag'], 'id': 1055, 'def': 'a label associated with something for the purpose of identification or information', 'name': 'tag'}, {'frequency': 'f', 'synset': 'taillight.n.01', 'synonyms': ['taillight', 'rear_light'], 'id': 1056, 'def': 'lamp (usually red) mounted at the rear of a motor vehicle', 'name': 'taillight'}, {'frequency': 'r', 'synset': 'tambourine.n.01', 'synonyms': ['tambourine'], 'id': 1057, 'def': 'a shallow drum with a single drumhead and with metallic disks in the sides', 'name': 'tambourine'}, {'frequency': 'r', 'synset': 'tank.n.01', 'synonyms': ['army_tank', 'armored_combat_vehicle', 'armoured_combat_vehicle'], 'id': 1058, 'def': 'an enclosed armored military vehicle; has a cannon and moves on caterpillar treads', 'name': 'army_tank'}, {'frequency': 'f', 'synset': 'tank.n.02', 'synonyms': ['tank_(storage_vessel)', 'storage_tank'], 'id': 1059, 'def': 'a large (usually metallic) vessel for holding gases or liquids', 'name': 'tank_(storage_vessel)'}, {'frequency': 'f', 'synset': 'tank_top.n.01', 'synonyms': ['tank_top_(clothing)'], 'id': 1060, 'def': 'a tight-fitting sleeveless shirt with wide shoulder straps and low neck and no front opening', 'name': 'tank_top_(clothing)'}, {'frequency': 'f', 'synset': 'tape.n.01', 'synonyms': ['tape_(sticky_cloth_or_paper)'], 'id': 1061, 'def': 'a long thin piece of cloth or paper as used for binding or fastening', 'name': 'tape_(sticky_cloth_or_paper)'}, {'frequency': 'c', 'synset': 'tape.n.04', 'synonyms': ['tape_measure', 'measuring_tape'], 'id': 1062, 'def': 'measuring instrument consisting of a narrow strip (cloth or metal) marked in inches or centimeters and used for measuring lengths', 'name': 'tape_measure'}, {'frequency': 'c', 'synset': 'tapestry.n.02', 'synonyms': ['tapestry'], 'id': 1063, 'def': 'a heavy textile with a woven design; used for curtains and upholstery', 'name': 'tapestry'}, {'frequency': 'f', 'synset': 'tarpaulin.n.01', 'synonyms': ['tarp'], 'id': 1064, 'def': 'waterproofed canvas', 'name': 'tarp'}, {'frequency': 'c', 'synset': 'tartan.n.01', 'synonyms': ['tartan', 'plaid'], 'id': 1065, 'def': 'a cloth having a crisscross design', 'name': 'tartan'}, {'frequency': 'c', 'synset': 'tassel.n.01', 'synonyms': ['tassel'], 'id': 1066, 'def': 'adornment consisting of a bunch of cords fastened at one end', 'name': 'tassel'}, {'frequency': 'c', 'synset': 'tea_bag.n.01', 'synonyms': ['tea_bag'], 'id': 1067, 'def': 'a measured amount of tea in a bag for an individual serving of tea', 'name': 'tea_bag'}, {'frequency': 'c', 'synset': 'teacup.n.02', 'synonyms': ['teacup'], 'id': 1068, 'def': 'a cup from which tea is drunk', 'name': 'teacup'}, {'frequency': 'c', 'synset': 'teakettle.n.01', 'synonyms': ['teakettle'], 'id': 1069, 'def': 'kettle for boiling water to make tea', 'name': 'teakettle'}, {'frequency': 'f', 'synset': 'teapot.n.01', 'synonyms': ['teapot'], 'id': 1070, 'def': 'pot for brewing tea; usually has a spout and handle', 'name': 'teapot'}, {'frequency': 'f', 'synset': 'teddy.n.01', 'synonyms': ['teddy_bear'], 'id': 1071, 'def': "plaything consisting of a child's toy bear (usually plush and stuffed with soft materials)", 'name': 'teddy_bear'}, {'frequency': 'f', 'synset': 'telephone.n.01', 'synonyms': ['telephone', 'phone', 'telephone_set'], 'id': 1072, 'def': 'electronic device for communicating by voice over long distances (includes wired and wireless/cell phones)', 'name': 'telephone'}, {'frequency': 'c', 'synset': 'telephone_booth.n.01', 'synonyms': ['telephone_booth', 'phone_booth', 'call_box', 'telephone_box', 'telephone_kiosk'], 'id': 1073, 'def': 'booth for using a telephone', 'name': 'telephone_booth'}, {'frequency': 'f', 'synset': 'telephone_pole.n.01', 'synonyms': ['telephone_pole', 'telegraph_pole', 'telegraph_post'], 'id': 1074, 'def': 'tall pole supporting telephone wires', 'name': 'telephone_pole'}, {'frequency': 'r', 'synset': 'telephoto_lens.n.01', 'synonyms': ['telephoto_lens', 'zoom_lens'], 'id': 1075, 'def': 'a camera lens that magnifies the image', 'name': 'telephoto_lens'}, {'frequency': 'c', 'synset': 'television_camera.n.01', 'synonyms': ['television_camera', 'tv_camera'], 'id': 1076, 'def': 'television equipment for capturing and recording video', 'name': 'television_camera'}, {'frequency': 'f', 'synset': 'television_receiver.n.01', 'synonyms': ['television_set', 'tv', 'tv_set'], 'id': 1077, 'def': 'an electronic device that receives television signals and displays them on a screen', 'name': 'television_set'}, {'frequency': 'f', 'synset': 'tennis_ball.n.01', 'synonyms': ['tennis_ball'], 'id': 1078, 'def': 'ball about the size of a fist used in playing tennis', 'name': 'tennis_ball'}, {'frequency': 'f', 'synset': 'tennis_racket.n.01', 'synonyms': ['tennis_racket'], 'id': 1079, 'def': 'a racket used to play tennis', 'name': 'tennis_racket'}, {'frequency': 'r', 'synset': 'tequila.n.01', 'synonyms': ['tequila'], 'id': 1080, 'def': 'Mexican liquor made from fermented juices of an agave plant', 'name': 'tequila'}, {'frequency': 'c', 'synset': 'thermometer.n.01', 'synonyms': ['thermometer'], 'id': 1081, 'def': 'measuring instrument for measuring temperature', 'name': 'thermometer'}, {'frequency': 'c', 'synset': 'thermos.n.01', 'synonyms': ['thermos_bottle'], 'id': 1082, 'def': 'vacuum flask that preserves temperature of hot or cold drinks', 'name': 'thermos_bottle'}, {'frequency': 'f', 'synset': 'thermostat.n.01', 'synonyms': ['thermostat'], 'id': 1083, 'def': 'a regulator for automatically regulating temperature by starting or stopping the supply of heat', 'name': 'thermostat'}, {'frequency': 'r', 'synset': 'thimble.n.02', 'synonyms': ['thimble'], 'id': 1084, 'def': 'a small metal cap to protect the finger while sewing; can be used as a small container', 'name': 'thimble'}, {'frequency': 'c', 'synset': 'thread.n.01', 'synonyms': ['thread', 'yarn'], 'id': 1085, 'def': 'a fine cord of twisted fibers (of cotton or silk or wool or nylon etc.) used in sewing and weaving', 'name': 'thread'}, {'frequency': 'c', 'synset': 'thumbtack.n.01', 'synonyms': ['thumbtack', 'drawing_pin', 'pushpin'], 'id': 1086, 'def': 'a tack for attaching papers to a bulletin board or drawing board', 'name': 'thumbtack'}, {'frequency': 'c', 'synset': 'tiara.n.01', 'synonyms': ['tiara'], 'id': 1087, 'def': 'a jeweled headdress worn by women on formal occasions', 'name': 'tiara'}, {'frequency': 'c', 'synset': 'tiger.n.02', 'synonyms': ['tiger'], 'id': 1088, 'def': 'large feline of forests in most of Asia having a tawny coat with black stripes', 'name': 'tiger'}, {'frequency': 'c', 'synset': 'tights.n.01', 'synonyms': ['tights_(clothing)', 'leotards'], 'id': 1089, 'def': 'skintight knit hose covering the body from the waist to the feet worn by acrobats and dancers and as stockings by women and girls', 'name': 'tights_(clothing)'}, {'frequency': 'c', 'synset': 'timer.n.01', 'synonyms': ['timer', 'stopwatch'], 'id': 1090, 'def': 'a timepiece that measures a time interval and signals its end', 'name': 'timer'}, {'frequency': 'f', 'synset': 'tinfoil.n.01', 'synonyms': ['tinfoil'], 'id': 1091, 'def': 'foil made of tin or an alloy of tin and lead', 'name': 'tinfoil'}, {'frequency': 'c', 'synset': 'tinsel.n.01', 'synonyms': ['tinsel'], 'id': 1092, 'def': 'a showy decoration that is basically valueless', 'name': 'tinsel'}, {'frequency': 'f', 'synset': 'tissue.n.02', 'synonyms': ['tissue_paper'], 'id': 1093, 'def': 'a soft thin (usually translucent) paper', 'name': 'tissue_paper'}, {'frequency': 'c', 'synset': 'toast.n.01', 'synonyms': ['toast_(food)'], 'id': 1094, 'def': 'slice of bread that has been toasted', 'name': 'toast_(food)'}, {'frequency': 'f', 'synset': 'toaster.n.02', 'synonyms': ['toaster'], 'id': 1095, 'def': 'a kitchen appliance (usually electric) for toasting bread', 'name': 'toaster'}, {'frequency': 'f', 'synset': 'toaster_oven.n.01', 'synonyms': ['toaster_oven'], 'id': 1096, 'def': 'kitchen appliance consisting of a small electric oven for toasting or warming food', 'name': 'toaster_oven'}, {'frequency': 'f', 'synset': 'toilet.n.02', 'synonyms': ['toilet'], 'id': 1097, 'def': 'a plumbing fixture for defecation and urination', 'name': 'toilet'}, {'frequency': 'f', 'synset': 'toilet_tissue.n.01', 'synonyms': ['toilet_tissue', 'toilet_paper', 'bathroom_tissue'], 'id': 1098, 'def': 'a soft thin absorbent paper for use in toilets', 'name': 'toilet_tissue'}, {'frequency': 'f', 'synset': 'tomato.n.01', 'synonyms': ['tomato'], 'id': 1099, 'def': 'mildly acid red or yellow pulpy fruit eaten as a vegetable', 'name': 'tomato'}, {'frequency': 'f', 'synset': 'tongs.n.01', 'synonyms': ['tongs'], 'id': 1100, 'def': 'any of various devices for taking hold of objects; usually have two hinged legs with handles above and pointed hooks below', 'name': 'tongs'}, {'frequency': 'c', 'synset': 'toolbox.n.01', 'synonyms': ['toolbox'], 'id': 1101, 'def': 'a box or chest or cabinet for holding hand tools', 'name': 'toolbox'}, {'frequency': 'f', 'synset': 'toothbrush.n.01', 'synonyms': ['toothbrush'], 'id': 1102, 'def': 'small brush; has long handle; used to clean teeth', 'name': 'toothbrush'}, {'frequency': 'f', 'synset': 'toothpaste.n.01', 'synonyms': ['toothpaste'], 'id': 1103, 'def': 'a dentifrice in the form of a paste', 'name': 'toothpaste'}, {'frequency': 'f', 'synset': 'toothpick.n.01', 'synonyms': ['toothpick'], 'id': 1104, 'def': 'pick consisting of a small strip of wood or plastic; used to pick food from between the teeth', 'name': 'toothpick'}, {'frequency': 'f', 'synset': 'top.n.09', 'synonyms': ['cover'], 'id': 1105, 'def': 'covering for a hole (especially a hole in the top of a container)', 'name': 'cover'}, {'frequency': 'c', 'synset': 'tortilla.n.01', 'synonyms': ['tortilla'], 'id': 1106, 'def': 'thin unleavened pancake made from cornmeal or wheat flour', 'name': 'tortilla'}, {'frequency': 'c', 'synset': 'tow_truck.n.01', 'synonyms': ['tow_truck'], 'id': 1107, 'def': 'a truck equipped to hoist and pull wrecked cars (or to remove cars from no-parking zones)', 'name': 'tow_truck'}, {'frequency': 'f', 'synset': 'towel.n.01', 'synonyms': ['towel'], 'id': 1108, 'def': 'a rectangular piece of absorbent cloth (or paper) for drying or wiping', 'name': 'towel'}, {'frequency': 'f', 'synset': 'towel_rack.n.01', 'synonyms': ['towel_rack', 'towel_rail', 'towel_bar'], 'id': 1109, 'def': 'a rack consisting of one or more bars on which towels can be hung', 'name': 'towel_rack'}, {'frequency': 'f', 'synset': 'toy.n.03', 'synonyms': ['toy'], 'id': 1110, 'def': 'a device regarded as providing amusement', 'name': 'toy'}, {'frequency': 'c', 'synset': 'tractor.n.01', 'synonyms': ['tractor_(farm_equipment)'], 'id': 1111, 'def': 'a wheeled vehicle with large wheels; used in farming and other applications', 'name': 'tractor_(farm_equipment)'}, {'frequency': 'f', 'synset': 'traffic_light.n.01', 'synonyms': ['traffic_light'], 'id': 1112, 'def': 'a device to control vehicle traffic often consisting of three or more lights', 'name': 'traffic_light'}, {'frequency': 'c', 'synset': 'trail_bike.n.01', 'synonyms': ['dirt_bike'], 'id': 1113, 'def': 'a lightweight motorcycle equipped with rugged tires and suspension for off-road use', 'name': 'dirt_bike'}, {'frequency': 'f', 'synset': 'trailer_truck.n.01', 'synonyms': ['trailer_truck', 'tractor_trailer', 'trucking_rig', 'articulated_lorry', 'semi_truck'], 'id': 1114, 'def': 'a truck consisting of a tractor and trailer together', 'name': 'trailer_truck'}, {'frequency': 'f', 'synset': 'train.n.01', 'synonyms': ['train_(railroad_vehicle)', 'railroad_train'], 'id': 1115, 'def': 'public or private transport provided by a line of railway cars coupled together and drawn by a locomotive', 'name': 'train_(railroad_vehicle)'}, {'frequency': 'r', 'synset': 'trampoline.n.01', 'synonyms': ['trampoline'], 'id': 1116, 'def': 'gymnastic apparatus consisting of a strong canvas sheet attached with springs to a metal frame', 'name': 'trampoline'}, {'frequency': 'f', 'synset': 'tray.n.01', 'synonyms': ['tray'], 'id': 1117, 'def': 'an open receptacle for holding or displaying or serving articles or food', 'name': 'tray'}, {'frequency': 'r', 'synset': 'trench_coat.n.01', 'synonyms': ['trench_coat'], 'id': 1118, 'def': 'a military style raincoat; belted with deep pockets', 'name': 'trench_coat'}, {'frequency': 'r', 'synset': 'triangle.n.05', 'synonyms': ['triangle_(musical_instrument)'], 'id': 1119, 'def': 'a percussion instrument consisting of a metal bar bent in the shape of an open triangle', 'name': 'triangle_(musical_instrument)'}, {'frequency': 'c', 'synset': 'tricycle.n.01', 'synonyms': ['tricycle'], 'id': 1120, 'def': 'a vehicle with three wheels that is moved by foot pedals', 'name': 'tricycle'}, {'frequency': 'f', 'synset': 'tripod.n.01', 'synonyms': ['tripod'], 'id': 1121, 'def': 'a three-legged rack used for support', 'name': 'tripod'}, {'frequency': 'f', 'synset': 'trouser.n.01', 'synonyms': ['trousers', 'pants_(clothing)'], 'id': 1122, 'def': 'a garment extending from the waist to the knee or ankle, covering each leg separately', 'name': 'trousers'}, {'frequency': 'f', 'synset': 'truck.n.01', 'synonyms': ['truck'], 'id': 1123, 'def': 'an automotive vehicle suitable for hauling', 'name': 'truck'}, {'frequency': 'r', 'synset': 'truffle.n.03', 'synonyms': ['truffle_(chocolate)', 'chocolate_truffle'], 'id': 1124, 'def': 'creamy chocolate candy', 'name': 'truffle_(chocolate)'}, {'frequency': 'c', 'synset': 'trunk.n.02', 'synonyms': ['trunk'], 'id': 1125, 'def': 'luggage consisting of a large strong case used when traveling or for storage', 'name': 'trunk'}, {'frequency': 'r', 'synset': 'tub.n.02', 'synonyms': ['vat'], 'id': 1126, 'def': 'a large vessel for holding or storing liquids', 'name': 'vat'}, {'frequency': 'c', 'synset': 'turban.n.01', 'synonyms': ['turban'], 'id': 1127, 'def': 'a traditional headdress consisting of a long scarf wrapped around the head', 'name': 'turban'}, {'frequency': 'c', 'synset': 'turkey.n.04', 'synonyms': ['turkey_(food)'], 'id': 1128, 'def': 'flesh of large domesticated fowl usually roasted', 'name': 'turkey_(food)'}, {'frequency': 'r', 'synset': 'turnip.n.01', 'synonyms': ['turnip'], 'id': 1129, 'def': 'widely cultivated plant having a large fleshy edible white or yellow root', 'name': 'turnip'}, {'frequency': 'c', 'synset': 'turtle.n.02', 'synonyms': ['turtle'], 'id': 1130, 'def': 'any of various aquatic and land reptiles having a bony shell and flipper-like limbs for swimming', 'name': 'turtle'}, {'frequency': 'c', 'synset': 'turtleneck.n.01', 'synonyms': ['turtleneck_(clothing)', 'polo-neck'], 'id': 1131, 'def': 'a sweater or jersey with a high close-fitting collar', 'name': 'turtleneck_(clothing)'}, {'frequency': 'c', 'synset': 'typewriter.n.01', 'synonyms': ['typewriter'], 'id': 1132, 'def': 'hand-operated character printer for printing written messages one character at a time', 'name': 'typewriter'}, {'frequency': 'f', 'synset': 'umbrella.n.01', 'synonyms': ['umbrella'], 'id': 1133, 'def': 'a lightweight handheld collapsible canopy', 'name': 'umbrella'}, {'frequency': 'f', 'synset': 'underwear.n.01', 'synonyms': ['underwear', 'underclothes', 'underclothing', 'underpants'], 'id': 1134, 'def': 'undergarment worn next to the skin and under the outer garments', 'name': 'underwear'}, {'frequency': 'r', 'synset': 'unicycle.n.01', 'synonyms': ['unicycle'], 'id': 1135, 'def': 'a vehicle with a single wheel that is driven by pedals', 'name': 'unicycle'}, {'frequency': 'f', 'synset': 'urinal.n.01', 'synonyms': ['urinal'], 'id': 1136, 'def': 'a plumbing fixture (usually attached to the wall) used by men to urinate', 'name': 'urinal'}, {'frequency': 'c', 'synset': 'urn.n.01', 'synonyms': ['urn'], 'id': 1137, 'def': 'a large vase that usually has a pedestal or feet', 'name': 'urn'}, {'frequency': 'c', 'synset': 'vacuum.n.04', 'synonyms': ['vacuum_cleaner'], 'id': 1138, 'def': 'an electrical home appliance that cleans by suction', 'name': 'vacuum_cleaner'}, {'frequency': 'f', 'synset': 'vase.n.01', 'synonyms': ['vase'], 'id': 1139, 'def': 'an open jar of glass or porcelain used as an ornament or to hold flowers', 'name': 'vase'}, {'frequency': 'c', 'synset': 'vending_machine.n.01', 'synonyms': ['vending_machine'], 'id': 1140, 'def': 'a slot machine for selling goods', 'name': 'vending_machine'}, {'frequency': 'f', 'synset': 'vent.n.01', 'synonyms': ['vent', 'blowhole', 'air_vent'], 'id': 1141, 'def': 'a hole for the escape of gas or air', 'name': 'vent'}, {'frequency': 'f', 'synset': 'vest.n.01', 'synonyms': ['vest', 'waistcoat'], 'id': 1142, 'def': "a man's sleeveless garment worn underneath a coat", 'name': 'vest'}, {'frequency': 'c', 'synset': 'videotape.n.01', 'synonyms': ['videotape'], 'id': 1143, 'def': 'a video recording made on magnetic tape', 'name': 'videotape'}, {'frequency': 'r', 'synset': 'vinegar.n.01', 'synonyms': ['vinegar'], 'id': 1144, 'def': 'sour-tasting liquid produced usually by oxidation of the alcohol in wine or cider and used as a condiment or food preservative', 'name': 'vinegar'}, {'frequency': 'r', 'synset': 'violin.n.01', 'synonyms': ['violin', 'fiddle'], 'id': 1145, 'def': 'bowed stringed instrument that is the highest member of the violin family', 'name': 'violin'}, {'frequency': 'r', 'synset': 'vodka.n.01', 'synonyms': ['vodka'], 'id': 1146, 'def': 'unaged colorless liquor originating in Russia', 'name': 'vodka'}, {'frequency': 'c', 'synset': 'volleyball.n.02', 'synonyms': ['volleyball'], 'id': 1147, 'def': 'an inflated ball used in playing volleyball', 'name': 'volleyball'}, {'frequency': 'r', 'synset': 'vulture.n.01', 'synonyms': ['vulture'], 'id': 1148, 'def': 'any of various large birds of prey having naked heads and weak claws and feeding chiefly on carrion', 'name': 'vulture'}, {'frequency': 'c', 'synset': 'waffle.n.01', 'synonyms': ['waffle'], 'id': 1149, 'def': 'pancake batter baked in a waffle iron', 'name': 'waffle'}, {'frequency': 'r', 'synset': 'waffle_iron.n.01', 'synonyms': ['waffle_iron'], 'id': 1150, 'def': 'a kitchen appliance for baking waffles', 'name': 'waffle_iron'}, {'frequency': 'c', 'synset': 'wagon.n.01', 'synonyms': ['wagon'], 'id': 1151, 'def': 'any of various kinds of wheeled vehicles drawn by an animal or a tractor', 'name': 'wagon'}, {'frequency': 'c', 'synset': 'wagon_wheel.n.01', 'synonyms': ['wagon_wheel'], 'id': 1152, 'def': 'a wheel of a wagon', 'name': 'wagon_wheel'}, {'frequency': 'c', 'synset': 'walking_stick.n.01', 'synonyms': ['walking_stick'], 'id': 1153, 'def': 'a stick carried in the hand for support in walking', 'name': 'walking_stick'}, {'frequency': 'c', 'synset': 'wall_clock.n.01', 'synonyms': ['wall_clock'], 'id': 1154, 'def': 'a clock mounted on a wall', 'name': 'wall_clock'}, {'frequency': 'f', 'synset': 'wall_socket.n.01', 'synonyms': ['wall_socket', 'wall_plug', 'electric_outlet', 'electrical_outlet', 'outlet', 'electric_receptacle'], 'id': 1155, 'def': 'receptacle providing a place in a wiring system where current can be taken to run electrical devices', 'name': 'wall_socket'}, {'frequency': 'f', 'synset': 'wallet.n.01', 'synonyms': ['wallet', 'billfold'], 'id': 1156, 'def': 'a pocket-size case for holding papers and paper money', 'name': 'wallet'}, {'frequency': 'r', 'synset': 'walrus.n.01', 'synonyms': ['walrus'], 'id': 1157, 'def': 'either of two large northern marine mammals having ivory tusks and tough hide over thick blubber', 'name': 'walrus'}, {'frequency': 'r', 'synset': 'wardrobe.n.01', 'synonyms': ['wardrobe'], 'id': 1158, 'def': 'a tall piece of furniture that provides storage space for clothes; has a door and rails or hooks for hanging clothes', 'name': 'wardrobe'}, {'frequency': 'r', 'synset': 'washbasin.n.01', 'synonyms': ['washbasin', 'basin_(for_washing)', 'washbowl', 'washstand', 'handbasin'], 'id': 1159, 'def': 'a bathroom sink that is permanently installed and connected to a water supply and drainpipe; where you can wash your hands and face', 'name': 'washbasin'}, {'frequency': 'c', 'synset': 'washer.n.03', 'synonyms': ['automatic_washer', 'washing_machine'], 'id': 1160, 'def': 'a home appliance for washing clothes and linens automatically', 'name': 'automatic_washer'}, {'frequency': 'f', 'synset': 'watch.n.01', 'synonyms': ['watch', 'wristwatch'], 'id': 1161, 'def': 'a small, portable timepiece', 'name': 'watch'}, {'frequency': 'f', 'synset': 'water_bottle.n.01', 'synonyms': ['water_bottle'], 'id': 1162, 'def': 'a bottle for holding water', 'name': 'water_bottle'}, {'frequency': 'c', 'synset': 'water_cooler.n.01', 'synonyms': ['water_cooler'], 'id': 1163, 'def': 'a device for cooling and dispensing drinking water', 'name': 'water_cooler'}, {'frequency': 'c', 'synset': 'water_faucet.n.01', 'synonyms': ['water_faucet', 'water_tap', 'tap_(water_faucet)'], 'id': 1164, 'def': 'a faucet for drawing water from a pipe or cask', 'name': 'water_faucet'}, {'frequency': 'r', 'synset': 'water_heater.n.01', 'synonyms': ['water_heater', 'hot-water_heater'], 'id': 1165, 'def': 'a heater and storage tank to supply heated water', 'name': 'water_heater'}, {'frequency': 'c', 'synset': 'water_jug.n.01', 'synonyms': ['water_jug'], 'id': 1166, 'def': 'a jug that holds water', 'name': 'water_jug'}, {'frequency': 'r', 'synset': 'water_pistol.n.01', 'synonyms': ['water_gun', 'squirt_gun'], 'id': 1167, 'def': 'plaything consisting of a toy pistol that squirts water', 'name': 'water_gun'}, {'frequency': 'c', 'synset': 'water_scooter.n.01', 'synonyms': ['water_scooter', 'sea_scooter', 'jet_ski'], 'id': 1168, 'def': 'a motorboat resembling a motor scooter (NOT A SURFBOARD OR WATER SKI)', 'name': 'water_scooter'}, {'frequency': 'c', 'synset': 'water_ski.n.01', 'synonyms': ['water_ski'], 'id': 1169, 'def': 'broad ski for skimming over water towed by a speedboat (DO NOT MARK WATER)', 'name': 'water_ski'}, {'frequency': 'c', 'synset': 'water_tower.n.01', 'synonyms': ['water_tower'], 'id': 1170, 'def': 'a large reservoir for water', 'name': 'water_tower'}, {'frequency': 'c', 'synset': 'watering_can.n.01', 'synonyms': ['watering_can'], 'id': 1171, 'def': 'a container with a handle and a spout with a perforated nozzle; used to sprinkle water over plants', 'name': 'watering_can'}, {'frequency': 'f', 'synset': 'watermelon.n.02', 'synonyms': ['watermelon'], 'id': 1172, 'def': 'large oblong or roundish melon with a hard green rind and sweet watery red or occasionally yellowish pulp', 'name': 'watermelon'}, {'frequency': 'f', 'synset': 'weathervane.n.01', 'synonyms': ['weathervane', 'vane_(weathervane)', 'wind_vane'], 'id': 1173, 'def': 'mechanical device attached to an elevated structure; rotates freely to show the direction of the wind', 'name': 'weathervane'}, {'frequency': 'c', 'synset': 'webcam.n.01', 'synonyms': ['webcam'], 'id': 1174, 'def': 'a digital camera designed to take digital photographs and transmit them over the internet', 'name': 'webcam'}, {'frequency': 'c', 'synset': 'wedding_cake.n.01', 'synonyms': ['wedding_cake', 'bridecake'], 'id': 1175, 'def': 'a rich cake with two or more tiers and covered with frosting and decorations; served at a wedding reception', 'name': 'wedding_cake'}, {'frequency': 'c', 'synset': 'wedding_ring.n.01', 'synonyms': ['wedding_ring', 'wedding_band'], 'id': 1176, 'def': 'a ring given to the bride and/or groom at the wedding', 'name': 'wedding_ring'}, {'frequency': 'f', 'synset': 'wet_suit.n.01', 'synonyms': ['wet_suit'], 'id': 1177, 'def': 'a close-fitting garment made of a permeable material; worn in cold water to retain body heat', 'name': 'wet_suit'}, {'frequency': 'f', 'synset': 'wheel.n.01', 'synonyms': ['wheel'], 'id': 1178, 'def': 'a circular frame with spokes (or a solid disc) that can rotate on a shaft or axle', 'name': 'wheel'}, {'frequency': 'c', 'synset': 'wheelchair.n.01', 'synonyms': ['wheelchair'], 'id': 1179, 'def': 'a movable chair mounted on large wheels', 'name': 'wheelchair'}, {'frequency': 'c', 'synset': 'whipped_cream.n.01', 'synonyms': ['whipped_cream'], 'id': 1180, 'def': 'cream that has been beaten until light and fluffy', 'name': 'whipped_cream'}, {'frequency': 'c', 'synset': 'whistle.n.03', 'synonyms': ['whistle'], 'id': 1181, 'def': 'a small wind instrument that produces a whistling sound by blowing into it', 'name': 'whistle'}, {'frequency': 'c', 'synset': 'wig.n.01', 'synonyms': ['wig'], 'id': 1182, 'def': 'hairpiece covering the head and made of real or synthetic hair', 'name': 'wig'}, {'frequency': 'c', 'synset': 'wind_chime.n.01', 'synonyms': ['wind_chime'], 'id': 1183, 'def': 'a decorative arrangement of pieces of metal or glass or pottery that hang together loosely so the wind can cause them to tinkle', 'name': 'wind_chime'}, {'frequency': 'c', 'synset': 'windmill.n.01', 'synonyms': ['windmill'], 'id': 1184, 'def': 'A mill or turbine that is powered by wind', 'name': 'windmill'}, {'frequency': 'c', 'synset': 'window_box.n.01', 'synonyms': ['window_box_(for_plants)'], 'id': 1185, 'def': 'a container for growing plants on a windowsill', 'name': 'window_box_(for_plants)'}, {'frequency': 'f', 'synset': 'windshield_wiper.n.01', 'synonyms': ['windshield_wiper', 'windscreen_wiper', 'wiper_(for_windshield/screen)'], 'id': 1186, 'def': 'a mechanical device that cleans the windshield', 'name': 'windshield_wiper'}, {'frequency': 'c', 'synset': 'windsock.n.01', 'synonyms': ['windsock', 'air_sock', 'air-sleeve', 'wind_sleeve', 'wind_cone'], 'id': 1187, 'def': 'a truncated cloth cone mounted on a mast/pole; shows wind direction', 'name': 'windsock'}, {'frequency': 'f', 'synset': 'wine_bottle.n.01', 'synonyms': ['wine_bottle'], 'id': 1188, 'def': 'a bottle for holding wine', 'name': 'wine_bottle'}, {'frequency': 'c', 'synset': 'wine_bucket.n.01', 'synonyms': ['wine_bucket', 'wine_cooler'], 'id': 1189, 'def': 'a bucket of ice used to chill a bottle of wine', 'name': 'wine_bucket'}, {'frequency': 'f', 'synset': 'wineglass.n.01', 'synonyms': ['wineglass'], 'id': 1190, 'def': 'a glass that has a stem and in which wine is served', 'name': 'wineglass'}, {'frequency': 'f', 'synset': 'winker.n.02', 'synonyms': ['blinder_(for_horses)'], 'id': 1191, 'def': 'blinds that prevent a horse from seeing something on either side', 'name': 'blinder_(for_horses)'}, {'frequency': 'c', 'synset': 'wok.n.01', 'synonyms': ['wok'], 'id': 1192, 'def': 'pan with a convex bottom; used for frying in Chinese cooking', 'name': 'wok'}, {'frequency': 'r', 'synset': 'wolf.n.01', 'synonyms': ['wolf'], 'id': 1193, 'def': 'a wild carnivorous mammal of the dog family, living and hunting in packs', 'name': 'wolf'}, {'frequency': 'c', 'synset': 'wooden_spoon.n.02', 'synonyms': ['wooden_spoon'], 'id': 1194, 'def': 'a spoon made of wood', 'name': 'wooden_spoon'}, {'frequency': 'c', 'synset': 'wreath.n.01', 'synonyms': ['wreath'], 'id': 1195, 'def': 'an arrangement of flowers, leaves, or stems fastened in a ring', 'name': 'wreath'}, {'frequency': 'c', 'synset': 'wrench.n.03', 'synonyms': ['wrench', 'spanner'], 'id': 1196, 'def': 'a hand tool that is used to hold or twist a nut or bolt', 'name': 'wrench'}, {'frequency': 'f', 'synset': 'wristband.n.01', 'synonyms': ['wristband'], 'id': 1197, 'def': 'band consisting of a part of a sleeve that covers the wrist', 'name': 'wristband'}, {'frequency': 'f', 'synset': 'wristlet.n.01', 'synonyms': ['wristlet', 'wrist_band'], 'id': 1198, 'def': 'a band or bracelet worn around the wrist', 'name': 'wristlet'}, {'frequency': 'c', 'synset': 'yacht.n.01', 'synonyms': ['yacht'], 'id': 1199, 'def': 'an expensive vessel propelled by sail or power and used for cruising or racing', 'name': 'yacht'}, {'frequency': 'c', 'synset': 'yogurt.n.01', 'synonyms': ['yogurt', 'yoghurt', 'yoghourt'], 'id': 1200, 'def': 'a custard-like food made from curdled milk', 'name': 'yogurt'}, {'frequency': 'c', 'synset': 'yoke.n.07', 'synonyms': ['yoke_(animal_equipment)'], 'id': 1201, 'def': 'gear joining two animals at the neck; NOT egg yolk', 'name': 'yoke_(animal_equipment)'}, {'frequency': 'f', 'synset': 'zebra.n.01', 'synonyms': ['zebra'], 'id': 1202, 'def': 'any of several fleet black-and-white striped African equines', 'name': 'zebra'}, {'frequency': 'c', 'synset': 'zucchini.n.02', 'synonyms': ['zucchini', 'courgette'], 'id': 1203, 'def': 'small cucumber-shaped vegetable marrow; typically dark green', 'name': 'zucchini'}]  # noqa
+# fmt: on
diff --git a/detectron2/data/datasets/lvis_v1_category_image_count.py b/detectron2/data/datasets/lvis_v1_category_image_count.py
new file mode 100644
index 0000000000000000000000000000000000000000..31bf0cfcd5096ab87835db86a28671d474514c40
--- /dev/null
+++ b/detectron2/data/datasets/lvis_v1_category_image_count.py
@@ -0,0 +1,20 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# Autogen with
+# with open("lvis_v1_train.json", "r") as f:
+#     a = json.load(f)
+# c = a["categories"]
+# for x in c:
+# del x["name"]
+# del x["instance_count"]
+# del x["def"]
+# del x["synonyms"]
+# del x["frequency"]
+# del x["synset"]
+# LVIS_CATEGORY_IMAGE_COUNT = repr(c) + "  # noqa"
+# with open("/tmp/lvis_category_image_count.py", "wt") as f:
+#     f.write(f"LVIS_CATEGORY_IMAGE_COUNT = {LVIS_CATEGORY_IMAGE_COUNT}")
+# Then paste the contents of that file below
+
+# fmt: off
+LVIS_CATEGORY_IMAGE_COUNT = [{'id': 1, 'image_count': 64}, {'id': 2, 'image_count': 364}, {'id': 3, 'image_count': 1911}, {'id': 4, 'image_count': 149}, {'id': 5, 'image_count': 29}, {'id': 6, 'image_count': 26}, {'id': 7, 'image_count': 59}, {'id': 8, 'image_count': 22}, {'id': 9, 'image_count': 12}, {'id': 10, 'image_count': 28}, {'id': 11, 'image_count': 505}, {'id': 12, 'image_count': 1207}, {'id': 13, 'image_count': 4}, {'id': 14, 'image_count': 10}, {'id': 15, 'image_count': 500}, {'id': 16, 'image_count': 33}, {'id': 17, 'image_count': 3}, {'id': 18, 'image_count': 44}, {'id': 19, 'image_count': 561}, {'id': 20, 'image_count': 8}, {'id': 21, 'image_count': 9}, {'id': 22, 'image_count': 33}, {'id': 23, 'image_count': 1883}, {'id': 24, 'image_count': 98}, {'id': 25, 'image_count': 70}, {'id': 26, 'image_count': 46}, {'id': 27, 'image_count': 117}, {'id': 28, 'image_count': 41}, {'id': 29, 'image_count': 1395}, {'id': 30, 'image_count': 7}, {'id': 31, 'image_count': 1}, {'id': 32, 'image_count': 314}, {'id': 33, 'image_count': 31}, {'id': 34, 'image_count': 1905}, {'id': 35, 'image_count': 1859}, {'id': 36, 'image_count': 1623}, {'id': 37, 'image_count': 47}, {'id': 38, 'image_count': 3}, {'id': 39, 'image_count': 3}, {'id': 40, 'image_count': 1}, {'id': 41, 'image_count': 305}, {'id': 42, 'image_count': 6}, {'id': 43, 'image_count': 210}, {'id': 44, 'image_count': 36}, {'id': 45, 'image_count': 1787}, {'id': 46, 'image_count': 17}, {'id': 47, 'image_count': 51}, {'id': 48, 'image_count': 138}, {'id': 49, 'image_count': 3}, {'id': 50, 'image_count': 1470}, {'id': 51, 'image_count': 3}, {'id': 52, 'image_count': 2}, {'id': 53, 'image_count': 186}, {'id': 54, 'image_count': 76}, {'id': 55, 'image_count': 26}, {'id': 56, 'image_count': 303}, {'id': 57, 'image_count': 738}, {'id': 58, 'image_count': 1799}, {'id': 59, 'image_count': 1934}, {'id': 60, 'image_count': 1609}, {'id': 61, 'image_count': 1622}, {'id': 62, 'image_count': 41}, {'id': 63, 'image_count': 4}, {'id': 64, 'image_count': 11}, {'id': 65, 'image_count': 270}, {'id': 66, 'image_count': 349}, {'id': 67, 'image_count': 42}, {'id': 68, 'image_count': 823}, {'id': 69, 'image_count': 6}, {'id': 70, 'image_count': 48}, {'id': 71, 'image_count': 3}, {'id': 72, 'image_count': 42}, {'id': 73, 'image_count': 24}, {'id': 74, 'image_count': 16}, {'id': 75, 'image_count': 605}, {'id': 76, 'image_count': 646}, {'id': 77, 'image_count': 1765}, {'id': 78, 'image_count': 2}, {'id': 79, 'image_count': 125}, {'id': 80, 'image_count': 1420}, {'id': 81, 'image_count': 140}, {'id': 82, 'image_count': 4}, {'id': 83, 'image_count': 322}, {'id': 84, 'image_count': 60}, {'id': 85, 'image_count': 2}, {'id': 86, 'image_count': 231}, {'id': 87, 'image_count': 333}, {'id': 88, 'image_count': 1941}, {'id': 89, 'image_count': 367}, {'id': 90, 'image_count': 1922}, {'id': 91, 'image_count': 18}, {'id': 92, 'image_count': 81}, {'id': 93, 'image_count': 1}, {'id': 94, 'image_count': 1852}, {'id': 95, 'image_count': 430}, {'id': 96, 'image_count': 247}, {'id': 97, 'image_count': 94}, {'id': 98, 'image_count': 21}, {'id': 99, 'image_count': 1821}, {'id': 100, 'image_count': 16}, {'id': 101, 'image_count': 12}, {'id': 102, 'image_count': 25}, {'id': 103, 'image_count': 41}, {'id': 104, 'image_count': 244}, {'id': 105, 'image_count': 7}, {'id': 106, 'image_count': 1}, {'id': 107, 'image_count': 40}, {'id': 108, 'image_count': 40}, {'id': 109, 'image_count': 104}, {'id': 110, 'image_count': 1671}, {'id': 111, 'image_count': 49}, {'id': 112, 'image_count': 243}, {'id': 113, 'image_count': 2}, {'id': 114, 'image_count': 242}, {'id': 115, 'image_count': 271}, {'id': 116, 'image_count': 104}, {'id': 117, 'image_count': 8}, {'id': 118, 'image_count': 1758}, {'id': 119, 'image_count': 1}, {'id': 120, 'image_count': 48}, {'id': 121, 'image_count': 14}, {'id': 122, 'image_count': 40}, {'id': 123, 'image_count': 1}, {'id': 124, 'image_count': 37}, {'id': 125, 'image_count': 1510}, {'id': 126, 'image_count': 6}, {'id': 127, 'image_count': 1903}, {'id': 128, 'image_count': 70}, {'id': 129, 'image_count': 86}, {'id': 130, 'image_count': 7}, {'id': 131, 'image_count': 5}, {'id': 132, 'image_count': 1406}, {'id': 133, 'image_count': 1901}, {'id': 134, 'image_count': 15}, {'id': 135, 'image_count': 28}, {'id': 136, 'image_count': 6}, {'id': 137, 'image_count': 494}, {'id': 138, 'image_count': 234}, {'id': 139, 'image_count': 1922}, {'id': 140, 'image_count': 1}, {'id': 141, 'image_count': 35}, {'id': 142, 'image_count': 5}, {'id': 143, 'image_count': 1828}, {'id': 144, 'image_count': 8}, {'id': 145, 'image_count': 63}, {'id': 146, 'image_count': 1668}, {'id': 147, 'image_count': 4}, {'id': 148, 'image_count': 95}, {'id': 149, 'image_count': 17}, {'id': 150, 'image_count': 1567}, {'id': 151, 'image_count': 2}, {'id': 152, 'image_count': 103}, {'id': 153, 'image_count': 50}, {'id': 154, 'image_count': 1309}, {'id': 155, 'image_count': 6}, {'id': 156, 'image_count': 92}, {'id': 157, 'image_count': 19}, {'id': 158, 'image_count': 37}, {'id': 159, 'image_count': 4}, {'id': 160, 'image_count': 709}, {'id': 161, 'image_count': 9}, {'id': 162, 'image_count': 82}, {'id': 163, 'image_count': 15}, {'id': 164, 'image_count': 3}, {'id': 165, 'image_count': 61}, {'id': 166, 'image_count': 51}, {'id': 167, 'image_count': 5}, {'id': 168, 'image_count': 13}, {'id': 169, 'image_count': 642}, {'id': 170, 'image_count': 24}, {'id': 171, 'image_count': 255}, {'id': 172, 'image_count': 9}, {'id': 173, 'image_count': 1808}, {'id': 174, 'image_count': 31}, {'id': 175, 'image_count': 158}, {'id': 176, 'image_count': 80}, {'id': 177, 'image_count': 1884}, {'id': 178, 'image_count': 158}, {'id': 179, 'image_count': 2}, {'id': 180, 'image_count': 12}, {'id': 181, 'image_count': 1659}, {'id': 182, 'image_count': 7}, {'id': 183, 'image_count': 834}, {'id': 184, 'image_count': 57}, {'id': 185, 'image_count': 174}, {'id': 186, 'image_count': 95}, {'id': 187, 'image_count': 27}, {'id': 188, 'image_count': 22}, {'id': 189, 'image_count': 1391}, {'id': 190, 'image_count': 90}, {'id': 191, 'image_count': 40}, {'id': 192, 'image_count': 445}, {'id': 193, 'image_count': 21}, {'id': 194, 'image_count': 1132}, {'id': 195, 'image_count': 177}, {'id': 196, 'image_count': 4}, {'id': 197, 'image_count': 17}, {'id': 198, 'image_count': 84}, {'id': 199, 'image_count': 55}, {'id': 200, 'image_count': 30}, {'id': 201, 'image_count': 25}, {'id': 202, 'image_count': 2}, {'id': 203, 'image_count': 125}, {'id': 204, 'image_count': 1135}, {'id': 205, 'image_count': 19}, {'id': 206, 'image_count': 72}, {'id': 207, 'image_count': 1926}, {'id': 208, 'image_count': 159}, {'id': 209, 'image_count': 7}, {'id': 210, 'image_count': 1}, {'id': 211, 'image_count': 13}, {'id': 212, 'image_count': 35}, {'id': 213, 'image_count': 18}, {'id': 214, 'image_count': 8}, {'id': 215, 'image_count': 6}, {'id': 216, 'image_count': 35}, {'id': 217, 'image_count': 1222}, {'id': 218, 'image_count': 103}, {'id': 219, 'image_count': 28}, {'id': 220, 'image_count': 63}, {'id': 221, 'image_count': 28}, {'id': 222, 'image_count': 5}, {'id': 223, 'image_count': 7}, {'id': 224, 'image_count': 14}, {'id': 225, 'image_count': 1918}, {'id': 226, 'image_count': 133}, {'id': 227, 'image_count': 16}, {'id': 228, 'image_count': 27}, {'id': 229, 'image_count': 110}, {'id': 230, 'image_count': 1895}, {'id': 231, 'image_count': 4}, {'id': 232, 'image_count': 1927}, {'id': 233, 'image_count': 8}, {'id': 234, 'image_count': 1}, {'id': 235, 'image_count': 263}, {'id': 236, 'image_count': 10}, {'id': 237, 'image_count': 2}, {'id': 238, 'image_count': 3}, {'id': 239, 'image_count': 87}, {'id': 240, 'image_count': 9}, {'id': 241, 'image_count': 71}, {'id': 242, 'image_count': 13}, {'id': 243, 'image_count': 18}, {'id': 244, 'image_count': 2}, {'id': 245, 'image_count': 5}, {'id': 246, 'image_count': 45}, {'id': 247, 'image_count': 1}, {'id': 248, 'image_count': 23}, {'id': 249, 'image_count': 32}, {'id': 250, 'image_count': 4}, {'id': 251, 'image_count': 1}, {'id': 252, 'image_count': 858}, {'id': 253, 'image_count': 661}, {'id': 254, 'image_count': 168}, {'id': 255, 'image_count': 210}, {'id': 256, 'image_count': 65}, {'id': 257, 'image_count': 4}, {'id': 258, 'image_count': 2}, {'id': 259, 'image_count': 159}, {'id': 260, 'image_count': 31}, {'id': 261, 'image_count': 811}, {'id': 262, 'image_count': 1}, {'id': 263, 'image_count': 42}, {'id': 264, 'image_count': 27}, {'id': 265, 'image_count': 2}, {'id': 266, 'image_count': 5}, {'id': 267, 'image_count': 95}, {'id': 268, 'image_count': 32}, {'id': 269, 'image_count': 1}, {'id': 270, 'image_count': 1}, {'id': 271, 'image_count': 1844}, {'id': 272, 'image_count': 897}, {'id': 273, 'image_count': 31}, {'id': 274, 'image_count': 23}, {'id': 275, 'image_count': 1}, {'id': 276, 'image_count': 202}, {'id': 277, 'image_count': 746}, {'id': 278, 'image_count': 44}, {'id': 279, 'image_count': 14}, {'id': 280, 'image_count': 26}, {'id': 281, 'image_count': 1}, {'id': 282, 'image_count': 2}, {'id': 283, 'image_count': 25}, {'id': 284, 'image_count': 238}, {'id': 285, 'image_count': 592}, {'id': 286, 'image_count': 26}, {'id': 287, 'image_count': 5}, {'id': 288, 'image_count': 42}, {'id': 289, 'image_count': 13}, {'id': 290, 'image_count': 46}, {'id': 291, 'image_count': 1}, {'id': 292, 'image_count': 8}, {'id': 293, 'image_count': 34}, {'id': 294, 'image_count': 5}, {'id': 295, 'image_count': 1}, {'id': 296, 'image_count': 1871}, {'id': 297, 'image_count': 717}, {'id': 298, 'image_count': 1010}, {'id': 299, 'image_count': 679}, {'id': 300, 'image_count': 3}, {'id': 301, 'image_count': 4}, {'id': 302, 'image_count': 1}, {'id': 303, 'image_count': 166}, {'id': 304, 'image_count': 2}, {'id': 305, 'image_count': 266}, {'id': 306, 'image_count': 101}, {'id': 307, 'image_count': 6}, {'id': 308, 'image_count': 14}, {'id': 309, 'image_count': 133}, {'id': 310, 'image_count': 2}, {'id': 311, 'image_count': 38}, {'id': 312, 'image_count': 95}, {'id': 313, 'image_count': 1}, {'id': 314, 'image_count': 12}, {'id': 315, 'image_count': 49}, {'id': 316, 'image_count': 5}, {'id': 317, 'image_count': 5}, {'id': 318, 'image_count': 16}, {'id': 319, 'image_count': 216}, {'id': 320, 'image_count': 12}, {'id': 321, 'image_count': 1}, {'id': 322, 'image_count': 54}, {'id': 323, 'image_count': 5}, {'id': 324, 'image_count': 245}, {'id': 325, 'image_count': 12}, {'id': 326, 'image_count': 7}, {'id': 327, 'image_count': 35}, {'id': 328, 'image_count': 36}, {'id': 329, 'image_count': 32}, {'id': 330, 'image_count': 1027}, {'id': 331, 'image_count': 10}, {'id': 332, 'image_count': 12}, {'id': 333, 'image_count': 1}, {'id': 334, 'image_count': 67}, {'id': 335, 'image_count': 71}, {'id': 336, 'image_count': 30}, {'id': 337, 'image_count': 48}, {'id': 338, 'image_count': 249}, {'id': 339, 'image_count': 13}, {'id': 340, 'image_count': 29}, {'id': 341, 'image_count': 14}, {'id': 342, 'image_count': 236}, {'id': 343, 'image_count': 15}, {'id': 344, 'image_count': 1521}, {'id': 345, 'image_count': 25}, {'id': 346, 'image_count': 249}, {'id': 347, 'image_count': 139}, {'id': 348, 'image_count': 2}, {'id': 349, 'image_count': 2}, {'id': 350, 'image_count': 1890}, {'id': 351, 'image_count': 1240}, {'id': 352, 'image_count': 1}, {'id': 353, 'image_count': 9}, {'id': 354, 'image_count': 1}, {'id': 355, 'image_count': 3}, {'id': 356, 'image_count': 11}, {'id': 357, 'image_count': 4}, {'id': 358, 'image_count': 236}, {'id': 359, 'image_count': 44}, {'id': 360, 'image_count': 19}, {'id': 361, 'image_count': 1100}, {'id': 362, 'image_count': 7}, {'id': 363, 'image_count': 69}, {'id': 364, 'image_count': 2}, {'id': 365, 'image_count': 8}, {'id': 366, 'image_count': 5}, {'id': 367, 'image_count': 227}, {'id': 368, 'image_count': 6}, {'id': 369, 'image_count': 106}, {'id': 370, 'image_count': 81}, {'id': 371, 'image_count': 17}, {'id': 372, 'image_count': 134}, {'id': 373, 'image_count': 312}, {'id': 374, 'image_count': 8}, {'id': 375, 'image_count': 271}, {'id': 376, 'image_count': 2}, {'id': 377, 'image_count': 103}, {'id': 378, 'image_count': 1938}, {'id': 379, 'image_count': 574}, {'id': 380, 'image_count': 120}, {'id': 381, 'image_count': 2}, {'id': 382, 'image_count': 2}, {'id': 383, 'image_count': 13}, {'id': 384, 'image_count': 29}, {'id': 385, 'image_count': 1710}, {'id': 386, 'image_count': 66}, {'id': 387, 'image_count': 1008}, {'id': 388, 'image_count': 1}, {'id': 389, 'image_count': 3}, {'id': 390, 'image_count': 1942}, {'id': 391, 'image_count': 19}, {'id': 392, 'image_count': 1488}, {'id': 393, 'image_count': 46}, {'id': 394, 'image_count': 106}, {'id': 395, 'image_count': 115}, {'id': 396, 'image_count': 19}, {'id': 397, 'image_count': 2}, {'id': 398, 'image_count': 1}, {'id': 399, 'image_count': 28}, {'id': 400, 'image_count': 9}, {'id': 401, 'image_count': 192}, {'id': 402, 'image_count': 12}, {'id': 403, 'image_count': 21}, {'id': 404, 'image_count': 247}, {'id': 405, 'image_count': 6}, {'id': 406, 'image_count': 64}, {'id': 407, 'image_count': 7}, {'id': 408, 'image_count': 40}, {'id': 409, 'image_count': 542}, {'id': 410, 'image_count': 2}, {'id': 411, 'image_count': 1898}, {'id': 412, 'image_count': 36}, {'id': 413, 'image_count': 4}, {'id': 414, 'image_count': 1}, {'id': 415, 'image_count': 191}, {'id': 416, 'image_count': 6}, {'id': 417, 'image_count': 41}, {'id': 418, 'image_count': 39}, {'id': 419, 'image_count': 46}, {'id': 420, 'image_count': 1}, {'id': 421, 'image_count': 1451}, {'id': 422, 'image_count': 1878}, {'id': 423, 'image_count': 11}, {'id': 424, 'image_count': 82}, {'id': 425, 'image_count': 18}, {'id': 426, 'image_count': 1}, {'id': 427, 'image_count': 7}, {'id': 428, 'image_count': 3}, {'id': 429, 'image_count': 575}, {'id': 430, 'image_count': 1907}, {'id': 431, 'image_count': 8}, {'id': 432, 'image_count': 4}, {'id': 433, 'image_count': 32}, {'id': 434, 'image_count': 11}, {'id': 435, 'image_count': 4}, {'id': 436, 'image_count': 54}, {'id': 437, 'image_count': 202}, {'id': 438, 'image_count': 32}, {'id': 439, 'image_count': 3}, {'id': 440, 'image_count': 130}, {'id': 441, 'image_count': 119}, {'id': 442, 'image_count': 141}, {'id': 443, 'image_count': 29}, {'id': 444, 'image_count': 525}, {'id': 445, 'image_count': 1323}, {'id': 446, 'image_count': 2}, {'id': 447, 'image_count': 113}, {'id': 448, 'image_count': 16}, {'id': 449, 'image_count': 7}, {'id': 450, 'image_count': 35}, {'id': 451, 'image_count': 1908}, {'id': 452, 'image_count': 353}, {'id': 453, 'image_count': 18}, {'id': 454, 'image_count': 14}, {'id': 455, 'image_count': 77}, {'id': 456, 'image_count': 8}, {'id': 457, 'image_count': 37}, {'id': 458, 'image_count': 1}, {'id': 459, 'image_count': 346}, {'id': 460, 'image_count': 19}, {'id': 461, 'image_count': 1779}, {'id': 462, 'image_count': 23}, {'id': 463, 'image_count': 25}, {'id': 464, 'image_count': 67}, {'id': 465, 'image_count': 19}, {'id': 466, 'image_count': 28}, {'id': 467, 'image_count': 4}, {'id': 468, 'image_count': 27}, {'id': 469, 'image_count': 1861}, {'id': 470, 'image_count': 11}, {'id': 471, 'image_count': 13}, {'id': 472, 'image_count': 13}, {'id': 473, 'image_count': 32}, {'id': 474, 'image_count': 1767}, {'id': 475, 'image_count': 42}, {'id': 476, 'image_count': 17}, {'id': 477, 'image_count': 128}, {'id': 478, 'image_count': 1}, {'id': 479, 'image_count': 9}, {'id': 480, 'image_count': 10}, {'id': 481, 'image_count': 4}, {'id': 482, 'image_count': 9}, {'id': 483, 'image_count': 18}, {'id': 484, 'image_count': 41}, {'id': 485, 'image_count': 28}, {'id': 486, 'image_count': 3}, {'id': 487, 'image_count': 65}, {'id': 488, 'image_count': 9}, {'id': 489, 'image_count': 23}, {'id': 490, 'image_count': 24}, {'id': 491, 'image_count': 1}, {'id': 492, 'image_count': 2}, {'id': 493, 'image_count': 59}, {'id': 494, 'image_count': 48}, {'id': 495, 'image_count': 17}, {'id': 496, 'image_count': 1877}, {'id': 497, 'image_count': 18}, {'id': 498, 'image_count': 1920}, {'id': 499, 'image_count': 50}, {'id': 500, 'image_count': 1890}, {'id': 501, 'image_count': 99}, {'id': 502, 'image_count': 1530}, {'id': 503, 'image_count': 3}, {'id': 504, 'image_count': 11}, {'id': 505, 'image_count': 19}, {'id': 506, 'image_count': 3}, {'id': 507, 'image_count': 63}, {'id': 508, 'image_count': 5}, {'id': 509, 'image_count': 6}, {'id': 510, 'image_count': 233}, {'id': 511, 'image_count': 54}, {'id': 512, 'image_count': 36}, {'id': 513, 'image_count': 10}, {'id': 514, 'image_count': 124}, {'id': 515, 'image_count': 101}, {'id': 516, 'image_count': 3}, {'id': 517, 'image_count': 363}, {'id': 518, 'image_count': 3}, {'id': 519, 'image_count': 30}, {'id': 520, 'image_count': 18}, {'id': 521, 'image_count': 199}, {'id': 522, 'image_count': 97}, {'id': 523, 'image_count': 32}, {'id': 524, 'image_count': 121}, {'id': 525, 'image_count': 16}, {'id': 526, 'image_count': 12}, {'id': 527, 'image_count': 2}, {'id': 528, 'image_count': 214}, {'id': 529, 'image_count': 48}, {'id': 530, 'image_count': 26}, {'id': 531, 'image_count': 13}, {'id': 532, 'image_count': 4}, {'id': 533, 'image_count': 11}, {'id': 534, 'image_count': 123}, {'id': 535, 'image_count': 7}, {'id': 536, 'image_count': 200}, {'id': 537, 'image_count': 91}, {'id': 538, 'image_count': 9}, {'id': 539, 'image_count': 72}, {'id': 540, 'image_count': 1886}, {'id': 541, 'image_count': 4}, {'id': 542, 'image_count': 1}, {'id': 543, 'image_count': 1}, {'id': 544, 'image_count': 1932}, {'id': 545, 'image_count': 4}, {'id': 546, 'image_count': 56}, {'id': 547, 'image_count': 854}, {'id': 548, 'image_count': 755}, {'id': 549, 'image_count': 1843}, {'id': 550, 'image_count': 96}, {'id': 551, 'image_count': 7}, {'id': 552, 'image_count': 74}, {'id': 553, 'image_count': 66}, {'id': 554, 'image_count': 57}, {'id': 555, 'image_count': 44}, {'id': 556, 'image_count': 1905}, {'id': 557, 'image_count': 4}, {'id': 558, 'image_count': 90}, {'id': 559, 'image_count': 1635}, {'id': 560, 'image_count': 8}, {'id': 561, 'image_count': 5}, {'id': 562, 'image_count': 50}, {'id': 563, 'image_count': 545}, {'id': 564, 'image_count': 20}, {'id': 565, 'image_count': 193}, {'id': 566, 'image_count': 285}, {'id': 567, 'image_count': 3}, {'id': 568, 'image_count': 1}, {'id': 569, 'image_count': 1904}, {'id': 570, 'image_count': 294}, {'id': 571, 'image_count': 3}, {'id': 572, 'image_count': 5}, {'id': 573, 'image_count': 24}, {'id': 574, 'image_count': 2}, {'id': 575, 'image_count': 2}, {'id': 576, 'image_count': 16}, {'id': 577, 'image_count': 8}, {'id': 578, 'image_count': 154}, {'id': 579, 'image_count': 66}, {'id': 580, 'image_count': 1}, {'id': 581, 'image_count': 24}, {'id': 582, 'image_count': 1}, {'id': 583, 'image_count': 4}, {'id': 584, 'image_count': 75}, {'id': 585, 'image_count': 6}, {'id': 586, 'image_count': 126}, {'id': 587, 'image_count': 24}, {'id': 588, 'image_count': 22}, {'id': 589, 'image_count': 1872}, {'id': 590, 'image_count': 16}, {'id': 591, 'image_count': 423}, {'id': 592, 'image_count': 1927}, {'id': 593, 'image_count': 38}, {'id': 594, 'image_count': 3}, {'id': 595, 'image_count': 1945}, {'id': 596, 'image_count': 35}, {'id': 597, 'image_count': 1}, {'id': 598, 'image_count': 13}, {'id': 599, 'image_count': 9}, {'id': 600, 'image_count': 14}, {'id': 601, 'image_count': 37}, {'id': 602, 'image_count': 3}, {'id': 603, 'image_count': 4}, {'id': 604, 'image_count': 100}, {'id': 605, 'image_count': 195}, {'id': 606, 'image_count': 1}, {'id': 607, 'image_count': 12}, {'id': 608, 'image_count': 24}, {'id': 609, 'image_count': 489}, {'id': 610, 'image_count': 10}, {'id': 611, 'image_count': 1689}, {'id': 612, 'image_count': 42}, {'id': 613, 'image_count': 81}, {'id': 614, 'image_count': 894}, {'id': 615, 'image_count': 1868}, {'id': 616, 'image_count': 7}, {'id': 617, 'image_count': 1567}, {'id': 618, 'image_count': 10}, {'id': 619, 'image_count': 8}, {'id': 620, 'image_count': 7}, {'id': 621, 'image_count': 629}, {'id': 622, 'image_count': 89}, {'id': 623, 'image_count': 15}, {'id': 624, 'image_count': 134}, {'id': 625, 'image_count': 4}, {'id': 626, 'image_count': 1802}, {'id': 627, 'image_count': 595}, {'id': 628, 'image_count': 1210}, {'id': 629, 'image_count': 48}, {'id': 630, 'image_count': 418}, {'id': 631, 'image_count': 1846}, {'id': 632, 'image_count': 5}, {'id': 633, 'image_count': 221}, {'id': 634, 'image_count': 10}, {'id': 635, 'image_count': 7}, {'id': 636, 'image_count': 76}, {'id': 637, 'image_count': 22}, {'id': 638, 'image_count': 10}, {'id': 639, 'image_count': 341}, {'id': 640, 'image_count': 1}, {'id': 641, 'image_count': 705}, {'id': 642, 'image_count': 1900}, {'id': 643, 'image_count': 188}, {'id': 644, 'image_count': 227}, {'id': 645, 'image_count': 861}, {'id': 646, 'image_count': 6}, {'id': 647, 'image_count': 115}, {'id': 648, 'image_count': 5}, {'id': 649, 'image_count': 43}, {'id': 650, 'image_count': 14}, {'id': 651, 'image_count': 6}, {'id': 652, 'image_count': 15}, {'id': 653, 'image_count': 1167}, {'id': 654, 'image_count': 15}, {'id': 655, 'image_count': 994}, {'id': 656, 'image_count': 28}, {'id': 657, 'image_count': 2}, {'id': 658, 'image_count': 338}, {'id': 659, 'image_count': 334}, {'id': 660, 'image_count': 15}, {'id': 661, 'image_count': 102}, {'id': 662, 'image_count': 1}, {'id': 663, 'image_count': 8}, {'id': 664, 'image_count': 1}, {'id': 665, 'image_count': 1}, {'id': 666, 'image_count': 28}, {'id': 667, 'image_count': 91}, {'id': 668, 'image_count': 260}, {'id': 669, 'image_count': 131}, {'id': 670, 'image_count': 128}, {'id': 671, 'image_count': 3}, {'id': 672, 'image_count': 10}, {'id': 673, 'image_count': 39}, {'id': 674, 'image_count': 2}, {'id': 675, 'image_count': 925}, {'id': 676, 'image_count': 354}, {'id': 677, 'image_count': 31}, {'id': 678, 'image_count': 10}, {'id': 679, 'image_count': 215}, {'id': 680, 'image_count': 71}, {'id': 681, 'image_count': 43}, {'id': 682, 'image_count': 28}, {'id': 683, 'image_count': 34}, {'id': 684, 'image_count': 16}, {'id': 685, 'image_count': 273}, {'id': 686, 'image_count': 2}, {'id': 687, 'image_count': 999}, {'id': 688, 'image_count': 4}, {'id': 689, 'image_count': 107}, {'id': 690, 'image_count': 2}, {'id': 691, 'image_count': 1}, {'id': 692, 'image_count': 454}, {'id': 693, 'image_count': 9}, {'id': 694, 'image_count': 1901}, {'id': 695, 'image_count': 61}, {'id': 696, 'image_count': 91}, {'id': 697, 'image_count': 46}, {'id': 698, 'image_count': 1402}, {'id': 699, 'image_count': 74}, {'id': 700, 'image_count': 421}, {'id': 701, 'image_count': 226}, {'id': 702, 'image_count': 10}, {'id': 703, 'image_count': 1720}, {'id': 704, 'image_count': 261}, {'id': 705, 'image_count': 1337}, {'id': 706, 'image_count': 293}, {'id': 707, 'image_count': 62}, {'id': 708, 'image_count': 814}, {'id': 709, 'image_count': 407}, {'id': 710, 'image_count': 6}, {'id': 711, 'image_count': 16}, {'id': 712, 'image_count': 7}, {'id': 713, 'image_count': 1791}, {'id': 714, 'image_count': 2}, {'id': 715, 'image_count': 1915}, {'id': 716, 'image_count': 1940}, {'id': 717, 'image_count': 13}, {'id': 718, 'image_count': 16}, {'id': 719, 'image_count': 448}, {'id': 720, 'image_count': 12}, {'id': 721, 'image_count': 18}, {'id': 722, 'image_count': 4}, {'id': 723, 'image_count': 71}, {'id': 724, 'image_count': 189}, {'id': 725, 'image_count': 74}, {'id': 726, 'image_count': 103}, {'id': 727, 'image_count': 3}, {'id': 728, 'image_count': 110}, {'id': 729, 'image_count': 5}, {'id': 730, 'image_count': 9}, {'id': 731, 'image_count': 15}, {'id': 732, 'image_count': 25}, {'id': 733, 'image_count': 7}, {'id': 734, 'image_count': 647}, {'id': 735, 'image_count': 824}, {'id': 736, 'image_count': 100}, {'id': 737, 'image_count': 47}, {'id': 738, 'image_count': 121}, {'id': 739, 'image_count': 731}, {'id': 740, 'image_count': 73}, {'id': 741, 'image_count': 49}, {'id': 742, 'image_count': 23}, {'id': 743, 'image_count': 4}, {'id': 744, 'image_count': 62}, {'id': 745, 'image_count': 118}, {'id': 746, 'image_count': 99}, {'id': 747, 'image_count': 40}, {'id': 748, 'image_count': 1036}, {'id': 749, 'image_count': 105}, {'id': 750, 'image_count': 21}, {'id': 751, 'image_count': 229}, {'id': 752, 'image_count': 7}, {'id': 753, 'image_count': 72}, {'id': 754, 'image_count': 9}, {'id': 755, 'image_count': 10}, {'id': 756, 'image_count': 328}, {'id': 757, 'image_count': 468}, {'id': 758, 'image_count': 1}, {'id': 759, 'image_count': 2}, {'id': 760, 'image_count': 24}, {'id': 761, 'image_count': 11}, {'id': 762, 'image_count': 72}, {'id': 763, 'image_count': 17}, {'id': 764, 'image_count': 10}, {'id': 765, 'image_count': 17}, {'id': 766, 'image_count': 489}, {'id': 767, 'image_count': 47}, {'id': 768, 'image_count': 93}, {'id': 769, 'image_count': 1}, {'id': 770, 'image_count': 12}, {'id': 771, 'image_count': 228}, {'id': 772, 'image_count': 5}, {'id': 773, 'image_count': 76}, {'id': 774, 'image_count': 71}, {'id': 775, 'image_count': 30}, {'id': 776, 'image_count': 109}, {'id': 777, 'image_count': 14}, {'id': 778, 'image_count': 1}, {'id': 779, 'image_count': 8}, {'id': 780, 'image_count': 26}, {'id': 781, 'image_count': 339}, {'id': 782, 'image_count': 153}, {'id': 783, 'image_count': 2}, {'id': 784, 'image_count': 3}, {'id': 785, 'image_count': 8}, {'id': 786, 'image_count': 47}, {'id': 787, 'image_count': 8}, {'id': 788, 'image_count': 6}, {'id': 789, 'image_count': 116}, {'id': 790, 'image_count': 69}, {'id': 791, 'image_count': 13}, {'id': 792, 'image_count': 6}, {'id': 793, 'image_count': 1928}, {'id': 794, 'image_count': 79}, {'id': 795, 'image_count': 14}, {'id': 796, 'image_count': 7}, {'id': 797, 'image_count': 20}, {'id': 798, 'image_count': 114}, {'id': 799, 'image_count': 221}, {'id': 800, 'image_count': 502}, {'id': 801, 'image_count': 62}, {'id': 802, 'image_count': 87}, {'id': 803, 'image_count': 4}, {'id': 804, 'image_count': 1912}, {'id': 805, 'image_count': 7}, {'id': 806, 'image_count': 186}, {'id': 807, 'image_count': 18}, {'id': 808, 'image_count': 4}, {'id': 809, 'image_count': 3}, {'id': 810, 'image_count': 7}, {'id': 811, 'image_count': 1413}, {'id': 812, 'image_count': 7}, {'id': 813, 'image_count': 12}, {'id': 814, 'image_count': 248}, {'id': 815, 'image_count': 4}, {'id': 816, 'image_count': 1881}, {'id': 817, 'image_count': 529}, {'id': 818, 'image_count': 1932}, {'id': 819, 'image_count': 50}, {'id': 820, 'image_count': 3}, {'id': 821, 'image_count': 28}, {'id': 822, 'image_count': 10}, {'id': 823, 'image_count': 5}, {'id': 824, 'image_count': 5}, {'id': 825, 'image_count': 18}, {'id': 826, 'image_count': 14}, {'id': 827, 'image_count': 1890}, {'id': 828, 'image_count': 660}, {'id': 829, 'image_count': 8}, {'id': 830, 'image_count': 25}, {'id': 831, 'image_count': 10}, {'id': 832, 'image_count': 218}, {'id': 833, 'image_count': 36}, {'id': 834, 'image_count': 16}, {'id': 835, 'image_count': 808}, {'id': 836, 'image_count': 479}, {'id': 837, 'image_count': 1404}, {'id': 838, 'image_count': 307}, {'id': 839, 'image_count': 57}, {'id': 840, 'image_count': 28}, {'id': 841, 'image_count': 80}, {'id': 842, 'image_count': 11}, {'id': 843, 'image_count': 92}, {'id': 844, 'image_count': 20}, {'id': 845, 'image_count': 194}, {'id': 846, 'image_count': 23}, {'id': 847, 'image_count': 52}, {'id': 848, 'image_count': 673}, {'id': 849, 'image_count': 2}, {'id': 850, 'image_count': 2}, {'id': 851, 'image_count': 1}, {'id': 852, 'image_count': 2}, {'id': 853, 'image_count': 8}, {'id': 854, 'image_count': 80}, {'id': 855, 'image_count': 3}, {'id': 856, 'image_count': 3}, {'id': 857, 'image_count': 15}, {'id': 858, 'image_count': 2}, {'id': 859, 'image_count': 10}, {'id': 860, 'image_count': 386}, {'id': 861, 'image_count': 65}, {'id': 862, 'image_count': 3}, {'id': 863, 'image_count': 35}, {'id': 864, 'image_count': 5}, {'id': 865, 'image_count': 180}, {'id': 866, 'image_count': 99}, {'id': 867, 'image_count': 49}, {'id': 868, 'image_count': 28}, {'id': 869, 'image_count': 1}, {'id': 870, 'image_count': 52}, {'id': 871, 'image_count': 36}, {'id': 872, 'image_count': 70}, {'id': 873, 'image_count': 6}, {'id': 874, 'image_count': 29}, {'id': 875, 'image_count': 24}, {'id': 876, 'image_count': 1115}, {'id': 877, 'image_count': 61}, {'id': 878, 'image_count': 18}, {'id': 879, 'image_count': 18}, {'id': 880, 'image_count': 665}, {'id': 881, 'image_count': 1096}, {'id': 882, 'image_count': 29}, {'id': 883, 'image_count': 8}, {'id': 884, 'image_count': 14}, {'id': 885, 'image_count': 1622}, {'id': 886, 'image_count': 2}, {'id': 887, 'image_count': 3}, {'id': 888, 'image_count': 32}, {'id': 889, 'image_count': 55}, {'id': 890, 'image_count': 1}, {'id': 891, 'image_count': 10}, {'id': 892, 'image_count': 10}, {'id': 893, 'image_count': 47}, {'id': 894, 'image_count': 3}, {'id': 895, 'image_count': 29}, {'id': 896, 'image_count': 342}, {'id': 897, 'image_count': 25}, {'id': 898, 'image_count': 1469}, {'id': 899, 'image_count': 521}, {'id': 900, 'image_count': 347}, {'id': 901, 'image_count': 35}, {'id': 902, 'image_count': 7}, {'id': 903, 'image_count': 207}, {'id': 904, 'image_count': 108}, {'id': 905, 'image_count': 2}, {'id': 906, 'image_count': 34}, {'id': 907, 'image_count': 12}, {'id': 908, 'image_count': 10}, {'id': 909, 'image_count': 13}, {'id': 910, 'image_count': 361}, {'id': 911, 'image_count': 1023}, {'id': 912, 'image_count': 782}, {'id': 913, 'image_count': 2}, {'id': 914, 'image_count': 5}, {'id': 915, 'image_count': 247}, {'id': 916, 'image_count': 221}, {'id': 917, 'image_count': 4}, {'id': 918, 'image_count': 8}, {'id': 919, 'image_count': 158}, {'id': 920, 'image_count': 3}, {'id': 921, 'image_count': 752}, {'id': 922, 'image_count': 64}, {'id': 923, 'image_count': 707}, {'id': 924, 'image_count': 143}, {'id': 925, 'image_count': 1}, {'id': 926, 'image_count': 49}, {'id': 927, 'image_count': 126}, {'id': 928, 'image_count': 76}, {'id': 929, 'image_count': 11}, {'id': 930, 'image_count': 11}, {'id': 931, 'image_count': 4}, {'id': 932, 'image_count': 39}, {'id': 933, 'image_count': 11}, {'id': 934, 'image_count': 13}, {'id': 935, 'image_count': 91}, {'id': 936, 'image_count': 14}, {'id': 937, 'image_count': 5}, {'id': 938, 'image_count': 3}, {'id': 939, 'image_count': 10}, {'id': 940, 'image_count': 18}, {'id': 941, 'image_count': 9}, {'id': 942, 'image_count': 6}, {'id': 943, 'image_count': 951}, {'id': 944, 'image_count': 2}, {'id': 945, 'image_count': 1}, {'id': 946, 'image_count': 19}, {'id': 947, 'image_count': 1942}, {'id': 948, 'image_count': 1916}, {'id': 949, 'image_count': 139}, {'id': 950, 'image_count': 43}, {'id': 951, 'image_count': 1969}, {'id': 952, 'image_count': 5}, {'id': 953, 'image_count': 134}, {'id': 954, 'image_count': 74}, {'id': 955, 'image_count': 381}, {'id': 956, 'image_count': 1}, {'id': 957, 'image_count': 381}, {'id': 958, 'image_count': 6}, {'id': 959, 'image_count': 1826}, {'id': 960, 'image_count': 28}, {'id': 961, 'image_count': 1635}, {'id': 962, 'image_count': 1967}, {'id': 963, 'image_count': 16}, {'id': 964, 'image_count': 1926}, {'id': 965, 'image_count': 1789}, {'id': 966, 'image_count': 401}, {'id': 967, 'image_count': 1968}, {'id': 968, 'image_count': 1167}, {'id': 969, 'image_count': 1}, {'id': 970, 'image_count': 56}, {'id': 971, 'image_count': 17}, {'id': 972, 'image_count': 1}, {'id': 973, 'image_count': 58}, {'id': 974, 'image_count': 9}, {'id': 975, 'image_count': 8}, {'id': 976, 'image_count': 1124}, {'id': 977, 'image_count': 31}, {'id': 978, 'image_count': 16}, {'id': 979, 'image_count': 491}, {'id': 980, 'image_count': 432}, {'id': 981, 'image_count': 1945}, {'id': 982, 'image_count': 1899}, {'id': 983, 'image_count': 5}, {'id': 984, 'image_count': 28}, {'id': 985, 'image_count': 7}, {'id': 986, 'image_count': 146}, {'id': 987, 'image_count': 1}, {'id': 988, 'image_count': 25}, {'id': 989, 'image_count': 22}, {'id': 990, 'image_count': 1}, {'id': 991, 'image_count': 10}, {'id': 992, 'image_count': 9}, {'id': 993, 'image_count': 308}, {'id': 994, 'image_count': 4}, {'id': 995, 'image_count': 1969}, {'id': 996, 'image_count': 45}, {'id': 997, 'image_count': 12}, {'id': 998, 'image_count': 1}, {'id': 999, 'image_count': 85}, {'id': 1000, 'image_count': 1127}, {'id': 1001, 'image_count': 11}, {'id': 1002, 'image_count': 60}, {'id': 1003, 'image_count': 1}, {'id': 1004, 'image_count': 16}, {'id': 1005, 'image_count': 1}, {'id': 1006, 'image_count': 65}, {'id': 1007, 'image_count': 13}, {'id': 1008, 'image_count': 655}, {'id': 1009, 'image_count': 51}, {'id': 1010, 'image_count': 1}, {'id': 1011, 'image_count': 673}, {'id': 1012, 'image_count': 5}, {'id': 1013, 'image_count': 36}, {'id': 1014, 'image_count': 54}, {'id': 1015, 'image_count': 5}, {'id': 1016, 'image_count': 8}, {'id': 1017, 'image_count': 305}, {'id': 1018, 'image_count': 297}, {'id': 1019, 'image_count': 1053}, {'id': 1020, 'image_count': 223}, {'id': 1021, 'image_count': 1037}, {'id': 1022, 'image_count': 63}, {'id': 1023, 'image_count': 1881}, {'id': 1024, 'image_count': 507}, {'id': 1025, 'image_count': 333}, {'id': 1026, 'image_count': 1911}, {'id': 1027, 'image_count': 1765}, {'id': 1028, 'image_count': 1}, {'id': 1029, 'image_count': 5}, {'id': 1030, 'image_count': 1}, {'id': 1031, 'image_count': 9}, {'id': 1032, 'image_count': 2}, {'id': 1033, 'image_count': 151}, {'id': 1034, 'image_count': 82}, {'id': 1035, 'image_count': 1931}, {'id': 1036, 'image_count': 41}, {'id': 1037, 'image_count': 1895}, {'id': 1038, 'image_count': 24}, {'id': 1039, 'image_count': 22}, {'id': 1040, 'image_count': 35}, {'id': 1041, 'image_count': 69}, {'id': 1042, 'image_count': 962}, {'id': 1043, 'image_count': 588}, {'id': 1044, 'image_count': 21}, {'id': 1045, 'image_count': 825}, {'id': 1046, 'image_count': 52}, {'id': 1047, 'image_count': 5}, {'id': 1048, 'image_count': 5}, {'id': 1049, 'image_count': 5}, {'id': 1050, 'image_count': 1860}, {'id': 1051, 'image_count': 56}, {'id': 1052, 'image_count': 1582}, {'id': 1053, 'image_count': 7}, {'id': 1054, 'image_count': 2}, {'id': 1055, 'image_count': 1562}, {'id': 1056, 'image_count': 1885}, {'id': 1057, 'image_count': 1}, {'id': 1058, 'image_count': 5}, {'id': 1059, 'image_count': 137}, {'id': 1060, 'image_count': 1094}, {'id': 1061, 'image_count': 134}, {'id': 1062, 'image_count': 29}, {'id': 1063, 'image_count': 22}, {'id': 1064, 'image_count': 522}, {'id': 1065, 'image_count': 50}, {'id': 1066, 'image_count': 68}, {'id': 1067, 'image_count': 16}, {'id': 1068, 'image_count': 40}, {'id': 1069, 'image_count': 35}, {'id': 1070, 'image_count': 135}, {'id': 1071, 'image_count': 1413}, {'id': 1072, 'image_count': 772}, {'id': 1073, 'image_count': 50}, {'id': 1074, 'image_count': 1015}, {'id': 1075, 'image_count': 1}, {'id': 1076, 'image_count': 65}, {'id': 1077, 'image_count': 1900}, {'id': 1078, 'image_count': 1302}, {'id': 1079, 'image_count': 1977}, {'id': 1080, 'image_count': 2}, {'id': 1081, 'image_count': 29}, {'id': 1082, 'image_count': 36}, {'id': 1083, 'image_count': 138}, {'id': 1084, 'image_count': 4}, {'id': 1085, 'image_count': 67}, {'id': 1086, 'image_count': 26}, {'id': 1087, 'image_count': 25}, {'id': 1088, 'image_count': 33}, {'id': 1089, 'image_count': 37}, {'id': 1090, 'image_count': 50}, {'id': 1091, 'image_count': 270}, {'id': 1092, 'image_count': 12}, {'id': 1093, 'image_count': 316}, {'id': 1094, 'image_count': 41}, {'id': 1095, 'image_count': 224}, {'id': 1096, 'image_count': 105}, {'id': 1097, 'image_count': 1925}, {'id': 1098, 'image_count': 1021}, {'id': 1099, 'image_count': 1213}, {'id': 1100, 'image_count': 172}, {'id': 1101, 'image_count': 28}, {'id': 1102, 'image_count': 745}, {'id': 1103, 'image_count': 187}, {'id': 1104, 'image_count': 147}, {'id': 1105, 'image_count': 136}, {'id': 1106, 'image_count': 34}, {'id': 1107, 'image_count': 41}, {'id': 1108, 'image_count': 636}, {'id': 1109, 'image_count': 570}, {'id': 1110, 'image_count': 1149}, {'id': 1111, 'image_count': 61}, {'id': 1112, 'image_count': 1890}, {'id': 1113, 'image_count': 18}, {'id': 1114, 'image_count': 143}, {'id': 1115, 'image_count': 1517}, {'id': 1116, 'image_count': 7}, {'id': 1117, 'image_count': 943}, {'id': 1118, 'image_count': 6}, {'id': 1119, 'image_count': 1}, {'id': 1120, 'image_count': 11}, {'id': 1121, 'image_count': 101}, {'id': 1122, 'image_count': 1909}, {'id': 1123, 'image_count': 800}, {'id': 1124, 'image_count': 1}, {'id': 1125, 'image_count': 44}, {'id': 1126, 'image_count': 3}, {'id': 1127, 'image_count': 44}, {'id': 1128, 'image_count': 31}, {'id': 1129, 'image_count': 7}, {'id': 1130, 'image_count': 20}, {'id': 1131, 'image_count': 11}, {'id': 1132, 'image_count': 13}, {'id': 1133, 'image_count': 1924}, {'id': 1134, 'image_count': 113}, {'id': 1135, 'image_count': 2}, {'id': 1136, 'image_count': 139}, {'id': 1137, 'image_count': 12}, {'id': 1138, 'image_count': 37}, {'id': 1139, 'image_count': 1866}, {'id': 1140, 'image_count': 47}, {'id': 1141, 'image_count': 1468}, {'id': 1142, 'image_count': 729}, {'id': 1143, 'image_count': 24}, {'id': 1144, 'image_count': 1}, {'id': 1145, 'image_count': 10}, {'id': 1146, 'image_count': 3}, {'id': 1147, 'image_count': 14}, {'id': 1148, 'image_count': 4}, {'id': 1149, 'image_count': 29}, {'id': 1150, 'image_count': 4}, {'id': 1151, 'image_count': 70}, {'id': 1152, 'image_count': 46}, {'id': 1153, 'image_count': 14}, {'id': 1154, 'image_count': 48}, {'id': 1155, 'image_count': 1855}, {'id': 1156, 'image_count': 113}, {'id': 1157, 'image_count': 1}, {'id': 1158, 'image_count': 1}, {'id': 1159, 'image_count': 10}, {'id': 1160, 'image_count': 54}, {'id': 1161, 'image_count': 1923}, {'id': 1162, 'image_count': 630}, {'id': 1163, 'image_count': 31}, {'id': 1164, 'image_count': 69}, {'id': 1165, 'image_count': 7}, {'id': 1166, 'image_count': 11}, {'id': 1167, 'image_count': 1}, {'id': 1168, 'image_count': 30}, {'id': 1169, 'image_count': 50}, {'id': 1170, 'image_count': 45}, {'id': 1171, 'image_count': 28}, {'id': 1172, 'image_count': 114}, {'id': 1173, 'image_count': 193}, {'id': 1174, 'image_count': 21}, {'id': 1175, 'image_count': 91}, {'id': 1176, 'image_count': 31}, {'id': 1177, 'image_count': 1469}, {'id': 1178, 'image_count': 1924}, {'id': 1179, 'image_count': 87}, {'id': 1180, 'image_count': 77}, {'id': 1181, 'image_count': 11}, {'id': 1182, 'image_count': 47}, {'id': 1183, 'image_count': 21}, {'id': 1184, 'image_count': 47}, {'id': 1185, 'image_count': 70}, {'id': 1186, 'image_count': 1838}, {'id': 1187, 'image_count': 19}, {'id': 1188, 'image_count': 531}, {'id': 1189, 'image_count': 11}, {'id': 1190, 'image_count': 941}, {'id': 1191, 'image_count': 113}, {'id': 1192, 'image_count': 26}, {'id': 1193, 'image_count': 5}, {'id': 1194, 'image_count': 56}, {'id': 1195, 'image_count': 73}, {'id': 1196, 'image_count': 32}, {'id': 1197, 'image_count': 128}, {'id': 1198, 'image_count': 623}, {'id': 1199, 'image_count': 12}, {'id': 1200, 'image_count': 52}, {'id': 1201, 'image_count': 11}, {'id': 1202, 'image_count': 1674}, {'id': 1203, 'image_count': 81}]  # noqa
+# fmt: on
diff --git a/detectron2/data/datasets/pascal_voc.py b/detectron2/data/datasets/pascal_voc.py
new file mode 100644
index 0000000000000000000000000000000000000000..46f8536ad26f4d47a53a95bed62548d8aff5047e
--- /dev/null
+++ b/detectron2/data/datasets/pascal_voc.py
@@ -0,0 +1,82 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) Facebook, Inc. and its affiliates.
+
+import numpy as np
+import os
+import xml.etree.ElementTree as ET
+from typing import List, Tuple, Union
+
+from detectron2.data import DatasetCatalog, MetadataCatalog
+from detectron2.structures import BoxMode
+from detectron2.utils.file_io import PathManager
+
+__all__ = ["load_voc_instances", "register_pascal_voc"]
+
+
+# fmt: off
+CLASS_NAMES = (
+    "aeroplane", "bicycle", "bird", "boat", "bottle", "bus", "car", "cat",
+    "chair", "cow", "diningtable", "dog", "horse", "motorbike", "person",
+    "pottedplant", "sheep", "sofa", "train", "tvmonitor"
+)
+# fmt: on
+
+
+def load_voc_instances(dirname: str, split: str, class_names: Union[List[str], Tuple[str, ...]]):
+    """
+    Load Pascal VOC detection annotations to Detectron2 format.
+
+    Args:
+        dirname: Contain "Annotations", "ImageSets", "JPEGImages"
+        split (str): one of "train", "test", "val", "trainval"
+        class_names: list or tuple of class names
+    """
+    with PathManager.open(os.path.join(dirname, "ImageSets", "Main", split + ".txt")) as f:
+        fileids = np.loadtxt(f, dtype=str)
+
+    # Needs to read many small annotation files. Makes sense at local
+    annotation_dirname = PathManager.get_local_path(os.path.join(dirname, "Annotations/"))
+    dicts = []
+    for fileid in fileids:
+        anno_file = os.path.join(annotation_dirname, fileid + ".xml")
+        jpeg_file = os.path.join(dirname, "JPEGImages", fileid + ".jpg")
+
+        with PathManager.open(anno_file) as f:
+            tree = ET.parse(f)
+
+        r = {
+            "file_name": jpeg_file,
+            "image_id": fileid,
+            "height": int(tree.findall("./size/height")[0].text),
+            "width": int(tree.findall("./size/width")[0].text),
+        }
+        instances = []
+
+        for obj in tree.findall("object"):
+            cls = obj.find("name").text
+            # We include "difficult" samples in training.
+            # Based on limited experiments, they don't hurt accuracy.
+            # difficult = int(obj.find("difficult").text)
+            # if difficult == 1:
+            # continue
+            bbox = obj.find("bndbox")
+            bbox = [float(bbox.find(x).text) for x in ["xmin", "ymin", "xmax", "ymax"]]
+            # Original annotations are integers in the range [1, W or H]
+            # Assuming they mean 1-based pixel indices (inclusive),
+            # a box with annotation (xmin=1, xmax=W) covers the whole image.
+            # In coordinate space this is represented by (xmin=0, xmax=W)
+            bbox[0] -= 1.0
+            bbox[1] -= 1.0
+            instances.append(
+                {"category_id": class_names.index(cls), "bbox": bbox, "bbox_mode": BoxMode.XYXY_ABS}
+            )
+        r["annotations"] = instances
+        dicts.append(r)
+    return dicts
+
+
+def register_pascal_voc(name, dirname, split, year, class_names=CLASS_NAMES):
+    DatasetCatalog.register(name, lambda: load_voc_instances(dirname, split, class_names))
+    MetadataCatalog.get(name).set(
+        thing_classes=list(class_names), dirname=dirname, year=year, split=split
+    )
diff --git a/detectron2/data/datasets/register_coco.py b/detectron2/data/datasets/register_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..e564438d5bf016bcdbb65b4bbdc215d79f579f8a
--- /dev/null
+++ b/detectron2/data/datasets/register_coco.py
@@ -0,0 +1,3 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+from .coco import register_coco_instances  # noqa
+from .coco_panoptic import register_coco_panoptic_separated  # noqa
diff --git a/detectron2/data/detection_utils.py b/detectron2/data/detection_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..ada19bdb4a2aa74874da4dba5d179ce38201c85d
--- /dev/null
+++ b/detectron2/data/detection_utils.py
@@ -0,0 +1,659 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) Facebook, Inc. and its affiliates.
+
+"""
+Common data processing utilities that are used in a
+typical object detection data pipeline.
+"""
+import logging
+import numpy as np
+from typing import List, Union
+import pycocotools.mask as mask_util
+import torch
+from PIL import Image
+
+from detectron2.structures import (
+    BitMasks,
+    Boxes,
+    BoxMode,
+    Instances,
+    Keypoints,
+    PolygonMasks,
+    RotatedBoxes,
+    polygons_to_bitmask,
+)
+from detectron2.utils.file_io import PathManager
+
+from . import transforms as T
+from .catalog import MetadataCatalog
+
+__all__ = [
+    "SizeMismatchError",
+    "convert_image_to_rgb",
+    "check_image_size",
+    "transform_proposals",
+    "transform_instance_annotations",
+    "annotations_to_instances",
+    "annotations_to_instances_rotated",
+    "build_augmentation",
+    "build_transform_gen",
+    "create_keypoint_hflip_indices",
+    "filter_empty_instances",
+    "read_image",
+]
+
+
+class SizeMismatchError(ValueError):
+    """
+    When loaded image has difference width/height compared with annotation.
+    """
+
+
+# https://en.wikipedia.org/wiki/YUV#SDTV_with_BT.601
+_M_RGB2YUV = [[0.299, 0.587, 0.114], [-0.14713, -0.28886, 0.436], [0.615, -0.51499, -0.10001]]
+_M_YUV2RGB = [[1.0, 0.0, 1.13983], [1.0, -0.39465, -0.58060], [1.0, 2.03211, 0.0]]
+
+# https://www.exiv2.org/tags.html
+_EXIF_ORIENT = 274  # exif 'Orientation' tag
+
+
+def convert_PIL_to_numpy(image, format):
+    """
+    Convert PIL image to numpy array of target format.
+
+    Args:
+        image (PIL.Image): a PIL image
+        format (str): the format of output image
+
+    Returns:
+        (np.ndarray): also see `read_image`
+    """
+    if format is not None:
+        # PIL only supports RGB, so convert to RGB and flip channels over below
+        conversion_format = format
+        if format in ["BGR", "YUV-BT.601"]:
+            conversion_format = "RGB"
+        image = image.convert(conversion_format)
+    image = np.asarray(image)
+    # PIL squeezes out the channel dimension for "L", so make it HWC
+    if format == "L":
+        image = np.expand_dims(image, -1)
+
+    # handle formats not supported by PIL
+    elif format == "BGR":
+        # flip channels if needed
+        image = image[:, :, ::-1]
+    elif format == "YUV-BT.601":
+        image = image / 255.0
+        image = np.dot(image, np.array(_M_RGB2YUV).T)
+
+    return image
+
+
+def convert_image_to_rgb(image, format):
+    """
+    Convert an image from given format to RGB.
+
+    Args:
+        image (np.ndarray or Tensor): an HWC image
+        format (str): the format of input image, also see `read_image`
+
+    Returns:
+        (np.ndarray): (H,W,3) RGB image in 0-255 range, can be either float or uint8
+    """
+    if isinstance(image, torch.Tensor):
+        image = image.cpu().numpy()
+    if format == "BGR":
+        image = image[:, :, [2, 1, 0]]
+    elif format == "YUV-BT.601":
+        image = np.dot(image, np.array(_M_YUV2RGB).T)
+        image = image * 255.0
+    else:
+        if format == "L":
+            image = image[:, :, 0]
+        image = image.astype(np.uint8)
+        image = np.asarray(Image.fromarray(image, mode=format).convert("RGB"))
+    return image
+
+
+def _apply_exif_orientation(image):
+    """
+    Applies the exif orientation correctly.
+
+    This code exists per the bug:
+      https://github.com/python-pillow/Pillow/issues/3973
+    with the function `ImageOps.exif_transpose`. The Pillow source raises errors with
+    various methods, especially `tobytes`
+
+    Function based on:
+      https://github.com/wkentaro/labelme/blob/v4.5.4/labelme/utils/image.py#L59
+      https://github.com/python-pillow/Pillow/blob/7.1.2/src/PIL/ImageOps.py#L527
+
+    Args:
+        image (PIL.Image): a PIL image
+
+    Returns:
+        (PIL.Image): the PIL image with exif orientation applied, if applicable
+    """
+    if not hasattr(image, "getexif"):
+        return image
+
+    try:
+        exif = image.getexif()
+    except Exception:  # https://github.com/facebookresearch/detectron2/issues/1885
+        exif = None
+
+    if exif is None:
+        return image
+
+    orientation = exif.get(_EXIF_ORIENT)
+
+    method = {
+        2: Image.FLIP_LEFT_RIGHT,
+        3: Image.ROTATE_180,
+        4: Image.FLIP_TOP_BOTTOM,
+        5: Image.TRANSPOSE,
+        6: Image.ROTATE_270,
+        7: Image.TRANSVERSE,
+        8: Image.ROTATE_90,
+    }.get(orientation)
+
+    if method is not None:
+        return image.transpose(method)
+    return image
+
+
+def read_image(file_name, format=None):
+    """
+    Read an image into the given format.
+    Will apply rotation and flipping if the image has such exif information.
+
+    Args:
+        file_name (str): image file path
+        format (str): one of the supported image modes in PIL, or "BGR" or "YUV-BT.601".
+
+    Returns:
+        image (np.ndarray):
+            an HWC image in the given format, which is 0-255, uint8 for
+            supported image modes in PIL or "BGR"; float (0-1 for Y) for YUV-BT.601.
+    """
+    with PathManager.open(file_name, "rb") as f:
+        image = Image.open(f)
+
+        # work around this bug: https://github.com/python-pillow/Pillow/issues/3973
+        image = _apply_exif_orientation(image)
+        return convert_PIL_to_numpy(image, format)
+
+
+def check_image_size(dataset_dict, image):
+    """
+    Raise an error if the image does not match the size specified in the dict.
+    """
+    if "width" in dataset_dict or "height" in dataset_dict:
+        image_wh = (image.shape[1], image.shape[0])
+        expected_wh = (dataset_dict["width"], dataset_dict["height"])
+        if not image_wh == expected_wh:
+            raise SizeMismatchError(
+                "Mismatched image shape{}, got {}, expect {}.".format(
+                    " for image " + dataset_dict["file_name"]
+                    if "file_name" in dataset_dict
+                    else "",
+                    image_wh,
+                    expected_wh,
+                )
+                + " Please check the width/height in your annotation."
+            )
+
+    # To ensure bbox always remap to original image size
+    if "width" not in dataset_dict:
+        dataset_dict["width"] = image.shape[1]
+    if "height" not in dataset_dict:
+        dataset_dict["height"] = image.shape[0]
+
+
+def transform_proposals(dataset_dict, image_shape, transforms, *, proposal_topk, min_box_size=0):
+    """
+    Apply transformations to the proposals in dataset_dict, if any.
+
+    Args:
+        dataset_dict (dict): a dict read from the dataset, possibly
+            contains fields "proposal_boxes", "proposal_objectness_logits", "proposal_bbox_mode"
+        image_shape (tuple): height, width
+        transforms (TransformList):
+        proposal_topk (int): only keep top-K scoring proposals
+        min_box_size (int): proposals with either side smaller than this
+            threshold are removed
+
+    The input dict is modified in-place, with abovementioned keys removed. A new
+    key "proposals" will be added. Its value is an `Instances`
+    object which contains the transformed proposals in its field
+    "proposal_boxes" and "objectness_logits".
+    """
+    if "proposal_boxes" in dataset_dict:
+        # Transform proposal boxes
+        boxes = transforms.apply_box(
+            BoxMode.convert(
+                dataset_dict.pop("proposal_boxes"),
+                dataset_dict.pop("proposal_bbox_mode"),
+                BoxMode.XYXY_ABS,
+            )
+        )
+        boxes = Boxes(boxes)
+        objectness_logits = torch.as_tensor(
+            dataset_dict.pop("proposal_objectness_logits").astype("float32")
+        )
+
+        boxes.clip(image_shape)
+        keep = boxes.nonempty(threshold=min_box_size)
+        boxes = boxes[keep]
+        objectness_logits = objectness_logits[keep]
+
+        proposals = Instances(image_shape)
+        proposals.proposal_boxes = boxes[:proposal_topk]
+        proposals.objectness_logits = objectness_logits[:proposal_topk]
+        dataset_dict["proposals"] = proposals
+
+
+def get_bbox(annotation):
+    """
+    Get bbox from data
+    Args:
+        annotation (dict): dict of instance annotations for a single instance.
+    Returns:
+        bbox (ndarray): x1, y1, x2, y2 coordinates
+    """
+    # bbox is 1d (per-instance bounding box)
+    bbox = BoxMode.convert(annotation["bbox"], annotation["bbox_mode"], BoxMode.XYXY_ABS)
+    return bbox
+
+
+def transform_instance_annotations(
+    annotation, transforms, image_size, *, keypoint_hflip_indices=None
+):
+    """
+    Apply transforms to box, segmentation and keypoints annotations of a single instance.
+
+    It will use `transforms.apply_box` for the box, and
+    `transforms.apply_coords` for segmentation polygons & keypoints.
+    If you need anything more specially designed for each data structure,
+    you'll need to implement your own version of this function or the transforms.
+
+    Args:
+        annotation (dict): dict of instance annotations for a single instance.
+            It will be modified in-place.
+        transforms (TransformList or list[Transform]):
+        image_size (tuple): the height, width of the transformed image
+        keypoint_hflip_indices (ndarray[int]): see `create_keypoint_hflip_indices`.
+
+    Returns:
+        dict:
+            the same input dict with fields "bbox", "segmentation", "keypoints"
+            transformed according to `transforms`.
+            The "bbox_mode" field will be set to XYXY_ABS.
+    """
+    if isinstance(transforms, (tuple, list)):
+        transforms = T.TransformList(transforms)
+    # bbox is 1d (per-instance bounding box)
+    bbox = BoxMode.convert(annotation["bbox"], annotation["bbox_mode"], BoxMode.XYXY_ABS)
+    # clip transformed bbox to image size
+    bbox = transforms.apply_box(np.array([bbox]))[0].clip(min=0)
+    annotation["bbox"] = np.minimum(bbox, list(image_size + image_size)[::-1])
+    annotation["bbox_mode"] = BoxMode.XYXY_ABS
+
+    if "segmentation" in annotation:
+        # each instance contains 1 or more polygons
+        segm = annotation["segmentation"]
+        if isinstance(segm, list):
+            # polygons
+            polygons = [np.asarray(p).reshape(-1, 2) for p in segm]
+            annotation["segmentation"] = [
+                p.reshape(-1) for p in transforms.apply_polygons(polygons)
+            ]
+        elif isinstance(segm, dict):
+            # RLE
+            mask = mask_util.decode(segm)
+            mask = transforms.apply_segmentation(mask)
+            assert tuple(mask.shape[:2]) == image_size
+            annotation["segmentation"] = mask
+        else:
+            raise ValueError(
+                "Cannot transform segmentation of type '{}'!"
+                "Supported types are: polygons as list[list[float] or ndarray],"
+                " COCO-style RLE as a dict.".format(type(segm))
+            )
+
+    if "keypoints" in annotation:
+        keypoints = transform_keypoint_annotations(
+            annotation["keypoints"], transforms, image_size, keypoint_hflip_indices
+        )
+        annotation["keypoints"] = keypoints
+
+    return annotation
+
+
+def transform_keypoint_annotations(keypoints, transforms, image_size, keypoint_hflip_indices=None):
+    """
+    Transform keypoint annotations of an image.
+    If a keypoint is transformed out of image boundary, it will be marked "unlabeled" (visibility=0)
+
+    Args:
+        keypoints (list[float]): Nx3 float in Detectron2's Dataset format.
+            Each point is represented by (x, y, visibility).
+        transforms (TransformList):
+        image_size (tuple): the height, width of the transformed image
+        keypoint_hflip_indices (ndarray[int]): see `create_keypoint_hflip_indices`.
+            When `transforms` includes horizontal flip, will use the index
+            mapping to flip keypoints.
+    """
+    # (N*3,) -> (N, 3)
+    keypoints = np.asarray(keypoints, dtype="float64").reshape(-1, 3)
+    keypoints_xy = transforms.apply_coords(keypoints[:, :2])
+
+    # Set all out-of-boundary points to "unlabeled"
+    inside = (keypoints_xy >= np.array([0, 0])) & (keypoints_xy <= np.array(image_size[::-1]))
+    inside = inside.all(axis=1)
+    keypoints[:, :2] = keypoints_xy
+    keypoints[:, 2][~inside] = 0
+
+    # This assumes that HorizFlipTransform is the only one that does flip
+    do_hflip = sum(isinstance(t, T.HFlipTransform) for t in transforms.transforms) % 2 == 1
+
+    # Alternative way: check if probe points was horizontally flipped.
+    # probe = np.asarray([[0.0, 0.0], [image_width, 0.0]])
+    # probe_aug = transforms.apply_coords(probe.copy())
+    # do_hflip = np.sign(probe[1][0] - probe[0][0]) != np.sign(probe_aug[1][0] - probe_aug[0][0])  # noqa
+
+    # If flipped, swap each keypoint with its opposite-handed equivalent
+    if do_hflip:
+        if keypoint_hflip_indices is None:
+            raise ValueError("Cannot flip keypoints without providing flip indices!")
+        if len(keypoints) != len(keypoint_hflip_indices):
+            raise ValueError(
+                "Keypoint data has {} points, but metadata "
+                "contains {} points!".format(len(keypoints), len(keypoint_hflip_indices))
+            )
+        keypoints = keypoints[np.asarray(keypoint_hflip_indices, dtype=np.int32), :]
+
+    # Maintain COCO convention that if visibility == 0 (unlabeled), then x, y = 0
+    keypoints[keypoints[:, 2] == 0] = 0
+    return keypoints
+
+
+def annotations_to_instances(annos, image_size, mask_format="polygon"):
+    """
+    Create an :class:`Instances` object used by the models,
+    from instance annotations in the dataset dict.
+
+    Args:
+        annos (list[dict]): a list of instance annotations in one image, each
+            element for one instance.
+        image_size (tuple): height, width
+
+    Returns:
+        Instances:
+            It will contain fields "gt_boxes", "gt_classes",
+            "gt_masks", "gt_keypoints", if they can be obtained from `annos`.
+            This is the format that builtin models expect.
+    """
+    boxes = (
+        np.stack(
+            [BoxMode.convert(obj["bbox"], obj["bbox_mode"], BoxMode.XYXY_ABS) for obj in annos]
+        )
+        if len(annos)
+        else np.zeros((0, 4))
+    )
+    target = Instances(image_size)
+    target.gt_boxes = Boxes(boxes)
+
+    classes = [int(obj["category_id"]) for obj in annos]
+    classes = torch.tensor(classes, dtype=torch.int64)
+    target.gt_classes = classes
+
+    if len(annos) and "segmentation" in annos[0]:
+        segms = [obj["segmentation"] for obj in annos]
+        if mask_format == "polygon":
+            try:
+                masks = PolygonMasks(segms)
+            except ValueError as e:
+                raise ValueError(
+                    "Failed to use mask_format=='polygon' from the given annotations!"
+                ) from e
+        else:
+            assert mask_format == "bitmask", mask_format
+            masks = []
+            for segm in segms:
+                if isinstance(segm, list):
+                    # polygon
+                    masks.append(polygons_to_bitmask(segm, *image_size))
+                elif isinstance(segm, dict):
+                    # COCO RLE
+                    masks.append(mask_util.decode(segm))
+                elif isinstance(segm, np.ndarray):
+                    assert segm.ndim == 2, "Expect segmentation of 2 dimensions, got {}.".format(
+                        segm.ndim
+                    )
+                    # mask array
+                    masks.append(segm)
+                else:
+                    raise ValueError(
+                        "Cannot convert segmentation of type '{}' to BitMasks!"
+                        "Supported types are: polygons as list[list[float] or ndarray],"
+                        " COCO-style RLE as a dict, or a binary segmentation mask "
+                        " in a 2D numpy array of shape HxW.".format(type(segm))
+                    )
+            # torch.from_numpy does not support array with negative stride.
+            masks = BitMasks(
+                torch.stack([torch.from_numpy(np.ascontiguousarray(x)) for x in masks])
+            )
+        target.gt_masks = masks
+
+    if len(annos) and "keypoints" in annos[0]:
+        kpts = [obj.get("keypoints", []) for obj in annos]
+        target.gt_keypoints = Keypoints(kpts)
+
+    return target
+
+
+def annotations_to_instances_rotated(annos, image_size):
+    """
+    Create an :class:`Instances` object used by the models,
+    from instance annotations in the dataset dict.
+    Compared to `annotations_to_instances`, this function is for rotated boxes only
+
+    Args:
+        annos (list[dict]): a list of instance annotations in one image, each
+            element for one instance.
+        image_size (tuple): height, width
+
+    Returns:
+        Instances:
+            Containing fields "gt_boxes", "gt_classes",
+            if they can be obtained from `annos`.
+            This is the format that builtin models expect.
+    """
+    boxes = [obj["bbox"] for obj in annos]
+    target = Instances(image_size)
+    boxes = target.gt_boxes = RotatedBoxes(boxes)
+    boxes.clip(image_size)
+
+    classes = [obj["category_id"] for obj in annos]
+    classes = torch.tensor(classes, dtype=torch.int64)
+    target.gt_classes = classes
+
+    return target
+
+
+def filter_empty_instances(
+    instances, by_box=True, by_mask=True, box_threshold=1e-5, return_mask=False
+):
+    """
+    Filter out empty instances in an `Instances` object.
+
+    Args:
+        instances (Instances):
+        by_box (bool): whether to filter out instances with empty boxes
+        by_mask (bool): whether to filter out instances with empty masks
+        box_threshold (float): minimum width and height to be considered non-empty
+        return_mask (bool): whether to return boolean mask of filtered instances
+
+    Returns:
+        Instances: the filtered instances.
+        tensor[bool], optional: boolean mask of filtered instances
+    """
+    assert by_box or by_mask
+    r = []
+    if by_box:
+        r.append(instances.gt_boxes.nonempty(threshold=box_threshold))
+    if instances.has("gt_masks") and by_mask:
+        r.append(instances.gt_masks.nonempty())
+
+    # TODO: can also filter visible keypoints
+
+    if not r:
+        return instances
+    m = r[0]
+    for x in r[1:]:
+        m = m & x
+    if return_mask:
+        return instances[m], m
+    return instances[m]
+
+
+def create_keypoint_hflip_indices(dataset_names: Union[str, List[str]]) -> List[int]:
+    """
+    Args:
+        dataset_names: list of dataset names
+
+    Returns:
+        list[int]: a list of size=#keypoints, storing the
+        horizontally-flipped keypoint indices.
+    """
+    if isinstance(dataset_names, str):
+        dataset_names = [dataset_names]
+
+    check_metadata_consistency("keypoint_names", dataset_names)
+    check_metadata_consistency("keypoint_flip_map", dataset_names)
+
+    meta = MetadataCatalog.get(dataset_names[0])
+    names = meta.keypoint_names
+    # TODO flip -> hflip
+    flip_map = dict(meta.keypoint_flip_map)
+    flip_map.update({v: k for k, v in flip_map.items()})
+    flipped_names = [i if i not in flip_map else flip_map[i] for i in names]
+    flip_indices = [names.index(i) for i in flipped_names]
+    return flip_indices
+
+
+def get_fed_loss_cls_weights(dataset_names: Union[str, List[str]], freq_weight_power=1.0):
+    """
+    Get frequency weight for each class sorted by class id.
+    We now calcualte freqency weight using image_count to the power freq_weight_power.
+
+    Args:
+        dataset_names: list of dataset names
+        freq_weight_power: power value
+    """
+    if isinstance(dataset_names, str):
+        dataset_names = [dataset_names]
+
+    check_metadata_consistency("class_image_count", dataset_names)
+
+    meta = MetadataCatalog.get(dataset_names[0])
+    class_freq_meta = meta.class_image_count
+    class_freq = torch.tensor(
+        [c["image_count"] for c in sorted(class_freq_meta, key=lambda x: x["id"])]
+    )
+    class_freq_weight = class_freq.float() ** freq_weight_power
+    return class_freq_weight
+
+
+def gen_crop_transform_with_instance(crop_size, image_size, instance):
+    """
+    Generate a CropTransform so that the cropping region contains
+    the center of the given instance.
+
+    Args:
+        crop_size (tuple): h, w in pixels
+        image_size (tuple): h, w
+        instance (dict): an annotation dict of one instance, in Detectron2's
+            dataset format.
+    """
+    crop_size = np.asarray(crop_size, dtype=np.int32)
+    bbox = BoxMode.convert(instance["bbox"], instance["bbox_mode"], BoxMode.XYXY_ABS)
+    center_yx = (bbox[1] + bbox[3]) * 0.5, (bbox[0] + bbox[2]) * 0.5
+    assert (
+        image_size[0] >= center_yx[0] and image_size[1] >= center_yx[1]
+    ), "The annotation bounding box is outside of the image!"
+    assert (
+        image_size[0] >= crop_size[0] and image_size[1] >= crop_size[1]
+    ), "Crop size is larger than image size!"
+
+    min_yx = np.maximum(np.floor(center_yx).astype(np.int32) - crop_size, 0)
+    max_yx = np.maximum(np.asarray(image_size, dtype=np.int32) - crop_size, 0)
+    max_yx = np.minimum(max_yx, np.ceil(center_yx).astype(np.int32))
+
+    y0 = np.random.randint(min_yx[0], max_yx[0] + 1)
+    x0 = np.random.randint(min_yx[1], max_yx[1] + 1)
+    return T.CropTransform(x0, y0, crop_size[1], crop_size[0])
+
+
+def check_metadata_consistency(key, dataset_names):
+    """
+    Check that the datasets have consistent metadata.
+
+    Args:
+        key (str): a metadata key
+        dataset_names (list[str]): a list of dataset names
+
+    Raises:
+        AttributeError: if the key does not exist in the metadata
+        ValueError: if the given datasets do not have the same metadata values defined by key
+    """
+    if len(dataset_names) == 0:
+        return
+    logger = logging.getLogger(__name__)
+    entries_per_dataset = [getattr(MetadataCatalog.get(d), key) for d in dataset_names]
+    for idx, entry in enumerate(entries_per_dataset):
+        if entry != entries_per_dataset[0]:
+            logger.error(
+                "Metadata '{}' for dataset '{}' is '{}'".format(key, dataset_names[idx], str(entry))
+            )
+            logger.error(
+                "Metadata '{}' for dataset '{}' is '{}'".format(
+                    key, dataset_names[0], str(entries_per_dataset[0])
+                )
+            )
+            raise ValueError("Datasets have different metadata '{}'!".format(key))
+
+
+def build_augmentation(cfg, is_train):
+    """
+    Create a list of default :class:`Augmentation` from config.
+    Now it includes resizing and flipping.
+
+    Returns:
+        list[Augmentation]
+    """
+    if is_train:
+        min_size = cfg.INPUT.MIN_SIZE_TRAIN
+        max_size = cfg.INPUT.MAX_SIZE_TRAIN
+        sample_style = cfg.INPUT.MIN_SIZE_TRAIN_SAMPLING
+    else:
+        min_size = cfg.INPUT.MIN_SIZE_TEST
+        max_size = cfg.INPUT.MAX_SIZE_TEST
+        sample_style = "choice"
+    augmentation = [T.ResizeShortestEdge(min_size, max_size, sample_style)]
+    if is_train and cfg.INPUT.RANDOM_FLIP != "none":
+        augmentation.append(
+            T.RandomFlip(
+                horizontal=cfg.INPUT.RANDOM_FLIP == "horizontal",
+                vertical=cfg.INPUT.RANDOM_FLIP == "vertical",
+            )
+        )
+    return augmentation
+
+
+build_transform_gen = build_augmentation
+"""
+Alias for backward-compatibility.
+"""
diff --git a/detectron2/data/samplers/__init__.py b/detectron2/data/samplers/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..85c9f1a9df8a4038fbd4246239b699402e382309
--- /dev/null
+++ b/detectron2/data/samplers/__init__.py
@@ -0,0 +1,17 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+from .distributed_sampler import (
+    InferenceSampler,
+    RandomSubsetTrainingSampler,
+    RepeatFactorTrainingSampler,
+    TrainingSampler,
+)
+
+from .grouped_batch_sampler import GroupedBatchSampler
+
+__all__ = [
+    "GroupedBatchSampler",
+    "TrainingSampler",
+    "RandomSubsetTrainingSampler",
+    "InferenceSampler",
+    "RepeatFactorTrainingSampler",
+]
diff --git a/detectron2/data/samplers/__pycache__/__init__.cpython-310.pyc b/detectron2/data/samplers/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2894c9958ab3e409fb823fe1cd2caaab49601f46
Binary files /dev/null and b/detectron2/data/samplers/__pycache__/__init__.cpython-310.pyc differ
diff --git a/detectron2/data/samplers/__pycache__/__init__.cpython-39.pyc b/detectron2/data/samplers/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..153d0f85ab5c41f8411433c413949180b9147361
Binary files /dev/null and b/detectron2/data/samplers/__pycache__/__init__.cpython-39.pyc differ
diff --git a/detectron2/data/samplers/__pycache__/distributed_sampler.cpython-310.pyc b/detectron2/data/samplers/__pycache__/distributed_sampler.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6f8c9dc92baebc6263c8199a601d87b1e263392e
Binary files /dev/null and b/detectron2/data/samplers/__pycache__/distributed_sampler.cpython-310.pyc differ
diff --git a/detectron2/data/samplers/__pycache__/distributed_sampler.cpython-39.pyc b/detectron2/data/samplers/__pycache__/distributed_sampler.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..54fe61f3ad319db1e497856b3226a323c29f3f2d
Binary files /dev/null and b/detectron2/data/samplers/__pycache__/distributed_sampler.cpython-39.pyc differ
diff --git a/detectron2/data/samplers/__pycache__/grouped_batch_sampler.cpython-310.pyc b/detectron2/data/samplers/__pycache__/grouped_batch_sampler.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6c76d67b442cad260a07c4cdc9d50ecc2225f14a
Binary files /dev/null and b/detectron2/data/samplers/__pycache__/grouped_batch_sampler.cpython-310.pyc differ
diff --git a/detectron2/data/samplers/__pycache__/grouped_batch_sampler.cpython-39.pyc b/detectron2/data/samplers/__pycache__/grouped_batch_sampler.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7378c80de6f89e3721c110395949a953b8ad9e2a
Binary files /dev/null and b/detectron2/data/samplers/__pycache__/grouped_batch_sampler.cpython-39.pyc differ
diff --git a/detectron2/data/samplers/distributed_sampler.py b/detectron2/data/samplers/distributed_sampler.py
new file mode 100644
index 0000000000000000000000000000000000000000..a098e6ac07c1b193fddcb69e6e54aced82e6081c
--- /dev/null
+++ b/detectron2/data/samplers/distributed_sampler.py
@@ -0,0 +1,278 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+import itertools
+import logging
+import math
+from collections import defaultdict
+from typing import Optional
+import torch
+from torch.utils.data.sampler import Sampler
+
+from detectron2.utils import comm
+
+logger = logging.getLogger(__name__)
+
+
+class TrainingSampler(Sampler):
+    """
+    In training, we only care about the "infinite stream" of training data.
+    So this sampler produces an infinite stream of indices and
+    all workers cooperate to correctly shuffle the indices and sample different indices.
+
+    The samplers in each worker effectively produces `indices[worker_id::num_workers]`
+    where `indices` is an infinite stream of indices consisting of
+    `shuffle(range(size)) + shuffle(range(size)) + ...` (if shuffle is True)
+    or `range(size) + range(size) + ...` (if shuffle is False)
+
+    Note that this sampler does not shard based on pytorch DataLoader worker id.
+    A sampler passed to pytorch DataLoader is used only with map-style dataset
+    and will not be executed inside workers.
+    But if this sampler is used in a way that it gets execute inside a dataloader
+    worker, then extra work needs to be done to shard its outputs based on worker id.
+    This is required so that workers don't produce identical data.
+    :class:`ToIterableDataset` implements this logic.
+    This note is true for all samplers in detectron2.
+    """
+
+    def __init__(self, size: int, shuffle: bool = True, seed: Optional[int] = None):
+        """
+        Args:
+            size (int): the total number of data of the underlying dataset to sample from
+            shuffle (bool): whether to shuffle the indices or not
+            seed (int): the initial seed of the shuffle. Must be the same
+                across all workers. If None, will use a random seed shared
+                among workers (require synchronization among all workers).
+        """
+        if not isinstance(size, int):
+            raise TypeError(f"TrainingSampler(size=) expects an int. Got type {type(size)}.")
+        if size <= 0:
+            raise ValueError(f"TrainingSampler(size=) expects a positive int. Got {size}.")
+        self._size = size
+        self._shuffle = shuffle
+        if seed is None:
+            seed = comm.shared_random_seed()
+        self._seed = int(seed)
+
+        self._rank = comm.get_rank()
+        self._world_size = comm.get_world_size()
+
+    def __iter__(self):
+        start = self._rank
+        yield from itertools.islice(self._infinite_indices(), start, None, self._world_size)
+
+    def _infinite_indices(self):
+        g = torch.Generator()
+        g.manual_seed(self._seed)
+        while True:
+            if self._shuffle:
+                yield from torch.randperm(self._size, generator=g).tolist()
+            else:
+                yield from torch.arange(self._size).tolist()
+
+
+class RandomSubsetTrainingSampler(TrainingSampler):
+    """
+    Similar to TrainingSampler, but only sample a random subset of indices.
+    This is useful when you want to estimate the accuracy vs data-number curves by
+      training the model with different subset_ratio.
+    """
+
+    def __init__(
+        self,
+        size: int,
+        subset_ratio: float,
+        shuffle: bool = True,
+        seed_shuffle: Optional[int] = None,
+        seed_subset: Optional[int] = None,
+    ):
+        """
+        Args:
+            size (int): the total number of data of the underlying dataset to sample from
+            subset_ratio (float): the ratio of subset data to sample from the underlying dataset
+            shuffle (bool): whether to shuffle the indices or not
+            seed_shuffle (int): the initial seed of the shuffle. Must be the same
+                across all workers. If None, will use a random seed shared
+                among workers (require synchronization among all workers).
+            seed_subset (int): the seed to randomize the subset to be sampled.
+                Must be the same across all workers. If None, will use a random seed shared
+                among workers (require synchronization among all workers).
+        """
+        super().__init__(size=size, shuffle=shuffle, seed=seed_shuffle)
+
+        assert 0.0 < subset_ratio <= 1.0
+        self._size_subset = int(size * subset_ratio)
+        assert self._size_subset > 0
+        if seed_subset is None:
+            seed_subset = comm.shared_random_seed()
+        self._seed_subset = int(seed_subset)
+
+        # randomly generate the subset indexes to be sampled from
+        g = torch.Generator()
+        g.manual_seed(self._seed_subset)
+        indexes_randperm = torch.randperm(self._size, generator=g)
+        self._indexes_subset = indexes_randperm[: self._size_subset]
+
+        logger.info("Using RandomSubsetTrainingSampler......")
+        logger.info(f"Randomly sample {self._size_subset} data from the original {self._size} data")
+
+    def _infinite_indices(self):
+        g = torch.Generator()
+        g.manual_seed(self._seed)  # self._seed equals seed_shuffle from __init__()
+        while True:
+            if self._shuffle:
+                # generate a random permutation to shuffle self._indexes_subset
+                randperm = torch.randperm(self._size_subset, generator=g)
+                yield from self._indexes_subset[randperm].tolist()
+            else:
+                yield from self._indexes_subset.tolist()
+
+
+class RepeatFactorTrainingSampler(Sampler):
+    """
+    Similar to TrainingSampler, but a sample may appear more times than others based
+    on its "repeat factor". This is suitable for training on class imbalanced datasets like LVIS.
+    """
+
+    def __init__(self, repeat_factors, *, shuffle=True, seed=None):
+        """
+        Args:
+            repeat_factors (Tensor): a float vector, the repeat factor for each indice. When it's
+                full of ones, it is equivalent to ``TrainingSampler(len(repeat_factors), ...)``.
+            shuffle (bool): whether to shuffle the indices or not
+            seed (int): the initial seed of the shuffle. Must be the same
+                across all workers. If None, will use a random seed shared
+                among workers (require synchronization among all workers).
+        """
+        self._shuffle = shuffle
+        if seed is None:
+            seed = comm.shared_random_seed()
+        self._seed = int(seed)
+
+        self._rank = comm.get_rank()
+        self._world_size = comm.get_world_size()
+
+        # Split into whole number (_int_part) and fractional (_frac_part) parts.
+        self._int_part = torch.trunc(repeat_factors)
+        self._frac_part = repeat_factors - self._int_part
+
+    @staticmethod
+    def repeat_factors_from_category_frequency(dataset_dicts, repeat_thresh):
+        """
+        Compute (fractional) per-image repeat factors based on category frequency.
+        The repeat factor for an image is a function of the frequency of the rarest
+        category labeled in that image. The "frequency of category c" in [0, 1] is defined
+        as the fraction of images in the training set (without repeats) in which category c
+        appears.
+        See :paper:`lvis` (>= v2) Appendix B.2.
+
+        Args:
+            dataset_dicts (list[dict]): annotations in Detectron2 dataset format.
+            repeat_thresh (float): frequency threshold below which data is repeated.
+                If the frequency is half of `repeat_thresh`, the image will be
+                repeated twice.
+
+        Returns:
+            torch.Tensor:
+                the i-th element is the repeat factor for the dataset image at index i.
+        """
+        # 1. For each category c, compute the fraction of images that contain it: f(c)
+        category_freq = defaultdict(int)
+        for dataset_dict in dataset_dicts:  # For each image (without repeats)
+            cat_ids = {ann["category_id"] for ann in dataset_dict["annotations"]}
+            for cat_id in cat_ids:
+                category_freq[cat_id] += 1
+        num_images = len(dataset_dicts)
+        for k, v in category_freq.items():
+            category_freq[k] = v / num_images
+
+        # 2. For each category c, compute the category-level repeat factor:
+        #    r(c) = max(1, sqrt(t / f(c)))
+        category_rep = {
+            cat_id: max(1.0, math.sqrt(repeat_thresh / cat_freq))
+            for cat_id, cat_freq in category_freq.items()
+        }
+
+        # 3. For each image I, compute the image-level repeat factor:
+        #    r(I) = max_{c in I} r(c)
+        rep_factors = []
+        for dataset_dict in dataset_dicts:
+            cat_ids = {ann["category_id"] for ann in dataset_dict["annotations"]}
+            rep_factor = max({category_rep[cat_id] for cat_id in cat_ids}, default=1.0)
+            rep_factors.append(rep_factor)
+
+        return torch.tensor(rep_factors, dtype=torch.float32)
+
+    def _get_epoch_indices(self, generator):
+        """
+        Create a list of dataset indices (with repeats) to use for one epoch.
+
+        Args:
+            generator (torch.Generator): pseudo random number generator used for
+                stochastic rounding.
+
+        Returns:
+            torch.Tensor: list of dataset indices to use in one epoch. Each index
+                is repeated based on its calculated repeat factor.
+        """
+        # Since repeat factors are fractional, we use stochastic rounding so
+        # that the target repeat factor is achieved in expectation over the
+        # course of training
+        rands = torch.rand(len(self._frac_part), generator=generator)
+        rep_factors = self._int_part + (rands < self._frac_part).float()
+        # Construct a list of indices in which we repeat images as specified
+        indices = []
+        for dataset_index, rep_factor in enumerate(rep_factors):
+            indices.extend([dataset_index] * int(rep_factor.item()))
+        return torch.tensor(indices, dtype=torch.int64)
+
+    def __iter__(self):
+        start = self._rank
+        yield from itertools.islice(self._infinite_indices(), start, None, self._world_size)
+
+    def _infinite_indices(self):
+        g = torch.Generator()
+        g.manual_seed(self._seed)
+        while True:
+            # Sample indices with repeats determined by stochastic rounding; each
+            # "epoch" may have a slightly different size due to the rounding.
+            indices = self._get_epoch_indices(g)
+            if self._shuffle:
+                randperm = torch.randperm(len(indices), generator=g)
+                yield from indices[randperm].tolist()
+            else:
+                yield from indices.tolist()
+
+
+class InferenceSampler(Sampler):
+    """
+    Produce indices for inference across all workers.
+    Inference needs to run on the __exact__ set of samples,
+    therefore when the total number of samples is not divisible by the number of workers,
+    this sampler produces different number of samples on different workers.
+    """
+
+    def __init__(self, size: int):
+        """
+        Args:
+            size (int): the total number of data of the underlying dataset to sample from
+        """
+        self._size = size
+        assert size > 0
+        self._rank = comm.get_rank()
+        self._world_size = comm.get_world_size()
+        self._local_indices = self._get_local_indices(size, self._world_size, self._rank)
+
+    @staticmethod
+    def _get_local_indices(total_size, world_size, rank):
+        shard_size = total_size // world_size
+        left = total_size % world_size
+        shard_sizes = [shard_size + int(r < left) for r in range(world_size)]
+
+        begin = sum(shard_sizes[:rank])
+        end = min(sum(shard_sizes[: rank + 1]), total_size)
+        return range(begin, end)
+
+    def __iter__(self):
+        yield from self._local_indices
+
+    def __len__(self):
+        return len(self._local_indices)
diff --git a/detectron2/data/samplers/grouped_batch_sampler.py b/detectron2/data/samplers/grouped_batch_sampler.py
new file mode 100644
index 0000000000000000000000000000000000000000..5b247730aacd04dd0c752664acde3257c4eddd71
--- /dev/null
+++ b/detectron2/data/samplers/grouped_batch_sampler.py
@@ -0,0 +1,47 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+import numpy as np
+from torch.utils.data.sampler import BatchSampler, Sampler
+
+
+class GroupedBatchSampler(BatchSampler):
+    """
+    Wraps another sampler to yield a mini-batch of indices.
+    It enforces that the batch only contain elements from the same group.
+    It also tries to provide mini-batches which follows an ordering which is
+    as close as possible to the ordering from the original sampler.
+    """
+
+    def __init__(self, sampler, group_ids, batch_size):
+        """
+        Args:
+            sampler (Sampler): Base sampler.
+            group_ids (list[int]): If the sampler produces indices in range [0, N),
+                `group_ids` must be a list of `N` ints which contains the group id of each sample.
+                The group ids must be a set of integers in the range [0, num_groups).
+            batch_size (int): Size of mini-batch.
+        """
+        if not isinstance(sampler, Sampler):
+            raise ValueError(
+                "sampler should be an instance of "
+                "torch.utils.data.Sampler, but got sampler={}".format(sampler)
+            )
+        self.sampler = sampler
+        self.group_ids = np.asarray(group_ids)
+        assert self.group_ids.ndim == 1
+        self.batch_size = batch_size
+        groups = np.unique(self.group_ids).tolist()
+
+        # buffer the indices of each group until batch size is reached
+        self.buffer_per_group = {k: [] for k in groups}
+
+    def __iter__(self):
+        for idx in self.sampler:
+            group_id = self.group_ids[idx]
+            group_buffer = self.buffer_per_group[group_id]
+            group_buffer.append(idx)
+            if len(group_buffer) == self.batch_size:
+                yield group_buffer[:]  # yield a copy of the list
+                del group_buffer[:]
+
+    def __len__(self):
+        raise NotImplementedError("len() of GroupedBatchSampler is not well-defined.")
diff --git a/detectron2/data/transforms/__init__.py b/detectron2/data/transforms/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..ab3c63b5b456a7fb878757e25768a3634f76ae5b
--- /dev/null
+++ b/detectron2/data/transforms/__init__.py
@@ -0,0 +1,14 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+from fvcore.transforms.transform import Transform, TransformList  # order them first
+from fvcore.transforms.transform import *
+from .transform import *
+from .augmentation import *
+from .augmentation_impl import *
+
+__all__ = [k for k in globals().keys() if not k.startswith("_")]
+
+
+from detectron2.utils.env import fixup_module_metadata
+
+fixup_module_metadata(__name__, globals(), __all__)
+del fixup_module_metadata
diff --git a/detectron2/data/transforms/__pycache__/__init__.cpython-310.pyc b/detectron2/data/transforms/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4b9ad666455ae1cb54bd9efa0e365a82a67f5336
Binary files /dev/null and b/detectron2/data/transforms/__pycache__/__init__.cpython-310.pyc differ
diff --git a/detectron2/data/transforms/__pycache__/__init__.cpython-39.pyc b/detectron2/data/transforms/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ef4061a9f437b7445972d5274d7769c8408f132d
Binary files /dev/null and b/detectron2/data/transforms/__pycache__/__init__.cpython-39.pyc differ
diff --git a/detectron2/data/transforms/__pycache__/augmentation.cpython-310.pyc b/detectron2/data/transforms/__pycache__/augmentation.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8c64c9d69f6ca106ba86d21f64b4d8a65599f00e
Binary files /dev/null and b/detectron2/data/transforms/__pycache__/augmentation.cpython-310.pyc differ
diff --git a/detectron2/data/transforms/__pycache__/augmentation.cpython-39.pyc b/detectron2/data/transforms/__pycache__/augmentation.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b17390ebdaa132c63db299d41a590d0f7b50a51f
Binary files /dev/null and b/detectron2/data/transforms/__pycache__/augmentation.cpython-39.pyc differ
diff --git a/detectron2/data/transforms/__pycache__/augmentation_impl.cpython-310.pyc b/detectron2/data/transforms/__pycache__/augmentation_impl.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b3a1f2681d5c9cc9e854e99324348214753d34d4
Binary files /dev/null and b/detectron2/data/transforms/__pycache__/augmentation_impl.cpython-310.pyc differ
diff --git a/detectron2/data/transforms/__pycache__/augmentation_impl.cpython-39.pyc b/detectron2/data/transforms/__pycache__/augmentation_impl.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4a1c4dcd91c0305f2ad86fda82c47bc4030df11f
Binary files /dev/null and b/detectron2/data/transforms/__pycache__/augmentation_impl.cpython-39.pyc differ
diff --git a/detectron2/data/transforms/__pycache__/transform.cpython-310.pyc b/detectron2/data/transforms/__pycache__/transform.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e29da4c86646606ce8a125a394ee46f8fa1b9184
Binary files /dev/null and b/detectron2/data/transforms/__pycache__/transform.cpython-310.pyc differ
diff --git a/detectron2/data/transforms/__pycache__/transform.cpython-39.pyc b/detectron2/data/transforms/__pycache__/transform.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ab3e3f46967ea6afe59612a952885cddc787c6fd
Binary files /dev/null and b/detectron2/data/transforms/__pycache__/transform.cpython-39.pyc differ
diff --git a/detectron2/data/transforms/augmentation.py b/detectron2/data/transforms/augmentation.py
new file mode 100644
index 0000000000000000000000000000000000000000..63dd41aef658c9b51c7246880399405a029c5580
--- /dev/null
+++ b/detectron2/data/transforms/augmentation.py
@@ -0,0 +1,380 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) Facebook, Inc. and its affiliates.
+
+import inspect
+import numpy as np
+import pprint
+from typing import Any, List, Optional, Tuple, Union
+from fvcore.transforms.transform import Transform, TransformList
+
+"""
+See "Data Augmentation" tutorial for an overview of the system:
+https://detectron2.readthedocs.io/tutorials/augmentation.html
+"""
+
+
+__all__ = [
+    "Augmentation",
+    "AugmentationList",
+    "AugInput",
+    "TransformGen",
+    "apply_transform_gens",
+    "StandardAugInput",
+    "apply_augmentations",
+]
+
+
+def _check_img_dtype(img):
+    assert isinstance(img, np.ndarray), "[Augmentation] Needs an numpy array, but got a {}!".format(
+        type(img)
+    )
+    assert not isinstance(img.dtype, np.integer) or (
+        img.dtype == np.uint8
+    ), "[Augmentation] Got image of type {}, use uint8 or floating points instead!".format(
+        img.dtype
+    )
+    assert img.ndim in [2, 3], img.ndim
+
+
+def _get_aug_input_args(aug, aug_input) -> List[Any]:
+    """
+    Get the arguments to be passed to ``aug.get_transform`` from the input ``aug_input``.
+    """
+    if aug.input_args is None:
+        # Decide what attributes are needed automatically
+        prms = list(inspect.signature(aug.get_transform).parameters.items())
+        # The default behavior is: if there is one parameter, then its "image"
+        # (work automatically for majority of use cases, and also avoid BC breaking),
+        # Otherwise, use the argument names.
+        if len(prms) == 1:
+            names = ("image",)
+        else:
+            names = []
+            for name, prm in prms:
+                if prm.kind in (
+                    inspect.Parameter.VAR_POSITIONAL,
+                    inspect.Parameter.VAR_KEYWORD,
+                ):
+                    raise TypeError(
+                        f""" \
+The default implementation of `{type(aug)}.__call__` does not allow \
+`{type(aug)}.get_transform` to use variable-length arguments (*args, **kwargs)! \
+If arguments are unknown, reimplement `__call__` instead. \
+"""
+                    )
+                names.append(name)
+        aug.input_args = tuple(names)
+
+    args = []
+    for f in aug.input_args:
+        try:
+            args.append(getattr(aug_input, f))
+        except AttributeError as e:
+            raise AttributeError(
+                f"{type(aug)}.get_transform needs input attribute '{f}', "
+                f"but it is not an attribute of {type(aug_input)}!"
+            ) from e
+    return args
+
+
+class Augmentation:
+    """
+    Augmentation defines (often random) policies/strategies to generate :class:`Transform`
+    from data. It is often used for pre-processing of input data.
+
+    A "policy" that generates a :class:`Transform` may, in the most general case,
+    need arbitrary information from input data in order to determine what transforms
+    to apply. Therefore, each :class:`Augmentation` instance defines the arguments
+    needed by its :meth:`get_transform` method. When called with the positional arguments,
+    the :meth:`get_transform` method executes the policy.
+
+    Note that :class:`Augmentation` defines the policies to create a :class:`Transform`,
+    but not how to execute the actual transform operations to those data.
+    Its :meth:`__call__` method will use :meth:`AugInput.transform` to execute the transform.
+
+    The returned `Transform` object is meant to describe deterministic transformation, which means
+    it can be re-applied on associated data, e.g. the geometry of an image and its segmentation
+    masks need to be transformed together.
+    (If such re-application is not needed, then determinism is not a crucial requirement.)
+    """
+
+    input_args: Optional[Tuple[str]] = None
+    """
+    Stores the attribute names needed by :meth:`get_transform`, e.g.  ``("image", "sem_seg")``.
+    By default, it is just a tuple of argument names in :meth:`self.get_transform`, which often only
+    contain "image". As long as the argument name convention is followed, there is no need for
+    users to touch this attribute.
+    """
+
+    def _init(self, params=None):
+        if params:
+            for k, v in params.items():
+                if k != "self" and not k.startswith("_"):
+                    setattr(self, k, v)
+
+    def get_transform(self, *args) -> Transform:
+        """
+        Execute the policy based on input data, and decide what transform to apply to inputs.
+
+        Args:
+            args: Any fixed-length positional arguments. By default, the name of the arguments
+                should exist in the :class:`AugInput` to be used.
+
+        Returns:
+            Transform: Returns the deterministic transform to apply to the input.
+
+        Examples:
+        ::
+            class MyAug:
+                # if a policy needs to know both image and semantic segmentation
+                def get_transform(image, sem_seg) -> T.Transform:
+                    pass
+            tfm: Transform = MyAug().get_transform(image, sem_seg)
+            new_image = tfm.apply_image(image)
+
+        Notes:
+            Users can freely use arbitrary new argument names in custom
+            :meth:`get_transform` method, as long as they are available in the
+            input data. In detectron2 we use the following convention:
+
+            * image: (H,W) or (H,W,C) ndarray of type uint8 in range [0, 255], or
+              floating point in range [0, 1] or [0, 255].
+            * boxes: (N,4) ndarray of float32. It represents the instance bounding boxes
+              of N instances. Each is in XYXY format in unit of absolute coordinates.
+            * sem_seg: (H,W) ndarray of type uint8. Each element is an integer label of pixel.
+
+            We do not specify convention for other types and do not include builtin
+            :class:`Augmentation` that uses other types in detectron2.
+        """
+        raise NotImplementedError
+
+    def __call__(self, aug_input) -> Transform:
+        """
+        Augment the given `aug_input` **in-place**, and return the transform that's used.
+
+        This method will be called to apply the augmentation. In most augmentation, it
+        is enough to use the default implementation, which calls :meth:`get_transform`
+        using the inputs. But a subclass can overwrite it to have more complicated logic.
+
+        Args:
+            aug_input (AugInput): an object that has attributes needed by this augmentation
+                (defined by ``self.get_transform``). Its ``transform`` method will be called
+                to in-place transform it.
+
+        Returns:
+            Transform: the transform that is applied on the input.
+        """
+        args = _get_aug_input_args(self, aug_input)
+        tfm = self.get_transform(*args)
+        assert isinstance(tfm, (Transform, TransformList)), (
+            f"{type(self)}.get_transform must return an instance of Transform! "
+            f"Got {type(tfm)} instead."
+        )
+        aug_input.transform(tfm)
+        return tfm
+
+    def _rand_range(self, low=1.0, high=None, size=None):
+        """
+        Uniform float random number between low and high.
+        """
+        if high is None:
+            low, high = 0, low
+        if size is None:
+            size = []
+        return np.random.uniform(low, high, size)
+
+    def __repr__(self):
+        """
+        Produce something like:
+        "MyAugmentation(field1={self.field1}, field2={self.field2})"
+        """
+        try:
+            sig = inspect.signature(self.__init__)
+            classname = type(self).__name__
+            argstr = []
+            for name, param in sig.parameters.items():
+                assert (
+                    param.kind != param.VAR_POSITIONAL and param.kind != param.VAR_KEYWORD
+                ), "The default __repr__ doesn't support *args or **kwargs"
+                assert hasattr(self, name), (
+                    "Attribute {} not found! "
+                    "Default __repr__ only works if attributes match the constructor.".format(name)
+                )
+                attr = getattr(self, name)
+                default = param.default
+                if default is attr:
+                    continue
+                attr_str = pprint.pformat(attr)
+                if "\n" in attr_str:
+                    # don't show it if pformat decides to use >1 lines
+                    attr_str = "..."
+                argstr.append("{}={}".format(name, attr_str))
+            return "{}({})".format(classname, ", ".join(argstr))
+        except AssertionError:
+            return super().__repr__()
+
+    __str__ = __repr__
+
+
+class _TransformToAug(Augmentation):
+    def __init__(self, tfm: Transform):
+        self.tfm = tfm
+
+    def get_transform(self, *args):
+        return self.tfm
+
+    def __repr__(self):
+        return repr(self.tfm)
+
+    __str__ = __repr__
+
+
+def _transform_to_aug(tfm_or_aug):
+    """
+    Wrap Transform into Augmentation.
+    Private, used internally to implement augmentations.
+    """
+    assert isinstance(tfm_or_aug, (Transform, Augmentation)), tfm_or_aug
+    if isinstance(tfm_or_aug, Augmentation):
+        return tfm_or_aug
+    else:
+        return _TransformToAug(tfm_or_aug)
+
+
+class AugmentationList(Augmentation):
+    """
+    Apply a sequence of augmentations.
+
+    It has ``__call__`` method to apply the augmentations.
+
+    Note that :meth:`get_transform` method is impossible (will throw error if called)
+    for :class:`AugmentationList`, because in order to apply a sequence of augmentations,
+    the kth augmentation must be applied first, to provide inputs needed by the (k+1)th
+    augmentation.
+    """
+
+    def __init__(self, augs):
+        """
+        Args:
+            augs (list[Augmentation or Transform]):
+        """
+        super().__init__()
+        self.augs = [_transform_to_aug(x) for x in augs]
+
+    def __call__(self, aug_input) -> TransformList:
+        tfms = []
+        for x in self.augs:
+            tfm = x(aug_input)
+            tfms.append(tfm)
+        return TransformList(tfms)
+
+    def __repr__(self):
+        msgs = [str(x) for x in self.augs]
+        return "AugmentationList[{}]".format(", ".join(msgs))
+
+    __str__ = __repr__
+
+
+class AugInput:
+    """
+    Input that can be used with :meth:`Augmentation.__call__`.
+    This is a standard implementation for the majority of use cases.
+    This class provides the standard attributes **"image", "boxes", "sem_seg"**
+    defined in :meth:`__init__` and they may be needed by different augmentations.
+    Most augmentation policies do not need attributes beyond these three.
+
+    After applying augmentations to these attributes (using :meth:`AugInput.transform`),
+    the returned transforms can then be used to transform other data structures that users have.
+
+    Examples:
+    ::
+        input = AugInput(image, boxes=boxes)
+        tfms = augmentation(input)
+        transformed_image = input.image
+        transformed_boxes = input.boxes
+        transformed_other_data = tfms.apply_other(other_data)
+
+    An extended project that works with new data types may implement augmentation policies
+    that need other inputs. An algorithm may need to transform inputs in a way different
+    from the standard approach defined in this class. In those rare situations, users can
+    implement a class similar to this class, that satify the following condition:
+
+    * The input must provide access to these data in the form of attribute access
+      (``getattr``).  For example, if an :class:`Augmentation` to be applied needs "image"
+      and "sem_seg" arguments, its input must have the attribute "image" and "sem_seg".
+    * The input must have a ``transform(tfm: Transform) -> None`` method which
+      in-place transforms all its attributes.
+    """
+
+    # TODO maybe should support more builtin data types here
+    def __init__(
+        self,
+        image: np.ndarray,
+        *,
+        boxes: Optional[np.ndarray] = None,
+        sem_seg: Optional[np.ndarray] = None,
+    ):
+        """
+        Args:
+            image (ndarray): (H,W) or (H,W,C) ndarray of type uint8 in range [0, 255], or
+                floating point in range [0, 1] or [0, 255]. The meaning of C is up
+                to users.
+            boxes (ndarray or None): Nx4 float32 boxes in XYXY_ABS mode
+            sem_seg (ndarray or None): HxW uint8 semantic segmentation mask. Each element
+                is an integer label of pixel.
+        """
+        _check_img_dtype(image)
+        self.image = image
+        self.boxes = boxes
+        self.sem_seg = sem_seg
+
+    def transform(self, tfm: Transform) -> None:
+        """
+        In-place transform all attributes of this class.
+
+        By "in-place", it means after calling this method, accessing an attribute such
+        as ``self.image`` will return transformed data.
+        """
+        self.image = tfm.apply_image(self.image)
+        if self.boxes is not None:
+            self.boxes = tfm.apply_box(self.boxes)
+        if self.sem_seg is not None:
+            self.sem_seg = tfm.apply_segmentation(self.sem_seg)
+
+    def apply_augmentations(
+        self, augmentations: List[Union[Augmentation, Transform]]
+    ) -> TransformList:
+        """
+        Equivalent of ``AugmentationList(augmentations)(self)``
+        """
+        return AugmentationList(augmentations)(self)
+
+
+def apply_augmentations(augmentations: List[Union[Transform, Augmentation]], inputs):
+    """
+    Use ``T.AugmentationList(augmentations)(inputs)`` instead.
+    """
+    if isinstance(inputs, np.ndarray):
+        # handle the common case of image-only Augmentation, also for backward compatibility
+        image_only = True
+        inputs = AugInput(inputs)
+    else:
+        image_only = False
+    tfms = inputs.apply_augmentations(augmentations)
+    return inputs.image if image_only else inputs, tfms
+
+
+apply_transform_gens = apply_augmentations
+"""
+Alias for backward-compatibility.
+"""
+
+TransformGen = Augmentation
+"""
+Alias for Augmentation, since it is something that generates :class:`Transform`s
+"""
+
+StandardAugInput = AugInput
+"""
+Alias for compatibility. It's not worth the complexity to have two classes.
+"""
diff --git a/detectron2/data/transforms/augmentation_impl.py b/detectron2/data/transforms/augmentation_impl.py
new file mode 100644
index 0000000000000000000000000000000000000000..7cc7b28be66cdf14bff493745c6c567da55aeb34
--- /dev/null
+++ b/detectron2/data/transforms/augmentation_impl.py
@@ -0,0 +1,736 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) Facebook, Inc. and its affiliates.
+"""
+Implement many useful :class:`Augmentation`.
+"""
+import numpy as np
+import sys
+from numpy import random
+from typing import Tuple
+import torch
+from fvcore.transforms.transform import (
+    BlendTransform,
+    CropTransform,
+    HFlipTransform,
+    NoOpTransform,
+    PadTransform,
+    Transform,
+    TransformList,
+    VFlipTransform,
+)
+from PIL import Image
+
+from detectron2.structures import Boxes, pairwise_iou
+
+from .augmentation import Augmentation, _transform_to_aug
+from .transform import ExtentTransform, ResizeTransform, RotationTransform
+
+__all__ = [
+    "FixedSizeCrop",
+    "RandomApply",
+    "RandomBrightness",
+    "RandomContrast",
+    "RandomCrop",
+    "RandomExtent",
+    "RandomFlip",
+    "RandomSaturation",
+    "RandomLighting",
+    "RandomRotation",
+    "Resize",
+    "ResizeScale",
+    "ResizeShortestEdge",
+    "RandomCrop_CategoryAreaConstraint",
+    "RandomResize",
+    "MinIoURandomCrop",
+]
+
+
+class RandomApply(Augmentation):
+    """
+    Randomly apply an augmentation with a given probability.
+    """
+
+    def __init__(self, tfm_or_aug, prob=0.5):
+        """
+        Args:
+            tfm_or_aug (Transform, Augmentation): the transform or augmentation
+                to be applied. It can either be a `Transform` or `Augmentation`
+                instance.
+            prob (float): probability between 0.0 and 1.0 that
+                the wrapper transformation is applied
+        """
+        super().__init__()
+        self.aug = _transform_to_aug(tfm_or_aug)
+        assert 0.0 <= prob <= 1.0, f"Probablity must be between 0.0 and 1.0 (given: {prob})"
+        self.prob = prob
+
+    def get_transform(self, *args):
+        do = self._rand_range() < self.prob
+        if do:
+            return self.aug.get_transform(*args)
+        else:
+            return NoOpTransform()
+
+    def __call__(self, aug_input):
+        do = self._rand_range() < self.prob
+        if do:
+            return self.aug(aug_input)
+        else:
+            return NoOpTransform()
+
+
+class RandomFlip(Augmentation):
+    """
+    Flip the image horizontally or vertically with the given probability.
+    """
+
+    def __init__(self, prob=0.5, *, horizontal=True, vertical=False):
+        """
+        Args:
+            prob (float): probability of flip.
+            horizontal (boolean): whether to apply horizontal flipping
+            vertical (boolean): whether to apply vertical flipping
+        """
+        super().__init__()
+
+        if horizontal and vertical:
+            raise ValueError("Cannot do both horiz and vert. Please use two Flip instead.")
+        if not horizontal and not vertical:
+            raise ValueError("At least one of horiz or vert has to be True!")
+        self._init(locals())
+
+    def get_transform(self, image):
+        h, w = image.shape[:2]
+        do = self._rand_range() < self.prob
+        if do:
+            if self.horizontal:
+                return HFlipTransform(w)
+            elif self.vertical:
+                return VFlipTransform(h)
+        else:
+            return NoOpTransform()
+
+
+class Resize(Augmentation):
+    """Resize image to a fixed target size"""
+
+    def __init__(self, shape, interp=Image.BILINEAR):
+        """
+        Args:
+            shape: (h, w) tuple or a int
+            interp: PIL interpolation method
+        """
+        if isinstance(shape, int):
+            shape = (shape, shape)
+        shape = tuple(shape)
+        self._init(locals())
+
+    def get_transform(self, image):
+        return ResizeTransform(
+            image.shape[0], image.shape[1], self.shape[0], self.shape[1], self.interp
+        )
+
+
+class ResizeShortestEdge(Augmentation):
+    """
+    Resize the image while keeping the aspect ratio unchanged.
+    It attempts to scale the shorter edge to the given `short_edge_length`,
+    as long as the longer edge does not exceed `max_size`.
+    If `max_size` is reached, then downscale so that the longer edge does not exceed max_size.
+    """
+
+    @torch.jit.unused
+    def __init__(
+        self, short_edge_length, max_size=sys.maxsize, sample_style="range", interp=Image.BILINEAR
+    ):
+        """
+        Args:
+            short_edge_length (list[int]): If ``sample_style=="range"``,
+                a [min, max] interval from which to sample the shortest edge length.
+                If ``sample_style=="choice"``, a list of shortest edge lengths to sample from.
+            max_size (int): maximum allowed longest edge length.
+            sample_style (str): either "range" or "choice".
+        """
+        super().__init__()
+        assert sample_style in ["range", "choice"], sample_style
+
+        self.is_range = sample_style == "range"
+        if isinstance(short_edge_length, int):
+            short_edge_length = (short_edge_length, short_edge_length)
+        if self.is_range:
+            assert len(short_edge_length) == 2, (
+                "short_edge_length must be two values using 'range' sample style."
+                f" Got {short_edge_length}!"
+            )
+        self._init(locals())
+
+    @torch.jit.unused
+    def get_transform(self, image):
+        h, w = image.shape[:2]
+        if self.is_range:
+            size = np.random.randint(self.short_edge_length[0], self.short_edge_length[1] + 1)
+        else:
+            size = np.random.choice(self.short_edge_length)
+        if size == 0:
+            return NoOpTransform()
+
+        newh, neww = ResizeShortestEdge.get_output_shape(h, w, size, self.max_size)
+        return ResizeTransform(h, w, newh, neww, self.interp)
+
+    @staticmethod
+    def get_output_shape(
+        oldh: int, oldw: int, short_edge_length: int, max_size: int
+    ) -> Tuple[int, int]:
+        """
+        Compute the output size given input size and target short edge length.
+        """
+        h, w = oldh, oldw
+        size = short_edge_length * 1.0
+        scale = size / min(h, w)
+        if h < w:
+            newh, neww = size, scale * w
+        else:
+            newh, neww = scale * h, size
+        if max(newh, neww) > max_size:
+            scale = max_size * 1.0 / max(newh, neww)
+            newh = newh * scale
+            neww = neww * scale
+        neww = int(neww + 0.5)
+        newh = int(newh + 0.5)
+        return (newh, neww)
+
+
+class ResizeScale(Augmentation):
+    """
+    Takes target size as input and randomly scales the given target size between `min_scale`
+    and `max_scale`. It then scales the input image such that it fits inside the scaled target
+    box, keeping the aspect ratio constant.
+    This implements the resize part of the Google's 'resize_and_crop' data augmentation:
+    https://github.com/tensorflow/tpu/blob/master/models/official/detection/utils/input_utils.py#L127
+    """
+
+    def __init__(
+        self,
+        min_scale: float,
+        max_scale: float,
+        target_height: int,
+        target_width: int,
+        interp: int = Image.BILINEAR,
+    ):
+        """
+        Args:
+            min_scale: minimum image scale range.
+            max_scale: maximum image scale range.
+            target_height: target image height.
+            target_width: target image width.
+            interp: image interpolation method.
+        """
+        super().__init__()
+        self._init(locals())
+
+    def _get_resize(self, image: np.ndarray, scale: float) -> Transform:
+        input_size = image.shape[:2]
+
+        # Compute new target size given a scale.
+        target_size = (self.target_height, self.target_width)
+        target_scale_size = np.multiply(target_size, scale)
+
+        # Compute actual rescaling applied to input image and output size.
+        output_scale = np.minimum(
+            target_scale_size[0] / input_size[0], target_scale_size[1] / input_size[1]
+        )
+        output_size = np.round(np.multiply(input_size, output_scale)).astype(int)
+
+        return ResizeTransform(
+            input_size[0], input_size[1], int(output_size[0]), int(output_size[1]), self.interp
+        )
+
+    def get_transform(self, image: np.ndarray) -> Transform:
+        random_scale = np.random.uniform(self.min_scale, self.max_scale)
+        return self._get_resize(image, random_scale)
+
+
+class RandomRotation(Augmentation):
+    """
+    This method returns a copy of this image, rotated the given
+    number of degrees counter clockwise around the given center.
+    """
+
+    def __init__(self, angle, expand=True, center=None, sample_style="range", interp=None):
+        """
+        Args:
+            angle (list[float]): If ``sample_style=="range"``,
+                a [min, max] interval from which to sample the angle (in degrees).
+                If ``sample_style=="choice"``, a list of angles to sample from
+            expand (bool): choose if the image should be resized to fit the whole
+                rotated image (default), or simply cropped
+            center (list[[float, float]]):  If ``sample_style=="range"``,
+                a [[minx, miny], [maxx, maxy]] relative interval from which to sample the center,
+                [0, 0] being the top left of the image and [1, 1] the bottom right.
+                If ``sample_style=="choice"``, a list of centers to sample from
+                Default: None, which means that the center of rotation is the center of the image
+                center has no effect if expand=True because it only affects shifting
+        """
+        super().__init__()
+        assert sample_style in ["range", "choice"], sample_style
+        self.is_range = sample_style == "range"
+        if isinstance(angle, (float, int)):
+            angle = (angle, angle)
+        if center is not None and isinstance(center[0], (float, int)):
+            center = (center, center)
+        self._init(locals())
+
+    def get_transform(self, image):
+        h, w = image.shape[:2]
+        center = None
+        if self.is_range:
+            angle = np.random.uniform(self.angle[0], self.angle[1])
+            if self.center is not None:
+                center = (
+                    np.random.uniform(self.center[0][0], self.center[1][0]),
+                    np.random.uniform(self.center[0][1], self.center[1][1]),
+                )
+        else:
+            angle = np.random.choice(self.angle)
+            if self.center is not None:
+                center = np.random.choice(self.center)
+
+        if center is not None:
+            center = (w * center[0], h * center[1])  # Convert to absolute coordinates
+
+        if angle % 360 == 0:
+            return NoOpTransform()
+
+        return RotationTransform(h, w, angle, expand=self.expand, center=center, interp=self.interp)
+
+
+class FixedSizeCrop(Augmentation):
+    """
+    If `crop_size` is smaller than the input image size, then it uses a random crop of
+    the crop size. If `crop_size` is larger than the input image size, then it pads
+    the right and the bottom of the image to the crop size if `pad` is True, otherwise
+    it returns the smaller image.
+    """
+
+    def __init__(
+        self,
+        crop_size: Tuple[int],
+        pad: bool = True,
+        pad_value: float = 128.0,
+        seg_pad_value: int = 255,
+    ):
+        """
+        Args:
+            crop_size: target image (height, width).
+            pad: if True, will pad images smaller than `crop_size` up to `crop_size`
+            pad_value: the padding value to the image.
+            seg_pad_value: the padding value to the segmentation mask.
+        """
+        super().__init__()
+        self._init(locals())
+
+    def _get_crop(self, image: np.ndarray) -> Transform:
+        # Compute the image scale and scaled size.
+        input_size = image.shape[:2]
+        output_size = self.crop_size
+
+        # Add random crop if the image is scaled up.
+        max_offset = np.subtract(input_size, output_size)
+        max_offset = np.maximum(max_offset, 0)
+        offset = np.multiply(max_offset, np.random.uniform(0.0, 1.0))
+        offset = np.round(offset).astype(int)
+        return CropTransform(
+            offset[1], offset[0], output_size[1], output_size[0], input_size[1], input_size[0]
+        )
+
+    def _get_pad(self, image: np.ndarray) -> Transform:
+        # Compute the image scale and scaled size.
+        input_size = image.shape[:2]
+        output_size = self.crop_size
+
+        # Add padding if the image is scaled down.
+        pad_size = np.subtract(output_size, input_size)
+        pad_size = np.maximum(pad_size, 0)
+        original_size = np.minimum(input_size, output_size)
+        return PadTransform(
+            0,
+            0,
+            pad_size[1],
+            pad_size[0],
+            original_size[1],
+            original_size[0],
+            self.pad_value,
+            self.seg_pad_value,
+        )
+
+    def get_transform(self, image: np.ndarray) -> TransformList:
+        transforms = [self._get_crop(image)]
+        if self.pad:
+            transforms.append(self._get_pad(image))
+        return TransformList(transforms)
+
+
+class RandomCrop(Augmentation):
+    """
+    Randomly crop a rectangle region out of an image.
+    """
+
+    def __init__(self, crop_type: str, crop_size):
+        """
+        Args:
+            crop_type (str): one of "relative_range", "relative", "absolute", "absolute_range".
+            crop_size (tuple[float, float]): two floats, explained below.
+
+        - "relative": crop a (H * crop_size[0], W * crop_size[1]) region from an input image of
+          size (H, W). crop size should be in (0, 1]
+        - "relative_range": uniformly sample two values from [crop_size[0], 1]
+          and [crop_size[1]], 1], and use them as in "relative" crop type.
+        - "absolute" crop a (crop_size[0], crop_size[1]) region from input image.
+          crop_size must be smaller than the input image size.
+        - "absolute_range", for an input of size (H, W), uniformly sample H_crop in
+          [crop_size[0], min(H, crop_size[1])] and W_crop in [crop_size[0], min(W, crop_size[1])].
+          Then crop a region (H_crop, W_crop).
+        """
+        # TODO style of relative_range and absolute_range are not consistent:
+        # one takes (h, w) but another takes (min, max)
+        super().__init__()
+        assert crop_type in ["relative_range", "relative", "absolute", "absolute_range"]
+        self._init(locals())
+
+    def get_transform(self, image):
+        h, w = image.shape[:2]
+        croph, cropw = self.get_crop_size((h, w))
+        assert h >= croph and w >= cropw, "Shape computation in {} has bugs.".format(self)
+        h0 = np.random.randint(h - croph + 1)
+        w0 = np.random.randint(w - cropw + 1)
+        return CropTransform(w0, h0, cropw, croph)
+
+    def get_crop_size(self, image_size):
+        """
+        Args:
+            image_size (tuple): height, width
+
+        Returns:
+            crop_size (tuple): height, width in absolute pixels
+        """
+        h, w = image_size
+        if self.crop_type == "relative":
+            ch, cw = self.crop_size
+            return int(h * ch + 0.5), int(w * cw + 0.5)
+        elif self.crop_type == "relative_range":
+            crop_size = np.asarray(self.crop_size, dtype=np.float32)
+            ch, cw = crop_size + np.random.rand(2) * (1 - crop_size)
+            return int(h * ch + 0.5), int(w * cw + 0.5)
+        elif self.crop_type == "absolute":
+            return (min(self.crop_size[0], h), min(self.crop_size[1], w))
+        elif self.crop_type == "absolute_range":
+            assert self.crop_size[0] <= self.crop_size[1]
+            ch = np.random.randint(min(h, self.crop_size[0]), min(h, self.crop_size[1]) + 1)
+            cw = np.random.randint(min(w, self.crop_size[0]), min(w, self.crop_size[1]) + 1)
+            return ch, cw
+        else:
+            raise NotImplementedError("Unknown crop type {}".format(self.crop_type))
+
+
+class RandomCrop_CategoryAreaConstraint(Augmentation):
+    """
+    Similar to :class:`RandomCrop`, but find a cropping window such that no single category
+    occupies a ratio of more than `single_category_max_area` in semantic segmentation ground
+    truth, which can cause unstability in training. The function attempts to find such a valid
+    cropping window for at most 10 times.
+    """
+
+    def __init__(
+        self,
+        crop_type: str,
+        crop_size,
+        single_category_max_area: float = 1.0,
+        ignored_category: int = None,
+    ):
+        """
+        Args:
+            crop_type, crop_size: same as in :class:`RandomCrop`
+            single_category_max_area: the maximum allowed area ratio of a
+                category. Set to 1.0 to disable
+            ignored_category: allow this category in the semantic segmentation
+                ground truth to exceed the area ratio. Usually set to the category
+                that's ignored in training.
+        """
+        self.crop_aug = RandomCrop(crop_type, crop_size)
+        self._init(locals())
+
+    def get_transform(self, image, sem_seg):
+        if self.single_category_max_area >= 1.0:
+            return self.crop_aug.get_transform(image)
+        else:
+            h, w = sem_seg.shape
+            for _ in range(10):
+                crop_size = self.crop_aug.get_crop_size((h, w))
+                y0 = np.random.randint(h - crop_size[0] + 1)
+                x0 = np.random.randint(w - crop_size[1] + 1)
+                sem_seg_temp = sem_seg[y0 : y0 + crop_size[0], x0 : x0 + crop_size[1]]
+                labels, cnt = np.unique(sem_seg_temp, return_counts=True)
+                if self.ignored_category is not None:
+                    cnt = cnt[labels != self.ignored_category]
+                if len(cnt) > 1 and np.max(cnt) < np.sum(cnt) * self.single_category_max_area:
+                    break
+            crop_tfm = CropTransform(x0, y0, crop_size[1], crop_size[0])
+            return crop_tfm
+
+
+class RandomExtent(Augmentation):
+    """
+    Outputs an image by cropping a random "subrect" of the source image.
+
+    The subrect can be parameterized to include pixels outside the source image,
+    in which case they will be set to zeros (i.e. black). The size of the output
+    image will vary with the size of the random subrect.
+    """
+
+    def __init__(self, scale_range, shift_range):
+        """
+        Args:
+            output_size (h, w): Dimensions of output image
+            scale_range (l, h): Range of input-to-output size scaling factor
+            shift_range (x, y): Range of shifts of the cropped subrect. The rect
+                is shifted by [w / 2 * Uniform(-x, x), h / 2 * Uniform(-y, y)],
+                where (w, h) is the (width, height) of the input image. Set each
+                component to zero to crop at the image's center.
+        """
+        super().__init__()
+        self._init(locals())
+
+    def get_transform(self, image):
+        img_h, img_w = image.shape[:2]
+
+        # Initialize src_rect to fit the input image.
+        src_rect = np.array([-0.5 * img_w, -0.5 * img_h, 0.5 * img_w, 0.5 * img_h])
+
+        # Apply a random scaling to the src_rect.
+        src_rect *= np.random.uniform(self.scale_range[0], self.scale_range[1])
+
+        # Apply a random shift to the coordinates origin.
+        src_rect[0::2] += self.shift_range[0] * img_w * (np.random.rand() - 0.5)
+        src_rect[1::2] += self.shift_range[1] * img_h * (np.random.rand() - 0.5)
+
+        # Map src_rect coordinates into image coordinates (center at corner).
+        src_rect[0::2] += 0.5 * img_w
+        src_rect[1::2] += 0.5 * img_h
+
+        return ExtentTransform(
+            src_rect=(src_rect[0], src_rect[1], src_rect[2], src_rect[3]),
+            output_size=(int(src_rect[3] - src_rect[1]), int(src_rect[2] - src_rect[0])),
+        )
+
+
+class RandomContrast(Augmentation):
+    """
+    Randomly transforms image contrast.
+
+    Contrast intensity is uniformly sampled in (intensity_min, intensity_max).
+    - intensity < 1 will reduce contrast
+    - intensity = 1 will preserve the input image
+    - intensity > 1 will increase contrast
+
+    See: https://pillow.readthedocs.io/en/3.0.x/reference/ImageEnhance.html
+    """
+
+    def __init__(self, intensity_min, intensity_max):
+        """
+        Args:
+            intensity_min (float): Minimum augmentation
+            intensity_max (float): Maximum augmentation
+        """
+        super().__init__()
+        self._init(locals())
+
+    def get_transform(self, image):
+        w = np.random.uniform(self.intensity_min, self.intensity_max)
+        return BlendTransform(src_image=image.mean(), src_weight=1 - w, dst_weight=w)
+
+
+class RandomBrightness(Augmentation):
+    """
+    Randomly transforms image brightness.
+
+    Brightness intensity is uniformly sampled in (intensity_min, intensity_max).
+    - intensity < 1 will reduce brightness
+    - intensity = 1 will preserve the input image
+    - intensity > 1 will increase brightness
+
+    See: https://pillow.readthedocs.io/en/3.0.x/reference/ImageEnhance.html
+    """
+
+    def __init__(self, intensity_min, intensity_max):
+        """
+        Args:
+            intensity_min (float): Minimum augmentation
+            intensity_max (float): Maximum augmentation
+        """
+        super().__init__()
+        self._init(locals())
+
+    def get_transform(self, image):
+        w = np.random.uniform(self.intensity_min, self.intensity_max)
+        return BlendTransform(src_image=0, src_weight=1 - w, dst_weight=w)
+
+
+class RandomSaturation(Augmentation):
+    """
+    Randomly transforms saturation of an RGB image.
+    Input images are assumed to have 'RGB' channel order.
+
+    Saturation intensity is uniformly sampled in (intensity_min, intensity_max).
+    - intensity < 1 will reduce saturation (make the image more grayscale)
+    - intensity = 1 will preserve the input image
+    - intensity > 1 will increase saturation
+
+    See: https://pillow.readthedocs.io/en/3.0.x/reference/ImageEnhance.html
+    """
+
+    def __init__(self, intensity_min, intensity_max):
+        """
+        Args:
+            intensity_min (float): Minimum augmentation (1 preserves input).
+            intensity_max (float): Maximum augmentation (1 preserves input).
+        """
+        super().__init__()
+        self._init(locals())
+
+    def get_transform(self, image):
+        assert image.shape[-1] == 3, "RandomSaturation only works on RGB images"
+        w = np.random.uniform(self.intensity_min, self.intensity_max)
+        grayscale = image.dot([0.299, 0.587, 0.114])[:, :, np.newaxis]
+        return BlendTransform(src_image=grayscale, src_weight=1 - w, dst_weight=w)
+
+
+class RandomLighting(Augmentation):
+    """
+    The "lighting" augmentation described in AlexNet, using fixed PCA over ImageNet.
+    Input images are assumed to have 'RGB' channel order.
+
+    The degree of color jittering is randomly sampled via a normal distribution,
+    with standard deviation given by the scale parameter.
+    """
+
+    def __init__(self, scale):
+        """
+        Args:
+            scale (float): Standard deviation of principal component weighting.
+        """
+        super().__init__()
+        self._init(locals())
+        self.eigen_vecs = np.array(
+            [[-0.5675, 0.7192, 0.4009], [-0.5808, -0.0045, -0.8140], [-0.5836, -0.6948, 0.4203]]
+        )
+        self.eigen_vals = np.array([0.2175, 0.0188, 0.0045])
+
+    def get_transform(self, image):
+        assert image.shape[-1] == 3, "RandomLighting only works on RGB images"
+        weights = np.random.normal(scale=self.scale, size=3)
+        return BlendTransform(
+            src_image=self.eigen_vecs.dot(weights * self.eigen_vals), src_weight=1.0, dst_weight=1.0
+        )
+
+
+class RandomResize(Augmentation):
+    """Randomly resize image to a target size in shape_list"""
+
+    def __init__(self, shape_list, interp=Image.BILINEAR):
+        """
+        Args:
+            shape_list: a list of shapes in (h, w)
+            interp: PIL interpolation method
+        """
+        self.shape_list = shape_list
+        self._init(locals())
+
+    def get_transform(self, image):
+        shape_idx = np.random.randint(low=0, high=len(self.shape_list))
+        h, w = self.shape_list[shape_idx]
+        return ResizeTransform(image.shape[0], image.shape[1], h, w, self.interp)
+
+
+class MinIoURandomCrop(Augmentation):
+    """Random crop the image & bboxes, the cropped patches have minimum IoU
+    requirement with original image & bboxes, the IoU threshold is randomly
+    selected from min_ious.
+
+    Args:
+        min_ious (tuple): minimum IoU threshold for all intersections with
+        bounding boxes
+        min_crop_size (float): minimum crop's size (i.e. h,w := a*h, a*w,
+        where a >= min_crop_size)
+        mode_trials: number of trials for sampling min_ious threshold
+        crop_trials: number of trials for sampling crop_size after cropping
+    """
+
+    def __init__(
+        self,
+        min_ious=(0.1, 0.3, 0.5, 0.7, 0.9),
+        min_crop_size=0.3,
+        mode_trials=1000,
+        crop_trials=50,
+    ):
+        self.min_ious = min_ious
+        self.sample_mode = (1, *min_ious, 0)
+        self.min_crop_size = min_crop_size
+        self.mode_trials = mode_trials
+        self.crop_trials = crop_trials
+
+    def get_transform(self, image, boxes):
+        """Call function to crop images and bounding boxes with minimum IoU
+        constraint.
+
+        Args:
+            boxes: ground truth boxes in (x1, y1, x2, y2) format
+        """
+        if boxes is None:
+            return NoOpTransform()
+        h, w, c = image.shape
+        for _ in range(self.mode_trials):
+            mode = random.choice(self.sample_mode)
+            self.mode = mode
+            if mode == 1:
+                return NoOpTransform()
+
+            min_iou = mode
+            for _ in range(self.crop_trials):
+                new_w = random.uniform(self.min_crop_size * w, w)
+                new_h = random.uniform(self.min_crop_size * h, h)
+
+                # h / w in [0.5, 2]
+                if new_h / new_w < 0.5 or new_h / new_w > 2:
+                    continue
+
+                left = random.uniform(w - new_w)
+                top = random.uniform(h - new_h)
+
+                patch = np.array((int(left), int(top), int(left + new_w), int(top + new_h)))
+                # Line or point crop is not allowed
+                if patch[2] == patch[0] or patch[3] == patch[1]:
+                    continue
+                overlaps = pairwise_iou(
+                    Boxes(patch.reshape(-1, 4)), Boxes(boxes.reshape(-1, 4))
+                ).reshape(-1)
+                if len(overlaps) > 0 and overlaps.min() < min_iou:
+                    continue
+
+                # center of boxes should inside the crop img
+                # only adjust boxes and instance masks when the gt is not empty
+                if len(overlaps) > 0:
+                    # adjust boxes
+                    def is_center_of_bboxes_in_patch(boxes, patch):
+                        center = (boxes[:, :2] + boxes[:, 2:]) / 2
+                        mask = (
+                            (center[:, 0] > patch[0])
+                            * (center[:, 1] > patch[1])
+                            * (center[:, 0] < patch[2])
+                            * (center[:, 1] < patch[3])
+                        )
+                        return mask
+
+                    mask = is_center_of_bboxes_in_patch(boxes, patch)
+                    if not mask.any():
+                        continue
+                return CropTransform(int(left), int(top), int(new_w), int(new_h))
diff --git a/detectron2/data/transforms/transform.py b/detectron2/data/transforms/transform.py
new file mode 100644
index 0000000000000000000000000000000000000000..46769a2569ffc6223a95990f8db5973757e7d23f
--- /dev/null
+++ b/detectron2/data/transforms/transform.py
@@ -0,0 +1,351 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) Facebook, Inc. and its affiliates.
+
+"""
+See "Data Augmentation" tutorial for an overview of the system:
+https://detectron2.readthedocs.io/tutorials/augmentation.html
+"""
+
+import numpy as np
+import torch
+import torch.nn.functional as F
+from fvcore.transforms.transform import (
+    CropTransform,
+    HFlipTransform,
+    NoOpTransform,
+    Transform,
+    TransformList,
+)
+from PIL import Image
+
+try:
+    import cv2  # noqa
+except ImportError:
+    # OpenCV is an optional dependency at the moment
+    pass
+
+__all__ = [
+    "ExtentTransform",
+    "ResizeTransform",
+    "RotationTransform",
+    "ColorTransform",
+    "PILColorTransform",
+]
+
+
+class ExtentTransform(Transform):
+    """
+    Extracts a subregion from the source image and scales it to the output size.
+
+    The fill color is used to map pixels from the source rect that fall outside
+    the source image.
+
+    See: https://pillow.readthedocs.io/en/latest/PIL.html#PIL.ImageTransform.ExtentTransform
+    """
+
+    def __init__(self, src_rect, output_size, interp=Image.BILINEAR, fill=0):
+        """
+        Args:
+            src_rect (x0, y0, x1, y1): src coordinates
+            output_size (h, w): dst image size
+            interp: PIL interpolation methods
+            fill: Fill color used when src_rect extends outside image
+        """
+        super().__init__()
+        self._set_attributes(locals())
+
+    def apply_image(self, img, interp=None):
+        h, w = self.output_size
+        if len(img.shape) > 2 and img.shape[2] == 1:
+            pil_image = Image.fromarray(img[:, :, 0], mode="L")
+        else:
+            pil_image = Image.fromarray(img)
+        pil_image = pil_image.transform(
+            size=(w, h),
+            method=Image.EXTENT,
+            data=self.src_rect,
+            resample=interp if interp else self.interp,
+            fill=self.fill,
+        )
+        ret = np.asarray(pil_image)
+        if len(img.shape) > 2 and img.shape[2] == 1:
+            ret = np.expand_dims(ret, -1)
+        return ret
+
+    def apply_coords(self, coords):
+        # Transform image center from source coordinates into output coordinates
+        # and then map the new origin to the corner of the output image.
+        h, w = self.output_size
+        x0, y0, x1, y1 = self.src_rect
+        new_coords = coords.astype(np.float32)
+        new_coords[:, 0] -= 0.5 * (x0 + x1)
+        new_coords[:, 1] -= 0.5 * (y0 + y1)
+        new_coords[:, 0] *= w / (x1 - x0)
+        new_coords[:, 1] *= h / (y1 - y0)
+        new_coords[:, 0] += 0.5 * w
+        new_coords[:, 1] += 0.5 * h
+        return new_coords
+
+    def apply_segmentation(self, segmentation):
+        segmentation = self.apply_image(segmentation, interp=Image.NEAREST)
+        return segmentation
+
+
+class ResizeTransform(Transform):
+    """
+    Resize the image to a target size.
+    """
+
+    def __init__(self, h, w, new_h, new_w, interp=None):
+        """
+        Args:
+            h, w (int): original image size
+            new_h, new_w (int): new image size
+            interp: PIL interpolation methods, defaults to bilinear.
+        """
+        # TODO decide on PIL vs opencv
+        super().__init__()
+        if interp is None:
+            interp = Image.BILINEAR
+        self._set_attributes(locals())
+
+    def apply_image(self, img, interp=None):
+        assert img.shape[:2] == (self.h, self.w)
+        assert len(img.shape) <= 4
+        interp_method = interp if interp is not None else self.interp
+
+        if img.dtype == np.uint8:
+            if len(img.shape) > 2 and img.shape[2] == 1:
+                pil_image = Image.fromarray(img[:, :, 0], mode="L")
+            else:
+                pil_image = Image.fromarray(img)
+            pil_image = pil_image.resize((self.new_w, self.new_h), interp_method)
+            ret = np.asarray(pil_image)
+            if len(img.shape) > 2 and img.shape[2] == 1:
+                ret = np.expand_dims(ret, -1)
+        else:
+            # PIL only supports uint8
+            if any(x < 0 for x in img.strides):
+                img = np.ascontiguousarray(img)
+            img = torch.from_numpy(img)
+            shape = list(img.shape)
+            shape_4d = shape[:2] + [1] * (4 - len(shape)) + shape[2:]
+            img = img.view(shape_4d).permute(2, 3, 0, 1)  # hw(c) -> nchw
+            _PIL_RESIZE_TO_INTERPOLATE_MODE = {
+                Image.NEAREST: "nearest",
+                Image.BILINEAR: "bilinear",
+                Image.BICUBIC: "bicubic",
+            }
+            mode = _PIL_RESIZE_TO_INTERPOLATE_MODE[interp_method]
+            align_corners = None if mode == "nearest" else False
+            img = F.interpolate(
+                img, (self.new_h, self.new_w), mode=mode, align_corners=align_corners
+            )
+            shape[:2] = (self.new_h, self.new_w)
+            ret = img.permute(2, 3, 0, 1).view(shape).numpy()  # nchw -> hw(c)
+
+        return ret
+
+    def apply_coords(self, coords):
+        coords[:, 0] = coords[:, 0] * (self.new_w * 1.0 / self.w)
+        coords[:, 1] = coords[:, 1] * (self.new_h * 1.0 / self.h)
+        return coords
+
+    def apply_segmentation(self, segmentation):
+        segmentation = self.apply_image(segmentation, interp=Image.NEAREST)
+        return segmentation
+
+    def inverse(self):
+        return ResizeTransform(self.new_h, self.new_w, self.h, self.w, self.interp)
+
+
+class RotationTransform(Transform):
+    """
+    This method returns a copy of this image, rotated the given
+    number of degrees counter clockwise around its center.
+    """
+
+    def __init__(self, h, w, angle, expand=True, center=None, interp=None):
+        """
+        Args:
+            h, w (int): original image size
+            angle (float): degrees for rotation
+            expand (bool): choose if the image should be resized to fit the whole
+                rotated image (default), or simply cropped
+            center (tuple (width, height)): coordinates of the rotation center
+                if left to None, the center will be fit to the center of each image
+                center has no effect if expand=True because it only affects shifting
+            interp: cv2 interpolation method, default cv2.INTER_LINEAR
+        """
+        super().__init__()
+        image_center = np.array((w / 2, h / 2))
+        if center is None:
+            center = image_center
+        if interp is None:
+            interp = cv2.INTER_LINEAR
+        abs_cos, abs_sin = (abs(np.cos(np.deg2rad(angle))), abs(np.sin(np.deg2rad(angle))))
+        if expand:
+            # find the new width and height bounds
+            bound_w, bound_h = np.rint(
+                [h * abs_sin + w * abs_cos, h * abs_cos + w * abs_sin]
+            ).astype(int)
+        else:
+            bound_w, bound_h = w, h
+
+        self._set_attributes(locals())
+        self.rm_coords = self.create_rotation_matrix()
+        # Needed because of this problem https://github.com/opencv/opencv/issues/11784
+        self.rm_image = self.create_rotation_matrix(offset=-0.5)
+
+    def apply_image(self, img, interp=None):
+        """
+        img should be a numpy array, formatted as Height * Width * Nchannels
+        """
+        if len(img) == 0 or self.angle % 360 == 0:
+            return img
+        assert img.shape[:2] == (self.h, self.w)
+        interp = interp if interp is not None else self.interp
+        return cv2.warpAffine(img, self.rm_image, (self.bound_w, self.bound_h), flags=interp)
+
+    def apply_coords(self, coords):
+        """
+        coords should be a N * 2 array-like, containing N couples of (x, y) points
+        """
+        coords = np.asarray(coords, dtype=float)
+        if len(coords) == 0 or self.angle % 360 == 0:
+            return coords
+        return cv2.transform(coords[:, np.newaxis, :], self.rm_coords)[:, 0, :]
+
+    def apply_segmentation(self, segmentation):
+        segmentation = self.apply_image(segmentation, interp=cv2.INTER_NEAREST)
+        return segmentation
+
+    def create_rotation_matrix(self, offset=0):
+        center = (self.center[0] + offset, self.center[1] + offset)
+        rm = cv2.getRotationMatrix2D(tuple(center), self.angle, 1)
+        if self.expand:
+            # Find the coordinates of the center of rotation in the new image
+            # The only point for which we know the future coordinates is the center of the image
+            rot_im_center = cv2.transform(self.image_center[None, None, :] + offset, rm)[0, 0, :]
+            new_center = np.array([self.bound_w / 2, self.bound_h / 2]) + offset - rot_im_center
+            # shift the rotation center to the new coordinates
+            rm[:, 2] += new_center
+        return rm
+
+    def inverse(self):
+        """
+        The inverse is to rotate it back with expand, and crop to get the original shape.
+        """
+        if not self.expand:  # Not possible to inverse if a part of the image is lost
+            raise NotImplementedError()
+        rotation = RotationTransform(
+            self.bound_h, self.bound_w, -self.angle, True, None, self.interp
+        )
+        crop = CropTransform(
+            (rotation.bound_w - self.w) // 2, (rotation.bound_h - self.h) // 2, self.w, self.h
+        )
+        return TransformList([rotation, crop])
+
+
+class ColorTransform(Transform):
+    """
+    Generic wrapper for any photometric transforms.
+    These transformations should only affect the color space and
+        not the coordinate space of the image (e.g. annotation
+        coordinates such as bounding boxes should not be changed)
+    """
+
+    def __init__(self, op):
+        """
+        Args:
+            op (Callable): operation to be applied to the image,
+                which takes in an ndarray and returns an ndarray.
+        """
+        if not callable(op):
+            raise ValueError("op parameter should be callable")
+        super().__init__()
+        self._set_attributes(locals())
+
+    def apply_image(self, img):
+        return self.op(img)
+
+    def apply_coords(self, coords):
+        return coords
+
+    def inverse(self):
+        return NoOpTransform()
+
+    def apply_segmentation(self, segmentation):
+        return segmentation
+
+
+class PILColorTransform(ColorTransform):
+    """
+    Generic wrapper for PIL Photometric image transforms,
+        which affect the color space and not the coordinate
+        space of the image
+    """
+
+    def __init__(self, op):
+        """
+        Args:
+            op (Callable): operation to be applied to the image,
+                which takes in a PIL Image and returns a transformed
+                PIL Image.
+                For reference on possible operations see:
+                - https://pillow.readthedocs.io/en/stable/
+        """
+        if not callable(op):
+            raise ValueError("op parameter should be callable")
+        super().__init__(op)
+
+    def apply_image(self, img):
+        img = Image.fromarray(img)
+        return np.asarray(super().apply_image(img))
+
+
+def HFlip_rotated_box(transform, rotated_boxes):
+    """
+    Apply the horizontal flip transform on rotated boxes.
+
+    Args:
+        rotated_boxes (ndarray): Nx5 floating point array of
+            (x_center, y_center, width, height, angle_degrees) format
+            in absolute coordinates.
+    """
+    # Transform x_center
+    rotated_boxes[:, 0] = transform.width - rotated_boxes[:, 0]
+    # Transform angle
+    rotated_boxes[:, 4] = -rotated_boxes[:, 4]
+    return rotated_boxes
+
+
+def Resize_rotated_box(transform, rotated_boxes):
+    """
+    Apply the resizing transform on rotated boxes. For details of how these (approximation)
+    formulas are derived, please refer to :meth:`RotatedBoxes.scale`.
+
+    Args:
+        rotated_boxes (ndarray): Nx5 floating point array of
+            (x_center, y_center, width, height, angle_degrees) format
+            in absolute coordinates.
+    """
+    scale_factor_x = transform.new_w * 1.0 / transform.w
+    scale_factor_y = transform.new_h * 1.0 / transform.h
+    rotated_boxes[:, 0] *= scale_factor_x
+    rotated_boxes[:, 1] *= scale_factor_y
+    theta = rotated_boxes[:, 4] * np.pi / 180.0
+    c = np.cos(theta)
+    s = np.sin(theta)
+    rotated_boxes[:, 2] *= np.sqrt(np.square(scale_factor_x * c) + np.square(scale_factor_y * s))
+    rotated_boxes[:, 3] *= np.sqrt(np.square(scale_factor_x * s) + np.square(scale_factor_y * c))
+    rotated_boxes[:, 4] = np.arctan2(scale_factor_x * s, scale_factor_y * c) * 180 / np.pi
+
+    return rotated_boxes
+
+
+HFlipTransform.register_type("rotated_box", HFlip_rotated_box)
+ResizeTransform.register_type("rotated_box", Resize_rotated_box)
+
+# not necessary any more with latest fvcore
+NoOpTransform.register_type("rotated_box", lambda t, x: x)
diff --git a/detectron2/engine/__init__.py b/detectron2/engine/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e6e4d673dedd10419b612755cfcb9744fc4999f8
--- /dev/null
+++ b/detectron2/engine/__init__.py
@@ -0,0 +1,19 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+
+from .launch import *
+from .train_loop import *
+
+__all__ = [k for k in globals().keys() if not k.startswith("_")]
+
+
+# prefer to let hooks and defaults live in separate namespaces (therefore not in __all__)
+# but still make them available here
+from .hooks import *
+from .defaults import (
+    create_ddp_model,
+    default_argument_parser,
+    default_setup,
+    default_writers,
+    DefaultPredictor,
+    DefaultTrainer,
+)
diff --git a/detectron2/engine/__pycache__/__init__.cpython-310.pyc b/detectron2/engine/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8ff09fdaa78caf5745f063543b51787af8201431
Binary files /dev/null and b/detectron2/engine/__pycache__/__init__.cpython-310.pyc differ
diff --git a/detectron2/engine/__pycache__/__init__.cpython-39.pyc b/detectron2/engine/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2dad0182d5698ff824b421a71c76849c5438e0fc
Binary files /dev/null and b/detectron2/engine/__pycache__/__init__.cpython-39.pyc differ
diff --git a/detectron2/engine/__pycache__/defaults.cpython-310.pyc b/detectron2/engine/__pycache__/defaults.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..56da925e87ad0990942b2fc35be5c020ab8e6a57
Binary files /dev/null and b/detectron2/engine/__pycache__/defaults.cpython-310.pyc differ
diff --git a/detectron2/engine/__pycache__/defaults.cpython-39.pyc b/detectron2/engine/__pycache__/defaults.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4ad75b68d5d650cf15a6511bc8aebe910fe5eab7
Binary files /dev/null and b/detectron2/engine/__pycache__/defaults.cpython-39.pyc differ
diff --git a/detectron2/engine/__pycache__/hooks.cpython-310.pyc b/detectron2/engine/__pycache__/hooks.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4557314057a9b86822259223964f13285602aa49
Binary files /dev/null and b/detectron2/engine/__pycache__/hooks.cpython-310.pyc differ
diff --git a/detectron2/engine/__pycache__/hooks.cpython-39.pyc b/detectron2/engine/__pycache__/hooks.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..cac44f48ab86cd7368b5e689de08011ab3c34340
Binary files /dev/null and b/detectron2/engine/__pycache__/hooks.cpython-39.pyc differ
diff --git a/detectron2/engine/__pycache__/launch.cpython-310.pyc b/detectron2/engine/__pycache__/launch.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d8964ec0cdb0a87f8e67fe635d751db9f27b97e2
Binary files /dev/null and b/detectron2/engine/__pycache__/launch.cpython-310.pyc differ
diff --git a/detectron2/engine/__pycache__/launch.cpython-39.pyc b/detectron2/engine/__pycache__/launch.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f3b67e68f22c5b877f4d5fcf7844de8b19abde4e
Binary files /dev/null and b/detectron2/engine/__pycache__/launch.cpython-39.pyc differ
diff --git a/detectron2/engine/__pycache__/train_loop.cpython-310.pyc b/detectron2/engine/__pycache__/train_loop.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5c94db8d690a98ba1261ef95805f1afc2aed029c
Binary files /dev/null and b/detectron2/engine/__pycache__/train_loop.cpython-310.pyc differ
diff --git a/detectron2/engine/__pycache__/train_loop.cpython-39.pyc b/detectron2/engine/__pycache__/train_loop.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ac0da27f6a613f0052af715734a2b7e3c83d4891
Binary files /dev/null and b/detectron2/engine/__pycache__/train_loop.cpython-39.pyc differ
diff --git a/detectron2/engine/defaults.py b/detectron2/engine/defaults.py
new file mode 100644
index 0000000000000000000000000000000000000000..ff5625ae86364c9c47ff4f63f5607b992855c6e3
--- /dev/null
+++ b/detectron2/engine/defaults.py
@@ -0,0 +1,717 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) Facebook, Inc. and its affiliates.
+
+"""
+This file contains components with some default boilerplate logic user may need
+in training / testing. They will not work for everyone, but many users may find them useful.
+
+The behavior of functions/classes in this file is subject to change,
+since they are meant to represent the "common default behavior" people need in their projects.
+"""
+
+import argparse
+import logging
+import os
+import sys
+import weakref
+from collections import OrderedDict
+from typing import Optional
+import torch
+from fvcore.nn.precise_bn import get_bn_modules
+from omegaconf import OmegaConf
+from torch.nn.parallel import DistributedDataParallel
+
+import detectron2.data.transforms as T
+from detectron2.checkpoint import DetectionCheckpointer
+from detectron2.config import CfgNode, LazyConfig
+from detectron2.data import (
+    MetadataCatalog,
+    build_detection_test_loader,
+    build_detection_train_loader,
+)
+from detectron2.evaluation import (
+    DatasetEvaluator,
+    inference_on_dataset,
+    print_csv_format,
+    verify_results,
+)
+from detectron2.modeling import build_model
+from detectron2.solver import build_lr_scheduler, build_optimizer
+from detectron2.utils import comm
+from detectron2.utils.collect_env import collect_env_info
+from detectron2.utils.env import seed_all_rng
+from detectron2.utils.events import CommonMetricPrinter, JSONWriter, TensorboardXWriter
+from detectron2.utils.file_io import PathManager
+from detectron2.utils.logger import setup_logger
+
+from . import hooks
+from .train_loop import AMPTrainer, SimpleTrainer, TrainerBase
+
+__all__ = [
+    "create_ddp_model",
+    "default_argument_parser",
+    "default_setup",
+    "default_writers",
+    "DefaultPredictor",
+    "DefaultTrainer",
+]
+
+
+def create_ddp_model(model, *, fp16_compression=False, **kwargs):
+    """
+    Create a DistributedDataParallel model if there are >1 processes.
+
+    Args:
+        model: a torch.nn.Module
+        fp16_compression: add fp16 compression hooks to the ddp object.
+            See more at https://pytorch.org/docs/stable/ddp_comm_hooks.html#torch.distributed.algorithms.ddp_comm_hooks.default_hooks.fp16_compress_hook
+        kwargs: other arguments of :module:`torch.nn.parallel.DistributedDataParallel`.
+    """  # noqa
+    if comm.get_world_size() == 1:
+        return model
+    if "device_ids" not in kwargs:
+        kwargs["device_ids"] = [comm.get_local_rank()]
+    ddp = DistributedDataParallel(model, **kwargs)
+    if fp16_compression:
+        from torch.distributed.algorithms.ddp_comm_hooks import default as comm_hooks
+
+        ddp.register_comm_hook(state=None, hook=comm_hooks.fp16_compress_hook)
+    return ddp
+
+
+def default_argument_parser(epilog=None):
+    """
+    Create a parser with some common arguments used by detectron2 users.
+
+    Args:
+        epilog (str): epilog passed to ArgumentParser describing the usage.
+
+    Returns:
+        argparse.ArgumentParser:
+    """
+    parser = argparse.ArgumentParser(
+        epilog=epilog
+        or f"""
+Examples:
+
+Run on single machine:
+    $ {sys.argv[0]} --num-gpus 8 --config-file cfg.yaml
+
+Change some config options:
+    $ {sys.argv[0]} --config-file cfg.yaml MODEL.WEIGHTS /path/to/weight.pth SOLVER.BASE_LR 0.001
+
+Run on multiple machines:
+    (machine0)$ {sys.argv[0]} --machine-rank 0 --num-machines 2 --dist-url <URL> [--other-flags]
+    (machine1)$ {sys.argv[0]} --machine-rank 1 --num-machines 2 --dist-url <URL> [--other-flags]
+""",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+    )
+    parser.add_argument("--config-file", default="", metavar="FILE", help="path to config file")
+    parser.add_argument(
+        "--resume",
+        action="store_true",
+        help="Whether to attempt to resume from the checkpoint directory. "
+        "See documentation of `DefaultTrainer.resume_or_load()` for what it means.",
+    )
+    parser.add_argument("--eval-only", action="store_true", help="perform evaluation only")
+    parser.add_argument("--num-gpus", type=int, default=1, help="number of gpus *per machine*")
+    parser.add_argument("--num-machines", type=int, default=1, help="total number of machines")
+    parser.add_argument(
+        "--machine-rank", type=int, default=0, help="the rank of this machine (unique per machine)"
+    )
+
+    # PyTorch still may leave orphan processes in multi-gpu training.
+    # Therefore we use a deterministic way to obtain port,
+    # so that users are aware of orphan processes by seeing the port occupied.
+    port = 2**15 + 2**14 + hash(os.getuid() if sys.platform != "win32" else 1) % 2**14
+    parser.add_argument(
+        "--dist-url",
+        default="tcp://127.0.0.1:{}".format(port),
+        help="initialization URL for pytorch distributed backend. See "
+        "https://pytorch.org/docs/stable/distributed.html for details.",
+    )
+    parser.add_argument(
+        "opts",
+        help="""
+Modify config options at the end of the command. For Yacs configs, use
+space-separated "PATH.KEY VALUE" pairs.
+For python-based LazyConfig, use "path.key=value".
+        """.strip(),
+        default=None,
+        nargs=argparse.REMAINDER,
+    )
+    return parser
+
+
+def _try_get_key(cfg, *keys, default=None):
+    """
+    Try select keys from cfg until the first key that exists. Otherwise return default.
+    """
+    if isinstance(cfg, CfgNode):
+        cfg = OmegaConf.create(cfg.dump())
+    for k in keys:
+        none = object()
+        p = OmegaConf.select(cfg, k, default=none)
+        if p is not none:
+            return p
+    return default
+
+
+def _highlight(code, filename):
+    try:
+        import pygments
+    except ImportError:
+        return code
+
+    from pygments.lexers import Python3Lexer, YamlLexer
+    from pygments.formatters import Terminal256Formatter
+
+    lexer = Python3Lexer() if filename.endswith(".py") else YamlLexer()
+    code = pygments.highlight(code, lexer, Terminal256Formatter(style="monokai"))
+    return code
+
+
+def default_setup(cfg, args):
+    """
+    Perform some basic common setups at the beginning of a job, including:
+
+    1. Set up the detectron2 logger
+    2. Log basic information about environment, cmdline arguments, and config
+    3. Backup the config to the output directory
+
+    Args:
+        cfg (CfgNode or omegaconf.DictConfig): the full config to be used
+        args (argparse.NameSpace): the command line arguments to be logged
+    """
+    output_dir = _try_get_key(cfg, "OUTPUT_DIR", "output_dir", "train.output_dir")
+    if comm.is_main_process() and output_dir:
+        PathManager.mkdirs(output_dir)
+
+    rank = comm.get_rank()
+    setup_logger(output_dir, distributed_rank=rank, name="fvcore")
+    logger = setup_logger(output_dir, distributed_rank=rank)
+
+    logger.info("Rank of current process: {}. World size: {}".format(rank, comm.get_world_size()))
+    logger.info("Environment info:\n" + collect_env_info())
+
+    logger.info("Command line arguments: " + str(args))
+    if hasattr(args, "config_file") and args.config_file != "":
+        logger.info(
+            "Contents of args.config_file={}:\n{}".format(
+                args.config_file,
+                _highlight(PathManager.open(args.config_file, "r").read(), args.config_file),
+            )
+        )
+
+    if comm.is_main_process() and output_dir:
+        # Note: some of our scripts may expect the existence of
+        # config.yaml in output directory
+        path = os.path.join(output_dir, "config.yaml")
+        if isinstance(cfg, CfgNode):
+            logger.info("Running with full config:\n{}".format(_highlight(cfg.dump(), ".yaml")))
+            with PathManager.open(path, "w") as f:
+                f.write(cfg.dump())
+        else:
+            LazyConfig.save(cfg, path)
+        logger.info("Full config saved to {}".format(path))
+
+    # make sure each worker has a different, yet deterministic seed if specified
+    seed = _try_get_key(cfg, "SEED", "train.seed", default=-1)
+    seed_all_rng(None if seed < 0 else seed + rank)
+
+    # cudnn benchmark has large overhead. It shouldn't be used considering the small size of
+    # typical validation set.
+    if not (hasattr(args, "eval_only") and args.eval_only):
+        torch.backends.cudnn.benchmark = _try_get_key(
+            cfg, "CUDNN_BENCHMARK", "train.cudnn_benchmark", default=False
+        )
+
+
+def default_writers(output_dir: str, max_iter: Optional[int] = None):
+    """
+    Build a list of :class:`EventWriter` to be used.
+    It now consists of a :class:`CommonMetricPrinter`,
+    :class:`TensorboardXWriter` and :class:`JSONWriter`.
+
+    Args:
+        output_dir: directory to store JSON metrics and tensorboard events
+        max_iter: the total number of iterations
+
+    Returns:
+        list[EventWriter]: a list of :class:`EventWriter` objects.
+    """
+    PathManager.mkdirs(output_dir)
+    return [
+        # It may not always print what you want to see, since it prints "common" metrics only.
+        CommonMetricPrinter(max_iter),
+        JSONWriter(os.path.join(output_dir, "metrics.json")),
+        TensorboardXWriter(output_dir),
+    ]
+
+
+class DefaultPredictor:
+    """
+    Create a simple end-to-end predictor with the given config that runs on
+    single device for a single input image.
+
+    Compared to using the model directly, this class does the following additions:
+
+    1. Load checkpoint from `cfg.MODEL.WEIGHTS`.
+    2. Always take BGR image as the input and apply conversion defined by `cfg.INPUT.FORMAT`.
+    3. Apply resizing defined by `cfg.INPUT.{MIN,MAX}_SIZE_TEST`.
+    4. Take one input image and produce a single output, instead of a batch.
+
+    This is meant for simple demo purposes, so it does the above steps automatically.
+    This is not meant for benchmarks or running complicated inference logic.
+    If you'd like to do anything more complicated, please refer to its source code as
+    examples to build and use the model manually.
+
+    Attributes:
+        metadata (Metadata): the metadata of the underlying dataset, obtained from
+            cfg.DATASETS.TEST.
+
+    Examples:
+    ::
+        pred = DefaultPredictor(cfg)
+        inputs = cv2.imread("input.jpg")
+        outputs = pred(inputs)
+    """
+
+    def __init__(self, cfg):
+        self.cfg = cfg.clone()  # cfg can be modified by model
+        self.model = build_model(self.cfg)
+        self.model.eval()
+        if len(cfg.DATASETS.TEST):
+            self.metadata = MetadataCatalog.get(cfg.DATASETS.TEST[0])
+
+        checkpointer = DetectionCheckpointer(self.model)
+        checkpointer.load(cfg.MODEL.WEIGHTS)
+
+        self.aug = T.ResizeShortestEdge(
+            [cfg.INPUT.MIN_SIZE_TEST, cfg.INPUT.MIN_SIZE_TEST], cfg.INPUT.MAX_SIZE_TEST
+        )
+
+        self.input_format = cfg.INPUT.FORMAT
+        assert self.input_format in ["RGB", "BGR"], self.input_format
+
+    def __call__(self, original_image):
+        """
+        Args:
+            original_image (np.ndarray): an image of shape (H, W, C) (in BGR order).
+
+        Returns:
+            predictions (dict):
+                the output of the model for one image only.
+                See :doc:`/tutorials/models` for details about the format.
+        """
+        with torch.no_grad():  # https://github.com/sphinx-doc/sphinx/issues/4258
+            # Apply pre-processing to image.
+            if self.input_format == "RGB":
+                # whether the model expects BGR inputs or RGB
+                original_image = original_image[:, :, ::-1]
+            height, width = original_image.shape[:2]
+            image = self.aug.get_transform(original_image).apply_image(original_image)
+            image = torch.as_tensor(image.astype("float32").transpose(2, 0, 1))
+            image.to(self.cfg.MODEL.DEVICE)
+
+            inputs = {"image": image, "height": height, "width": width}
+
+            predictions = self.model([inputs])[0]
+            return predictions
+
+
+class DefaultTrainer(TrainerBase):
+    """
+    A trainer with default training logic. It does the following:
+
+    1. Create a :class:`SimpleTrainer` using model, optimizer, dataloader
+       defined by the given config. Create a LR scheduler defined by the config.
+    2. Load the last checkpoint or `cfg.MODEL.WEIGHTS`, if exists, when
+       `resume_or_load` is called.
+    3. Register a few common hooks defined by the config.
+
+    It is created to simplify the **standard model training workflow** and reduce code boilerplate
+    for users who only need the standard training workflow, with standard features.
+    It means this class makes *many assumptions* about your training logic that
+    may easily become invalid in a new research. In fact, any assumptions beyond those made in the
+    :class:`SimpleTrainer` are too much for research.
+
+    The code of this class has been annotated about restrictive assumptions it makes.
+    When they do not work for you, you're encouraged to:
+
+    1. Overwrite methods of this class, OR:
+    2. Use :class:`SimpleTrainer`, which only does minimal SGD training and
+       nothing else. You can then add your own hooks if needed. OR:
+    3. Write your own training loop similar to `tools/plain_train_net.py`.
+
+    See the :doc:`/tutorials/training` tutorials for more details.
+
+    Note that the behavior of this class, like other functions/classes in
+    this file, is not stable, since it is meant to represent the "common default behavior".
+    It is only guaranteed to work well with the standard models and training workflow in detectron2.
+    To obtain more stable behavior, write your own training logic with other public APIs.
+
+    Examples:
+    ::
+        trainer = DefaultTrainer(cfg)
+        trainer.resume_or_load()  # load last checkpoint or MODEL.WEIGHTS
+        trainer.train()
+
+    Attributes:
+        scheduler:
+        checkpointer (DetectionCheckpointer):
+        cfg (CfgNode):
+    """
+
+    def __init__(self, cfg):
+        """
+        Args:
+            cfg (CfgNode):
+        """
+        super().__init__()
+        logger = logging.getLogger("detectron2")
+        if not logger.isEnabledFor(logging.INFO):  # setup_logger is not called for d2
+            setup_logger()
+        cfg = DefaultTrainer.auto_scale_workers(cfg, comm.get_world_size())
+
+        # Assume these objects must be constructed in this order.
+        model = self.build_model(cfg)
+        optimizer = self.build_optimizer(cfg, model)
+        data_loader = self.build_train_loader(cfg)
+
+        model = create_ddp_model(model, broadcast_buffers=False)
+        self._trainer = (AMPTrainer if cfg.SOLVER.AMP.ENABLED else SimpleTrainer)(
+            model, data_loader, optimizer
+        )
+
+        self.scheduler = self.build_lr_scheduler(cfg, optimizer)
+        self.checkpointer = DetectionCheckpointer(
+            # Assume you want to save checkpoints together with logs/statistics
+            model,
+            cfg.OUTPUT_DIR,
+            trainer=weakref.proxy(self),
+        )
+        self.start_iter = 0
+        self.max_iter = cfg.SOLVER.MAX_ITER
+        self.cfg = cfg
+
+        self.register_hooks(self.build_hooks())
+
+    def resume_or_load(self, resume=True):
+        """
+        If `resume==True` and `cfg.OUTPUT_DIR` contains the last checkpoint (defined by
+        a `last_checkpoint` file), resume from the file. Resuming means loading all
+        available states (eg. optimizer and scheduler) and update iteration counter
+        from the checkpoint. ``cfg.MODEL.WEIGHTS`` will not be used.
+
+        Otherwise, this is considered as an independent training. The method will load model
+        weights from the file `cfg.MODEL.WEIGHTS` (but will not load other states) and start
+        from iteration 0.
+
+        Args:
+            resume (bool): whether to do resume or not
+        """
+        self.checkpointer.resume_or_load(self.cfg.MODEL.WEIGHTS, resume=resume)
+        if resume and self.checkpointer.has_checkpoint():
+            # The checkpoint stores the training iteration that just finished, thus we start
+            # at the next iteration
+            self.start_iter = self.iter + 1
+
+    def build_hooks(self):
+        """
+        Build a list of default hooks, including timing, evaluation,
+        checkpointing, lr scheduling, precise BN, writing events.
+
+        Returns:
+            list[HookBase]:
+        """
+        cfg = self.cfg.clone()
+        cfg.defrost()
+        cfg.DATALOADER.NUM_WORKERS = 0  # save some memory and time for PreciseBN
+
+        ret = [
+            hooks.IterationTimer(),
+            hooks.LRScheduler(),
+            hooks.PreciseBN(
+                # Run at the same freq as (but before) evaluation.
+                cfg.TEST.EVAL_PERIOD,
+                self.model,
+                # Build a new data loader to not affect training
+                self.build_train_loader(cfg),
+                cfg.TEST.PRECISE_BN.NUM_ITER,
+            )
+            if cfg.TEST.PRECISE_BN.ENABLED and get_bn_modules(self.model)
+            else None,
+        ]
+
+        # Do PreciseBN before checkpointer, because it updates the model and need to
+        # be saved by checkpointer.
+        # This is not always the best: if checkpointing has a different frequency,
+        # some checkpoints may have more precise statistics than others.
+        if comm.is_main_process():
+            ret.append(hooks.PeriodicCheckpointer(self.checkpointer, cfg.SOLVER.CHECKPOINT_PERIOD))
+
+        def test_and_save_results():
+            self._last_eval_results = self.test(self.cfg, self.model)
+            return self._last_eval_results
+
+        # Do evaluation after checkpointer, because then if it fails,
+        # we can use the saved checkpoint to debug.
+        ret.append(hooks.EvalHook(cfg.TEST.EVAL_PERIOD, test_and_save_results))
+
+        if comm.is_main_process():
+            # Here the default print/log frequency of each writer is used.
+            # run writers in the end, so that evaluation metrics are written
+            ret.append(hooks.PeriodicWriter(self.build_writers(), period=20))
+        return ret
+
+    def build_writers(self):
+        """
+        Build a list of writers to be used using :func:`default_writers()`.
+        If you'd like a different list of writers, you can overwrite it in
+        your trainer.
+
+        Returns:
+            list[EventWriter]: a list of :class:`EventWriter` objects.
+        """
+        return default_writers(self.cfg.OUTPUT_DIR, self.max_iter)
+
+    def train(self):
+        """
+        Run training.
+
+        Returns:
+            OrderedDict of results, if evaluation is enabled. Otherwise None.
+        """
+        super().train(self.start_iter, self.max_iter)
+        if len(self.cfg.TEST.EXPECTED_RESULTS) and comm.is_main_process():
+            assert hasattr(
+                self, "_last_eval_results"
+            ), "No evaluation results obtained during training!"
+            verify_results(self.cfg, self._last_eval_results)
+            return self._last_eval_results
+
+    def run_step(self):
+        self._trainer.iter = self.iter
+        self._trainer.run_step()
+
+    def state_dict(self):
+        ret = super().state_dict()
+        ret["_trainer"] = self._trainer.state_dict()
+        return ret
+
+    def load_state_dict(self, state_dict):
+        super().load_state_dict(state_dict)
+        self._trainer.load_state_dict(state_dict["_trainer"])
+
+    @classmethod
+    def build_model(cls, cfg):
+        """
+        Returns:
+            torch.nn.Module:
+
+        It now calls :func:`detectron2.modeling.build_model`.
+        Overwrite it if you'd like a different model.
+        """
+        model = build_model(cfg)
+        logger = logging.getLogger(__name__)
+        logger.info("Model:\n{}".format(model))
+        return model
+
+    @classmethod
+    def build_optimizer(cls, cfg, model):
+        """
+        Returns:
+            torch.optim.Optimizer:
+
+        It now calls :func:`detectron2.solver.build_optimizer`.
+        Overwrite it if you'd like a different optimizer.
+        """
+        return build_optimizer(cfg, model)
+
+    @classmethod
+    def build_lr_scheduler(cls, cfg, optimizer):
+        """
+        It now calls :func:`detectron2.solver.build_lr_scheduler`.
+        Overwrite it if you'd like a different scheduler.
+        """
+        return build_lr_scheduler(cfg, optimizer)
+
+    @classmethod
+    def build_train_loader(cls, cfg):
+        """
+        Returns:
+            iterable
+
+        It now calls :func:`detectron2.data.build_detection_train_loader`.
+        Overwrite it if you'd like a different data loader.
+        """
+        return build_detection_train_loader(cfg)
+
+    @classmethod
+    def build_test_loader(cls, cfg, dataset_name):
+        """
+        Returns:
+            iterable
+
+        It now calls :func:`detectron2.data.build_detection_test_loader`.
+        Overwrite it if you'd like a different data loader.
+        """
+        return build_detection_test_loader(cfg, dataset_name)
+
+    @classmethod
+    def build_evaluator(cls, cfg, dataset_name):
+        """
+        Returns:
+            DatasetEvaluator or None
+
+        It is not implemented by default.
+        """
+        raise NotImplementedError(
+            """
+If you want DefaultTrainer to automatically run evaluation,
+please implement `build_evaluator()` in subclasses (see train_net.py for example).
+Alternatively, you can call evaluation functions yourself (see Colab balloon tutorial for example).
+"""
+        )
+
+    @classmethod
+    def test(cls, cfg, model, evaluators=None):
+        """
+        Evaluate the given model. The given model is expected to already contain
+        weights to evaluate.
+
+        Args:
+            cfg (CfgNode):
+            model (nn.Module):
+            evaluators (list[DatasetEvaluator] or None): if None, will call
+                :meth:`build_evaluator`. Otherwise, must have the same length as
+                ``cfg.DATASETS.TEST``.
+
+        Returns:
+            dict: a dict of result metrics
+        """
+        logger = logging.getLogger(__name__)
+        if isinstance(evaluators, DatasetEvaluator):
+            evaluators = [evaluators]
+        if evaluators is not None:
+            assert len(cfg.DATASETS.TEST) == len(evaluators), "{} != {}".format(
+                len(cfg.DATASETS.TEST), len(evaluators)
+            )
+
+        results = OrderedDict()
+        for idx, dataset_name in enumerate(cfg.DATASETS.TEST):
+            data_loader = cls.build_test_loader(cfg, dataset_name)
+            # When evaluators are passed in as arguments,
+            # implicitly assume that evaluators can be created before data_loader.
+            if evaluators is not None:
+                evaluator = evaluators[idx]
+            else:
+                try:
+                    evaluator = cls.build_evaluator(cfg, dataset_name)
+                except NotImplementedError:
+                    logger.warn(
+                        "No evaluator found. Use `DefaultTrainer.test(evaluators=)`, "
+                        "or implement its `build_evaluator` method."
+                    )
+                    results[dataset_name] = {}
+                    continue
+            results_i = inference_on_dataset(model, data_loader, evaluator)
+            results[dataset_name] = results_i
+            if comm.is_main_process():
+                assert isinstance(
+                    results_i, dict
+                ), "Evaluator must return a dict on the main process. Got {} instead.".format(
+                    results_i
+                )
+                logger.info("Evaluation results for {} in csv format:".format(dataset_name))
+                print_csv_format(results_i)
+
+        if len(results) == 1:
+            results = list(results.values())[0]
+        return results
+
+    @staticmethod
+    def auto_scale_workers(cfg, num_workers: int):
+        """
+        When the config is defined for certain number of workers (according to
+        ``cfg.SOLVER.REFERENCE_WORLD_SIZE``) that's different from the number of
+        workers currently in use, returns a new cfg where the total batch size
+        is scaled so that the per-GPU batch size stays the same as the
+        original ``IMS_PER_BATCH // REFERENCE_WORLD_SIZE``.
+
+        Other config options are also scaled accordingly:
+        * training steps and warmup steps are scaled inverse proportionally.
+        * learning rate are scaled proportionally, following :paper:`ImageNet in 1h`.
+
+        For example, with the original config like the following:
+
+        .. code-block:: yaml
+
+            IMS_PER_BATCH: 16
+            BASE_LR: 0.1
+            REFERENCE_WORLD_SIZE: 8
+            MAX_ITER: 5000
+            STEPS: (4000,)
+            CHECKPOINT_PERIOD: 1000
+
+        When this config is used on 16 GPUs instead of the reference number 8,
+        calling this method will return a new config with:
+
+        .. code-block:: yaml
+
+            IMS_PER_BATCH: 32
+            BASE_LR: 0.2
+            REFERENCE_WORLD_SIZE: 16
+            MAX_ITER: 2500
+            STEPS: (2000,)
+            CHECKPOINT_PERIOD: 500
+
+        Note that both the original config and this new config can be trained on 16 GPUs.
+        It's up to user whether to enable this feature (by setting ``REFERENCE_WORLD_SIZE``).
+
+        Returns:
+            CfgNode: a new config. Same as original if ``cfg.SOLVER.REFERENCE_WORLD_SIZE==0``.
+        """
+        old_world_size = cfg.SOLVER.REFERENCE_WORLD_SIZE
+        if old_world_size == 0 or old_world_size == num_workers:
+            return cfg
+        cfg = cfg.clone()
+        frozen = cfg.is_frozen()
+        cfg.defrost()
+
+        assert (
+            cfg.SOLVER.IMS_PER_BATCH % old_world_size == 0
+        ), "Invalid REFERENCE_WORLD_SIZE in config!"
+        scale = num_workers / old_world_size
+        bs = cfg.SOLVER.IMS_PER_BATCH = int(round(cfg.SOLVER.IMS_PER_BATCH * scale))
+        lr = cfg.SOLVER.BASE_LR = cfg.SOLVER.BASE_LR * scale
+        max_iter = cfg.SOLVER.MAX_ITER = int(round(cfg.SOLVER.MAX_ITER / scale))
+        warmup_iter = cfg.SOLVER.WARMUP_ITERS = int(round(cfg.SOLVER.WARMUP_ITERS / scale))
+        cfg.SOLVER.STEPS = tuple(int(round(s / scale)) for s in cfg.SOLVER.STEPS)
+        cfg.TEST.EVAL_PERIOD = int(round(cfg.TEST.EVAL_PERIOD / scale))
+        cfg.SOLVER.CHECKPOINT_PERIOD = int(round(cfg.SOLVER.CHECKPOINT_PERIOD / scale))
+        cfg.SOLVER.REFERENCE_WORLD_SIZE = num_workers  # maintain invariant
+        logger = logging.getLogger(__name__)
+        logger.info(
+            f"Auto-scaling the config to batch_size={bs}, learning_rate={lr}, "
+            f"max_iter={max_iter}, warmup={warmup_iter}."
+        )
+
+        if frozen:
+            cfg.freeze()
+        return cfg
+
+
+# Access basic attributes from the underlying trainer
+for _attr in ["model", "data_loader", "optimizer"]:
+    setattr(
+        DefaultTrainer,
+        _attr,
+        property(
+            # getter
+            lambda self, x=_attr: getattr(self._trainer, x),
+            # setter
+            lambda self, value, x=_attr: setattr(self._trainer, x, value),
+        ),
+    )
diff --git a/detectron2/engine/hooks.py b/detectron2/engine/hooks.py
new file mode 100644
index 0000000000000000000000000000000000000000..fc37af0fd3a276eb389f7667be113b41ca53f012
--- /dev/null
+++ b/detectron2/engine/hooks.py
@@ -0,0 +1,690 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) Facebook, Inc. and its affiliates.
+
+import datetime
+import itertools
+import logging
+import math
+import operator
+import os
+import tempfile
+import time
+import warnings
+from collections import Counter
+import torch
+from fvcore.common.checkpoint import Checkpointer
+from fvcore.common.checkpoint import PeriodicCheckpointer as _PeriodicCheckpointer
+from fvcore.common.param_scheduler import ParamScheduler
+from fvcore.common.timer import Timer
+from fvcore.nn.precise_bn import get_bn_modules, update_bn_stats
+
+import detectron2.utils.comm as comm
+from detectron2.evaluation.testing import flatten_results_dict
+from detectron2.solver import LRMultiplier
+from detectron2.solver import LRScheduler as _LRScheduler
+from detectron2.utils.events import EventStorage, EventWriter
+from detectron2.utils.file_io import PathManager
+
+from .train_loop import HookBase
+
+__all__ = [
+    "CallbackHook",
+    "IterationTimer",
+    "PeriodicWriter",
+    "PeriodicCheckpointer",
+    "BestCheckpointer",
+    "LRScheduler",
+    "AutogradProfiler",
+    "EvalHook",
+    "PreciseBN",
+    "TorchProfiler",
+    "TorchMemoryStats",
+]
+
+
+"""
+Implement some common hooks.
+"""
+
+
+class CallbackHook(HookBase):
+    """
+    Create a hook using callback functions provided by the user.
+    """
+
+    def __init__(self, *, before_train=None, after_train=None, before_step=None, after_step=None):
+        """
+        Each argument is a function that takes one argument: the trainer.
+        """
+        self._before_train = before_train
+        self._before_step = before_step
+        self._after_step = after_step
+        self._after_train = after_train
+
+    def before_train(self):
+        if self._before_train:
+            self._before_train(self.trainer)
+
+    def after_train(self):
+        if self._after_train:
+            self._after_train(self.trainer)
+        # The functions may be closures that hold reference to the trainer
+        # Therefore, delete them to avoid circular reference.
+        del self._before_train, self._after_train
+        del self._before_step, self._after_step
+
+    def before_step(self):
+        if self._before_step:
+            self._before_step(self.trainer)
+
+    def after_step(self):
+        if self._after_step:
+            self._after_step(self.trainer)
+
+
+class IterationTimer(HookBase):
+    """
+    Track the time spent for each iteration (each run_step call in the trainer).
+    Print a summary in the end of training.
+
+    This hook uses the time between the call to its :meth:`before_step`
+    and :meth:`after_step` methods.
+    Under the convention that :meth:`before_step` of all hooks should only
+    take negligible amount of time, the :class:`IterationTimer` hook should be
+    placed at the beginning of the list of hooks to obtain accurate timing.
+    """
+
+    def __init__(self, warmup_iter=3):
+        """
+        Args:
+            warmup_iter (int): the number of iterations at the beginning to exclude
+                from timing.
+        """
+        self._warmup_iter = warmup_iter
+        self._step_timer = Timer()
+        self._start_time = time.perf_counter()
+        self._total_timer = Timer()
+
+    def before_train(self):
+        self._start_time = time.perf_counter()
+        self._total_timer.reset()
+        self._total_timer.pause()
+
+    def after_train(self):
+        logger = logging.getLogger(__name__)
+        total_time = time.perf_counter() - self._start_time
+        total_time_minus_hooks = self._total_timer.seconds()
+        hook_time = total_time - total_time_minus_hooks
+
+        num_iter = self.trainer.storage.iter + 1 - self.trainer.start_iter - self._warmup_iter
+
+        if num_iter > 0 and total_time_minus_hooks > 0:
+            # Speed is meaningful only after warmup
+            # NOTE this format is parsed by grep in some scripts
+            logger.info(
+                "Overall training speed: {} iterations in {} ({:.4f} s / it)".format(
+                    num_iter,
+                    str(datetime.timedelta(seconds=int(total_time_minus_hooks))),
+                    total_time_minus_hooks / num_iter,
+                )
+            )
+
+        logger.info(
+            "Total training time: {} ({} on hooks)".format(
+                str(datetime.timedelta(seconds=int(total_time))),
+                str(datetime.timedelta(seconds=int(hook_time))),
+            )
+        )
+
+    def before_step(self):
+        self._step_timer.reset()
+        self._total_timer.resume()
+
+    def after_step(self):
+        # +1 because we're in after_step, the current step is done
+        # but not yet counted
+        iter_done = self.trainer.storage.iter - self.trainer.start_iter + 1
+        if iter_done >= self._warmup_iter:
+            sec = self._step_timer.seconds()
+            self.trainer.storage.put_scalars(time=sec)
+        else:
+            self._start_time = time.perf_counter()
+            self._total_timer.reset()
+
+        self._total_timer.pause()
+
+
+class PeriodicWriter(HookBase):
+    """
+    Write events to EventStorage (by calling ``writer.write()``) periodically.
+
+    It is executed every ``period`` iterations and after the last iteration.
+    Note that ``period`` does not affect how data is smoothed by each writer.
+    """
+
+    def __init__(self, writers, period=20):
+        """
+        Args:
+            writers (list[EventWriter]): a list of EventWriter objects
+            period (int):
+        """
+        self._writers = writers
+        for w in writers:
+            assert isinstance(w, EventWriter), w
+        self._period = period
+
+    def after_step(self):
+        if (self.trainer.iter + 1) % self._period == 0 or (
+            self.trainer.iter == self.trainer.max_iter - 1
+        ):
+            for writer in self._writers:
+                writer.write()
+
+    def after_train(self):
+        for writer in self._writers:
+            # If any new data is found (e.g. produced by other after_train),
+            # write them before closing
+            writer.write()
+            writer.close()
+
+
+class PeriodicCheckpointer(_PeriodicCheckpointer, HookBase):
+    """
+    Same as :class:`detectron2.checkpoint.PeriodicCheckpointer`, but as a hook.
+
+    Note that when used as a hook,
+    it is unable to save additional data other than what's defined
+    by the given `checkpointer`.
+
+    It is executed every ``period`` iterations and after the last iteration.
+    """
+
+    def before_train(self):
+        self.max_iter = self.trainer.max_iter
+
+    def after_step(self):
+        # No way to use **kwargs
+        self.step(self.trainer.iter)
+
+
+class BestCheckpointer(HookBase):
+    """
+    Checkpoints best weights based off given metric.
+
+    This hook should be used in conjunction to and executed after the hook
+    that produces the metric, e.g. `EvalHook`.
+    """
+
+    def __init__(
+        self,
+        eval_period: int,
+        checkpointer: Checkpointer,
+        val_metric: str,
+        mode: str = "max",
+        file_prefix: str = "model_best",
+    ) -> None:
+        """
+        Args:
+            eval_period (int): the period `EvalHook` is set to run.
+            checkpointer: the checkpointer object used to save checkpoints.
+            val_metric (str): validation metric to track for best checkpoint, e.g. "bbox/AP50"
+            mode (str): one of {'max', 'min'}. controls whether the chosen val metric should be
+                maximized or minimized, e.g. for "bbox/AP50" it should be "max"
+            file_prefix (str): the prefix of checkpoint's filename, defaults to "model_best"
+        """
+        self._logger = logging.getLogger(__name__)
+        self._period = eval_period
+        self._val_metric = val_metric
+        assert mode in [
+            "max",
+            "min",
+        ], f'Mode "{mode}" to `BestCheckpointer` is unknown. It should be one of {"max", "min"}.'
+        if mode == "max":
+            self._compare = operator.gt
+        else:
+            self._compare = operator.lt
+        self._checkpointer = checkpointer
+        self._file_prefix = file_prefix
+        self.best_metric = None
+        self.best_iter = None
+
+    def _update_best(self, val, iteration):
+        if math.isnan(val) or math.isinf(val):
+            return False
+        self.best_metric = val
+        self.best_iter = iteration
+        return True
+
+    def _best_checking(self):
+        metric_tuple = self.trainer.storage.latest().get(self._val_metric)
+        if metric_tuple is None:
+            self._logger.warning(
+                f"Given val metric {self._val_metric} does not seem to be computed/stored."
+                "Will not be checkpointing based on it."
+            )
+            return
+        else:
+            latest_metric, metric_iter = metric_tuple
+
+        if self.best_metric is None:
+            if self._update_best(latest_metric, metric_iter):
+                additional_state = {"iteration": metric_iter}
+                self._checkpointer.save(f"{self._file_prefix}", **additional_state)
+                self._logger.info(
+                    f"Saved first model at {self.best_metric:0.5f} @ {self.best_iter} steps"
+                )
+        elif self._compare(latest_metric, self.best_metric):
+            additional_state = {"iteration": metric_iter}
+            self._checkpointer.save(f"{self._file_prefix}", **additional_state)
+            self._logger.info(
+                f"Saved best model as latest eval score for {self._val_metric} is "
+                f"{latest_metric:0.5f}, better than last best score "
+                f"{self.best_metric:0.5f} @ iteration {self.best_iter}."
+            )
+            self._update_best(latest_metric, metric_iter)
+        else:
+            self._logger.info(
+                f"Not saving as latest eval score for {self._val_metric} is {latest_metric:0.5f}, "
+                f"not better than best score {self.best_metric:0.5f} @ iteration {self.best_iter}."
+            )
+
+    def after_step(self):
+        # same conditions as `EvalHook`
+        next_iter = self.trainer.iter + 1
+        if (
+            self._period > 0
+            and next_iter % self._period == 0
+            and next_iter != self.trainer.max_iter
+        ):
+            self._best_checking()
+
+    def after_train(self):
+        # same conditions as `EvalHook`
+        if self.trainer.iter + 1 >= self.trainer.max_iter:
+            self._best_checking()
+
+
+class LRScheduler(HookBase):
+    """
+    A hook which executes a torch builtin LR scheduler and summarizes the LR.
+    It is executed after every iteration.
+    """
+
+    def __init__(self, optimizer=None, scheduler=None):
+        """
+        Args:
+            optimizer (torch.optim.Optimizer):
+            scheduler (torch.optim.LRScheduler or fvcore.common.param_scheduler.ParamScheduler):
+                if a :class:`ParamScheduler` object, it defines the multiplier over the base LR
+                in the optimizer.
+
+        If any argument is not given, will try to obtain it from the trainer.
+        """
+        self._optimizer = optimizer
+        self._scheduler = scheduler
+
+    def before_train(self):
+        self._optimizer = self._optimizer or self.trainer.optimizer
+        if isinstance(self.scheduler, ParamScheduler):
+            self._scheduler = LRMultiplier(
+                self._optimizer,
+                self.scheduler,
+                self.trainer.max_iter,
+                last_iter=self.trainer.iter - 1,
+            )
+        self._best_param_group_id = LRScheduler.get_best_param_group_id(self._optimizer)
+
+    @staticmethod
+    def get_best_param_group_id(optimizer):
+        # NOTE: some heuristics on what LR to summarize
+        # summarize the param group with most parameters
+        largest_group = max(len(g["params"]) for g in optimizer.param_groups)
+
+        if largest_group == 1:
+            # If all groups have one parameter,
+            # then find the most common initial LR, and use it for summary
+            lr_count = Counter([g["lr"] for g in optimizer.param_groups])
+            lr = lr_count.most_common()[0][0]
+            for i, g in enumerate(optimizer.param_groups):
+                if g["lr"] == lr:
+                    return i
+        else:
+            for i, g in enumerate(optimizer.param_groups):
+                if len(g["params"]) == largest_group:
+                    return i
+
+    def after_step(self):
+        lr = self._optimizer.param_groups[self._best_param_group_id]["lr"]
+        self.trainer.storage.put_scalar("lr", lr, smoothing_hint=False)
+        self.scheduler.step()
+
+    @property
+    def scheduler(self):
+        return self._scheduler or self.trainer.scheduler
+
+    def state_dict(self):
+        if isinstance(self.scheduler, _LRScheduler):
+            return self.scheduler.state_dict()
+        return {}
+
+    def load_state_dict(self, state_dict):
+        if isinstance(self.scheduler, _LRScheduler):
+            logger = logging.getLogger(__name__)
+            logger.info("Loading scheduler from state_dict ...")
+            self.scheduler.load_state_dict(state_dict)
+
+
+class TorchProfiler(HookBase):
+    """
+    A hook which runs `torch.profiler.profile`.
+
+    Examples:
+    ::
+        hooks.TorchProfiler(
+             lambda trainer: 10 < trainer.iter < 20, self.cfg.OUTPUT_DIR
+        )
+
+    The above example will run the profiler for iteration 10~20 and dump
+    results to ``OUTPUT_DIR``. We did not profile the first few iterations
+    because they are typically slower than the rest.
+    The result files can be loaded in the ``chrome://tracing`` page in chrome browser,
+    and the tensorboard visualizations can be visualized using
+    ``tensorboard --logdir OUTPUT_DIR/log``
+    """
+
+    def __init__(self, enable_predicate, output_dir, *, activities=None, save_tensorboard=True):
+        """
+        Args:
+            enable_predicate (callable[trainer -> bool]): a function which takes a trainer,
+                and returns whether to enable the profiler.
+                It will be called once every step, and can be used to select which steps to profile.
+            output_dir (str): the output directory to dump tracing files.
+            activities (iterable): same as in `torch.profiler.profile`.
+            save_tensorboard (bool): whether to save tensorboard visualizations at (output_dir)/log/
+        """
+        self._enable_predicate = enable_predicate
+        self._activities = activities
+        self._output_dir = output_dir
+        self._save_tensorboard = save_tensorboard
+
+    def before_step(self):
+        if self._enable_predicate(self.trainer):
+            if self._save_tensorboard:
+                on_trace_ready = torch.profiler.tensorboard_trace_handler(
+                    os.path.join(
+                        self._output_dir,
+                        "log",
+                        "profiler-tensorboard-iter{}".format(self.trainer.iter),
+                    ),
+                    f"worker{comm.get_rank()}",
+                )
+            else:
+                on_trace_ready = None
+            self._profiler = torch.profiler.profile(
+                activities=self._activities,
+                on_trace_ready=on_trace_ready,
+                record_shapes=True,
+                profile_memory=True,
+                with_stack=True,
+                with_flops=True,
+            )
+            self._profiler.__enter__()
+        else:
+            self._profiler = None
+
+    def after_step(self):
+        if self._profiler is None:
+            return
+        self._profiler.__exit__(None, None, None)
+        if not self._save_tensorboard:
+            PathManager.mkdirs(self._output_dir)
+            out_file = os.path.join(
+                self._output_dir, "profiler-trace-iter{}.json".format(self.trainer.iter)
+            )
+            if "://" not in out_file:
+                self._profiler.export_chrome_trace(out_file)
+            else:
+                # Support non-posix filesystems
+                with tempfile.TemporaryDirectory(prefix="detectron2_profiler") as d:
+                    tmp_file = os.path.join(d, "tmp.json")
+                    self._profiler.export_chrome_trace(tmp_file)
+                    with open(tmp_file) as f:
+                        content = f.read()
+                with PathManager.open(out_file, "w") as f:
+                    f.write(content)
+
+
+class AutogradProfiler(TorchProfiler):
+    """
+    A hook which runs `torch.autograd.profiler.profile`.
+
+    Examples:
+    ::
+        hooks.AutogradProfiler(
+             lambda trainer: 10 < trainer.iter < 20, self.cfg.OUTPUT_DIR
+        )
+
+    The above example will run the profiler for iteration 10~20 and dump
+    results to ``OUTPUT_DIR``. We did not profile the first few iterations
+    because they are typically slower than the rest.
+    The result files can be loaded in the ``chrome://tracing`` page in chrome browser.
+
+    Note:
+        When used together with NCCL on older version of GPUs,
+        autograd profiler may cause deadlock because it unnecessarily allocates
+        memory on every device it sees. The memory management calls, if
+        interleaved with NCCL calls, lead to deadlock on GPUs that do not
+        support ``cudaLaunchCooperativeKernelMultiDevice``.
+    """
+
+    def __init__(self, enable_predicate, output_dir, *, use_cuda=True):
+        """
+        Args:
+            enable_predicate (callable[trainer -> bool]): a function which takes a trainer,
+                and returns whether to enable the profiler.
+                It will be called once every step, and can be used to select which steps to profile.
+            output_dir (str): the output directory to dump tracing files.
+            use_cuda (bool): same as in `torch.autograd.profiler.profile`.
+        """
+        warnings.warn("AutogradProfiler has been deprecated in favor of TorchProfiler.")
+        self._enable_predicate = enable_predicate
+        self._use_cuda = use_cuda
+        self._output_dir = output_dir
+
+    def before_step(self):
+        if self._enable_predicate(self.trainer):
+            self._profiler = torch.autograd.profiler.profile(use_cuda=self._use_cuda)
+            self._profiler.__enter__()
+        else:
+            self._profiler = None
+
+
+class EvalHook(HookBase):
+    """
+    Run an evaluation function periodically, and at the end of training.
+
+    It is executed every ``eval_period`` iterations and after the last iteration.
+    """
+
+    def __init__(self, eval_period, eval_function, eval_after_train=True):
+        """
+        Args:
+            eval_period (int): the period to run `eval_function`. Set to 0 to
+                not evaluate periodically (but still evaluate after the last iteration
+                if `eval_after_train` is True).
+            eval_function (callable): a function which takes no arguments, and
+                returns a nested dict of evaluation metrics.
+            eval_after_train (bool): whether to evaluate after the last iteration
+
+        Note:
+            This hook must be enabled in all or none workers.
+            If you would like only certain workers to perform evaluation,
+            give other workers a no-op function (`eval_function=lambda: None`).
+        """
+        self._period = eval_period
+        self._func = eval_function
+        self._eval_after_train = eval_after_train
+
+    def _do_eval(self):
+        results = self._func()
+
+        if results:
+            assert isinstance(
+                results, dict
+            ), "Eval function must return a dict. Got {} instead.".format(results)
+
+            flattened_results = flatten_results_dict(results)
+            for k, v in flattened_results.items():
+                try:
+                    v = float(v)
+                except Exception as e:
+                    raise ValueError(
+                        "[EvalHook] eval_function should return a nested dict of float. "
+                        "Got '{}: {}' instead.".format(k, v)
+                    ) from e
+            self.trainer.storage.put_scalars(**flattened_results, smoothing_hint=False)
+
+        # Evaluation may take different time among workers.
+        # A barrier make them start the next iteration together.
+        comm.synchronize()
+
+    def after_step(self):
+        next_iter = self.trainer.iter + 1
+        if self._period > 0 and next_iter % self._period == 0:
+            # do the last eval in after_train
+            if next_iter != self.trainer.max_iter:
+                self._do_eval()
+
+    def after_train(self):
+        # This condition is to prevent the eval from running after a failed training
+        if self._eval_after_train and self.trainer.iter + 1 >= self.trainer.max_iter:
+            self._do_eval()
+        # func is likely a closure that holds reference to the trainer
+        # therefore we clean it to avoid circular reference in the end
+        del self._func
+
+
+class PreciseBN(HookBase):
+    """
+    The standard implementation of BatchNorm uses EMA in inference, which is
+    sometimes suboptimal.
+    This class computes the true average of statistics rather than the moving average,
+    and put true averages to every BN layer in the given model.
+
+    It is executed every ``period`` iterations and after the last iteration.
+    """
+
+    def __init__(self, period, model, data_loader, num_iter):
+        """
+        Args:
+            period (int): the period this hook is run, or 0 to not run during training.
+                The hook will always run in the end of training.
+            model (nn.Module): a module whose all BN layers in training mode will be
+                updated by precise BN.
+                Note that user is responsible for ensuring the BN layers to be
+                updated are in training mode when this hook is triggered.
+            data_loader (iterable): it will produce data to be run by `model(data)`.
+            num_iter (int): number of iterations used to compute the precise
+                statistics.
+        """
+        self._logger = logging.getLogger(__name__)
+        if len(get_bn_modules(model)) == 0:
+            self._logger.info(
+                "PreciseBN is disabled because model does not contain BN layers in training mode."
+            )
+            self._disabled = True
+            return
+
+        self._model = model
+        self._data_loader = data_loader
+        self._num_iter = num_iter
+        self._period = period
+        self._disabled = False
+
+        self._data_iter = None
+
+    def after_step(self):
+        next_iter = self.trainer.iter + 1
+        is_final = next_iter == self.trainer.max_iter
+        if is_final or (self._period > 0 and next_iter % self._period == 0):
+            self.update_stats()
+
+    def update_stats(self):
+        """
+        Update the model with precise statistics. Users can manually call this method.
+        """
+        if self._disabled:
+            return
+
+        if self._data_iter is None:
+            self._data_iter = iter(self._data_loader)
+
+        def data_loader():
+            for num_iter in itertools.count(1):
+                if num_iter % 100 == 0:
+                    self._logger.info(
+                        "Running precise-BN ... {}/{} iterations.".format(num_iter, self._num_iter)
+                    )
+                # This way we can reuse the same iterator
+                yield next(self._data_iter)
+
+        with EventStorage():  # capture events in a new storage to discard them
+            self._logger.info(
+                "Running precise-BN for {} iterations...  ".format(self._num_iter)
+                + "Note that this could produce different statistics every time."
+            )
+            update_bn_stats(self._model, data_loader(), self._num_iter)
+
+
+class TorchMemoryStats(HookBase):
+    """
+    Writes pytorch's cuda memory statistics periodically.
+    """
+
+    def __init__(self, period=20, max_runs=10):
+        """
+        Args:
+            period (int): Output stats each 'period' iterations
+            max_runs (int): Stop the logging after 'max_runs'
+        """
+
+        self._logger = logging.getLogger(__name__)
+        self._period = period
+        self._max_runs = max_runs
+        self._runs = 0
+
+    def after_step(self):
+        if self._runs > self._max_runs:
+            return
+
+        if (self.trainer.iter + 1) % self._period == 0 or (
+            self.trainer.iter == self.trainer.max_iter - 1
+        ):
+            if torch.cuda.is_available():
+                max_reserved_mb = torch.cuda.max_memory_reserved() / 1024.0 / 1024.0
+                reserved_mb = torch.cuda.memory_reserved() / 1024.0 / 1024.0
+                max_allocated_mb = torch.cuda.max_memory_allocated() / 1024.0 / 1024.0
+                allocated_mb = torch.cuda.memory_allocated() / 1024.0 / 1024.0
+
+                self._logger.info(
+                    (
+                        " iter: {} "
+                        " max_reserved_mem: {:.0f}MB "
+                        " reserved_mem: {:.0f}MB "
+                        " max_allocated_mem: {:.0f}MB "
+                        " allocated_mem: {:.0f}MB "
+                    ).format(
+                        self.trainer.iter,
+                        max_reserved_mb,
+                        reserved_mb,
+                        max_allocated_mb,
+                        allocated_mb,
+                    )
+                )
+
+                self._runs += 1
+                if self._runs == self._max_runs:
+                    mem_summary = torch.cuda.memory_summary()
+                    self._logger.info("\n" + mem_summary)
+
+                torch.cuda.reset_peak_memory_stats()
diff --git a/detectron2/engine/launch.py b/detectron2/engine/launch.py
new file mode 100644
index 0000000000000000000000000000000000000000..7052c5040e4d9e6553a1b371518cb53fb056524e
--- /dev/null
+++ b/detectron2/engine/launch.py
@@ -0,0 +1,123 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+import logging
+from datetime import timedelta
+import torch
+import torch.distributed as dist
+import torch.multiprocessing as mp
+
+from detectron2.utils import comm
+
+__all__ = ["DEFAULT_TIMEOUT", "launch"]
+
+DEFAULT_TIMEOUT = timedelta(minutes=30)
+
+
+def _find_free_port():
+    import socket
+
+    sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+    # Binding to port 0 will cause the OS to find an available port for us
+    sock.bind(("", 0))
+    port = sock.getsockname()[1]
+    sock.close()
+    # NOTE: there is still a chance the port could be taken by other processes.
+    return port
+
+
+def launch(
+    main_func,
+    # Should be num_processes_per_machine, but kept for compatibility.
+    num_gpus_per_machine,
+    num_machines=1,
+    machine_rank=0,
+    dist_url=None,
+    args=(),
+    timeout=DEFAULT_TIMEOUT,
+):
+    """
+    Launch multi-process or distributed training.
+    This function must be called on all machines involved in the training.
+    It will spawn child processes (defined by ``num_gpus_per_machine``) on each machine.
+
+    Args:
+        main_func: a function that will be called by `main_func(*args)`
+        num_gpus_per_machine (int): number of processes per machine. When
+            using GPUs, this should be the number of GPUs.
+        num_machines (int): the total number of machines
+        machine_rank (int): the rank of this machine
+        dist_url (str): url to connect to for distributed jobs, including protocol
+                       e.g. "tcp://127.0.0.1:8686".
+                       Can be set to "auto" to automatically select a free port on localhost
+        timeout (timedelta): timeout of the distributed workers
+        args (tuple): arguments passed to main_func
+    """
+    world_size = num_machines * num_gpus_per_machine
+    if world_size > 1:
+        # https://github.com/pytorch/pytorch/pull/14391
+        # TODO prctl in spawned processes
+
+        if dist_url == "auto":
+            assert num_machines == 1, "dist_url=auto not supported in multi-machine jobs."
+            port = _find_free_port()
+            dist_url = f"tcp://127.0.0.1:{port}"
+        if num_machines > 1 and dist_url.startswith("file://"):
+            logger = logging.getLogger(__name__)
+            logger.warning(
+                "file:// is not a reliable init_method in multi-machine jobs. Prefer tcp://"
+            )
+
+        mp.start_processes(
+            _distributed_worker,
+            nprocs=num_gpus_per_machine,
+            args=(
+                main_func,
+                world_size,
+                num_gpus_per_machine,
+                machine_rank,
+                dist_url,
+                args,
+                timeout,
+            ),
+            daemon=False,
+        )
+    else:
+        main_func(*args)
+
+
+def _distributed_worker(
+    local_rank,
+    main_func,
+    world_size,
+    num_gpus_per_machine,
+    machine_rank,
+    dist_url,
+    args,
+    timeout=DEFAULT_TIMEOUT,
+):
+    has_gpu = torch.cuda.is_available()
+    if has_gpu:
+        assert num_gpus_per_machine <= torch.cuda.device_count()
+    global_rank = machine_rank * num_gpus_per_machine + local_rank
+    try:
+        dist.init_process_group(
+            backend="NCCL" if has_gpu else "GLOO",
+            init_method=dist_url,
+            world_size=world_size,
+            rank=global_rank,
+            timeout=timeout,
+        )
+    except Exception as e:
+        logger = logging.getLogger(__name__)
+        logger.error("Process group URL: {}".format(dist_url))
+        raise e
+
+    # Setup the local process group.
+    comm.create_local_process_group(num_gpus_per_machine)
+    if has_gpu:
+        torch.cuda.set_device(local_rank)
+
+    # synchronize is needed here to prevent a possible timeout after calling init_process_group
+    # See: https://github.com/facebookresearch/maskrcnn-benchmark/issues/172
+    comm.synchronize()
+
+    main_func(*args)
diff --git a/detectron2/engine/train_loop.py b/detectron2/engine/train_loop.py
new file mode 100644
index 0000000000000000000000000000000000000000..738a69de946ae7741e2e16d322592076b3d1014d
--- /dev/null
+++ b/detectron2/engine/train_loop.py
@@ -0,0 +1,530 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) Facebook, Inc. and its affiliates.
+import concurrent.futures
+import logging
+import numpy as np
+import time
+import weakref
+from typing import List, Mapping, Optional
+import torch
+from torch.nn.parallel import DataParallel, DistributedDataParallel
+
+import detectron2.utils.comm as comm
+from detectron2.utils.events import EventStorage, get_event_storage
+from detectron2.utils.logger import _log_api_usage
+
+__all__ = ["HookBase", "TrainerBase", "SimpleTrainer", "AMPTrainer"]
+
+
+class HookBase:
+    """
+    Base class for hooks that can be registered with :class:`TrainerBase`.
+
+    Each hook can implement 4 methods. The way they are called is demonstrated
+    in the following snippet:
+    ::
+        hook.before_train()
+        for iter in range(start_iter, max_iter):
+            hook.before_step()
+            trainer.run_step()
+            hook.after_step()
+        iter += 1
+        hook.after_train()
+
+    Notes:
+        1. In the hook method, users can access ``self.trainer`` to access more
+           properties about the context (e.g., model, current iteration, or config
+           if using :class:`DefaultTrainer`).
+
+        2. A hook that does something in :meth:`before_step` can often be
+           implemented equivalently in :meth:`after_step`.
+           If the hook takes non-trivial time, it is strongly recommended to
+           implement the hook in :meth:`after_step` instead of :meth:`before_step`.
+           The convention is that :meth:`before_step` should only take negligible time.
+
+           Following this convention will allow hooks that do care about the difference
+           between :meth:`before_step` and :meth:`after_step` (e.g., timer) to
+           function properly.
+
+    """
+
+    trainer: "TrainerBase" = None
+    """
+    A weak reference to the trainer object. Set by the trainer when the hook is registered.
+    """
+
+    def before_train(self):
+        """
+        Called before the first iteration.
+        """
+        pass
+
+    def after_train(self):
+        """
+        Called after the last iteration.
+        """
+        pass
+
+    def before_step(self):
+        """
+        Called before each iteration.
+        """
+        pass
+
+    def after_backward(self):
+        """
+        Called after the backward pass of each iteration.
+        """
+        pass
+
+    def after_step(self):
+        """
+        Called after each iteration.
+        """
+        pass
+
+    def state_dict(self):
+        """
+        Hooks are stateless by default, but can be made checkpointable by
+        implementing `state_dict` and `load_state_dict`.
+        """
+        return {}
+
+
+class TrainerBase:
+    """
+    Base class for iterative trainer with hooks.
+
+    The only assumption we made here is: the training runs in a loop.
+    A subclass can implement what the loop is.
+    We made no assumptions about the existence of dataloader, optimizer, model, etc.
+
+    Attributes:
+        iter(int): the current iteration.
+
+        start_iter(int): The iteration to start with.
+            By convention the minimum possible value is 0.
+
+        max_iter(int): The iteration to end training.
+
+        storage(EventStorage): An EventStorage that's opened during the course of training.
+    """
+
+    def __init__(self) -> None:
+        self._hooks: List[HookBase] = []
+        self.iter: int = 0
+        self.start_iter: int = 0
+        self.max_iter: int
+        self.storage: EventStorage
+        _log_api_usage("trainer." + self.__class__.__name__)
+
+    def register_hooks(self, hooks: List[Optional[HookBase]]) -> None:
+        """
+        Register hooks to the trainer. The hooks are executed in the order
+        they are registered.
+
+        Args:
+            hooks (list[Optional[HookBase]]): list of hooks
+        """
+        hooks = [h for h in hooks if h is not None]
+        for h in hooks:
+            assert isinstance(h, HookBase)
+            # To avoid circular reference, hooks and trainer cannot own each other.
+            # This normally does not matter, but will cause memory leak if the
+            # involved objects contain __del__:
+            # See http://engineering.hearsaysocial.com/2013/06/16/circular-references-in-python/
+            h.trainer = weakref.proxy(self)
+        self._hooks.extend(hooks)
+
+    def train(self, start_iter: int, max_iter: int):
+        """
+        Args:
+            start_iter, max_iter (int): See docs above
+        """
+        logger = logging.getLogger(__name__)
+        logger.info("Starting training from iteration {}".format(start_iter))
+
+        self.iter = self.start_iter = start_iter
+        self.max_iter = max_iter
+
+        with EventStorage(start_iter) as self.storage:
+            try:
+                self.before_train()
+                for self.iter in range(start_iter, max_iter):
+                    self.before_step()
+                    self.run_step()
+                    self.after_step()
+                # self.iter == max_iter can be used by `after_train` to
+                # tell whether the training successfully finished or failed
+                # due to exceptions.
+                self.iter += 1
+            except Exception:
+                logger.exception("Exception during training:")
+                raise
+            finally:
+                self.after_train()
+
+    def before_train(self):
+        for h in self._hooks:
+            h.before_train()
+
+    def after_train(self):
+        self.storage.iter = self.iter
+        for h in self._hooks:
+            h.after_train()
+
+    def before_step(self):
+        # Maintain the invariant that storage.iter == trainer.iter
+        # for the entire execution of each step
+        self.storage.iter = self.iter
+
+        for h in self._hooks:
+            h.before_step()
+
+    def after_backward(self):
+        for h in self._hooks:
+            h.after_backward()
+
+    def after_step(self):
+        for h in self._hooks:
+            h.after_step()
+
+    def run_step(self):
+        raise NotImplementedError
+
+    def state_dict(self):
+        ret = {"iteration": self.iter}
+        hooks_state = {}
+        for h in self._hooks:
+            sd = h.state_dict()
+            if sd:
+                name = type(h).__qualname__
+                if name in hooks_state:
+                    # TODO handle repetitive stateful hooks
+                    continue
+                hooks_state[name] = sd
+        if hooks_state:
+            ret["hooks"] = hooks_state
+        return ret
+
+    def load_state_dict(self, state_dict):
+        logger = logging.getLogger(__name__)
+        self.iter = state_dict["iteration"]
+        for key, value in state_dict.get("hooks", {}).items():
+            for h in self._hooks:
+                try:
+                    name = type(h).__qualname__
+                except AttributeError:
+                    continue
+                if name == key:
+                    h.load_state_dict(value)
+                    break
+            else:
+                logger.warning(f"Cannot find the hook '{key}', its state_dict is ignored.")
+
+
+class SimpleTrainer(TrainerBase):
+    """
+    A simple trainer for the most common type of task:
+    single-cost single-optimizer single-data-source iterative optimization,
+    optionally using data-parallelism.
+    It assumes that every step, you:
+
+    1. Compute the loss with a data from the data_loader.
+    2. Compute the gradients with the above loss.
+    3. Update the model with the optimizer.
+
+    All other tasks during training (checkpointing, logging, evaluation, LR schedule)
+    are maintained by hooks, which can be registered by :meth:`TrainerBase.register_hooks`.
+
+    If you want to do anything fancier than this,
+    either subclass TrainerBase and implement your own `run_step`,
+    or write your own training loop.
+    """
+
+    def __init__(
+        self,
+        model,
+        data_loader,
+        optimizer,
+        gather_metric_period=1,
+        zero_grad_before_forward=False,
+        async_write_metrics=False,
+    ):
+        """
+        Args:
+            model: a torch Module. Takes a data from data_loader and returns a
+                dict of losses.
+            data_loader: an iterable. Contains data to be used to call model.
+            optimizer: a torch optimizer.
+            gather_metric_period: an int. Every gather_metric_period iterations
+                the metrics are gathered from all the ranks to rank 0 and logged.
+            zero_grad_before_forward: whether to zero the gradients before the forward.
+            async_write_metrics: bool. If True, then write metrics asynchronously to improve
+                training speed
+        """
+        super().__init__()
+
+        """
+        We set the model to training mode in the trainer.
+        However it's valid to train a model that's in eval mode.
+        If you want your model (or a submodule of it) to behave
+        like evaluation during training, you can overwrite its train() method.
+        """
+        model.train()
+
+        self.model = model
+        self.data_loader = data_loader
+        # to access the data loader iterator, call `self._data_loader_iter`
+        self._data_loader_iter_obj = None
+        self.optimizer = optimizer
+        self.gather_metric_period = gather_metric_period
+        self.zero_grad_before_forward = zero_grad_before_forward
+        self.async_write_metrics = async_write_metrics
+        # create a thread pool that can execute non critical logic in run_step asynchronically
+        # use only 1 worker so tasks will be executred in order of submitting.
+        self.concurrent_executor = concurrent.futures.ThreadPoolExecutor(max_workers=1)
+
+    def run_step(self):
+        """
+        Implement the standard training logic described above.
+        """
+        assert self.model.training, "[SimpleTrainer] model was changed to eval mode!"
+        start = time.perf_counter()
+        """
+        If you want to do something with the data, you can wrap the dataloader.
+        """
+        data = next(self._data_loader_iter)
+        data_time = time.perf_counter() - start
+
+        if self.zero_grad_before_forward:
+            """
+            If you need to accumulate gradients or do something similar, you can
+            wrap the optimizer with your custom `zero_grad()` method.
+            """
+            self.optimizer.zero_grad()
+
+        """
+        If you want to do something with the losses, you can wrap the model.
+        """
+        loss_dict = self.model(data)
+        if isinstance(loss_dict, torch.Tensor):
+            losses = loss_dict
+            loss_dict = {"total_loss": loss_dict}
+        else:
+            losses = sum(loss_dict.values())
+        if not self.zero_grad_before_forward:
+            """
+            If you need to accumulate gradients or do something similar, you can
+            wrap the optimizer with your custom `zero_grad()` method.
+            """
+            self.optimizer.zero_grad()
+        losses.backward()
+
+        self.after_backward()
+
+        if self.async_write_metrics:
+            # write metrics asynchronically
+            self.concurrent_executor.submit(
+                self._write_metrics, loss_dict, data_time, iter=self.iter
+            )
+        else:
+            self._write_metrics(loss_dict, data_time)
+
+        """
+        If you need gradient clipping/scaling or other processing, you can
+        wrap the optimizer with your custom `step()` method. But it is
+        suboptimal as explained in https://arxiv.org/abs/2006.15704 Sec 3.2.4
+        """
+        self.optimizer.step()
+
+    @property
+    def _data_loader_iter(self):
+        # only create the data loader iterator when it is used
+        if self._data_loader_iter_obj is None:
+            self._data_loader_iter_obj = iter(self.data_loader)
+        return self._data_loader_iter_obj
+
+    def reset_data_loader(self, data_loader_builder):
+        """
+        Delete and replace the current data loader with a new one, which will be created
+        by calling `data_loader_builder` (without argument).
+        """
+        del self.data_loader
+        data_loader = data_loader_builder()
+        self.data_loader = data_loader
+        self._data_loader_iter_obj = None
+
+    def _write_metrics(
+        self,
+        loss_dict: Mapping[str, torch.Tensor],
+        data_time: float,
+        prefix: str = "",
+        iter: Optional[int] = None,
+    ) -> None:
+        logger = logging.getLogger(__name__)
+
+        iter = self.iter if iter is None else iter
+        if (iter + 1) % self.gather_metric_period == 0:
+            try:
+                SimpleTrainer.write_metrics(loss_dict, data_time, iter, prefix)
+            except Exception:
+                logger.exception("Exception in writing metrics: ")
+                raise
+
+    @staticmethod
+    def write_metrics(
+        loss_dict: Mapping[str, torch.Tensor],
+        data_time: float,
+        cur_iter: int,
+        prefix: str = "",
+    ) -> None:
+        """
+        Args:
+            loss_dict (dict): dict of scalar losses
+            data_time (float): time taken by the dataloader iteration
+            prefix (str): prefix for logging keys
+        """
+        metrics_dict = {k: v.detach().cpu().item() for k, v in loss_dict.items()}
+        metrics_dict["data_time"] = data_time
+
+        storage = get_event_storage()
+        # Keep track of data time per rank
+        storage.put_scalar("rank_data_time", data_time, cur_iter=cur_iter)
+
+        # Gather metrics among all workers for logging
+        # This assumes we do DDP-style training, which is currently the only
+        # supported method in detectron2.
+        all_metrics_dict = comm.gather(metrics_dict)
+
+        if comm.is_main_process():
+            # data_time among workers can have high variance. The actual latency
+            # caused by data_time is the maximum among workers.
+            data_time = np.max([x.pop("data_time") for x in all_metrics_dict])
+            storage.put_scalar("data_time", data_time, cur_iter=cur_iter)
+
+            # average the rest metrics
+            metrics_dict = {
+                k: np.mean([x[k] for x in all_metrics_dict]) for k in all_metrics_dict[0].keys()
+            }
+            total_losses_reduced = sum(metrics_dict.values())
+            if not np.isfinite(total_losses_reduced):
+                raise FloatingPointError(
+                    f"Loss became infinite or NaN at iteration={cur_iter}!\n"
+                    f"loss_dict = {metrics_dict}"
+                )
+
+            storage.put_scalar(
+                "{}total_loss".format(prefix), total_losses_reduced, cur_iter=cur_iter
+            )
+            if len(metrics_dict) > 1:
+                storage.put_scalars(cur_iter=cur_iter, **metrics_dict)
+
+    def state_dict(self):
+        ret = super().state_dict()
+        ret["optimizer"] = self.optimizer.state_dict()
+        return ret
+
+    def load_state_dict(self, state_dict):
+        super().load_state_dict(state_dict)
+        self.optimizer.load_state_dict(state_dict["optimizer"])
+
+    def after_train(self):
+        super().after_train()
+        self.concurrent_executor.shutdown(wait=True)
+
+
+class AMPTrainer(SimpleTrainer):
+    """
+    Like :class:`SimpleTrainer`, but uses PyTorch's native automatic mixed precision
+    in the training loop.
+    """
+
+    def __init__(
+        self,
+        model,
+        data_loader,
+        optimizer,
+        gather_metric_period=1,
+        zero_grad_before_forward=False,
+        grad_scaler=None,
+        precision: torch.dtype = torch.float16,
+        log_grad_scaler: bool = False,
+        async_write_metrics=False,
+    ):
+        """
+        Args:
+            model, data_loader, optimizer, gather_metric_period, zero_grad_before_forward,
+                async_write_metrics: same as in :class:`SimpleTrainer`.
+            grad_scaler: torch GradScaler to automatically scale gradients.
+            precision: torch.dtype as the target precision to cast to in computations
+        """
+        unsupported = "AMPTrainer does not support single-process multi-device training!"
+        if isinstance(model, DistributedDataParallel):
+            assert not (model.device_ids and len(model.device_ids) > 1), unsupported
+        assert not isinstance(model, DataParallel), unsupported
+
+        super().__init__(
+            model, data_loader, optimizer, gather_metric_period, zero_grad_before_forward
+        )
+
+        if grad_scaler is None:
+            from torch.cuda.amp import GradScaler
+
+            grad_scaler = GradScaler()
+        self.grad_scaler = grad_scaler
+        self.precision = precision
+        self.log_grad_scaler = log_grad_scaler
+
+    def run_step(self):
+        """
+        Implement the AMP training logic.
+        """
+        assert self.model.training, "[AMPTrainer] model was changed to eval mode!"
+        assert torch.cuda.is_available(), "[AMPTrainer] CUDA is required for AMP training!"
+        from torch.cuda.amp import autocast
+
+        start = time.perf_counter()
+        data = next(self._data_loader_iter)
+        data_time = time.perf_counter() - start
+
+        if self.zero_grad_before_forward:
+            self.optimizer.zero_grad()
+        with autocast(dtype=self.precision):
+            loss_dict = self.model(data)
+            if isinstance(loss_dict, torch.Tensor):
+                losses = loss_dict
+                loss_dict = {"total_loss": loss_dict}
+            else:
+                losses = sum(loss_dict.values())
+
+        if not self.zero_grad_before_forward:
+            self.optimizer.zero_grad()
+
+        self.grad_scaler.scale(losses).backward()
+
+        if self.log_grad_scaler:
+            storage = get_event_storage()
+            storage.put_scalar("[metric]grad_scaler", self.grad_scaler.get_scale())
+
+        self.after_backward()
+
+        if self.async_write_metrics:
+            # write metrics asynchronically
+            self.concurrent_executor.submit(
+                self._write_metrics, loss_dict, data_time, iter=self.iter
+            )
+        else:
+            self._write_metrics(loss_dict, data_time)
+
+        self.grad_scaler.step(self.optimizer)
+        self.grad_scaler.update()
+
+    def state_dict(self):
+        ret = super().state_dict()
+        ret["grad_scaler"] = self.grad_scaler.state_dict()
+        return ret
+
+    def load_state_dict(self, state_dict):
+        super().load_state_dict(state_dict)
+        self.grad_scaler.load_state_dict(state_dict["grad_scaler"])
diff --git a/detectron2/evaluation/__init__.py b/detectron2/evaluation/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..d96609e8f2261a6800fe85fcf3e1eaeaa44455c6
--- /dev/null
+++ b/detectron2/evaluation/__init__.py
@@ -0,0 +1,12 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+from .cityscapes_evaluation import CityscapesInstanceEvaluator, CityscapesSemSegEvaluator
+from .coco_evaluation import COCOEvaluator
+from .rotated_coco_evaluation import RotatedCOCOEvaluator
+from .evaluator import DatasetEvaluator, DatasetEvaluators, inference_context, inference_on_dataset
+from .lvis_evaluation import LVISEvaluator
+from .panoptic_evaluation import COCOPanopticEvaluator
+from .pascal_voc_evaluation import PascalVOCDetectionEvaluator
+from .sem_seg_evaluation import SemSegEvaluator
+from .testing import print_csv_format, verify_results
+
+__all__ = [k for k in globals().keys() if not k.startswith("_")]
diff --git a/detectron2/evaluation/__pycache__/__init__.cpython-310.pyc b/detectron2/evaluation/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d9310725795ac41ae4ec1e20df334c1b0f6b3289
Binary files /dev/null and b/detectron2/evaluation/__pycache__/__init__.cpython-310.pyc differ
diff --git a/detectron2/evaluation/__pycache__/__init__.cpython-39.pyc b/detectron2/evaluation/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2526102bb4ddb9d2a256ce2cb5ff04e6cd13c701
Binary files /dev/null and b/detectron2/evaluation/__pycache__/__init__.cpython-39.pyc differ
diff --git a/detectron2/evaluation/__pycache__/cityscapes_evaluation.cpython-310.pyc b/detectron2/evaluation/__pycache__/cityscapes_evaluation.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d6234bec62864ff651fba1f4b7bd6ee9c8d4b907
Binary files /dev/null and b/detectron2/evaluation/__pycache__/cityscapes_evaluation.cpython-310.pyc differ
diff --git a/detectron2/evaluation/__pycache__/cityscapes_evaluation.cpython-39.pyc b/detectron2/evaluation/__pycache__/cityscapes_evaluation.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9176e063fb64a02ceea01f18446550585e0743ed
Binary files /dev/null and b/detectron2/evaluation/__pycache__/cityscapes_evaluation.cpython-39.pyc differ
diff --git a/detectron2/evaluation/__pycache__/coco_evaluation.cpython-310.pyc b/detectron2/evaluation/__pycache__/coco_evaluation.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9613355c47eb55a7a7e653eb383b22ac71abcf65
Binary files /dev/null and b/detectron2/evaluation/__pycache__/coco_evaluation.cpython-310.pyc differ
diff --git a/detectron2/evaluation/__pycache__/coco_evaluation.cpython-39.pyc b/detectron2/evaluation/__pycache__/coco_evaluation.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..705ef3230983b9178d5b7efe87ba0c192df00ae5
Binary files /dev/null and b/detectron2/evaluation/__pycache__/coco_evaluation.cpython-39.pyc differ
diff --git a/detectron2/evaluation/__pycache__/evaluator.cpython-310.pyc b/detectron2/evaluation/__pycache__/evaluator.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..986d1433a2a8ebff166ee21eaebeb9450765679f
Binary files /dev/null and b/detectron2/evaluation/__pycache__/evaluator.cpython-310.pyc differ
diff --git a/detectron2/evaluation/__pycache__/evaluator.cpython-39.pyc b/detectron2/evaluation/__pycache__/evaluator.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..daaf9b06ea18341e95446c4ccd3b2c7b83aa8964
Binary files /dev/null and b/detectron2/evaluation/__pycache__/evaluator.cpython-39.pyc differ
diff --git a/detectron2/evaluation/__pycache__/fast_eval_api.cpython-310.pyc b/detectron2/evaluation/__pycache__/fast_eval_api.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ad3593909225b9c8a9b4df081e232409bb38094a
Binary files /dev/null and b/detectron2/evaluation/__pycache__/fast_eval_api.cpython-310.pyc differ
diff --git a/detectron2/evaluation/__pycache__/fast_eval_api.cpython-39.pyc b/detectron2/evaluation/__pycache__/fast_eval_api.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..32cd122ed855d6ee3a71759799f2ff452d51aa50
Binary files /dev/null and b/detectron2/evaluation/__pycache__/fast_eval_api.cpython-39.pyc differ
diff --git a/detectron2/evaluation/__pycache__/lvis_evaluation.cpython-310.pyc b/detectron2/evaluation/__pycache__/lvis_evaluation.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c18f9cba9c686d18ec62745d8e4c07231bb5e92f
Binary files /dev/null and b/detectron2/evaluation/__pycache__/lvis_evaluation.cpython-310.pyc differ
diff --git a/detectron2/evaluation/__pycache__/lvis_evaluation.cpython-39.pyc b/detectron2/evaluation/__pycache__/lvis_evaluation.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..86ea4e8829ce207a7ae0ff895579a56829014323
Binary files /dev/null and b/detectron2/evaluation/__pycache__/lvis_evaluation.cpython-39.pyc differ
diff --git a/detectron2/evaluation/__pycache__/panoptic_evaluation.cpython-310.pyc b/detectron2/evaluation/__pycache__/panoptic_evaluation.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5ce9da4b93ff9c2147f65e950fd9c8944674669e
Binary files /dev/null and b/detectron2/evaluation/__pycache__/panoptic_evaluation.cpython-310.pyc differ
diff --git a/detectron2/evaluation/__pycache__/panoptic_evaluation.cpython-39.pyc b/detectron2/evaluation/__pycache__/panoptic_evaluation.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c700f37f837effe44fabb7342102ac19ba303ff1
Binary files /dev/null and b/detectron2/evaluation/__pycache__/panoptic_evaluation.cpython-39.pyc differ
diff --git a/detectron2/evaluation/__pycache__/pascal_voc_evaluation.cpython-310.pyc b/detectron2/evaluation/__pycache__/pascal_voc_evaluation.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..23716ad0fc2225b7d293017b3f93fb3cfdec02dc
Binary files /dev/null and b/detectron2/evaluation/__pycache__/pascal_voc_evaluation.cpython-310.pyc differ
diff --git a/detectron2/evaluation/__pycache__/pascal_voc_evaluation.cpython-39.pyc b/detectron2/evaluation/__pycache__/pascal_voc_evaluation.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..410e15054436a2a6f7e21b43821f0a77c4e4bd15
Binary files /dev/null and b/detectron2/evaluation/__pycache__/pascal_voc_evaluation.cpython-39.pyc differ
diff --git a/detectron2/evaluation/__pycache__/rotated_coco_evaluation.cpython-310.pyc b/detectron2/evaluation/__pycache__/rotated_coco_evaluation.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9f4130eec8149ad5f8916c761999b072e15b1277
Binary files /dev/null and b/detectron2/evaluation/__pycache__/rotated_coco_evaluation.cpython-310.pyc differ
diff --git a/detectron2/evaluation/__pycache__/rotated_coco_evaluation.cpython-39.pyc b/detectron2/evaluation/__pycache__/rotated_coco_evaluation.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..68b48102dc300ca4755e99252ffdcc3810ead80f
Binary files /dev/null and b/detectron2/evaluation/__pycache__/rotated_coco_evaluation.cpython-39.pyc differ
diff --git a/detectron2/evaluation/__pycache__/sem_seg_evaluation.cpython-310.pyc b/detectron2/evaluation/__pycache__/sem_seg_evaluation.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..79d51b22945e4341cc6189714c03fbf878d81e1e
Binary files /dev/null and b/detectron2/evaluation/__pycache__/sem_seg_evaluation.cpython-310.pyc differ
diff --git a/detectron2/evaluation/__pycache__/sem_seg_evaluation.cpython-39.pyc b/detectron2/evaluation/__pycache__/sem_seg_evaluation.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..18a9081e3150e2bb38cca2ee97792e56383ab180
Binary files /dev/null and b/detectron2/evaluation/__pycache__/sem_seg_evaluation.cpython-39.pyc differ
diff --git a/detectron2/evaluation/__pycache__/testing.cpython-310.pyc b/detectron2/evaluation/__pycache__/testing.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3d5c89ceccf1d7c18952ca0e8ea5c80e39b7216b
Binary files /dev/null and b/detectron2/evaluation/__pycache__/testing.cpython-310.pyc differ
diff --git a/detectron2/evaluation/__pycache__/testing.cpython-39.pyc b/detectron2/evaluation/__pycache__/testing.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..559e9dadb1e2cc9621a67ac713515b9b9ddc62af
Binary files /dev/null and b/detectron2/evaluation/__pycache__/testing.cpython-39.pyc differ
diff --git a/detectron2/evaluation/cityscapes_evaluation.py b/detectron2/evaluation/cityscapes_evaluation.py
new file mode 100644
index 0000000000000000000000000000000000000000..9cc7888f0f88ed9b44eae942353a9f4dd4b8782a
--- /dev/null
+++ b/detectron2/evaluation/cityscapes_evaluation.py
@@ -0,0 +1,197 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+import glob
+import logging
+import numpy as np
+import os
+import tempfile
+from collections import OrderedDict
+import torch
+from PIL import Image
+
+from detectron2.data import MetadataCatalog
+from detectron2.utils import comm
+from detectron2.utils.file_io import PathManager
+
+from .evaluator import DatasetEvaluator
+
+
+class CityscapesEvaluator(DatasetEvaluator):
+    """
+    Base class for evaluation using cityscapes API.
+    """
+
+    def __init__(self, dataset_name):
+        """
+        Args:
+            dataset_name (str): the name of the dataset.
+                It must have the following metadata associated with it:
+                "thing_classes", "gt_dir".
+        """
+        self._metadata = MetadataCatalog.get(dataset_name)
+        self._cpu_device = torch.device("cpu")
+        self._logger = logging.getLogger(__name__)
+
+    def reset(self):
+        self._working_dir = tempfile.TemporaryDirectory(prefix="cityscapes_eval_")
+        self._temp_dir = self._working_dir.name
+        # All workers will write to the same results directory
+        # TODO this does not work in distributed training
+        assert (
+            comm.get_local_size() == comm.get_world_size()
+        ), "CityscapesEvaluator currently do not work with multiple machines."
+        self._temp_dir = comm.all_gather(self._temp_dir)[0]
+        if self._temp_dir != self._working_dir.name:
+            self._working_dir.cleanup()
+        self._logger.info(
+            "Writing cityscapes results to temporary directory {} ...".format(self._temp_dir)
+        )
+
+
+class CityscapesInstanceEvaluator(CityscapesEvaluator):
+    """
+    Evaluate instance segmentation results on cityscapes dataset using cityscapes API.
+
+    Note:
+        * It does not work in multi-machine distributed training.
+        * It contains a synchronization, therefore has to be used on all ranks.
+        * Only the main process runs evaluation.
+    """
+
+    def process(self, inputs, outputs):
+        from cityscapesscripts.helpers.labels import name2label
+
+        for input, output in zip(inputs, outputs):
+            file_name = input["file_name"]
+            basename = os.path.splitext(os.path.basename(file_name))[0]
+            pred_txt = os.path.join(self._temp_dir, basename + "_pred.txt")
+
+            if "instances" in output:
+                output = output["instances"].to(self._cpu_device)
+                num_instances = len(output)
+                with open(pred_txt, "w") as fout:
+                    for i in range(num_instances):
+                        pred_class = output.pred_classes[i]
+                        classes = self._metadata.thing_classes[pred_class]
+                        class_id = name2label[classes].id
+                        score = output.scores[i]
+                        mask = output.pred_masks[i].numpy().astype("uint8")
+                        png_filename = os.path.join(
+                            self._temp_dir, basename + "_{}_{}.png".format(i, classes)
+                        )
+
+                        Image.fromarray(mask * 255).save(png_filename)
+                        fout.write(
+                            "{} {} {}\n".format(os.path.basename(png_filename), class_id, score)
+                        )
+            else:
+                # Cityscapes requires a prediction file for every ground truth image.
+                with open(pred_txt, "w") as fout:
+                    pass
+
+    def evaluate(self):
+        """
+        Returns:
+            dict: has a key "segm", whose value is a dict of "AP" and "AP50".
+        """
+        comm.synchronize()
+        if comm.get_rank() > 0:
+            return
+        import cityscapesscripts.evaluation.evalInstanceLevelSemanticLabeling as cityscapes_eval
+
+        self._logger.info("Evaluating results under {} ...".format(self._temp_dir))
+
+        # set some global states in cityscapes evaluation API, before evaluating
+        cityscapes_eval.args.predictionPath = os.path.abspath(self._temp_dir)
+        cityscapes_eval.args.predictionWalk = None
+        cityscapes_eval.args.JSONOutput = False
+        cityscapes_eval.args.colorized = False
+        cityscapes_eval.args.gtInstancesFile = os.path.join(self._temp_dir, "gtInstances.json")
+
+        # These lines are adopted from
+        # https://github.com/mcordts/cityscapesScripts/blob/master/cityscapesscripts/evaluation/evalInstanceLevelSemanticLabeling.py # noqa
+        gt_dir = PathManager.get_local_path(self._metadata.gt_dir)
+        groundTruthImgList = glob.glob(os.path.join(gt_dir, "*", "*_gtFine_instanceIds.png"))
+        assert len(
+            groundTruthImgList
+        ), "Cannot find any ground truth images to use for evaluation. Searched for: {}".format(
+            cityscapes_eval.args.groundTruthSearch
+        )
+        predictionImgList = []
+        for gt in groundTruthImgList:
+            predictionImgList.append(cityscapes_eval.getPrediction(gt, cityscapes_eval.args))
+        results = cityscapes_eval.evaluateImgLists(
+            predictionImgList, groundTruthImgList, cityscapes_eval.args
+        )["averages"]
+
+        ret = OrderedDict()
+        ret["segm"] = {"AP": results["allAp"] * 100, "AP50": results["allAp50%"] * 100}
+        self._working_dir.cleanup()
+        return ret
+
+
+class CityscapesSemSegEvaluator(CityscapesEvaluator):
+    """
+    Evaluate semantic segmentation results on cityscapes dataset using cityscapes API.
+
+    Note:
+        * It does not work in multi-machine distributed training.
+        * It contains a synchronization, therefore has to be used on all ranks.
+        * Only the main process runs evaluation.
+    """
+
+    def process(self, inputs, outputs):
+        from cityscapesscripts.helpers.labels import trainId2label
+
+        for input, output in zip(inputs, outputs):
+            file_name = input["file_name"]
+            basename = os.path.splitext(os.path.basename(file_name))[0]
+            pred_filename = os.path.join(self._temp_dir, basename + "_pred.png")
+
+            output = output["sem_seg"].argmax(dim=0).to(self._cpu_device).numpy()
+            pred = 255 * np.ones(output.shape, dtype=np.uint8)
+            for train_id, label in trainId2label.items():
+                if label.ignoreInEval:
+                    continue
+                pred[output == train_id] = label.id
+            Image.fromarray(pred).save(pred_filename)
+
+    def evaluate(self):
+        comm.synchronize()
+        if comm.get_rank() > 0:
+            return
+        # Load the Cityscapes eval script *after* setting the required env var,
+        # since the script reads CITYSCAPES_DATASET into global variables at load time.
+        import cityscapesscripts.evaluation.evalPixelLevelSemanticLabeling as cityscapes_eval
+
+        self._logger.info("Evaluating results under {} ...".format(self._temp_dir))
+
+        # set some global states in cityscapes evaluation API, before evaluating
+        cityscapes_eval.args.predictionPath = os.path.abspath(self._temp_dir)
+        cityscapes_eval.args.predictionWalk = None
+        cityscapes_eval.args.JSONOutput = False
+        cityscapes_eval.args.colorized = False
+
+        # These lines are adopted from
+        # https://github.com/mcordts/cityscapesScripts/blob/master/cityscapesscripts/evaluation/evalPixelLevelSemanticLabeling.py # noqa
+        gt_dir = PathManager.get_local_path(self._metadata.gt_dir)
+        groundTruthImgList = glob.glob(os.path.join(gt_dir, "*", "*_gtFine_labelIds.png"))
+        assert len(
+            groundTruthImgList
+        ), "Cannot find any ground truth images to use for evaluation. Searched for: {}".format(
+            cityscapes_eval.args.groundTruthSearch
+        )
+        predictionImgList = []
+        for gt in groundTruthImgList:
+            predictionImgList.append(cityscapes_eval.getPrediction(cityscapes_eval.args, gt))
+        results = cityscapes_eval.evaluateImgLists(
+            predictionImgList, groundTruthImgList, cityscapes_eval.args
+        )
+        ret = OrderedDict()
+        ret["sem_seg"] = {
+            "IoU": 100.0 * results["averageScoreClasses"],
+            "iIoU": 100.0 * results["averageScoreInstClasses"],
+            "IoU_sup": 100.0 * results["averageScoreCategories"],
+            "iIoU_sup": 100.0 * results["averageScoreInstCategories"],
+        }
+        self._working_dir.cleanup()
+        return ret
diff --git a/detectron2/evaluation/coco_evaluation.py b/detectron2/evaluation/coco_evaluation.py
new file mode 100644
index 0000000000000000000000000000000000000000..fe8142cda29613ce1cf78523e422bf598128f590
--- /dev/null
+++ b/detectron2/evaluation/coco_evaluation.py
@@ -0,0 +1,722 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+import contextlib
+import copy
+import io
+import itertools
+import json
+import logging
+import numpy as np
+import os
+import pickle
+from collections import OrderedDict
+import pycocotools.mask as mask_util
+import torch
+from pycocotools.coco import COCO
+from pycocotools.cocoeval import COCOeval
+from tabulate import tabulate
+
+import detectron2.utils.comm as comm
+from detectron2.config import CfgNode
+from detectron2.data import MetadataCatalog
+from detectron2.data.datasets.coco import convert_to_coco_json
+from detectron2.structures import Boxes, BoxMode, pairwise_iou
+from detectron2.utils.file_io import PathManager
+from detectron2.utils.logger import create_small_table
+
+from .evaluator import DatasetEvaluator
+
+try:
+    from detectron2.evaluation.fast_eval_api import COCOeval_opt
+except ImportError:
+    COCOeval_opt = COCOeval
+
+
+class COCOEvaluator(DatasetEvaluator):
+    """
+    Evaluate AR for object proposals, AP for instance detection/segmentation, AP
+    for keypoint detection outputs using COCO's metrics.
+    See http://cocodataset.org/#detection-eval and
+    http://cocodataset.org/#keypoints-eval to understand its metrics.
+    The metrics range from 0 to 100 (instead of 0 to 1), where a -1 or NaN means
+    the metric cannot be computed (e.g. due to no predictions made).
+
+    In addition to COCO, this evaluator is able to support any bounding box detection,
+    instance segmentation, or keypoint detection dataset.
+    """
+
+    def __init__(
+        self,
+        dataset_name,
+        tasks=None,
+        distributed=True,
+        output_dir=None,
+        *,
+        max_dets_per_image=None,
+        use_fast_impl=True,
+        kpt_oks_sigmas=(),
+        allow_cached_coco=True,
+    ):
+        """
+        Args:
+            dataset_name (str): name of the dataset to be evaluated.
+                It must have either the following corresponding metadata:
+
+                    "json_file": the path to the COCO format annotation
+
+                Or it must be in detectron2's standard dataset format
+                so it can be converted to COCO format automatically.
+            tasks (tuple[str]): tasks that can be evaluated under the given
+                configuration. A task is one of "bbox", "segm", "keypoints".
+                By default, will infer this automatically from predictions.
+            distributed (True): if True, will collect results from all ranks and run evaluation
+                in the main process.
+                Otherwise, will only evaluate the results in the current process.
+            output_dir (str): optional, an output directory to dump all
+                results predicted on the dataset. The dump contains two files:
+
+                1. "instances_predictions.pth" a file that can be loaded with `torch.load` and
+                   contains all the results in the format they are produced by the model.
+                2. "coco_instances_results.json" a json file in COCO's result format.
+            max_dets_per_image (int): limit on the maximum number of detections per image.
+                By default in COCO, this limit is to 100, but this can be customized
+                to be greater, as is needed in evaluation metrics AP fixed and AP pool
+                (see https://arxiv.org/pdf/2102.01066.pdf)
+                This doesn't affect keypoint evaluation.
+            use_fast_impl (bool): use a fast but **unofficial** implementation to compute AP.
+                Although the results should be very close to the official implementation in COCO
+                API, it is still recommended to compute results with the official API for use in
+                papers. The faster implementation also uses more RAM.
+            kpt_oks_sigmas (list[float]): The sigmas used to calculate keypoint OKS.
+                See http://cocodataset.org/#keypoints-eval
+                When empty, it will use the defaults in COCO.
+                Otherwise it should be the same length as ROI_KEYPOINT_HEAD.NUM_KEYPOINTS.
+            allow_cached_coco (bool): Whether to use cached coco json from previous validation
+                runs. You should set this to False if you need to use different validation data.
+                Defaults to True.
+        """
+        self._logger = logging.getLogger(__name__)
+        self._distributed = distributed
+        self._output_dir = output_dir
+
+        if use_fast_impl and (COCOeval_opt is COCOeval):
+            self._logger.info("Fast COCO eval is not built. Falling back to official COCO eval.")
+            use_fast_impl = False
+        self._use_fast_impl = use_fast_impl
+
+        # COCOeval requires the limit on the number of detections per image (maxDets) to be a list
+        # with at least 3 elements. The default maxDets in COCOeval is [1, 10, 100], in which the
+        # 3rd element (100) is used as the limit on the number of detections per image when
+        # evaluating AP. COCOEvaluator expects an integer for max_dets_per_image, so for COCOeval,
+        # we reformat max_dets_per_image into [1, 10, max_dets_per_image], based on the defaults.
+        if max_dets_per_image is None:
+            max_dets_per_image = [1, 10, 100]
+        else:
+            max_dets_per_image = [1, 10, max_dets_per_image]
+        self._max_dets_per_image = max_dets_per_image
+
+        if tasks is not None and isinstance(tasks, CfgNode):
+            kpt_oks_sigmas = (
+                tasks.TEST.KEYPOINT_OKS_SIGMAS if not kpt_oks_sigmas else kpt_oks_sigmas
+            )
+            self._logger.warn(
+                "COCO Evaluator instantiated using config, this is deprecated behavior."
+                " Please pass in explicit arguments instead."
+            )
+            self._tasks = None  # Infering it from predictions should be better
+        else:
+            self._tasks = tasks
+
+        self._cpu_device = torch.device("cpu")
+
+        self._metadata = MetadataCatalog.get(dataset_name)
+        if not hasattr(self._metadata, "json_file"):
+            if output_dir is None:
+                raise ValueError(
+                    "output_dir must be provided to COCOEvaluator "
+                    "for datasets not in COCO format."
+                )
+            self._logger.info(f"Trying to convert '{dataset_name}' to COCO format ...")
+
+            cache_path = os.path.join(output_dir, f"{dataset_name}_coco_format.json")
+            self._metadata.json_file = cache_path
+            convert_to_coco_json(dataset_name, cache_path, allow_cached=allow_cached_coco)
+
+        json_file = PathManager.get_local_path(self._metadata.json_file)
+        with contextlib.redirect_stdout(io.StringIO()):
+            self._coco_api = COCO(json_file)
+
+        # Test set json files do not contain annotations (evaluation must be
+        # performed using the COCO evaluation server).
+        self._do_evaluation = "annotations" in self._coco_api.dataset
+        if self._do_evaluation:
+            self._kpt_oks_sigmas = kpt_oks_sigmas
+
+    def reset(self):
+        self._predictions = []
+
+    def process(self, inputs, outputs):
+        """
+        Args:
+            inputs: the inputs to a COCO model (e.g., GeneralizedRCNN).
+                It is a list of dict. Each dict corresponds to an image and
+                contains keys like "height", "width", "file_name", "image_id".
+            outputs: the outputs of a COCO model. It is a list of dicts with key
+                "instances" that contains :class:`Instances`.
+        """
+        for input, output in zip(inputs, outputs):
+            prediction = {"image_id": input["image_id"]}
+
+            if "instances" in output:
+                instances = output["instances"].to(self._cpu_device)
+                prediction["instances"] = instances_to_coco_json(instances, input["image_id"])
+            if "proposals" in output:
+                prediction["proposals"] = output["proposals"].to(self._cpu_device)
+            if len(prediction) > 1:
+                self._predictions.append(prediction)
+
+    def evaluate(self, img_ids=None):
+        """
+        Args:
+            img_ids: a list of image IDs to evaluate on. Default to None for the whole dataset
+        """
+        if self._distributed:
+            comm.synchronize()
+            predictions = comm.gather(self._predictions, dst=0)
+            predictions = list(itertools.chain(*predictions))
+
+            if not comm.is_main_process():
+                return {}
+        else:
+            predictions = self._predictions
+
+        if len(predictions) == 0:
+            self._logger.warning("[COCOEvaluator] Did not receive valid predictions.")
+            return {}
+
+        if self._output_dir:
+            PathManager.mkdirs(self._output_dir)
+            file_path = os.path.join(self._output_dir, "instances_predictions.pth")
+            with PathManager.open(file_path, "wb") as f:
+                torch.save(predictions, f)
+
+        self._results = OrderedDict()
+        if "proposals" in predictions[0]:
+            self._eval_box_proposals(predictions)
+        if "instances" in predictions[0]:
+            self._eval_predictions(predictions, img_ids=img_ids)
+        # Copy so the caller can do whatever with results
+        return copy.deepcopy(self._results)
+
+    def _tasks_from_predictions(self, predictions):
+        """
+        Get COCO API "tasks" (i.e. iou_type) from COCO-format predictions.
+        """
+        tasks = {"bbox"}
+        for pred in predictions:
+            if "segmentation" in pred:
+                tasks.add("segm")
+            if "keypoints" in pred:
+                tasks.add("keypoints")
+        return sorted(tasks)
+
+    def _eval_predictions(self, predictions, img_ids=None):
+        """
+        Evaluate predictions. Fill self._results with the metrics of the tasks.
+        """
+        self._logger.info("Preparing results for COCO format ...")
+        coco_results = list(itertools.chain(*[x["instances"] for x in predictions]))
+        tasks = self._tasks or self._tasks_from_predictions(coco_results)
+
+        # unmap the category ids for COCO
+        if hasattr(self._metadata, "thing_dataset_id_to_contiguous_id"):
+            dataset_id_to_contiguous_id = self._metadata.thing_dataset_id_to_contiguous_id
+            all_contiguous_ids = list(dataset_id_to_contiguous_id.values())
+            num_classes = len(all_contiguous_ids)
+            assert min(all_contiguous_ids) == 0 and max(all_contiguous_ids) == num_classes - 1
+
+            reverse_id_mapping = {v: k for k, v in dataset_id_to_contiguous_id.items()}
+            for result in coco_results:
+                category_id = result["category_id"]
+                assert category_id < num_classes, (
+                    f"A prediction has class={category_id}, "
+                    f"but the dataset only has {num_classes} classes and "
+                    f"predicted class id should be in [0, {num_classes - 1}]."
+                )
+                result["category_id"] = reverse_id_mapping[category_id]
+
+        if self._output_dir:
+            file_path = os.path.join(self._output_dir, "coco_instances_results.json")
+            self._logger.info("Saving results to {}".format(file_path))
+            with PathManager.open(file_path, "w") as f:
+                f.write(json.dumps(coco_results))
+                f.flush()
+
+        if not self._do_evaluation:
+            self._logger.info("Annotations are not available for evaluation.")
+            return
+
+        self._logger.info(
+            "Evaluating predictions with {} COCO API...".format(
+                "unofficial" if self._use_fast_impl else "official"
+            )
+        )
+        for task in sorted(tasks):
+            assert task in {"bbox", "segm", "keypoints"}, f"Got unknown task: {task}!"
+            coco_eval = (
+                _evaluate_predictions_on_coco(
+                    self._coco_api,
+                    coco_results,
+                    task,
+                    kpt_oks_sigmas=self._kpt_oks_sigmas,
+                    cocoeval_fn=COCOeval_opt if self._use_fast_impl else COCOeval,
+                    img_ids=img_ids,
+                    max_dets_per_image=self._max_dets_per_image,
+                )
+                if len(coco_results) > 0
+                else None  # cocoapi does not handle empty results very well
+            )
+
+            res = self._derive_coco_results(
+                coco_eval, task, class_names=self._metadata.get("thing_classes")
+            )
+            self._results[task] = res
+
+    def _eval_box_proposals(self, predictions):
+        """
+        Evaluate the box proposals in predictions.
+        Fill self._results with the metrics for "box_proposals" task.
+        """
+        if self._output_dir:
+            # Saving generated box proposals to file.
+            # Predicted box_proposals are in XYXY_ABS mode.
+            bbox_mode = BoxMode.XYXY_ABS.value
+            ids, boxes, objectness_logits = [], [], []
+            for prediction in predictions:
+                ids.append(prediction["image_id"])
+                boxes.append(prediction["proposals"].proposal_boxes.tensor.numpy())
+                objectness_logits.append(prediction["proposals"].objectness_logits.numpy())
+
+            proposal_data = {
+                "boxes": boxes,
+                "objectness_logits": objectness_logits,
+                "ids": ids,
+                "bbox_mode": bbox_mode,
+            }
+            with PathManager.open(os.path.join(self._output_dir, "box_proposals.pkl"), "wb") as f:
+                pickle.dump(proposal_data, f)
+
+        if not self._do_evaluation:
+            self._logger.info("Annotations are not available for evaluation.")
+            return
+
+        self._logger.info("Evaluating bbox proposals ...")
+        res = {}
+        areas = {"all": "", "small": "s", "medium": "m", "large": "l"}
+        for limit in [100, 1000]:
+            for area, suffix in areas.items():
+                stats = _evaluate_box_proposals(predictions, self._coco_api, area=area, limit=limit)
+                key = "AR{}@{:d}".format(suffix, limit)
+                res[key] = float(stats["ar"].item() * 100)
+        self._logger.info("Proposal metrics: \n" + create_small_table(res))
+        self._results["box_proposals"] = res
+
+    def _derive_coco_results(self, coco_eval, iou_type, class_names=None):
+        """
+        Derive the desired score numbers from summarized COCOeval.
+
+        Args:
+            coco_eval (None or COCOEval): None represents no predictions from model.
+            iou_type (str):
+            class_names (None or list[str]): if provided, will use it to predict
+                per-category AP.
+
+        Returns:
+            a dict of {metric name: score}
+        """
+
+        metrics = {
+            "bbox": ["AP", "AP50", "AP75", "APs", "APm", "APl"],
+            "segm": ["AP", "AP50", "AP75", "APs", "APm", "APl"],
+            "keypoints": ["AP", "AP50", "AP75", "APm", "APl"],
+        }[iou_type]
+
+        if coco_eval is None:
+            self._logger.warn("No predictions from the model!")
+            return {metric: float("nan") for metric in metrics}
+
+        # the standard metrics
+        results = {
+            metric: float(coco_eval.stats[idx] * 100 if coco_eval.stats[idx] >= 0 else "nan")
+            for idx, metric in enumerate(metrics)
+        }
+        self._logger.info(
+            "Evaluation results for {}: \n".format(iou_type) + create_small_table(results)
+        )
+        if not np.isfinite(sum(results.values())):
+            self._logger.info("Some metrics cannot be computed and is shown as NaN.")
+
+        if class_names is None or len(class_names) <= 1:
+            return results
+        # Compute per-category AP
+        # from https://github.com/facebookresearch/Detectron/blob/a6a835f5b8208c45d0dce217ce9bbda915f44df7/detectron/datasets/json_dataset_evaluator.py#L222-L252 # noqa
+        precisions = coco_eval.eval["precision"]
+        # precision has dims (iou, recall, cls, area range, max dets)
+        assert len(class_names) == precisions.shape[2]
+
+        results_per_category = []
+        for idx, name in enumerate(class_names):
+            # area range index 0: all area ranges
+            # max dets index -1: typically 100 per image
+            precision = precisions[:, :, idx, 0, -1]
+            precision = precision[precision > -1]
+            ap = np.mean(precision) if precision.size else float("nan")
+            results_per_category.append(("{}".format(name), float(ap * 100)))
+
+        # tabulate it
+        N_COLS = min(6, len(results_per_category) * 2)
+        results_flatten = list(itertools.chain(*results_per_category))
+        results_2d = itertools.zip_longest(*[results_flatten[i::N_COLS] for i in range(N_COLS)])
+        table = tabulate(
+            results_2d,
+            tablefmt="pipe",
+            floatfmt=".3f",
+            headers=["category", "AP"] * (N_COLS // 2),
+            numalign="left",
+        )
+        self._logger.info("Per-category {} AP: \n".format(iou_type) + table)
+
+        results.update({"AP-" + name: ap for name, ap in results_per_category})
+        return results
+
+
+def instances_to_coco_json(instances, img_id):
+    """
+    Dump an "Instances" object to a COCO-format json that's used for evaluation.
+
+    Args:
+        instances (Instances):
+        img_id (int): the image id
+
+    Returns:
+        list[dict]: list of json annotations in COCO format.
+    """
+    num_instance = len(instances)
+    if num_instance == 0:
+        return []
+
+    boxes = instances.pred_boxes.tensor.numpy()
+    boxes = BoxMode.convert(boxes, BoxMode.XYXY_ABS, BoxMode.XYWH_ABS)
+    boxes = boxes.tolist()
+    scores = instances.scores.tolist()
+    classes = instances.pred_classes.tolist()
+
+    has_mask = instances.has("pred_masks")
+    if has_mask:
+        # use RLE to encode the masks, because they are too large and takes memory
+        # since this evaluator stores outputs of the entire dataset
+        rles = [
+            mask_util.encode(np.array(mask[:, :, None], order="F", dtype="uint8"))[0]
+            for mask in instances.pred_masks
+        ]
+        for rle in rles:
+            # "counts" is an array encoded by mask_util as a byte-stream. Python3's
+            # json writer which always produces strings cannot serialize a bytestream
+            # unless you decode it. Thankfully, utf-8 works out (which is also what
+            # the pycocotools/_mask.pyx does).
+            rle["counts"] = rle["counts"].decode("utf-8")
+
+    has_keypoints = instances.has("pred_keypoints")
+    if has_keypoints:
+        keypoints = instances.pred_keypoints
+
+    results = []
+    for k in range(num_instance):
+        result = {
+            "image_id": img_id,
+            "category_id": classes[k],
+            "bbox": boxes[k],
+            "score": scores[k],
+        }
+        if has_mask:
+            result["segmentation"] = rles[k]
+        if has_keypoints:
+            # In COCO annotations,
+            # keypoints coordinates are pixel indices.
+            # However our predictions are floating point coordinates.
+            # Therefore we subtract 0.5 to be consistent with the annotation format.
+            # This is the inverse of data loading logic in `datasets/coco.py`.
+            keypoints[k][:, :2] -= 0.5
+            result["keypoints"] = keypoints[k].flatten().tolist()
+        results.append(result)
+    return results
+
+
+# inspired from Detectron:
+# https://github.com/facebookresearch/Detectron/blob/a6a835f5b8208c45d0dce217ce9bbda915f44df7/detectron/datasets/json_dataset_evaluator.py#L255 # noqa
+def _evaluate_box_proposals(dataset_predictions, coco_api, thresholds=None, area="all", limit=None):
+    """
+    Evaluate detection proposal recall metrics. This function is a much
+    faster alternative to the official COCO API recall evaluation code. However,
+    it produces slightly different results.
+    """
+    # Record max overlap value for each gt box
+    # Return vector of overlap values
+    areas = {
+        "all": 0,
+        "small": 1,
+        "medium": 2,
+        "large": 3,
+        "96-128": 4,
+        "128-256": 5,
+        "256-512": 6,
+        "512-inf": 7,
+    }
+    area_ranges = [
+        [0**2, 1e5**2],  # all
+        [0**2, 32**2],  # small
+        [32**2, 96**2],  # medium
+        [96**2, 1e5**2],  # large
+        [96**2, 128**2],  # 96-128
+        [128**2, 256**2],  # 128-256
+        [256**2, 512**2],  # 256-512
+        [512**2, 1e5**2],
+    ]  # 512-inf
+    assert area in areas, "Unknown area range: {}".format(area)
+    area_range = area_ranges[areas[area]]
+    gt_overlaps = []
+    num_pos = 0
+
+    for prediction_dict in dataset_predictions:
+        predictions = prediction_dict["proposals"]
+
+        # sort predictions in descending order
+        # TODO maybe remove this and make it explicit in the documentation
+        inds = predictions.objectness_logits.sort(descending=True)[1]
+        predictions = predictions[inds]
+
+        ann_ids = coco_api.getAnnIds(imgIds=prediction_dict["image_id"])
+        anno = coco_api.loadAnns(ann_ids)
+        gt_boxes = [
+            BoxMode.convert(obj["bbox"], BoxMode.XYWH_ABS, BoxMode.XYXY_ABS)
+            for obj in anno
+            if obj["iscrowd"] == 0
+        ]
+        gt_boxes = torch.as_tensor(gt_boxes).reshape(-1, 4)  # guard against no boxes
+        gt_boxes = Boxes(gt_boxes)
+        gt_areas = torch.as_tensor([obj["area"] for obj in anno if obj["iscrowd"] == 0])
+
+        if len(gt_boxes) == 0 or len(predictions) == 0:
+            continue
+
+        valid_gt_inds = (gt_areas >= area_range[0]) & (gt_areas <= area_range[1])
+        gt_boxes = gt_boxes[valid_gt_inds]
+
+        num_pos += len(gt_boxes)
+
+        if len(gt_boxes) == 0:
+            continue
+
+        if limit is not None and len(predictions) > limit:
+            predictions = predictions[:limit]
+
+        overlaps = pairwise_iou(predictions.proposal_boxes, gt_boxes)
+
+        _gt_overlaps = torch.zeros(len(gt_boxes))
+        for j in range(min(len(predictions), len(gt_boxes))):
+            # find which proposal box maximally covers each gt box
+            # and get the iou amount of coverage for each gt box
+            max_overlaps, argmax_overlaps = overlaps.max(dim=0)
+
+            # find which gt box is 'best' covered (i.e. 'best' = most iou)
+            gt_ovr, gt_ind = max_overlaps.max(dim=0)
+            assert gt_ovr >= 0
+            # find the proposal box that covers the best covered gt box
+            box_ind = argmax_overlaps[gt_ind]
+            # record the iou coverage of this gt box
+            _gt_overlaps[j] = overlaps[box_ind, gt_ind]
+            assert _gt_overlaps[j] == gt_ovr
+            # mark the proposal box and the gt box as used
+            overlaps[box_ind, :] = -1
+            overlaps[:, gt_ind] = -1
+
+        # append recorded iou coverage level
+        gt_overlaps.append(_gt_overlaps)
+    gt_overlaps = (
+        torch.cat(gt_overlaps, dim=0) if len(gt_overlaps) else torch.zeros(0, dtype=torch.float32)
+    )
+    gt_overlaps, _ = torch.sort(gt_overlaps)
+
+    if thresholds is None:
+        step = 0.05
+        thresholds = torch.arange(0.5, 0.95 + 1e-5, step, dtype=torch.float32)
+    recalls = torch.zeros_like(thresholds)
+    # compute recall for each iou threshold
+    for i, t in enumerate(thresholds):
+        recalls[i] = (gt_overlaps >= t).float().sum() / float(num_pos)
+    # ar = 2 * np.trapz(recalls, thresholds)
+    ar = recalls.mean()
+    return {
+        "ar": ar,
+        "recalls": recalls,
+        "thresholds": thresholds,
+        "gt_overlaps": gt_overlaps,
+        "num_pos": num_pos,
+    }
+
+
+def _evaluate_predictions_on_coco(
+    coco_gt,
+    coco_results,
+    iou_type,
+    kpt_oks_sigmas=None,
+    cocoeval_fn=COCOeval_opt,
+    img_ids=None,
+    max_dets_per_image=None,
+):
+    """
+    Evaluate the coco results using COCOEval API.
+    """
+    assert len(coco_results) > 0
+
+    if iou_type == "segm":
+        coco_results = copy.deepcopy(coco_results)
+        # When evaluating mask AP, if the results contain bbox, cocoapi will
+        # use the box area as the area of the instance, instead of the mask area.
+        # This leads to a different definition of small/medium/large.
+        # We remove the bbox field to let mask AP use mask area.
+        for c in coco_results:
+            c.pop("bbox", None)
+
+    coco_dt = coco_gt.loadRes(coco_results)
+    coco_eval = cocoeval_fn(coco_gt, coco_dt, iou_type)
+    # For COCO, the default max_dets_per_image is [1, 10, 100].
+    if max_dets_per_image is None:
+        max_dets_per_image = [1, 10, 100]  # Default from COCOEval
+    else:
+        assert (
+            len(max_dets_per_image) >= 3
+        ), "COCOeval requires maxDets (and max_dets_per_image) to have length at least 3"
+        # In the case that user supplies a custom input for max_dets_per_image,
+        # apply COCOevalMaxDets to evaluate AP with the custom input.
+        if max_dets_per_image[2] != 100:
+            coco_eval = COCOevalMaxDets(coco_gt, coco_dt, iou_type)
+    if iou_type != "keypoints":
+        coco_eval.params.maxDets = max_dets_per_image
+
+    if img_ids is not None:
+        coco_eval.params.imgIds = img_ids
+
+    if iou_type == "keypoints":
+        # Use the COCO default keypoint OKS sigmas unless overrides are specified
+        if kpt_oks_sigmas:
+            assert hasattr(coco_eval.params, "kpt_oks_sigmas"), "pycocotools is too old!"
+            coco_eval.params.kpt_oks_sigmas = np.array(kpt_oks_sigmas)
+        # COCOAPI requires every detection and every gt to have keypoints, so
+        # we just take the first entry from both
+        num_keypoints_dt = len(coco_results[0]["keypoints"]) // 3
+        num_keypoints_gt = len(next(iter(coco_gt.anns.values()))["keypoints"]) // 3
+        num_keypoints_oks = len(coco_eval.params.kpt_oks_sigmas)
+        assert num_keypoints_oks == num_keypoints_dt == num_keypoints_gt, (
+            f"[COCOEvaluator] Prediction contain {num_keypoints_dt} keypoints. "
+            f"Ground truth contains {num_keypoints_gt} keypoints. "
+            f"The length of cfg.TEST.KEYPOINT_OKS_SIGMAS is {num_keypoints_oks}. "
+            "They have to agree with each other. For meaning of OKS, please refer to "
+            "http://cocodataset.org/#keypoints-eval."
+        )
+
+    coco_eval.evaluate()
+    coco_eval.accumulate()
+    coco_eval.summarize()
+
+    return coco_eval
+
+
+class COCOevalMaxDets(COCOeval):
+    """
+    Modified version of COCOeval for evaluating AP with a custom
+    maxDets (by default for COCO, maxDets is 100)
+    """
+
+    def summarize(self):
+        """
+        Compute and display summary metrics for evaluation results given
+        a custom value for  max_dets_per_image
+        """
+
+        def _summarize(ap=1, iouThr=None, areaRng="all", maxDets=100):
+            p = self.params
+            iStr = " {:<18} {} @[ IoU={:<9} | area={:>6s} | maxDets={:>3d} ] = {:0.3f}"
+            titleStr = "Average Precision" if ap == 1 else "Average Recall"
+            typeStr = "(AP)" if ap == 1 else "(AR)"
+            iouStr = (
+                "{:0.2f}:{:0.2f}".format(p.iouThrs[0], p.iouThrs[-1])
+                if iouThr is None
+                else "{:0.2f}".format(iouThr)
+            )
+
+            aind = [i for i, aRng in enumerate(p.areaRngLbl) if aRng == areaRng]
+            mind = [i for i, mDet in enumerate(p.maxDets) if mDet == maxDets]
+            if ap == 1:
+                # dimension of precision: [TxRxKxAxM]
+                s = self.eval["precision"]
+                # IoU
+                if iouThr is not None:
+                    t = np.where(iouThr == p.iouThrs)[0]
+                    s = s[t]
+                s = s[:, :, :, aind, mind]
+            else:
+                # dimension of recall: [TxKxAxM]
+                s = self.eval["recall"]
+                if iouThr is not None:
+                    t = np.where(iouThr == p.iouThrs)[0]
+                    s = s[t]
+                s = s[:, :, aind, mind]
+            if len(s[s > -1]) == 0:
+                mean_s = -1
+            else:
+                mean_s = np.mean(s[s > -1])
+            print(iStr.format(titleStr, typeStr, iouStr, areaRng, maxDets, mean_s))
+            return mean_s
+
+        def _summarizeDets():
+            stats = np.zeros((12,))
+            # Evaluate AP using the custom limit on maximum detections per image
+            stats[0] = _summarize(1, maxDets=self.params.maxDets[2])
+            stats[1] = _summarize(1, iouThr=0.5, maxDets=self.params.maxDets[2])
+            stats[2] = _summarize(1, iouThr=0.75, maxDets=self.params.maxDets[2])
+            stats[3] = _summarize(1, areaRng="small", maxDets=self.params.maxDets[2])
+            stats[4] = _summarize(1, areaRng="medium", maxDets=self.params.maxDets[2])
+            stats[5] = _summarize(1, areaRng="large", maxDets=self.params.maxDets[2])
+            stats[6] = _summarize(0, maxDets=self.params.maxDets[0])
+            stats[7] = _summarize(0, maxDets=self.params.maxDets[1])
+            stats[8] = _summarize(0, maxDets=self.params.maxDets[2])
+            stats[9] = _summarize(0, areaRng="small", maxDets=self.params.maxDets[2])
+            stats[10] = _summarize(0, areaRng="medium", maxDets=self.params.maxDets[2])
+            stats[11] = _summarize(0, areaRng="large", maxDets=self.params.maxDets[2])
+            return stats
+
+        def _summarizeKps():
+            stats = np.zeros((10,))
+            stats[0] = _summarize(1, maxDets=20)
+            stats[1] = _summarize(1, maxDets=20, iouThr=0.5)
+            stats[2] = _summarize(1, maxDets=20, iouThr=0.75)
+            stats[3] = _summarize(1, maxDets=20, areaRng="medium")
+            stats[4] = _summarize(1, maxDets=20, areaRng="large")
+            stats[5] = _summarize(0, maxDets=20)
+            stats[6] = _summarize(0, maxDets=20, iouThr=0.5)
+            stats[7] = _summarize(0, maxDets=20, iouThr=0.75)
+            stats[8] = _summarize(0, maxDets=20, areaRng="medium")
+            stats[9] = _summarize(0, maxDets=20, areaRng="large")
+            return stats
+
+        if not self.eval:
+            raise Exception("Please run accumulate() first")
+        iouType = self.params.iouType
+        if iouType == "segm" or iouType == "bbox":
+            summarize = _summarizeDets
+        elif iouType == "keypoints":
+            summarize = _summarizeKps
+        self.stats = summarize()
+
+    def __str__(self):
+        self.summarize()
diff --git a/detectron2/evaluation/evaluator.py b/detectron2/evaluation/evaluator.py
new file mode 100644
index 0000000000000000000000000000000000000000..9c0e33e0269dd90fcff82a636f476791868e8dd7
--- /dev/null
+++ b/detectron2/evaluation/evaluator.py
@@ -0,0 +1,233 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+import datetime
+import logging
+import time
+from collections import OrderedDict, abc
+from contextlib import ExitStack, contextmanager
+from typing import List, Union
+import torch
+from torch import nn
+
+from detectron2.utils.comm import get_world_size, is_main_process
+from detectron2.utils.logger import log_every_n_seconds
+
+
+class DatasetEvaluator:
+    """
+    Base class for a dataset evaluator.
+
+    The function :func:`inference_on_dataset` runs the model over
+    all samples in the dataset, and have a DatasetEvaluator to process the inputs/outputs.
+
+    This class will accumulate information of the inputs/outputs (by :meth:`process`),
+    and produce evaluation results in the end (by :meth:`evaluate`).
+    """
+
+    def reset(self):
+        """
+        Preparation for a new round of evaluation.
+        Should be called before starting a round of evaluation.
+        """
+        pass
+
+    def process(self, inputs, outputs):
+        """
+        Process the pair of inputs and outputs.
+        If they contain batches, the pairs can be consumed one-by-one using `zip`:
+
+        .. code-block:: python
+
+            for input_, output in zip(inputs, outputs):
+                # do evaluation on single input/output pair
+                ...
+
+        Args:
+            inputs (list): the inputs that's used to call the model.
+            outputs (list): the return value of `model(inputs)`
+        """
+        pass
+
+    def evaluate(self):
+        """
+        Evaluate/summarize the performance, after processing all input/output pairs.
+
+        Returns:
+            dict:
+                A new evaluator class can return a dict of arbitrary format
+                as long as the user can process the results.
+                In our train_net.py, we expect the following format:
+
+                * key: the name of the task (e.g., bbox)
+                * value: a dict of {metric name: score}, e.g.: {"AP50": 80}
+        """
+        pass
+
+
+class DatasetEvaluators(DatasetEvaluator):
+    """
+    Wrapper class to combine multiple :class:`DatasetEvaluator` instances.
+
+    This class dispatches every evaluation call to
+    all of its :class:`DatasetEvaluator`.
+    """
+
+    def __init__(self, evaluators):
+        """
+        Args:
+            evaluators (list): the evaluators to combine.
+        """
+        super().__init__()
+        self._evaluators = evaluators
+
+    def reset(self):
+        for evaluator in self._evaluators:
+            evaluator.reset()
+
+    def process(self, inputs, outputs):
+        for evaluator in self._evaluators:
+            evaluator.process(inputs, outputs)
+
+    def evaluate(self):
+        results = OrderedDict()
+        for evaluator in self._evaluators:
+            result = evaluator.evaluate()
+            if is_main_process() and result is not None:
+                for k, v in result.items():
+                    assert (
+                        k not in results
+                    ), "Different evaluators produce results with the same key {}".format(k)
+                    results[k] = v
+        return results
+
+
+def inference_on_dataset(
+    model,
+    data_loader,
+    evaluator: Union[DatasetEvaluator, List[DatasetEvaluator], None],
+    callbacks=None,
+):
+    """
+    Run model on the data_loader and evaluate the metrics with evaluator.
+    Also benchmark the inference speed of `model.__call__` accurately.
+    The model will be used in eval mode.
+
+    Args:
+        model (callable): a callable which takes an object from
+            `data_loader` and returns some outputs.
+
+            If it's an nn.Module, it will be temporarily set to `eval` mode.
+            If you wish to evaluate a model in `training` mode instead, you can
+            wrap the given model and override its behavior of `.eval()` and `.train()`.
+        data_loader: an iterable object with a length.
+            The elements it generates will be the inputs to the model.
+        evaluator: the evaluator(s) to run. Use `None` if you only want to benchmark,
+            but don't want to do any evaluation.
+        callbacks (dict of callables): a dictionary of callback functions which can be
+            called at each stage of inference.
+
+    Returns:
+        The return value of `evaluator.evaluate()`
+    """
+    num_devices = get_world_size()
+    logger = logging.getLogger(__name__)
+    logger.info("Start inference on {} batches".format(len(data_loader)))
+
+    total = len(data_loader)  # inference data loader must have a fixed length
+    if evaluator is None:
+        # create a no-op evaluator
+        evaluator = DatasetEvaluators([])
+    if isinstance(evaluator, abc.MutableSequence):
+        evaluator = DatasetEvaluators(evaluator)
+    evaluator.reset()
+
+    num_warmup = min(5, total - 1)
+    start_time = time.perf_counter()
+    total_data_time = 0
+    total_compute_time = 0
+    total_eval_time = 0
+    with ExitStack() as stack:
+        if isinstance(model, nn.Module):
+            stack.enter_context(inference_context(model))
+        stack.enter_context(torch.no_grad())
+
+        start_data_time = time.perf_counter()
+        dict.get(callbacks or {}, "on_start", lambda: None)()
+        for idx, inputs in enumerate(data_loader):
+            total_data_time += time.perf_counter() - start_data_time
+            if idx == num_warmup:
+                start_time = time.perf_counter()
+                total_data_time = 0
+                total_compute_time = 0
+                total_eval_time = 0
+
+            start_compute_time = time.perf_counter()
+            dict.get(callbacks or {}, "before_inference", lambda: None)()
+            outputs = model(inputs)
+            dict.get(callbacks or {}, "after_inference", lambda: None)()
+            if torch.cuda.is_available():
+                torch.cuda.synchronize()
+            total_compute_time += time.perf_counter() - start_compute_time
+
+            start_eval_time = time.perf_counter()
+            evaluator.process(inputs, outputs)
+            total_eval_time += time.perf_counter() - start_eval_time
+
+            iters_after_start = idx + 1 - num_warmup * int(idx >= num_warmup)
+            data_seconds_per_iter = total_data_time / iters_after_start
+            compute_seconds_per_iter = total_compute_time / iters_after_start
+            eval_seconds_per_iter = total_eval_time / iters_after_start
+            total_seconds_per_iter = (time.perf_counter() - start_time) / iters_after_start
+            if idx >= num_warmup * 2 or compute_seconds_per_iter > 5:
+                eta = datetime.timedelta(seconds=int(total_seconds_per_iter * (total - idx - 1)))
+                log_every_n_seconds(
+                    logging.INFO,
+                    (
+                        f"Inference done {idx + 1}/{total}. "
+                        f"Dataloading: {data_seconds_per_iter:.4f} s/iter. "
+                        f"Inference: {compute_seconds_per_iter:.4f} s/iter. "
+                        f"Eval: {eval_seconds_per_iter:.4f} s/iter. "
+                        f"Total: {total_seconds_per_iter:.4f} s/iter. "
+                        f"ETA={eta}"
+                    ),
+                    n=5,
+                )
+            start_data_time = time.perf_counter()
+        dict.get(callbacks or {}, "on_end", lambda: None)()
+
+    # Measure the time only for this worker (before the synchronization barrier)
+    total_time = time.perf_counter() - start_time
+    total_time_str = str(datetime.timedelta(seconds=total_time))
+    # NOTE this format is parsed by grep
+    logger.info(
+        "Total inference time: {} ({:.6f} s / iter per device, on {} devices)".format(
+            total_time_str, total_time / (total - num_warmup), num_devices
+        )
+    )
+    total_compute_time_str = str(datetime.timedelta(seconds=int(total_compute_time)))
+    logger.info(
+        "Total inference pure compute time: {} ({:.6f} s / iter per device, on {} devices)".format(
+            total_compute_time_str, total_compute_time / (total - num_warmup), num_devices
+        )
+    )
+
+    results = evaluator.evaluate()
+    # An evaluator may return None when not in main process.
+    # Replace it by an empty dict instead to make it easier for downstream code to handle
+    if results is None:
+        results = {}
+    return results
+
+
+@contextmanager
+def inference_context(model):
+    """
+    A context where the model is temporarily changed to eval mode,
+    and restored to previous mode afterwards.
+
+    Args:
+        model: a torch Module
+    """
+    training_mode = model.training
+    model.eval()
+    yield
+    model.train(training_mode)
diff --git a/detectron2/evaluation/fast_eval_api.py b/detectron2/evaluation/fast_eval_api.py
new file mode 100644
index 0000000000000000000000000000000000000000..2eb202bd5efa3ec3d366027b1debffc269ae8b17
--- /dev/null
+++ b/detectron2/evaluation/fast_eval_api.py
@@ -0,0 +1,121 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+import copy
+import logging
+import numpy as np
+import time
+from pycocotools.cocoeval import COCOeval
+
+from detectron2 import _C
+
+logger = logging.getLogger(__name__)
+
+
+class COCOeval_opt(COCOeval):
+    """
+    This is a slightly modified version of the original COCO API, where the functions evaluateImg()
+    and accumulate() are implemented in C++ to speedup evaluation
+    """
+
+    def evaluate(self):
+        """
+        Run per image evaluation on given images and store results in self.evalImgs_cpp, a
+        datastructure that isn't readable from Python but is used by a c++ implementation of
+        accumulate().  Unlike the original COCO PythonAPI, we don't populate the datastructure
+        self.evalImgs because this datastructure is a computational bottleneck.
+        :return: None
+        """
+        tic = time.time()
+
+        p = self.params
+        # add backward compatibility if useSegm is specified in params
+        if p.useSegm is not None:
+            p.iouType = "segm" if p.useSegm == 1 else "bbox"
+        logger.info("Evaluate annotation type *{}*".format(p.iouType))
+        p.imgIds = list(np.unique(p.imgIds))
+        if p.useCats:
+            p.catIds = list(np.unique(p.catIds))
+        p.maxDets = sorted(p.maxDets)
+        self.params = p
+
+        self._prepare()  # bottleneck
+
+        # loop through images, area range, max detection number
+        catIds = p.catIds if p.useCats else [-1]
+
+        if p.iouType == "segm" or p.iouType == "bbox":
+            computeIoU = self.computeIoU
+        elif p.iouType == "keypoints":
+            computeIoU = self.computeOks
+        self.ious = {
+            (imgId, catId): computeIoU(imgId, catId) for imgId in p.imgIds for catId in catIds
+        }  # bottleneck
+
+        maxDet = p.maxDets[-1]
+
+        # <<<< Beginning of code differences with original COCO API
+        def convert_instances_to_cpp(instances, is_det=False):
+            # Convert annotations for a list of instances in an image to a format that's fast
+            # to access in C++
+            instances_cpp = []
+            for instance in instances:
+                instance_cpp = _C.InstanceAnnotation(
+                    int(instance["id"]),
+                    instance["score"] if is_det else instance.get("score", 0.0),
+                    instance["area"],
+                    bool(instance.get("iscrowd", 0)),
+                    bool(instance.get("ignore", 0)),
+                )
+                instances_cpp.append(instance_cpp)
+            return instances_cpp
+
+        # Convert GT annotations, detections, and IOUs to a format that's fast to access in C++
+        ground_truth_instances = [
+            [convert_instances_to_cpp(self._gts[imgId, catId]) for catId in p.catIds]
+            for imgId in p.imgIds
+        ]
+        detected_instances = [
+            [convert_instances_to_cpp(self._dts[imgId, catId], is_det=True) for catId in p.catIds]
+            for imgId in p.imgIds
+        ]
+        ious = [[self.ious[imgId, catId] for catId in catIds] for imgId in p.imgIds]
+
+        if not p.useCats:
+            # For each image, flatten per-category lists into a single list
+            ground_truth_instances = [[[o for c in i for o in c]] for i in ground_truth_instances]
+            detected_instances = [[[o for c in i for o in c]] for i in detected_instances]
+
+        # Call C++ implementation of self.evaluateImgs()
+        self._evalImgs_cpp = _C.COCOevalEvaluateImages(
+            p.areaRng, maxDet, p.iouThrs, ious, ground_truth_instances, detected_instances
+        )
+        self._evalImgs = None
+
+        self._paramsEval = copy.deepcopy(self.params)
+        toc = time.time()
+        logger.info("COCOeval_opt.evaluate() finished in {:0.2f} seconds.".format(toc - tic))
+        # >>>> End of code differences with original COCO API
+
+    def accumulate(self):
+        """
+        Accumulate per image evaluation results and store the result in self.eval.  Does not
+        support changing parameter settings from those used by self.evaluate()
+        """
+        logger.info("Accumulating evaluation results...")
+        tic = time.time()
+        assert hasattr(
+            self, "_evalImgs_cpp"
+        ), "evaluate() must be called before accmulate() is called."
+
+        self.eval = _C.COCOevalAccumulate(self._paramsEval, self._evalImgs_cpp)
+
+        # recall is num_iou_thresholds X num_categories X num_area_ranges X num_max_detections
+        self.eval["recall"] = np.array(self.eval["recall"]).reshape(
+            self.eval["counts"][:1] + self.eval["counts"][2:]
+        )
+
+        # precision and scores are num_iou_thresholds X num_recall_thresholds X num_categories X
+        # num_area_ranges X num_max_detections
+        self.eval["precision"] = np.array(self.eval["precision"]).reshape(self.eval["counts"])
+        self.eval["scores"] = np.array(self.eval["scores"]).reshape(self.eval["counts"])
+        toc = time.time()
+        logger.info("COCOeval_opt.accumulate() finished in {:0.2f} seconds.".format(toc - tic))
diff --git a/detectron2/evaluation/lvis_evaluation.py b/detectron2/evaluation/lvis_evaluation.py
new file mode 100644
index 0000000000000000000000000000000000000000..6cc854a157dc469be99a9be1bb7d570068adc891
--- /dev/null
+++ b/detectron2/evaluation/lvis_evaluation.py
@@ -0,0 +1,380 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+import copy
+import itertools
+import json
+import logging
+import os
+import pickle
+from collections import OrderedDict
+import torch
+
+import detectron2.utils.comm as comm
+from detectron2.config import CfgNode
+from detectron2.data import MetadataCatalog
+from detectron2.structures import Boxes, BoxMode, pairwise_iou
+from detectron2.utils.file_io import PathManager
+from detectron2.utils.logger import create_small_table
+
+from .coco_evaluation import instances_to_coco_json
+from .evaluator import DatasetEvaluator
+
+
+class LVISEvaluator(DatasetEvaluator):
+    """
+    Evaluate object proposal and instance detection/segmentation outputs using
+    LVIS's metrics and evaluation API.
+    """
+
+    def __init__(
+        self,
+        dataset_name,
+        tasks=None,
+        distributed=True,
+        output_dir=None,
+        *,
+        max_dets_per_image=None,
+    ):
+        """
+        Args:
+            dataset_name (str): name of the dataset to be evaluated.
+                It must have the following corresponding metadata:
+                "json_file": the path to the LVIS format annotation
+            tasks (tuple[str]): tasks that can be evaluated under the given
+                configuration. A task is one of "bbox", "segm".
+                By default, will infer this automatically from predictions.
+            distributed (True): if True, will collect results from all ranks for evaluation.
+                Otherwise, will evaluate the results in the current process.
+            output_dir (str): optional, an output directory to dump results.
+            max_dets_per_image (None or int): limit on maximum detections per image in evaluating AP
+                This limit, by default of the LVIS dataset, is 300.
+        """
+        from lvis import LVIS
+
+        self._logger = logging.getLogger(__name__)
+
+        if tasks is not None and isinstance(tasks, CfgNode):
+            self._logger.warn(
+                "COCO Evaluator instantiated using config, this is deprecated behavior."
+                " Please pass in explicit arguments instead."
+            )
+            self._tasks = None  # Infering it from predictions should be better
+        else:
+            self._tasks = tasks
+
+        self._distributed = distributed
+        self._output_dir = output_dir
+        self._max_dets_per_image = max_dets_per_image
+
+        self._cpu_device = torch.device("cpu")
+
+        self._metadata = MetadataCatalog.get(dataset_name)
+        json_file = PathManager.get_local_path(self._metadata.json_file)
+        self._lvis_api = LVIS(json_file)
+        # Test set json files do not contain annotations (evaluation must be
+        # performed using the LVIS evaluation server).
+        self._do_evaluation = len(self._lvis_api.get_ann_ids()) > 0
+
+    def reset(self):
+        self._predictions = []
+
+    def process(self, inputs, outputs):
+        """
+        Args:
+            inputs: the inputs to a LVIS model (e.g., GeneralizedRCNN).
+                It is a list of dict. Each dict corresponds to an image and
+                contains keys like "height", "width", "file_name", "image_id".
+            outputs: the outputs of a LVIS model. It is a list of dicts with key
+                "instances" that contains :class:`Instances`.
+        """
+        for input, output in zip(inputs, outputs):
+            prediction = {"image_id": input["image_id"]}
+
+            if "instances" in output:
+                instances = output["instances"].to(self._cpu_device)
+                prediction["instances"] = instances_to_coco_json(instances, input["image_id"])
+            if "proposals" in output:
+                prediction["proposals"] = output["proposals"].to(self._cpu_device)
+            self._predictions.append(prediction)
+
+    def evaluate(self):
+        if self._distributed:
+            comm.synchronize()
+            predictions = comm.gather(self._predictions, dst=0)
+            predictions = list(itertools.chain(*predictions))
+
+            if not comm.is_main_process():
+                return
+        else:
+            predictions = self._predictions
+
+        if len(predictions) == 0:
+            self._logger.warning("[LVISEvaluator] Did not receive valid predictions.")
+            return {}
+
+        if self._output_dir:
+            PathManager.mkdirs(self._output_dir)
+            file_path = os.path.join(self._output_dir, "instances_predictions.pth")
+            with PathManager.open(file_path, "wb") as f:
+                torch.save(predictions, f)
+
+        self._results = OrderedDict()
+        if "proposals" in predictions[0]:
+            self._eval_box_proposals(predictions)
+        if "instances" in predictions[0]:
+            self._eval_predictions(predictions)
+        # Copy so the caller can do whatever with results
+        return copy.deepcopy(self._results)
+
+    def _tasks_from_predictions(self, predictions):
+        for pred in predictions:
+            if "segmentation" in pred:
+                return ("bbox", "segm")
+        return ("bbox",)
+
+    def _eval_predictions(self, predictions):
+        """
+        Evaluate predictions. Fill self._results with the metrics of the tasks.
+
+        Args:
+            predictions (list[dict]): list of outputs from the model
+        """
+        self._logger.info("Preparing results in the LVIS format ...")
+        lvis_results = list(itertools.chain(*[x["instances"] for x in predictions]))
+        tasks = self._tasks or self._tasks_from_predictions(lvis_results)
+
+        # LVIS evaluator can be used to evaluate results for COCO dataset categories.
+        # In this case `_metadata` variable will have a field with COCO-specific category mapping.
+        if hasattr(self._metadata, "thing_dataset_id_to_contiguous_id"):
+            reverse_id_mapping = {
+                v: k for k, v in self._metadata.thing_dataset_id_to_contiguous_id.items()
+            }
+            for result in lvis_results:
+                result["category_id"] = reverse_id_mapping[result["category_id"]]
+        else:
+            # unmap the category ids for LVIS (from 0-indexed to 1-indexed)
+            for result in lvis_results:
+                result["category_id"] += 1
+
+        if self._output_dir:
+            file_path = os.path.join(self._output_dir, "lvis_instances_results.json")
+            self._logger.info("Saving results to {}".format(file_path))
+            with PathManager.open(file_path, "w") as f:
+                f.write(json.dumps(lvis_results))
+                f.flush()
+
+        if not self._do_evaluation:
+            self._logger.info("Annotations are not available for evaluation.")
+            return
+
+        self._logger.info("Evaluating predictions ...")
+        for task in sorted(tasks):
+            res = _evaluate_predictions_on_lvis(
+                self._lvis_api,
+                lvis_results,
+                task,
+                max_dets_per_image=self._max_dets_per_image,
+                class_names=self._metadata.get("thing_classes"),
+            )
+            self._results[task] = res
+
+    def _eval_box_proposals(self, predictions):
+        """
+        Evaluate the box proposals in predictions.
+        Fill self._results with the metrics for "box_proposals" task.
+        """
+        if self._output_dir:
+            # Saving generated box proposals to file.
+            # Predicted box_proposals are in XYXY_ABS mode.
+            bbox_mode = BoxMode.XYXY_ABS.value
+            ids, boxes, objectness_logits = [], [], []
+            for prediction in predictions:
+                ids.append(prediction["image_id"])
+                boxes.append(prediction["proposals"].proposal_boxes.tensor.numpy())
+                objectness_logits.append(prediction["proposals"].objectness_logits.numpy())
+
+            proposal_data = {
+                "boxes": boxes,
+                "objectness_logits": objectness_logits,
+                "ids": ids,
+                "bbox_mode": bbox_mode,
+            }
+            with PathManager.open(os.path.join(self._output_dir, "box_proposals.pkl"), "wb") as f:
+                pickle.dump(proposal_data, f)
+
+        if not self._do_evaluation:
+            self._logger.info("Annotations are not available for evaluation.")
+            return
+
+        self._logger.info("Evaluating bbox proposals ...")
+        res = {}
+        areas = {"all": "", "small": "s", "medium": "m", "large": "l"}
+        for limit in [100, 1000]:
+            for area, suffix in areas.items():
+                stats = _evaluate_box_proposals(predictions, self._lvis_api, area=area, limit=limit)
+                key = "AR{}@{:d}".format(suffix, limit)
+                res[key] = float(stats["ar"].item() * 100)
+        self._logger.info("Proposal metrics: \n" + create_small_table(res))
+        self._results["box_proposals"] = res
+
+
+# inspired from Detectron:
+# https://github.com/facebookresearch/Detectron/blob/a6a835f5b8208c45d0dce217ce9bbda915f44df7/detectron/datasets/json_dataset_evaluator.py#L255 # noqa
+def _evaluate_box_proposals(dataset_predictions, lvis_api, thresholds=None, area="all", limit=None):
+    """
+    Evaluate detection proposal recall metrics. This function is a much
+    faster alternative to the official LVIS API recall evaluation code. However,
+    it produces slightly different results.
+    """
+    # Record max overlap value for each gt box
+    # Return vector of overlap values
+    areas = {
+        "all": 0,
+        "small": 1,
+        "medium": 2,
+        "large": 3,
+        "96-128": 4,
+        "128-256": 5,
+        "256-512": 6,
+        "512-inf": 7,
+    }
+    area_ranges = [
+        [0**2, 1e5**2],  # all
+        [0**2, 32**2],  # small
+        [32**2, 96**2],  # medium
+        [96**2, 1e5**2],  # large
+        [96**2, 128**2],  # 96-128
+        [128**2, 256**2],  # 128-256
+        [256**2, 512**2],  # 256-512
+        [512**2, 1e5**2],
+    ]  # 512-inf
+    assert area in areas, "Unknown area range: {}".format(area)
+    area_range = area_ranges[areas[area]]
+    gt_overlaps = []
+    num_pos = 0
+
+    for prediction_dict in dataset_predictions:
+        predictions = prediction_dict["proposals"]
+
+        # sort predictions in descending order
+        # TODO maybe remove this and make it explicit in the documentation
+        inds = predictions.objectness_logits.sort(descending=True)[1]
+        predictions = predictions[inds]
+
+        ann_ids = lvis_api.get_ann_ids(img_ids=[prediction_dict["image_id"]])
+        anno = lvis_api.load_anns(ann_ids)
+        gt_boxes = [
+            BoxMode.convert(obj["bbox"], BoxMode.XYWH_ABS, BoxMode.XYXY_ABS) for obj in anno
+        ]
+        gt_boxes = torch.as_tensor(gt_boxes).reshape(-1, 4)  # guard against no boxes
+        gt_boxes = Boxes(gt_boxes)
+        gt_areas = torch.as_tensor([obj["area"] for obj in anno])
+
+        if len(gt_boxes) == 0 or len(predictions) == 0:
+            continue
+
+        valid_gt_inds = (gt_areas >= area_range[0]) & (gt_areas <= area_range[1])
+        gt_boxes = gt_boxes[valid_gt_inds]
+
+        num_pos += len(gt_boxes)
+
+        if len(gt_boxes) == 0:
+            continue
+
+        if limit is not None and len(predictions) > limit:
+            predictions = predictions[:limit]
+
+        overlaps = pairwise_iou(predictions.proposal_boxes, gt_boxes)
+
+        _gt_overlaps = torch.zeros(len(gt_boxes))
+        for j in range(min(len(predictions), len(gt_boxes))):
+            # find which proposal box maximally covers each gt box
+            # and get the iou amount of coverage for each gt box
+            max_overlaps, argmax_overlaps = overlaps.max(dim=0)
+
+            # find which gt box is 'best' covered (i.e. 'best' = most iou)
+            gt_ovr, gt_ind = max_overlaps.max(dim=0)
+            assert gt_ovr >= 0
+            # find the proposal box that covers the best covered gt box
+            box_ind = argmax_overlaps[gt_ind]
+            # record the iou coverage of this gt box
+            _gt_overlaps[j] = overlaps[box_ind, gt_ind]
+            assert _gt_overlaps[j] == gt_ovr
+            # mark the proposal box and the gt box as used
+            overlaps[box_ind, :] = -1
+            overlaps[:, gt_ind] = -1
+
+        # append recorded iou coverage level
+        gt_overlaps.append(_gt_overlaps)
+    gt_overlaps = (
+        torch.cat(gt_overlaps, dim=0) if len(gt_overlaps) else torch.zeros(0, dtype=torch.float32)
+    )
+    gt_overlaps, _ = torch.sort(gt_overlaps)
+
+    if thresholds is None:
+        step = 0.05
+        thresholds = torch.arange(0.5, 0.95 + 1e-5, step, dtype=torch.float32)
+    recalls = torch.zeros_like(thresholds)
+    # compute recall for each iou threshold
+    for i, t in enumerate(thresholds):
+        recalls[i] = (gt_overlaps >= t).float().sum() / float(num_pos)
+    # ar = 2 * np.trapz(recalls, thresholds)
+    ar = recalls.mean()
+    return {
+        "ar": ar,
+        "recalls": recalls,
+        "thresholds": thresholds,
+        "gt_overlaps": gt_overlaps,
+        "num_pos": num_pos,
+    }
+
+
+def _evaluate_predictions_on_lvis(
+    lvis_gt, lvis_results, iou_type, max_dets_per_image=None, class_names=None
+):
+    """
+    Args:
+        iou_type (str):
+        max_dets_per_image (None or int): limit on maximum detections per image in evaluating AP
+            This limit, by default of the LVIS dataset, is 300.
+        class_names (None or list[str]): if provided, will use it to predict
+            per-category AP.
+
+    Returns:
+        a dict of {metric name: score}
+    """
+    metrics = {
+        "bbox": ["AP", "AP50", "AP75", "APs", "APm", "APl", "APr", "APc", "APf"],
+        "segm": ["AP", "AP50", "AP75", "APs", "APm", "APl", "APr", "APc", "APf"],
+    }[iou_type]
+
+    logger = logging.getLogger(__name__)
+
+    if len(lvis_results) == 0:  # TODO: check if needed
+        logger.warn("No predictions from the model!")
+        return {metric: float("nan") for metric in metrics}
+
+    if iou_type == "segm":
+        lvis_results = copy.deepcopy(lvis_results)
+        # When evaluating mask AP, if the results contain bbox, LVIS API will
+        # use the box area as the area of the instance, instead of the mask area.
+        # This leads to a different definition of small/medium/large.
+        # We remove the bbox field to let mask AP use mask area.
+        for c in lvis_results:
+            c.pop("bbox", None)
+
+    if max_dets_per_image is None:
+        max_dets_per_image = 300  # Default for LVIS dataset
+
+    from lvis import LVISEval, LVISResults
+
+    logger.info(f"Evaluating with max detections per image = {max_dets_per_image}")
+    lvis_results = LVISResults(lvis_gt, lvis_results, max_dets=max_dets_per_image)
+    lvis_eval = LVISEval(lvis_gt, lvis_results, iou_type)
+    lvis_eval.run()
+    lvis_eval.print_results()
+
+    # Pull the standard metrics from the LVIS results
+    results = lvis_eval.get_results()
+    results = {metric: float(results[metric] * 100) for metric in metrics}
+    logger.info("Evaluation results for {}: \n".format(iou_type) + create_small_table(results))
+    return results
diff --git a/detectron2/evaluation/panoptic_evaluation.py b/detectron2/evaluation/panoptic_evaluation.py
new file mode 100644
index 0000000000000000000000000000000000000000..9fb3462b7f9abf6feaa499976bfed526ebd17e31
--- /dev/null
+++ b/detectron2/evaluation/panoptic_evaluation.py
@@ -0,0 +1,199 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+import contextlib
+import io
+import itertools
+import json
+import logging
+import numpy as np
+import os
+import tempfile
+from collections import OrderedDict
+from typing import Optional
+from PIL import Image
+from tabulate import tabulate
+
+from detectron2.data import MetadataCatalog
+from detectron2.utils import comm
+from detectron2.utils.file_io import PathManager
+
+from .evaluator import DatasetEvaluator
+
+logger = logging.getLogger(__name__)
+
+
+class COCOPanopticEvaluator(DatasetEvaluator):
+    """
+    Evaluate Panoptic Quality metrics on COCO using PanopticAPI.
+    It saves panoptic segmentation prediction in `output_dir`
+
+    It contains a synchronize call and has to be called from all workers.
+    """
+
+    def __init__(self, dataset_name: str, output_dir: Optional[str] = None):
+        """
+        Args:
+            dataset_name: name of the dataset
+            output_dir: output directory to save results for evaluation.
+        """
+        self._metadata = MetadataCatalog.get(dataset_name)
+        self._thing_contiguous_id_to_dataset_id = {
+            v: k for k, v in self._metadata.thing_dataset_id_to_contiguous_id.items()
+        }
+        self._stuff_contiguous_id_to_dataset_id = {
+            v: k for k, v in self._metadata.stuff_dataset_id_to_contiguous_id.items()
+        }
+
+        self._output_dir = output_dir
+        if self._output_dir is not None:
+            PathManager.mkdirs(self._output_dir)
+
+    def reset(self):
+        self._predictions = []
+
+    def _convert_category_id(self, segment_info):
+        isthing = segment_info.pop("isthing", None)
+        if isthing is None:
+            # the model produces panoptic category id directly. No more conversion needed
+            return segment_info
+        if isthing is True:
+            segment_info["category_id"] = self._thing_contiguous_id_to_dataset_id[
+                segment_info["category_id"]
+            ]
+        else:
+            segment_info["category_id"] = self._stuff_contiguous_id_to_dataset_id[
+                segment_info["category_id"]
+            ]
+        return segment_info
+
+    def process(self, inputs, outputs):
+        from panopticapi.utils import id2rgb
+
+        for input, output in zip(inputs, outputs):
+            panoptic_img, segments_info = output["panoptic_seg"]
+            panoptic_img = panoptic_img.cpu().numpy()
+            if segments_info is None:
+                # If "segments_info" is None, we assume "panoptic_img" is a
+                # H*W int32 image storing the panoptic_id in the format of
+                # category_id * label_divisor + instance_id. We reserve -1 for
+                # VOID label, and add 1 to panoptic_img since the official
+                # evaluation script uses 0 for VOID label.
+                label_divisor = self._metadata.label_divisor
+                segments_info = []
+                for panoptic_label in np.unique(panoptic_img):
+                    if panoptic_label == -1:
+                        # VOID region.
+                        continue
+                    pred_class = panoptic_label // label_divisor
+                    isthing = (
+                        pred_class in self._metadata.thing_dataset_id_to_contiguous_id.values()
+                    )
+                    segments_info.append(
+                        {
+                            "id": int(panoptic_label) + 1,
+                            "category_id": int(pred_class),
+                            "isthing": bool(isthing),
+                        }
+                    )
+                # Official evaluation script uses 0 for VOID label.
+                panoptic_img += 1
+
+            file_name = os.path.basename(input["file_name"])
+            file_name_png = os.path.splitext(file_name)[0] + ".png"
+            with io.BytesIO() as out:
+                Image.fromarray(id2rgb(panoptic_img)).save(out, format="PNG")
+                segments_info = [self._convert_category_id(x) for x in segments_info]
+                self._predictions.append(
+                    {
+                        "image_id": input["image_id"],
+                        "file_name": file_name_png,
+                        "png_string": out.getvalue(),
+                        "segments_info": segments_info,
+                    }
+                )
+
+    def evaluate(self):
+        comm.synchronize()
+
+        self._predictions = comm.gather(self._predictions)
+        self._predictions = list(itertools.chain(*self._predictions))
+        if not comm.is_main_process():
+            return
+
+        # PanopticApi requires local files
+        gt_json = PathManager.get_local_path(self._metadata.panoptic_json)
+        gt_folder = PathManager.get_local_path(self._metadata.panoptic_root)
+
+        with tempfile.TemporaryDirectory(prefix="panoptic_eval") as pred_dir:
+            logger.info("Writing all panoptic predictions to {} ...".format(pred_dir))
+            for p in self._predictions:
+                with open(os.path.join(pred_dir, p["file_name"]), "wb") as f:
+                    f.write(p.pop("png_string"))
+
+            with open(gt_json, "r") as f:
+                json_data = json.load(f)
+            json_data["annotations"] = self._predictions
+
+            output_dir = self._output_dir or pred_dir
+            predictions_json = os.path.join(output_dir, "predictions.json")
+            with PathManager.open(predictions_json, "w") as f:
+                f.write(json.dumps(json_data))
+
+            from panopticapi.evaluation import pq_compute
+
+            with contextlib.redirect_stdout(io.StringIO()):
+                pq_res = pq_compute(
+                    gt_json,
+                    PathManager.get_local_path(predictions_json),
+                    gt_folder=gt_folder,
+                    pred_folder=pred_dir,
+                )
+
+        res = {}
+        res["PQ"] = 100 * pq_res["All"]["pq"]
+        res["SQ"] = 100 * pq_res["All"]["sq"]
+        res["RQ"] = 100 * pq_res["All"]["rq"]
+        res["PQ_th"] = 100 * pq_res["Things"]["pq"]
+        res["SQ_th"] = 100 * pq_res["Things"]["sq"]
+        res["RQ_th"] = 100 * pq_res["Things"]["rq"]
+        res["PQ_st"] = 100 * pq_res["Stuff"]["pq"]
+        res["SQ_st"] = 100 * pq_res["Stuff"]["sq"]
+        res["RQ_st"] = 100 * pq_res["Stuff"]["rq"]
+
+        results = OrderedDict({"panoptic_seg": res})
+        _print_panoptic_results(pq_res)
+
+        return results
+
+
+def _print_panoptic_results(pq_res):
+    headers = ["", "PQ", "SQ", "RQ", "#categories"]
+    data = []
+    for name in ["All", "Things", "Stuff"]:
+        row = [name] + [pq_res[name][k] * 100 for k in ["pq", "sq", "rq"]] + [pq_res[name]["n"]]
+        data.append(row)
+    table = tabulate(
+        data, headers=headers, tablefmt="pipe", floatfmt=".3f", stralign="center", numalign="center"
+    )
+    logger.info("Panoptic Evaluation Results:\n" + table)
+
+
+if __name__ == "__main__":
+    from detectron2.utils.logger import setup_logger
+
+    logger = setup_logger()
+    import argparse
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--gt-json")
+    parser.add_argument("--gt-dir")
+    parser.add_argument("--pred-json")
+    parser.add_argument("--pred-dir")
+    args = parser.parse_args()
+
+    from panopticapi.evaluation import pq_compute
+
+    with contextlib.redirect_stdout(io.StringIO()):
+        pq_res = pq_compute(
+            args.gt_json, args.pred_json, gt_folder=args.gt_dir, pred_folder=args.pred_dir
+        )
+        _print_panoptic_results(pq_res)
diff --git a/detectron2/evaluation/pascal_voc_evaluation.py b/detectron2/evaluation/pascal_voc_evaluation.py
new file mode 100644
index 0000000000000000000000000000000000000000..88bb42e6f75f5f0faa4b774ddf16938477a37d2b
--- /dev/null
+++ b/detectron2/evaluation/pascal_voc_evaluation.py
@@ -0,0 +1,300 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) Facebook, Inc. and its affiliates.
+
+import logging
+import numpy as np
+import os
+import tempfile
+import xml.etree.ElementTree as ET
+from collections import OrderedDict, defaultdict
+from functools import lru_cache
+import torch
+
+from detectron2.data import MetadataCatalog
+from detectron2.utils import comm
+from detectron2.utils.file_io import PathManager
+
+from .evaluator import DatasetEvaluator
+
+
+class PascalVOCDetectionEvaluator(DatasetEvaluator):
+    """
+    Evaluate Pascal VOC style AP for Pascal VOC dataset.
+    It contains a synchronization, therefore has to be called from all ranks.
+
+    Note that the concept of AP can be implemented in different ways and may not
+    produce identical results. This class mimics the implementation of the official
+    Pascal VOC Matlab API, and should produce similar but not identical results to the
+    official API.
+    """
+
+    def __init__(self, dataset_name):
+        """
+        Args:
+            dataset_name (str): name of the dataset, e.g., "voc_2007_test"
+        """
+        self._dataset_name = dataset_name
+        meta = MetadataCatalog.get(dataset_name)
+
+        # Too many tiny files, download all to local for speed.
+        annotation_dir_local = PathManager.get_local_path(
+            os.path.join(meta.dirname, "Annotations/")
+        )
+        self._anno_file_template = os.path.join(annotation_dir_local, "{}.xml")
+        self._image_set_path = os.path.join(meta.dirname, "ImageSets", "Main", meta.split + ".txt")
+        self._class_names = meta.thing_classes
+        assert meta.year in [2007, 2012], meta.year
+        self._is_2007 = meta.year == 2007
+        self._cpu_device = torch.device("cpu")
+        self._logger = logging.getLogger(__name__)
+
+    def reset(self):
+        self._predictions = defaultdict(list)  # class name -> list of prediction strings
+
+    def process(self, inputs, outputs):
+        for input, output in zip(inputs, outputs):
+            image_id = input["image_id"]
+            instances = output["instances"].to(self._cpu_device)
+            boxes = instances.pred_boxes.tensor.numpy()
+            scores = instances.scores.tolist()
+            classes = instances.pred_classes.tolist()
+            for box, score, cls in zip(boxes, scores, classes):
+                xmin, ymin, xmax, ymax = box
+                # The inverse of data loading logic in `datasets/pascal_voc.py`
+                xmin += 1
+                ymin += 1
+                self._predictions[cls].append(
+                    f"{image_id} {score:.3f} {xmin:.1f} {ymin:.1f} {xmax:.1f} {ymax:.1f}"
+                )
+
+    def evaluate(self):
+        """
+        Returns:
+            dict: has a key "segm", whose value is a dict of "AP", "AP50", and "AP75".
+        """
+        all_predictions = comm.gather(self._predictions, dst=0)
+        if not comm.is_main_process():
+            return
+        predictions = defaultdict(list)
+        for predictions_per_rank in all_predictions:
+            for clsid, lines in predictions_per_rank.items():
+                predictions[clsid].extend(lines)
+        del all_predictions
+
+        self._logger.info(
+            "Evaluating {} using {} metric. "
+            "Note that results do not use the official Matlab API.".format(
+                self._dataset_name, 2007 if self._is_2007 else 2012
+            )
+        )
+
+        with tempfile.TemporaryDirectory(prefix="pascal_voc_eval_") as dirname:
+            res_file_template = os.path.join(dirname, "{}.txt")
+
+            aps = defaultdict(list)  # iou -> ap per class
+            for cls_id, cls_name in enumerate(self._class_names):
+                lines = predictions.get(cls_id, [""])
+
+                with open(res_file_template.format(cls_name), "w") as f:
+                    f.write("\n".join(lines))
+
+                for thresh in range(50, 100, 5):
+                    rec, prec, ap = voc_eval(
+                        res_file_template,
+                        self._anno_file_template,
+                        self._image_set_path,
+                        cls_name,
+                        ovthresh=thresh / 100.0,
+                        use_07_metric=self._is_2007,
+                    )
+                    aps[thresh].append(ap * 100)
+
+        ret = OrderedDict()
+        mAP = {iou: np.mean(x) for iou, x in aps.items()}
+        ret["bbox"] = {"AP": np.mean(list(mAP.values())), "AP50": mAP[50], "AP75": mAP[75]}
+        return ret
+
+
+##############################################################################
+#
+# Below code is modified from
+# https://github.com/rbgirshick/py-faster-rcnn/blob/master/lib/datasets/voc_eval.py
+# --------------------------------------------------------
+# Fast/er R-CNN
+# Licensed under The MIT License [see LICENSE for details]
+# Written by Bharath Hariharan
+# --------------------------------------------------------
+
+"""Python implementation of the PASCAL VOC devkit's AP evaluation code."""
+
+
+@lru_cache(maxsize=None)
+def parse_rec(filename):
+    """Parse a PASCAL VOC xml file."""
+    with PathManager.open(filename) as f:
+        tree = ET.parse(f)
+    objects = []
+    for obj in tree.findall("object"):
+        obj_struct = {}
+        obj_struct["name"] = obj.find("name").text
+        obj_struct["pose"] = obj.find("pose").text
+        obj_struct["truncated"] = int(obj.find("truncated").text)
+        obj_struct["difficult"] = int(obj.find("difficult").text)
+        bbox = obj.find("bndbox")
+        obj_struct["bbox"] = [
+            int(bbox.find("xmin").text),
+            int(bbox.find("ymin").text),
+            int(bbox.find("xmax").text),
+            int(bbox.find("ymax").text),
+        ]
+        objects.append(obj_struct)
+
+    return objects
+
+
+def voc_ap(rec, prec, use_07_metric=False):
+    """Compute VOC AP given precision and recall. If use_07_metric is true, uses
+    the VOC 07 11-point method (default:False).
+    """
+    if use_07_metric:
+        # 11 point metric
+        ap = 0.0
+        for t in np.arange(0.0, 1.1, 0.1):
+            if np.sum(rec >= t) == 0:
+                p = 0
+            else:
+                p = np.max(prec[rec >= t])
+            ap = ap + p / 11.0
+    else:
+        # correct AP calculation
+        # first append sentinel values at the end
+        mrec = np.concatenate(([0.0], rec, [1.0]))
+        mpre = np.concatenate(([0.0], prec, [0.0]))
+
+        # compute the precision envelope
+        for i in range(mpre.size - 1, 0, -1):
+            mpre[i - 1] = np.maximum(mpre[i - 1], mpre[i])
+
+        # to calculate area under PR curve, look for points
+        # where X axis (recall) changes value
+        i = np.where(mrec[1:] != mrec[:-1])[0]
+
+        # and sum (\Delta recall) * prec
+        ap = np.sum((mrec[i + 1] - mrec[i]) * mpre[i + 1])
+    return ap
+
+
+def voc_eval(detpath, annopath, imagesetfile, classname, ovthresh=0.5, use_07_metric=False):
+    """rec, prec, ap = voc_eval(detpath,
+                                annopath,
+                                imagesetfile,
+                                classname,
+                                [ovthresh],
+                                [use_07_metric])
+
+    Top level function that does the PASCAL VOC evaluation.
+
+    detpath: Path to detections
+        detpath.format(classname) should produce the detection results file.
+    annopath: Path to annotations
+        annopath.format(imagename) should be the xml annotations file.
+    imagesetfile: Text file containing the list of images, one image per line.
+    classname: Category name (duh)
+    [ovthresh]: Overlap threshold (default = 0.5)
+    [use_07_metric]: Whether to use VOC07's 11 point AP computation
+        (default False)
+    """
+    # assumes detections are in detpath.format(classname)
+    # assumes annotations are in annopath.format(imagename)
+    # assumes imagesetfile is a text file with each line an image name
+
+    # first load gt
+    # read list of images
+    with PathManager.open(imagesetfile, "r") as f:
+        lines = f.readlines()
+    imagenames = [x.strip() for x in lines]
+
+    # load annots
+    recs = {}
+    for imagename in imagenames:
+        recs[imagename] = parse_rec(annopath.format(imagename))
+
+    # extract gt objects for this class
+    class_recs = {}
+    npos = 0
+    for imagename in imagenames:
+        R = [obj for obj in recs[imagename] if obj["name"] == classname]
+        bbox = np.array([x["bbox"] for x in R])
+        difficult = np.array([x["difficult"] for x in R]).astype(bool)
+        # difficult = np.array([False for x in R]).astype(bool)  # treat all "difficult" as GT
+        det = [False] * len(R)
+        npos = npos + sum(~difficult)
+        class_recs[imagename] = {"bbox": bbox, "difficult": difficult, "det": det}
+
+    # read dets
+    detfile = detpath.format(classname)
+    with open(detfile, "r") as f:
+        lines = f.readlines()
+
+    splitlines = [x.strip().split(" ") for x in lines]
+    image_ids = [x[0] for x in splitlines]
+    confidence = np.array([float(x[1]) for x in splitlines])
+    BB = np.array([[float(z) for z in x[2:]] for x in splitlines]).reshape(-1, 4)
+
+    # sort by confidence
+    sorted_ind = np.argsort(-confidence)
+    BB = BB[sorted_ind, :]
+    image_ids = [image_ids[x] for x in sorted_ind]
+
+    # go down dets and mark TPs and FPs
+    nd = len(image_ids)
+    tp = np.zeros(nd)
+    fp = np.zeros(nd)
+    for d in range(nd):
+        R = class_recs[image_ids[d]]
+        bb = BB[d, :].astype(float)
+        ovmax = -np.inf
+        BBGT = R["bbox"].astype(float)
+
+        if BBGT.size > 0:
+            # compute overlaps
+            # intersection
+            ixmin = np.maximum(BBGT[:, 0], bb[0])
+            iymin = np.maximum(BBGT[:, 1], bb[1])
+            ixmax = np.minimum(BBGT[:, 2], bb[2])
+            iymax = np.minimum(BBGT[:, 3], bb[3])
+            iw = np.maximum(ixmax - ixmin + 1.0, 0.0)
+            ih = np.maximum(iymax - iymin + 1.0, 0.0)
+            inters = iw * ih
+
+            # union
+            uni = (
+                (bb[2] - bb[0] + 1.0) * (bb[3] - bb[1] + 1.0)
+                + (BBGT[:, 2] - BBGT[:, 0] + 1.0) * (BBGT[:, 3] - BBGT[:, 1] + 1.0)
+                - inters
+            )
+
+            overlaps = inters / uni
+            ovmax = np.max(overlaps)
+            jmax = np.argmax(overlaps)
+
+        if ovmax > ovthresh:
+            if not R["difficult"][jmax]:
+                if not R["det"][jmax]:
+                    tp[d] = 1.0
+                    R["det"][jmax] = 1
+                else:
+                    fp[d] = 1.0
+        else:
+            fp[d] = 1.0
+
+    # compute precision recall
+    fp = np.cumsum(fp)
+    tp = np.cumsum(tp)
+    rec = tp / float(npos)
+    # avoid divide by zero in case the first detection matches a difficult
+    # ground truth
+    prec = tp / np.maximum(tp + fp, np.finfo(np.float64).eps)
+    ap = voc_ap(rec, prec, use_07_metric)
+
+    return rec, prec, ap
diff --git a/detectron2/evaluation/rotated_coco_evaluation.py b/detectron2/evaluation/rotated_coco_evaluation.py
new file mode 100644
index 0000000000000000000000000000000000000000..c9782558881c9ad651accf8ac57ae158f3e46a96
--- /dev/null
+++ b/detectron2/evaluation/rotated_coco_evaluation.py
@@ -0,0 +1,209 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+import itertools
+import json
+import numpy as np
+import os
+import torch
+from pycocotools.cocoeval import COCOeval, maskUtils
+
+from detectron2.structures import BoxMode, RotatedBoxes, pairwise_iou_rotated
+from detectron2.utils.file_io import PathManager
+
+from .coco_evaluation import COCOEvaluator
+
+
+class RotatedCOCOeval(COCOeval):
+    @staticmethod
+    def is_rotated(box_list):
+        if type(box_list) == np.ndarray:
+            return box_list.shape[1] == 5
+        elif type(box_list) == list:
+            if box_list == []:  # cannot decide the box_dim
+                return False
+            return np.all(
+                np.array(
+                    [
+                        (len(obj) == 5) and ((type(obj) == list) or (type(obj) == np.ndarray))
+                        for obj in box_list
+                    ]
+                )
+            )
+        return False
+
+    @staticmethod
+    def boxlist_to_tensor(boxlist, output_box_dim):
+        if type(boxlist) == np.ndarray:
+            box_tensor = torch.from_numpy(boxlist)
+        elif type(boxlist) == list:
+            if boxlist == []:
+                return torch.zeros((0, output_box_dim), dtype=torch.float32)
+            else:
+                box_tensor = torch.FloatTensor(boxlist)
+        else:
+            raise Exception("Unrecognized boxlist type")
+
+        input_box_dim = box_tensor.shape[1]
+        if input_box_dim != output_box_dim:
+            if input_box_dim == 4 and output_box_dim == 5:
+                box_tensor = BoxMode.convert(box_tensor, BoxMode.XYWH_ABS, BoxMode.XYWHA_ABS)
+            else:
+                raise Exception(
+                    "Unable to convert from {}-dim box to {}-dim box".format(
+                        input_box_dim, output_box_dim
+                    )
+                )
+        return box_tensor
+
+    def compute_iou_dt_gt(self, dt, gt, is_crowd):
+        if self.is_rotated(dt) or self.is_rotated(gt):
+            # TODO: take is_crowd into consideration
+            assert all(c == 0 for c in is_crowd)
+            dt = RotatedBoxes(self.boxlist_to_tensor(dt, output_box_dim=5))
+            gt = RotatedBoxes(self.boxlist_to_tensor(gt, output_box_dim=5))
+            return pairwise_iou_rotated(dt, gt)
+        else:
+            # This is the same as the classical COCO evaluation
+            return maskUtils.iou(dt, gt, is_crowd)
+
+    def computeIoU(self, imgId: int, catId: int):
+        p = self.params
+        if p.useCats:
+            gt = self._gts[imgId, catId]
+            dt = self._dts[imgId, catId]
+        else:
+            gt = [_ for cId in p.catIds for _ in self._gts[imgId, cId]]
+            dt = [_ for cId in p.catIds for _ in self._dts[imgId, cId]]
+
+        if len(gt) == 0 or len(dt) == 0:
+            return []
+
+        inds = np.argsort([-d["score"] for d in dt], kind="mergesort")
+        dt = [dt[i] for i in inds]
+        if len(dt) > p.maxDets[-1]:
+            dt = dt[0 : p.maxDets[-1]]
+
+        assert p.iouType == "bbox", "unsupported iouType for iou computation"
+
+        g = [g["bbox"] for g in gt]
+        d = [d["bbox"] for d in dt]
+
+        # compute iou between each dt and gt region
+        iscrowd = [int(o["iscrowd"]) for o in gt]
+
+        # Note: this function is copied from cocoeval.py in cocoapi
+        # and the major difference is here.
+        ious = self.compute_iou_dt_gt(d, g, iscrowd)
+        return ious
+
+
+class RotatedCOCOEvaluator(COCOEvaluator):
+    """
+    Evaluate object proposal/instance detection outputs using COCO-like metrics and APIs,
+    with rotated boxes support.
+    Note: this uses IOU only and does not consider angle differences.
+    """
+
+    def process(self, inputs, outputs):
+        """
+        Args:
+            inputs: the inputs to a COCO model (e.g., GeneralizedRCNN).
+                It is a list of dict. Each dict corresponds to an image and
+                contains keys like "height", "width", "file_name", "image_id".
+            outputs: the outputs of a COCO model. It is a list of dicts with key
+                "instances" that contains :class:`Instances`.
+        """
+        for input, output in zip(inputs, outputs):
+            prediction = {"image_id": input["image_id"]}
+
+            if "instances" in output:
+                instances = output["instances"].to(self._cpu_device)
+
+                prediction["instances"] = self.instances_to_json(instances, input["image_id"])
+            if "proposals" in output:
+                prediction["proposals"] = output["proposals"].to(self._cpu_device)
+            self._predictions.append(prediction)
+
+    def instances_to_json(self, instances, img_id):
+        num_instance = len(instances)
+        if num_instance == 0:
+            return []
+
+        boxes = instances.pred_boxes.tensor.numpy()
+        if boxes.shape[1] == 4:
+            boxes = BoxMode.convert(boxes, BoxMode.XYXY_ABS, BoxMode.XYWH_ABS)
+        boxes = boxes.tolist()
+        scores = instances.scores.tolist()
+        classes = instances.pred_classes.tolist()
+
+        results = []
+        for k in range(num_instance):
+            result = {
+                "image_id": img_id,
+                "category_id": classes[k],
+                "bbox": boxes[k],
+                "score": scores[k],
+            }
+
+            results.append(result)
+        return results
+
+    def _eval_predictions(self, predictions, img_ids=None):  # img_ids: unused
+        """
+        Evaluate predictions on the given tasks.
+        Fill self._results with the metrics of the tasks.
+        """
+        self._logger.info("Preparing results for COCO format ...")
+        coco_results = list(itertools.chain(*[x["instances"] for x in predictions]))
+
+        # unmap the category ids for COCO
+        if hasattr(self._metadata, "thing_dataset_id_to_contiguous_id"):
+            reverse_id_mapping = {
+                v: k for k, v in self._metadata.thing_dataset_id_to_contiguous_id.items()
+            }
+            for result in coco_results:
+                result["category_id"] = reverse_id_mapping[result["category_id"]]
+
+        if self._output_dir:
+            file_path = os.path.join(self._output_dir, "coco_instances_results.json")
+            self._logger.info("Saving results to {}".format(file_path))
+            with PathManager.open(file_path, "w") as f:
+                f.write(json.dumps(coco_results))
+                f.flush()
+
+        if not self._do_evaluation:
+            self._logger.info("Annotations are not available for evaluation.")
+            return
+
+        self._logger.info("Evaluating predictions ...")
+
+        assert self._tasks is None or set(self._tasks) == {
+            "bbox"
+        }, "[RotatedCOCOEvaluator] Only bbox evaluation is supported"
+        coco_eval = (
+            self._evaluate_predictions_on_coco(self._coco_api, coco_results)
+            if len(coco_results) > 0
+            else None  # cocoapi does not handle empty results very well
+        )
+
+        task = "bbox"
+        res = self._derive_coco_results(
+            coco_eval, task, class_names=self._metadata.get("thing_classes")
+        )
+        self._results[task] = res
+
+    def _evaluate_predictions_on_coco(self, coco_gt, coco_results):
+        """
+        Evaluate the coco results using COCOEval API.
+        """
+        assert len(coco_results) > 0
+
+        coco_dt = coco_gt.loadRes(coco_results)
+
+        # Only bbox is supported for now
+        coco_eval = RotatedCOCOeval(coco_gt, coco_dt, iouType="bbox")
+
+        coco_eval.evaluate()
+        coco_eval.accumulate()
+        coco_eval.summarize()
+
+        return coco_eval
diff --git a/detectron2/evaluation/sem_seg_evaluation.py b/detectron2/evaluation/sem_seg_evaluation.py
new file mode 100644
index 0000000000000000000000000000000000000000..f87043b02f08777c4bea801eb1a9bcb1da747774
--- /dev/null
+++ b/detectron2/evaluation/sem_seg_evaluation.py
@@ -0,0 +1,265 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+import itertools
+import json
+import logging
+import numpy as np
+import os
+from collections import OrderedDict
+from typing import Optional, Union
+import pycocotools.mask as mask_util
+import torch
+from PIL import Image
+
+from detectron2.data import DatasetCatalog, MetadataCatalog
+from detectron2.utils.comm import all_gather, is_main_process, synchronize
+from detectron2.utils.file_io import PathManager
+
+from .evaluator import DatasetEvaluator
+
+_CV2_IMPORTED = True
+try:
+    import cv2  # noqa
+except ImportError:
+    # OpenCV is an optional dependency at the moment
+    _CV2_IMPORTED = False
+
+
+def load_image_into_numpy_array(
+    filename: str,
+    copy: bool = False,
+    dtype: Optional[Union[np.dtype, str]] = None,
+) -> np.ndarray:
+    with PathManager.open(filename, "rb") as f:
+        array = np.array(Image.open(f), copy=copy, dtype=dtype)
+    return array
+
+
+class SemSegEvaluator(DatasetEvaluator):
+    """
+    Evaluate semantic segmentation metrics.
+    """
+
+    def __init__(
+        self,
+        dataset_name,
+        distributed=True,
+        output_dir=None,
+        *,
+        sem_seg_loading_fn=load_image_into_numpy_array,
+        num_classes=None,
+        ignore_label=None,
+    ):
+        """
+        Args:
+            dataset_name (str): name of the dataset to be evaluated.
+            distributed (bool): if True, will collect results from all ranks for evaluation.
+                Otherwise, will evaluate the results in the current process.
+            output_dir (str): an output directory to dump results.
+            sem_seg_loading_fn: function to read sem seg file and load into numpy array.
+                Default provided, but projects can customize.
+            num_classes, ignore_label: deprecated argument
+        """
+        self._logger = logging.getLogger(__name__)
+        if num_classes is not None:
+            self._logger.warn(
+                "SemSegEvaluator(num_classes) is deprecated! It should be obtained from metadata."
+            )
+        if ignore_label is not None:
+            self._logger.warn(
+                "SemSegEvaluator(ignore_label) is deprecated! It should be obtained from metadata."
+            )
+        self._dataset_name = dataset_name
+        self._distributed = distributed
+        self._output_dir = output_dir
+
+        self._cpu_device = torch.device("cpu")
+
+        self.input_file_to_gt_file = {
+            dataset_record["file_name"]: dataset_record["sem_seg_file_name"]
+            for dataset_record in DatasetCatalog.get(dataset_name)
+        }
+
+        meta = MetadataCatalog.get(dataset_name)
+        # Dict that maps contiguous training ids to COCO category ids
+        try:
+            c2d = meta.stuff_dataset_id_to_contiguous_id
+            self._contiguous_id_to_dataset_id = {v: k for k, v in c2d.items()}
+        except AttributeError:
+            self._contiguous_id_to_dataset_id = None
+        self._class_names = meta.stuff_classes
+        self.sem_seg_loading_fn = sem_seg_loading_fn
+        self._num_classes = len(meta.stuff_classes)
+        if num_classes is not None:
+            assert self._num_classes == num_classes, f"{self._num_classes} != {num_classes}"
+        self._ignore_label = ignore_label if ignore_label is not None else meta.ignore_label
+
+        # This is because cv2.erode did not work for int datatype. Only works for uint8.
+        self._compute_boundary_iou = True
+        if not _CV2_IMPORTED:
+            self._compute_boundary_iou = False
+            self._logger.warn(
+                """Boundary IoU calculation requires OpenCV. B-IoU metrics are
+                not going to be computed because OpenCV is not available to import."""
+            )
+        if self._num_classes >= np.iinfo(np.uint8).max:
+            self._compute_boundary_iou = False
+            self._logger.warn(
+                f"""SemSegEvaluator(num_classes) is more than supported value for Boundary IoU calculation!
+                B-IoU metrics are not going to be computed. Max allowed value (exclusive)
+                for num_classes for calculating Boundary IoU is {np.iinfo(np.uint8).max}.
+                The number of classes of dataset {self._dataset_name} is {self._num_classes}"""
+            )
+
+    def reset(self):
+        self._conf_matrix = np.zeros((self._num_classes + 1, self._num_classes + 1), dtype=np.int64)
+        self._b_conf_matrix = np.zeros(
+            (self._num_classes + 1, self._num_classes + 1), dtype=np.int64
+        )
+        self._predictions = []
+
+    def process(self, inputs, outputs):
+        """
+        Args:
+            inputs: the inputs to a model.
+                It is a list of dicts. Each dict corresponds to an image and
+                contains keys like "height", "width", "file_name".
+            outputs: the outputs of a model. It is either list of semantic segmentation predictions
+                (Tensor [H, W]) or list of dicts with key "sem_seg" that contains semantic
+                segmentation prediction in the same format.
+        """
+        for input, output in zip(inputs, outputs):
+            output = output["sem_seg"].argmax(dim=0).to(self._cpu_device)
+            pred = np.array(output, dtype=int)
+            gt_filename = self.input_file_to_gt_file[input["file_name"]]
+            gt = self.sem_seg_loading_fn(gt_filename, dtype=int)
+
+            gt[gt == self._ignore_label] = self._num_classes
+
+            self._conf_matrix += np.bincount(
+                (self._num_classes + 1) * pred.reshape(-1) + gt.reshape(-1),
+                minlength=self._conf_matrix.size,
+            ).reshape(self._conf_matrix.shape)
+
+            if self._compute_boundary_iou:
+                b_gt = self._mask_to_boundary(gt.astype(np.uint8))
+                b_pred = self._mask_to_boundary(pred.astype(np.uint8))
+
+                self._b_conf_matrix += np.bincount(
+                    (self._num_classes + 1) * b_pred.reshape(-1) + b_gt.reshape(-1),
+                    minlength=self._conf_matrix.size,
+                ).reshape(self._conf_matrix.shape)
+
+            self._predictions.extend(self.encode_json_sem_seg(pred, input["file_name"]))
+
+    def evaluate(self):
+        """
+        Evaluates standard semantic segmentation metrics (http://cocodataset.org/#stuff-eval):
+
+        * Mean intersection-over-union averaged across classes (mIoU)
+        * Frequency Weighted IoU (fwIoU)
+        * Mean pixel accuracy averaged across classes (mACC)
+        * Pixel Accuracy (pACC)
+        """
+        if self._distributed:
+            synchronize()
+            conf_matrix_list = all_gather(self._conf_matrix)
+            b_conf_matrix_list = all_gather(self._b_conf_matrix)
+            self._predictions = all_gather(self._predictions)
+            self._predictions = list(itertools.chain(*self._predictions))
+            if not is_main_process():
+                return
+
+            self._conf_matrix = np.zeros_like(self._conf_matrix)
+            for conf_matrix in conf_matrix_list:
+                self._conf_matrix += conf_matrix
+
+            self._b_conf_matrix = np.zeros_like(self._b_conf_matrix)
+            for b_conf_matrix in b_conf_matrix_list:
+                self._b_conf_matrix += b_conf_matrix
+
+        if self._output_dir:
+            PathManager.mkdirs(self._output_dir)
+            file_path = os.path.join(self._output_dir, "sem_seg_predictions.json")
+            with PathManager.open(file_path, "w") as f:
+                f.write(json.dumps(self._predictions))
+
+        acc = np.full(self._num_classes, np.nan, dtype=float)
+        iou = np.full(self._num_classes, np.nan, dtype=float)
+        tp = self._conf_matrix.diagonal()[:-1].astype(float)
+        pos_gt = np.sum(self._conf_matrix[:-1, :-1], axis=0).astype(float)
+        class_weights = pos_gt / np.sum(pos_gt)
+        pos_pred = np.sum(self._conf_matrix[:-1, :-1], axis=1).astype(float)
+        acc_valid = pos_gt > 0
+        acc[acc_valid] = tp[acc_valid] / pos_gt[acc_valid]
+        union = pos_gt + pos_pred - tp
+        iou_valid = np.logical_and(acc_valid, union > 0)
+        iou[iou_valid] = tp[iou_valid] / union[iou_valid]
+        macc = np.sum(acc[acc_valid]) / np.sum(acc_valid)
+        miou = np.sum(iou[iou_valid]) / np.sum(iou_valid)
+        fiou = np.sum(iou[iou_valid] * class_weights[iou_valid])
+        pacc = np.sum(tp) / np.sum(pos_gt)
+
+        if self._compute_boundary_iou:
+            b_iou = np.full(self._num_classes, np.nan, dtype=float)
+            b_tp = self._b_conf_matrix.diagonal()[:-1].astype(float)
+            b_pos_gt = np.sum(self._b_conf_matrix[:-1, :-1], axis=0).astype(float)
+            b_pos_pred = np.sum(self._b_conf_matrix[:-1, :-1], axis=1).astype(float)
+            b_union = b_pos_gt + b_pos_pred - b_tp
+            b_iou_valid = b_union > 0
+            b_iou[b_iou_valid] = b_tp[b_iou_valid] / b_union[b_iou_valid]
+
+        res = {}
+        res["mIoU"] = 100 * miou
+        res["fwIoU"] = 100 * fiou
+        for i, name in enumerate(self._class_names):
+            res[f"IoU-{name}"] = 100 * iou[i]
+            if self._compute_boundary_iou:
+                res[f"BoundaryIoU-{name}"] = 100 * b_iou[i]
+                res[f"min(IoU, B-Iou)-{name}"] = 100 * min(iou[i], b_iou[i])
+        res["mACC"] = 100 * macc
+        res["pACC"] = 100 * pacc
+        for i, name in enumerate(self._class_names):
+            res[f"ACC-{name}"] = 100 * acc[i]
+
+        if self._output_dir:
+            file_path = os.path.join(self._output_dir, "sem_seg_evaluation.pth")
+            with PathManager.open(file_path, "wb") as f:
+                torch.save(res, f)
+        results = OrderedDict({"sem_seg": res})
+        self._logger.info(results)
+        return results
+
+    def encode_json_sem_seg(self, sem_seg, input_file_name):
+        """
+        Convert semantic segmentation to COCO stuff format with segments encoded as RLEs.
+        See http://cocodataset.org/#format-results
+        """
+        json_list = []
+        for label in np.unique(sem_seg):
+            if self._contiguous_id_to_dataset_id is not None:
+                assert (
+                    label in self._contiguous_id_to_dataset_id
+                ), "Label {} is not in the metadata info for {}".format(label, self._dataset_name)
+                dataset_id = self._contiguous_id_to_dataset_id[label]
+            else:
+                dataset_id = int(label)
+            mask = (sem_seg == label).astype(np.uint8)
+            mask_rle = mask_util.encode(np.array(mask[:, :, None], order="F"))[0]
+            mask_rle["counts"] = mask_rle["counts"].decode("utf-8")
+            json_list.append(
+                {"file_name": input_file_name, "category_id": dataset_id, "segmentation": mask_rle}
+            )
+        return json_list
+
+    def _mask_to_boundary(self, mask: np.ndarray, dilation_ratio=0.02):
+        assert mask.ndim == 2, "mask_to_boundary expects a 2-dimensional image"
+        h, w = mask.shape
+        diag_len = np.sqrt(h**2 + w**2)
+        dilation = max(1, int(round(dilation_ratio * diag_len)))
+        kernel = np.ones((3, 3), dtype=np.uint8)
+
+        padded_mask = cv2.copyMakeBorder(mask, 1, 1, 1, 1, cv2.BORDER_CONSTANT, value=0)
+        eroded_mask_with_padding = cv2.erode(padded_mask, kernel, iterations=dilation)
+        eroded_mask = eroded_mask_with_padding[1:-1, 1:-1]
+        boundary = mask - eroded_mask
+        return boundary
diff --git a/detectron2/evaluation/testing.py b/detectron2/evaluation/testing.py
new file mode 100644
index 0000000000000000000000000000000000000000..9e5ae625bb0593fc20739dd3ea549157e4df4f3d
--- /dev/null
+++ b/detectron2/evaluation/testing.py
@@ -0,0 +1,85 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+import logging
+import numpy as np
+import pprint
+import sys
+from collections.abc import Mapping
+
+
+def print_csv_format(results):
+    """
+    Print main metrics in a format similar to Detectron,
+    so that they are easy to copypaste into a spreadsheet.
+
+    Args:
+        results (OrderedDict[dict]): task_name -> {metric -> score}
+            unordered dict can also be printed, but in arbitrary order
+    """
+    assert isinstance(results, Mapping) or not len(results), results
+    logger = logging.getLogger(__name__)
+    for task, res in results.items():
+        if isinstance(res, Mapping):
+            # Don't print "AP-category" metrics since they are usually not tracked.
+            important_res = [(k, v) for k, v in res.items() if "-" not in k]
+            logger.info("copypaste: Task: {}".format(task))
+            logger.info("copypaste: " + ",".join([k[0] for k in important_res]))
+            logger.info("copypaste: " + ",".join(["{0:.4f}".format(k[1]) for k in important_res]))
+        else:
+            logger.info(f"copypaste: {task}={res}")
+
+
+def verify_results(cfg, results):
+    """
+    Args:
+        results (OrderedDict[dict]): task_name -> {metric -> score}
+
+    Returns:
+        bool: whether the verification succeeds or not
+    """
+    expected_results = cfg.TEST.EXPECTED_RESULTS
+    if not len(expected_results):
+        return True
+
+    ok = True
+    for task, metric, expected, tolerance in expected_results:
+        actual = results[task].get(metric, None)
+        if actual is None:
+            ok = False
+            continue
+        if not np.isfinite(actual):
+            ok = False
+            continue
+        diff = abs(actual - expected)
+        if diff > tolerance:
+            ok = False
+
+    logger = logging.getLogger(__name__)
+    if not ok:
+        logger.error("Result verification failed!")
+        logger.error("Expected Results: " + str(expected_results))
+        logger.error("Actual Results: " + pprint.pformat(results))
+
+        sys.exit(1)
+    else:
+        logger.info("Results verification passed.")
+    return ok
+
+
+def flatten_results_dict(results):
+    """
+    Expand a hierarchical dict of scalars into a flat dict of scalars.
+    If results[k1][k2][k3] = v, the returned dict will have the entry
+    {"k1/k2/k3": v}.
+
+    Args:
+        results (dict):
+    """
+    r = {}
+    for k, v in results.items():
+        if isinstance(v, Mapping):
+            v = flatten_results_dict(v)
+            for kk, vv in v.items():
+                r[k + "/" + kk] = vv
+        else:
+            r[k] = v
+    return r
diff --git a/detectron2/export/README.md b/detectron2/export/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..c86ff62516f4e8e4b1a6c1f33f11192933cf3861
--- /dev/null
+++ b/detectron2/export/README.md
@@ -0,0 +1,15 @@
+
+This directory contains code to prepare a detectron2 model for deployment.
+Currently it supports exporting a detectron2 model to TorchScript, ONNX, or (deprecated) Caffe2 format.
+
+Please see [documentation](https://detectron2.readthedocs.io/tutorials/deployment.html) for its usage.
+
+
+### Acknowledgements
+
+Thanks to Mobile Vision team at Facebook for developing the Caffe2 conversion tools.
+
+Thanks to Computing Platform Department - PAI team at Alibaba Group (@bddpqq, @chenbohua3) who
+help export Detectron2 models to TorchScript.
+
+Thanks to ONNX Converter team at Microsoft who help export Detectron2 models to ONNX.
diff --git a/detectron2/export/__init__.py b/detectron2/export/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..5a58758f64aae6071fa688be4400622ce6036efa
--- /dev/null
+++ b/detectron2/export/__init__.py
@@ -0,0 +1,30 @@
+# -*- coding: utf-8 -*-
+
+import warnings
+
+from .flatten import TracingAdapter
+from .torchscript import dump_torchscript_IR, scripting_with_instances
+
+try:
+    from caffe2.proto import caffe2_pb2 as _tmp
+    from caffe2.python import core
+
+    # caffe2 is optional
+except ImportError:
+    pass
+else:
+    from .api import *
+
+
+# TODO: Update ONNX Opset version and run tests when a newer PyTorch is supported
+STABLE_ONNX_OPSET_VERSION = 11
+
+
+def add_export_config(cfg):
+    warnings.warn(
+        "add_export_config has been deprecated and behaves as no-op function.", DeprecationWarning
+    )
+    return cfg
+
+
+__all__ = [k for k in globals().keys() if not k.startswith("_")]
diff --git a/detectron2/export/api.py b/detectron2/export/api.py
new file mode 100644
index 0000000000000000000000000000000000000000..1a272fed929217f18e04f731365f4bf7472110fc
--- /dev/null
+++ b/detectron2/export/api.py
@@ -0,0 +1,230 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+import copy
+import logging
+import os
+import torch
+from caffe2.proto import caffe2_pb2
+from torch import nn
+
+from detectron2.config import CfgNode
+from detectron2.utils.file_io import PathManager
+
+from .caffe2_inference import ProtobufDetectionModel
+from .caffe2_modeling import META_ARCH_CAFFE2_EXPORT_TYPE_MAP, convert_batched_inputs_to_c2_format
+from .shared import get_pb_arg_vali, get_pb_arg_vals, save_graph
+
+__all__ = [
+    "Caffe2Model",
+    "Caffe2Tracer",
+]
+
+
+class Caffe2Tracer:
+    """
+    Make a detectron2 model traceable with Caffe2 operators.
+    This class creates a traceable version of a detectron2 model which:
+
+    1. Rewrite parts of the model using ops in Caffe2. Note that some ops do
+       not have GPU implementation in Caffe2.
+    2. Remove post-processing and only produce raw layer outputs
+
+    After making a traceable model, the class provide methods to export such a
+    model to different deployment formats.
+    Exported graph produced by this class take two input tensors:
+
+    1. (1, C, H, W) float "data" which is an image (usually in [0, 255]).
+       (H, W) often has to be padded to multiple of 32 (depend on the model
+       architecture).
+    2. 1x3 float "im_info", each row of which is (height, width, 1.0).
+       Height and width are true image shapes before padding.
+
+    The class currently only supports models using builtin meta architectures.
+    Batch inference is not supported, and contributions are welcome.
+    """
+
+    def __init__(self, cfg: CfgNode, model: nn.Module, inputs):
+        """
+        Args:
+            cfg (CfgNode): a detectron2 config used to construct caffe2-compatible model.
+            model (nn.Module): An original pytorch model. Must be among a few official models
+                in detectron2 that can be converted to become caffe2-compatible automatically.
+                Weights have to be already loaded to this model.
+            inputs: sample inputs that the given model takes for inference.
+                Will be used to trace the model. For most models, random inputs with
+                no detected objects will not work as they lead to wrong traces.
+        """
+        assert isinstance(cfg, CfgNode), cfg
+        assert isinstance(model, torch.nn.Module), type(model)
+
+        # TODO make it support custom models, by passing in c2 model directly
+        C2MetaArch = META_ARCH_CAFFE2_EXPORT_TYPE_MAP[cfg.MODEL.META_ARCHITECTURE]
+        self.traceable_model = C2MetaArch(cfg, copy.deepcopy(model))
+        self.inputs = inputs
+        self.traceable_inputs = self.traceable_model.get_caffe2_inputs(inputs)
+
+    def export_caffe2(self):
+        """
+        Export the model to Caffe2's protobuf format.
+        The returned object can be saved with its :meth:`.save_protobuf()` method.
+        The result can be loaded and executed using Caffe2 runtime.
+
+        Returns:
+            :class:`Caffe2Model`
+        """
+        from .caffe2_export import export_caffe2_detection_model
+
+        predict_net, init_net = export_caffe2_detection_model(
+            self.traceable_model, self.traceable_inputs
+        )
+        return Caffe2Model(predict_net, init_net)
+
+    def export_onnx(self):
+        """
+        Export the model to ONNX format.
+        Note that the exported model contains custom ops only available in caffe2, therefore it
+        cannot be directly executed by other runtime (such as onnxruntime or TensorRT).
+        Post-processing or transformation passes may be applied on the model to accommodate
+        different runtimes, but we currently do not provide support for them.
+
+        Returns:
+            onnx.ModelProto: an onnx model.
+        """
+        from .caffe2_export import export_onnx_model as export_onnx_model_impl
+
+        return export_onnx_model_impl(self.traceable_model, (self.traceable_inputs,))
+
+    def export_torchscript(self):
+        """
+        Export the model to a ``torch.jit.TracedModule`` by tracing.
+        The returned object can be saved to a file by ``.save()``.
+
+        Returns:
+            torch.jit.TracedModule: a torch TracedModule
+        """
+        logger = logging.getLogger(__name__)
+        logger.info("Tracing the model with torch.jit.trace ...")
+        with torch.no_grad():
+            return torch.jit.trace(self.traceable_model, (self.traceable_inputs,))
+
+
+class Caffe2Model(nn.Module):
+    """
+    A wrapper around the traced model in Caffe2's protobuf format.
+    The exported graph has different inputs/outputs from the original Pytorch
+    model, as explained in :class:`Caffe2Tracer`. This class wraps around the
+    exported graph to simulate the same interface as the original Pytorch model.
+    It also provides functions to save/load models in Caffe2's format.'
+
+    Examples:
+    ::
+        c2_model = Caffe2Tracer(cfg, torch_model, inputs).export_caffe2()
+        inputs = [{"image": img_tensor_CHW}]
+        outputs = c2_model(inputs)
+        orig_outputs = torch_model(inputs)
+    """
+
+    def __init__(self, predict_net, init_net):
+        super().__init__()
+        self.eval()  # always in eval mode
+        self._predict_net = predict_net
+        self._init_net = init_net
+        self._predictor = None
+
+    __init__.__HIDE_SPHINX_DOC__ = True
+
+    @property
+    def predict_net(self):
+        """
+        caffe2.core.Net: the underlying caffe2 predict net
+        """
+        return self._predict_net
+
+    @property
+    def init_net(self):
+        """
+        caffe2.core.Net: the underlying caffe2 init net
+        """
+        return self._init_net
+
+    def save_protobuf(self, output_dir):
+        """
+        Save the model as caffe2's protobuf format.
+        It saves the following files:
+
+            * "model.pb": definition of the graph. Can be visualized with
+              tools like `netron <https://github.com/lutzroeder/netron>`_.
+            * "model_init.pb": model parameters
+            * "model.pbtxt": human-readable definition of the graph. Not
+              needed for deployment.
+
+        Args:
+            output_dir (str): the output directory to save protobuf files.
+        """
+        logger = logging.getLogger(__name__)
+        logger.info("Saving model to {} ...".format(output_dir))
+        if not PathManager.exists(output_dir):
+            PathManager.mkdirs(output_dir)
+
+        with PathManager.open(os.path.join(output_dir, "model.pb"), "wb") as f:
+            f.write(self._predict_net.SerializeToString())
+        with PathManager.open(os.path.join(output_dir, "model.pbtxt"), "w") as f:
+            f.write(str(self._predict_net))
+        with PathManager.open(os.path.join(output_dir, "model_init.pb"), "wb") as f:
+            f.write(self._init_net.SerializeToString())
+
+    def save_graph(self, output_file, inputs=None):
+        """
+        Save the graph as SVG format.
+
+        Args:
+            output_file (str): a SVG file
+            inputs: optional inputs given to the model.
+                If given, the inputs will be used to run the graph to record
+                shape of every tensor. The shape information will be
+                saved together with the graph.
+        """
+        from .caffe2_export import run_and_save_graph
+
+        if inputs is None:
+            save_graph(self._predict_net, output_file, op_only=False)
+        else:
+            size_divisibility = get_pb_arg_vali(self._predict_net, "size_divisibility", 0)
+            device = get_pb_arg_vals(self._predict_net, "device", b"cpu").decode("ascii")
+            inputs = convert_batched_inputs_to_c2_format(inputs, size_divisibility, device)
+            inputs = [x.cpu().numpy() for x in inputs]
+            run_and_save_graph(self._predict_net, self._init_net, inputs, output_file)
+
+    @staticmethod
+    def load_protobuf(dir):
+        """
+        Args:
+            dir (str): a directory used to save Caffe2Model with
+                :meth:`save_protobuf`.
+                The files "model.pb" and "model_init.pb" are needed.
+
+        Returns:
+            Caffe2Model: the caffe2 model loaded from this directory.
+        """
+        predict_net = caffe2_pb2.NetDef()
+        with PathManager.open(os.path.join(dir, "model.pb"), "rb") as f:
+            predict_net.ParseFromString(f.read())
+
+        init_net = caffe2_pb2.NetDef()
+        with PathManager.open(os.path.join(dir, "model_init.pb"), "rb") as f:
+            init_net.ParseFromString(f.read())
+
+        return Caffe2Model(predict_net, init_net)
+
+    def __call__(self, inputs):
+        """
+        An interface that wraps around a Caffe2 model and mimics detectron2's models'
+        input/output format. See details about the format at :doc:`/tutorials/models`.
+        This is used to compare the outputs of caffe2 model with its original torch model.
+
+        Due to the extra conversion between Pytorch/Caffe2, this method is not meant for
+        benchmark. Because of the conversion, this method also has dependency
+        on detectron2 in order to convert to detectron2's output format.
+        """
+        if self._predictor is None:
+            self._predictor = ProtobufDetectionModel(self._predict_net, self._init_net)
+        return self._predictor(inputs)
diff --git a/detectron2/export/c10.py b/detectron2/export/c10.py
new file mode 100644
index 0000000000000000000000000000000000000000..adbc62bea70b67f8ba6fef83f29826f165dc7c4d
--- /dev/null
+++ b/detectron2/export/c10.py
@@ -0,0 +1,571 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+
+import math
+from typing import Dict
+import torch
+import torch.nn.functional as F
+
+from detectron2.layers import ShapeSpec, cat
+from detectron2.layers.roi_align_rotated import ROIAlignRotated
+from detectron2.modeling import poolers
+from detectron2.modeling.proposal_generator import rpn
+from detectron2.modeling.roi_heads.mask_head import mask_rcnn_inference
+from detectron2.structures import Boxes, ImageList, Instances, Keypoints, RotatedBoxes
+
+from .shared import alias, to_device
+
+
+"""
+This file contains caffe2-compatible implementation of several detectron2 components.
+"""
+
+
+class Caffe2Boxes(Boxes):
+    """
+    Representing a list of detectron2.structures.Boxes from minibatch, each box
+    is represented by a 5d vector (batch index + 4 coordinates), or a 6d vector
+    (batch index + 5 coordinates) for RotatedBoxes.
+    """
+
+    def __init__(self, tensor):
+        assert isinstance(tensor, torch.Tensor)
+        assert tensor.dim() == 2 and tensor.size(-1) in [4, 5, 6], tensor.size()
+        # TODO: make tensor immutable when dim is Nx5 for Boxes,
+        # and Nx6 for RotatedBoxes?
+        self.tensor = tensor
+
+
+# TODO clean up this class, maybe just extend Instances
+class InstancesList:
+    """
+    Tensor representation of a list of Instances object for a batch of images.
+
+    When dealing with a batch of images with Caffe2 ops, a list of bboxes
+    (instances) are usually represented by single Tensor with size
+    (sigma(Ni), 5) or (sigma(Ni), 4) plus a batch split Tensor. This class is
+    for providing common functions to convert between these two representations.
+    """
+
+    def __init__(self, im_info, indices, extra_fields=None):
+        # [N, 3] -> (H, W, Scale)
+        self.im_info = im_info
+        # [N,] -> indice of batch to which the instance belongs
+        self.indices = indices
+        # [N, ...]
+        self.batch_extra_fields = extra_fields or {}
+
+        self.image_size = self.im_info
+
+    def get_fields(self):
+        """like `get_fields` in the Instances object,
+        but return each field in tensor representations"""
+        ret = {}
+        for k, v in self.batch_extra_fields.items():
+            # if isinstance(v, torch.Tensor):
+            #     tensor_rep = v
+            # elif isinstance(v, (Boxes, Keypoints)):
+            #     tensor_rep = v.tensor
+            # else:
+            #     raise ValueError("Can't find tensor representation for: {}".format())
+            ret[k] = v
+        return ret
+
+    def has(self, name):
+        return name in self.batch_extra_fields
+
+    def set(self, name, value):
+        # len(tensor) is a bad practice that generates ONNX constants during tracing.
+        # Although not a problem for the `assert` statement below, torch ONNX exporter
+        # still raises a misleading warning as it does not this call comes from `assert`
+        if isinstance(value, Boxes):
+            data_len = value.tensor.shape[0]
+        elif isinstance(value, torch.Tensor):
+            data_len = value.shape[0]
+        else:
+            data_len = len(value)
+        if len(self.batch_extra_fields):
+            assert (
+                len(self) == data_len
+            ), "Adding a field of length {} to a Instances of length {}".format(data_len, len(self))
+        self.batch_extra_fields[name] = value
+
+    def __getattr__(self, name):
+        if name not in self.batch_extra_fields:
+            raise AttributeError("Cannot find field '{}' in the given Instances!".format(name))
+        return self.batch_extra_fields[name]
+
+    def __len__(self):
+        return len(self.indices)
+
+    def flatten(self):
+        ret = []
+        for _, v in self.batch_extra_fields.items():
+            if isinstance(v, (Boxes, Keypoints)):
+                ret.append(v.tensor)
+            else:
+                ret.append(v)
+        return ret
+
+    @staticmethod
+    def to_d2_instances_list(instances_list):
+        """
+        Convert InstancesList to List[Instances]. The input `instances_list` can
+        also be a List[Instances], in this case this method is a non-op.
+        """
+        if not isinstance(instances_list, InstancesList):
+            assert all(isinstance(x, Instances) for x in instances_list)
+            return instances_list
+
+        ret = []
+        for i, info in enumerate(instances_list.im_info):
+            instances = Instances(torch.Size([int(info[0].item()), int(info[1].item())]))
+
+            ids = instances_list.indices == i
+            for k, v in instances_list.batch_extra_fields.items():
+                if isinstance(v, torch.Tensor):
+                    instances.set(k, v[ids])
+                    continue
+                elif isinstance(v, Boxes):
+                    instances.set(k, v[ids, -4:])
+                    continue
+
+                target_type, tensor_source = v
+                assert isinstance(tensor_source, torch.Tensor)
+                assert tensor_source.shape[0] == instances_list.indices.shape[0]
+                tensor_source = tensor_source[ids]
+
+                if issubclass(target_type, Boxes):
+                    instances.set(k, Boxes(tensor_source[:, -4:]))
+                elif issubclass(target_type, Keypoints):
+                    instances.set(k, Keypoints(tensor_source))
+                elif issubclass(target_type, torch.Tensor):
+                    instances.set(k, tensor_source)
+                else:
+                    raise ValueError("Can't handle targe type: {}".format(target_type))
+
+            ret.append(instances)
+        return ret
+
+
+class Caffe2Compatible:
+    """
+    A model can inherit this class to indicate that it can be traced and deployed with caffe2.
+    """
+
+    def _get_tensor_mode(self):
+        return self._tensor_mode
+
+    def _set_tensor_mode(self, v):
+        self._tensor_mode = v
+
+    tensor_mode = property(_get_tensor_mode, _set_tensor_mode)
+    """
+    If true, the model expects C2-style tensor only inputs/outputs format.
+    """
+
+
+class Caffe2RPN(Caffe2Compatible, rpn.RPN):
+    @classmethod
+    def from_config(cls, cfg, input_shape: Dict[str, ShapeSpec]):
+        ret = super(Caffe2Compatible, cls).from_config(cfg, input_shape)
+        assert tuple(cfg.MODEL.RPN.BBOX_REG_WEIGHTS) == (1.0, 1.0, 1.0, 1.0) or tuple(
+            cfg.MODEL.RPN.BBOX_REG_WEIGHTS
+        ) == (1.0, 1.0, 1.0, 1.0, 1.0)
+        return ret
+
+    def _generate_proposals(
+        self, images, objectness_logits_pred, anchor_deltas_pred, gt_instances=None
+    ):
+        assert isinstance(images, ImageList)
+        if self.tensor_mode:
+            im_info = images.image_sizes
+        else:
+            im_info = torch.tensor([[im_sz[0], im_sz[1], 1.0] for im_sz in images.image_sizes]).to(
+                images.tensor.device
+            )
+        assert isinstance(im_info, torch.Tensor)
+
+        rpn_rois_list = []
+        rpn_roi_probs_list = []
+        for scores, bbox_deltas, cell_anchors_tensor, feat_stride in zip(
+            objectness_logits_pred,
+            anchor_deltas_pred,
+            [b for (n, b) in self.anchor_generator.cell_anchors.named_buffers()],
+            self.anchor_generator.strides,
+        ):
+            scores = scores.detach()
+            bbox_deltas = bbox_deltas.detach()
+
+            rpn_rois, rpn_roi_probs = torch.ops._caffe2.GenerateProposals(
+                scores,
+                bbox_deltas,
+                im_info,
+                cell_anchors_tensor,
+                spatial_scale=1.0 / feat_stride,
+                pre_nms_topN=self.pre_nms_topk[self.training],
+                post_nms_topN=self.post_nms_topk[self.training],
+                nms_thresh=self.nms_thresh,
+                min_size=self.min_box_size,
+                # correct_transform_coords=True,  # deprecated argument
+                angle_bound_on=True,  # Default
+                angle_bound_lo=-180,
+                angle_bound_hi=180,
+                clip_angle_thresh=1.0,  # Default
+                legacy_plus_one=False,
+            )
+            rpn_rois_list.append(rpn_rois)
+            rpn_roi_probs_list.append(rpn_roi_probs)
+
+        # For FPN in D2, in RPN all proposals from different levels are concated
+        # together, ranked and picked by top post_nms_topk. Then in ROIPooler
+        # it calculates level_assignments and calls the RoIAlign from
+        # the corresponding level.
+
+        if len(objectness_logits_pred) == 1:
+            rpn_rois = rpn_rois_list[0]
+            rpn_roi_probs = rpn_roi_probs_list[0]
+        else:
+            assert len(rpn_rois_list) == len(rpn_roi_probs_list)
+            rpn_post_nms_topN = self.post_nms_topk[self.training]
+
+            device = rpn_rois_list[0].device
+            input_list = [to_device(x, "cpu") for x in (rpn_rois_list + rpn_roi_probs_list)]
+
+            # TODO remove this after confirming rpn_max_level/rpn_min_level
+            # is not needed in CollectRpnProposals.
+            feature_strides = list(self.anchor_generator.strides)
+            rpn_min_level = int(math.log2(feature_strides[0]))
+            rpn_max_level = int(math.log2(feature_strides[-1]))
+            assert (rpn_max_level - rpn_min_level + 1) == len(
+                rpn_rois_list
+            ), "CollectRpnProposals requires continuous levels"
+
+            rpn_rois = torch.ops._caffe2.CollectRpnProposals(
+                input_list,
+                # NOTE: in current implementation, rpn_max_level and rpn_min_level
+                # are not needed, only the subtraction of two matters and it
+                # can be infer from the number of inputs. Keep them now for
+                # consistency.
+                rpn_max_level=2 + len(rpn_rois_list) - 1,
+                rpn_min_level=2,
+                rpn_post_nms_topN=rpn_post_nms_topN,
+            )
+            rpn_rois = to_device(rpn_rois, device)
+            rpn_roi_probs = []
+
+        proposals = self.c2_postprocess(im_info, rpn_rois, rpn_roi_probs, self.tensor_mode)
+        return proposals, {}
+
+    def forward(self, images, features, gt_instances=None):
+        assert not self.training
+        features = [features[f] for f in self.in_features]
+        objectness_logits_pred, anchor_deltas_pred = self.rpn_head(features)
+        return self._generate_proposals(
+            images,
+            objectness_logits_pred,
+            anchor_deltas_pred,
+            gt_instances,
+        )
+
+    @staticmethod
+    def c2_postprocess(im_info, rpn_rois, rpn_roi_probs, tensor_mode):
+        proposals = InstancesList(
+            im_info=im_info,
+            indices=rpn_rois[:, 0],
+            extra_fields={
+                "proposal_boxes": Caffe2Boxes(rpn_rois),
+                "objectness_logits": (torch.Tensor, rpn_roi_probs),
+            },
+        )
+        if not tensor_mode:
+            proposals = InstancesList.to_d2_instances_list(proposals)
+        else:
+            proposals = [proposals]
+        return proposals
+
+
+class Caffe2ROIPooler(Caffe2Compatible, poolers.ROIPooler):
+    @staticmethod
+    def c2_preprocess(box_lists):
+        assert all(isinstance(x, Boxes) for x in box_lists)
+        if all(isinstance(x, Caffe2Boxes) for x in box_lists):
+            # input is pure-tensor based
+            assert len(box_lists) == 1
+            pooler_fmt_boxes = box_lists[0].tensor
+        else:
+            pooler_fmt_boxes = poolers.convert_boxes_to_pooler_format(box_lists)
+        return pooler_fmt_boxes
+
+    def forward(self, x, box_lists):
+        assert not self.training
+
+        pooler_fmt_boxes = self.c2_preprocess(box_lists)
+        num_level_assignments = len(self.level_poolers)
+
+        if num_level_assignments == 1:
+            if isinstance(self.level_poolers[0], ROIAlignRotated):
+                c2_roi_align = torch.ops._caffe2.RoIAlignRotated
+                aligned = True
+            else:
+                c2_roi_align = torch.ops._caffe2.RoIAlign
+                aligned = self.level_poolers[0].aligned
+
+            x0 = x[0]
+            if x0.is_quantized:
+                x0 = x0.dequantize()
+
+            out = c2_roi_align(
+                x0,
+                pooler_fmt_boxes,
+                order="NCHW",
+                spatial_scale=float(self.level_poolers[0].spatial_scale),
+                pooled_h=int(self.output_size[0]),
+                pooled_w=int(self.output_size[1]),
+                sampling_ratio=int(self.level_poolers[0].sampling_ratio),
+                aligned=aligned,
+            )
+            return out
+
+        device = pooler_fmt_boxes.device
+        assert (
+            self.max_level - self.min_level + 1 == 4
+        ), "Currently DistributeFpnProposals only support 4 levels"
+        fpn_outputs = torch.ops._caffe2.DistributeFpnProposals(
+            to_device(pooler_fmt_boxes, "cpu"),
+            roi_canonical_scale=self.canonical_box_size,
+            roi_canonical_level=self.canonical_level,
+            roi_max_level=self.max_level,
+            roi_min_level=self.min_level,
+            legacy_plus_one=False,
+        )
+        fpn_outputs = [to_device(x, device) for x in fpn_outputs]
+
+        rois_fpn_list = fpn_outputs[:-1]
+        rois_idx_restore_int32 = fpn_outputs[-1]
+
+        roi_feat_fpn_list = []
+        for roi_fpn, x_level, pooler in zip(rois_fpn_list, x, self.level_poolers):
+            if isinstance(pooler, ROIAlignRotated):
+                c2_roi_align = torch.ops._caffe2.RoIAlignRotated
+                aligned = True
+            else:
+                c2_roi_align = torch.ops._caffe2.RoIAlign
+                aligned = bool(pooler.aligned)
+
+            if x_level.is_quantized:
+                x_level = x_level.dequantize()
+
+            roi_feat_fpn = c2_roi_align(
+                x_level,
+                roi_fpn,
+                order="NCHW",
+                spatial_scale=float(pooler.spatial_scale),
+                pooled_h=int(self.output_size[0]),
+                pooled_w=int(self.output_size[1]),
+                sampling_ratio=int(pooler.sampling_ratio),
+                aligned=aligned,
+            )
+            roi_feat_fpn_list.append(roi_feat_fpn)
+
+        roi_feat_shuffled = cat(roi_feat_fpn_list, dim=0)
+        assert roi_feat_shuffled.numel() > 0 and rois_idx_restore_int32.numel() > 0, (
+            "Caffe2 export requires tracing with a model checkpoint + input that can produce valid"
+            " detections. But no detections were obtained with the given checkpoint and input!"
+        )
+        roi_feat = torch.ops._caffe2.BatchPermutation(roi_feat_shuffled, rois_idx_restore_int32)
+        return roi_feat
+
+
+def caffe2_fast_rcnn_outputs_inference(tensor_mode, box_predictor, predictions, proposals):
+    """equivalent to FastRCNNOutputLayers.inference"""
+    num_classes = box_predictor.num_classes
+    score_thresh = box_predictor.test_score_thresh
+    nms_thresh = box_predictor.test_nms_thresh
+    topk_per_image = box_predictor.test_topk_per_image
+    is_rotated = len(box_predictor.box2box_transform.weights) == 5
+
+    if is_rotated:
+        box_dim = 5
+        assert box_predictor.box2box_transform.weights[4] == 1, (
+            "The weights for Rotated BBoxTransform in C2 have only 4 dimensions,"
+            + " thus enforcing the angle weight to be 1 for now"
+        )
+        box2box_transform_weights = box_predictor.box2box_transform.weights[:4]
+    else:
+        box_dim = 4
+        box2box_transform_weights = box_predictor.box2box_transform.weights
+
+    class_logits, box_regression = predictions
+    if num_classes + 1 == class_logits.shape[1]:
+        class_prob = F.softmax(class_logits, -1)
+    else:
+        assert num_classes == class_logits.shape[1]
+        class_prob = F.sigmoid(class_logits)
+        # BoxWithNMSLimit will infer num_classes from the shape of the class_prob
+        # So append a zero column as placeholder for the background class
+        class_prob = torch.cat((class_prob, torch.zeros(class_prob.shape[0], 1)), dim=1)
+
+    assert box_regression.shape[1] % box_dim == 0
+    cls_agnostic_bbox_reg = box_regression.shape[1] // box_dim == 1
+
+    input_tensor_mode = proposals[0].proposal_boxes.tensor.shape[1] == box_dim + 1
+
+    proposal_boxes = proposals[0].proposal_boxes
+    if isinstance(proposal_boxes, Caffe2Boxes):
+        rois = Caffe2Boxes.cat([p.proposal_boxes for p in proposals])
+    elif isinstance(proposal_boxes, RotatedBoxes):
+        rois = RotatedBoxes.cat([p.proposal_boxes for p in proposals])
+    elif isinstance(proposal_boxes, Boxes):
+        rois = Boxes.cat([p.proposal_boxes for p in proposals])
+    else:
+        raise NotImplementedError(
+            'Expected proposals[0].proposal_boxes to be type "Boxes", '
+            f"instead got {type(proposal_boxes)}"
+        )
+
+    device, dtype = rois.tensor.device, rois.tensor.dtype
+    if input_tensor_mode:
+        im_info = proposals[0].image_size
+        rois = rois.tensor
+    else:
+        im_info = torch.tensor([[sz[0], sz[1], 1.0] for sz in [x.image_size for x in proposals]])
+        batch_ids = cat(
+            [
+                torch.full((b, 1), i, dtype=dtype, device=device)
+                for i, b in enumerate(len(p) for p in proposals)
+            ],
+            dim=0,
+        )
+        rois = torch.cat([batch_ids, rois.tensor], dim=1)
+
+    roi_pred_bbox, roi_batch_splits = torch.ops._caffe2.BBoxTransform(
+        to_device(rois, "cpu"),
+        to_device(box_regression, "cpu"),
+        to_device(im_info, "cpu"),
+        weights=box2box_transform_weights,
+        apply_scale=True,
+        rotated=is_rotated,
+        angle_bound_on=True,
+        angle_bound_lo=-180,
+        angle_bound_hi=180,
+        clip_angle_thresh=1.0,
+        legacy_plus_one=False,
+    )
+    roi_pred_bbox = to_device(roi_pred_bbox, device)
+    roi_batch_splits = to_device(roi_batch_splits, device)
+
+    nms_outputs = torch.ops._caffe2.BoxWithNMSLimit(
+        to_device(class_prob, "cpu"),
+        to_device(roi_pred_bbox, "cpu"),
+        to_device(roi_batch_splits, "cpu"),
+        score_thresh=float(score_thresh),
+        nms=float(nms_thresh),
+        detections_per_im=int(topk_per_image),
+        soft_nms_enabled=False,
+        soft_nms_method="linear",
+        soft_nms_sigma=0.5,
+        soft_nms_min_score_thres=0.001,
+        rotated=is_rotated,
+        cls_agnostic_bbox_reg=cls_agnostic_bbox_reg,
+        input_boxes_include_bg_cls=False,
+        output_classes_include_bg_cls=False,
+        legacy_plus_one=False,
+    )
+    roi_score_nms = to_device(nms_outputs[0], device)
+    roi_bbox_nms = to_device(nms_outputs[1], device)
+    roi_class_nms = to_device(nms_outputs[2], device)
+    roi_batch_splits_nms = to_device(nms_outputs[3], device)
+    roi_keeps_nms = to_device(nms_outputs[4], device)
+    roi_keeps_size_nms = to_device(nms_outputs[5], device)
+    if not tensor_mode:
+        roi_class_nms = roi_class_nms.to(torch.int64)
+
+    roi_batch_ids = cat(
+        [
+            torch.full((b, 1), i, dtype=dtype, device=device)
+            for i, b in enumerate(int(x.item()) for x in roi_batch_splits_nms)
+        ],
+        dim=0,
+    )
+
+    roi_class_nms = alias(roi_class_nms, "class_nms")
+    roi_score_nms = alias(roi_score_nms, "score_nms")
+    roi_bbox_nms = alias(roi_bbox_nms, "bbox_nms")
+    roi_batch_splits_nms = alias(roi_batch_splits_nms, "batch_splits_nms")
+    roi_keeps_nms = alias(roi_keeps_nms, "keeps_nms")
+    roi_keeps_size_nms = alias(roi_keeps_size_nms, "keeps_size_nms")
+
+    results = InstancesList(
+        im_info=im_info,
+        indices=roi_batch_ids[:, 0],
+        extra_fields={
+            "pred_boxes": Caffe2Boxes(roi_bbox_nms),
+            "scores": roi_score_nms,
+            "pred_classes": roi_class_nms,
+        },
+    )
+
+    if not tensor_mode:
+        results = InstancesList.to_d2_instances_list(results)
+        batch_splits = roi_batch_splits_nms.int().tolist()
+        kept_indices = list(roi_keeps_nms.to(torch.int64).split(batch_splits))
+    else:
+        results = [results]
+        kept_indices = [roi_keeps_nms]
+
+    return results, kept_indices
+
+
+class Caffe2FastRCNNOutputsInference:
+    def __init__(self, tensor_mode):
+        self.tensor_mode = tensor_mode  # whether the output is caffe2 tensor mode
+
+    def __call__(self, box_predictor, predictions, proposals):
+        return caffe2_fast_rcnn_outputs_inference(
+            self.tensor_mode, box_predictor, predictions, proposals
+        )
+
+
+def caffe2_mask_rcnn_inference(pred_mask_logits, pred_instances):
+    """equivalent to mask_head.mask_rcnn_inference"""
+    if all(isinstance(x, InstancesList) for x in pred_instances):
+        assert len(pred_instances) == 1
+        mask_probs_pred = pred_mask_logits.sigmoid()
+        mask_probs_pred = alias(mask_probs_pred, "mask_fcn_probs")
+        pred_instances[0].set("pred_masks", mask_probs_pred)
+    else:
+        mask_rcnn_inference(pred_mask_logits, pred_instances)
+
+
+class Caffe2MaskRCNNInference:
+    def __call__(self, pred_mask_logits, pred_instances):
+        return caffe2_mask_rcnn_inference(pred_mask_logits, pred_instances)
+
+
+def caffe2_keypoint_rcnn_inference(use_heatmap_max_keypoint, pred_keypoint_logits, pred_instances):
+    # just return the keypoint heatmap for now,
+    # there will be option to call HeatmapMaxKeypointOp
+    output = alias(pred_keypoint_logits, "kps_score")
+    if all(isinstance(x, InstancesList) for x in pred_instances):
+        assert len(pred_instances) == 1
+        if use_heatmap_max_keypoint:
+            device = output.device
+            output = torch.ops._caffe2.HeatmapMaxKeypoint(
+                to_device(output, "cpu"),
+                pred_instances[0].pred_boxes.tensor,
+                should_output_softmax=True,  # worth make it configerable?
+            )
+            output = to_device(output, device)
+            output = alias(output, "keypoints_out")
+        pred_instances[0].set("pred_keypoints", output)
+    return pred_keypoint_logits
+
+
+class Caffe2KeypointRCNNInference:
+    def __init__(self, use_heatmap_max_keypoint):
+        self.use_heatmap_max_keypoint = use_heatmap_max_keypoint
+
+    def __call__(self, pred_keypoint_logits, pred_instances):
+        return caffe2_keypoint_rcnn_inference(
+            self.use_heatmap_max_keypoint, pred_keypoint_logits, pred_instances
+        )
diff --git a/detectron2/export/caffe2_export.py b/detectron2/export/caffe2_export.py
new file mode 100644
index 0000000000000000000000000000000000000000..d609c27c7deb396352967dbcbc79b1e00f2a2de1
--- /dev/null
+++ b/detectron2/export/caffe2_export.py
@@ -0,0 +1,203 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+
+import copy
+import io
+import logging
+import numpy as np
+from typing import List
+import onnx
+import onnx.optimizer
+import torch
+from caffe2.proto import caffe2_pb2
+from caffe2.python import core
+from caffe2.python.onnx.backend import Caffe2Backend
+from tabulate import tabulate
+from termcolor import colored
+from torch.onnx import OperatorExportTypes
+
+from .shared import (
+    ScopedWS,
+    construct_init_net_from_params,
+    fuse_alias_placeholder,
+    fuse_copy_between_cpu_and_gpu,
+    get_params_from_init_net,
+    group_norm_replace_aten_with_caffe2,
+    infer_device_type,
+    remove_dead_end_ops,
+    remove_reshape_for_fc,
+    save_graph,
+)
+
+logger = logging.getLogger(__name__)
+
+
+def export_onnx_model(model, inputs):
+    """
+    Trace and export a model to onnx format.
+
+    Args:
+        model (nn.Module):
+        inputs (tuple[args]): the model will be called by `model(*inputs)`
+
+    Returns:
+        an onnx model
+    """
+    assert isinstance(model, torch.nn.Module)
+
+    # make sure all modules are in eval mode, onnx may change the training state
+    # of the module if the states are not consistent
+    def _check_eval(module):
+        assert not module.training
+
+    model.apply(_check_eval)
+
+    # Export the model to ONNX
+    with torch.no_grad():
+        with io.BytesIO() as f:
+            torch.onnx.export(
+                model,
+                inputs,
+                f,
+                operator_export_type=OperatorExportTypes.ONNX_ATEN_FALLBACK,
+                # verbose=True,  # NOTE: uncomment this for debugging
+                # export_params=True,
+            )
+            onnx_model = onnx.load_from_string(f.getvalue())
+
+    return onnx_model
+
+
+def _op_stats(net_def):
+    type_count = {}
+    for t in [op.type for op in net_def.op]:
+        type_count[t] = type_count.get(t, 0) + 1
+    type_count_list = sorted(type_count.items(), key=lambda kv: kv[0])  # alphabet
+    type_count_list = sorted(type_count_list, key=lambda kv: -kv[1])  # count
+    return "\n".join("{:>4}x {}".format(count, name) for name, count in type_count_list)
+
+
+def _assign_device_option(
+    predict_net: caffe2_pb2.NetDef, init_net: caffe2_pb2.NetDef, tensor_inputs: List[torch.Tensor]
+):
+    """
+    ONNX exported network doesn't have concept of device, assign necessary
+    device option for each op in order to make it runable on GPU runtime.
+    """
+
+    def _get_device_type(torch_tensor):
+        assert torch_tensor.device.type in ["cpu", "cuda"]
+        assert torch_tensor.device.index == 0
+        return torch_tensor.device.type
+
+    def _assign_op_device_option(net_proto, net_ssa, blob_device_types):
+        for op, ssa_i in zip(net_proto.op, net_ssa):
+            if op.type in ["CopyCPUToGPU", "CopyGPUToCPU"]:
+                op.device_option.CopyFrom(core.DeviceOption(caffe2_pb2.CUDA, 0))
+            else:
+                devices = [blob_device_types[b] for b in ssa_i[0] + ssa_i[1]]
+                assert all(d == devices[0] for d in devices)
+                if devices[0] == "cuda":
+                    op.device_option.CopyFrom(core.DeviceOption(caffe2_pb2.CUDA, 0))
+
+    # update ops in predict_net
+    predict_net_input_device_types = {
+        (name, 0): _get_device_type(tensor)
+        for name, tensor in zip(predict_net.external_input, tensor_inputs)
+    }
+    predict_net_device_types = infer_device_type(
+        predict_net, known_status=predict_net_input_device_types, device_name_style="pytorch"
+    )
+    predict_net_ssa, _ = core.get_ssa(predict_net)
+    _assign_op_device_option(predict_net, predict_net_ssa, predict_net_device_types)
+
+    # update ops in init_net
+    init_net_ssa, versions = core.get_ssa(init_net)
+    init_net_output_device_types = {
+        (name, versions[name]): predict_net_device_types[(name, 0)]
+        for name in init_net.external_output
+    }
+    init_net_device_types = infer_device_type(
+        init_net, known_status=init_net_output_device_types, device_name_style="pytorch"
+    )
+    _assign_op_device_option(init_net, init_net_ssa, init_net_device_types)
+
+
+def export_caffe2_detection_model(model: torch.nn.Module, tensor_inputs: List[torch.Tensor]):
+    """
+    Export a caffe2-compatible Detectron2 model to caffe2 format via ONNX.
+
+    Arg:
+        model: a caffe2-compatible version of detectron2 model, defined in caffe2_modeling.py
+        tensor_inputs: a list of tensors that caffe2 model takes as input.
+    """
+    model = copy.deepcopy(model)
+    assert isinstance(model, torch.nn.Module)
+    assert hasattr(model, "encode_additional_info")
+
+    # Export via ONNX
+    logger.info(
+        "Exporting a {} model via ONNX ...".format(type(model).__name__)
+        + " Some warnings from ONNX are expected and are usually not to worry about."
+    )
+    onnx_model = export_onnx_model(model, (tensor_inputs,))
+    # Convert ONNX model to Caffe2 protobuf
+    init_net, predict_net = Caffe2Backend.onnx_graph_to_caffe2_net(onnx_model)
+    ops_table = [[op.type, op.input, op.output] for op in predict_net.op]
+    table = tabulate(ops_table, headers=["type", "input", "output"], tablefmt="pipe")
+    logger.info(
+        "ONNX export Done. Exported predict_net (before optimizations):\n" + colored(table, "cyan")
+    )
+
+    # Apply protobuf optimization
+    fuse_alias_placeholder(predict_net, init_net)
+    if any(t.device.type != "cpu" for t in tensor_inputs):
+        fuse_copy_between_cpu_and_gpu(predict_net)
+        remove_dead_end_ops(init_net)
+        _assign_device_option(predict_net, init_net, tensor_inputs)
+    params, device_options = get_params_from_init_net(init_net)
+    predict_net, params = remove_reshape_for_fc(predict_net, params)
+    init_net = construct_init_net_from_params(params, device_options)
+    group_norm_replace_aten_with_caffe2(predict_net)
+
+    # Record necessary information for running the pb model in Detectron2 system.
+    model.encode_additional_info(predict_net, init_net)
+
+    logger.info("Operators used in predict_net: \n{}".format(_op_stats(predict_net)))
+    logger.info("Operators used in init_net: \n{}".format(_op_stats(init_net)))
+
+    return predict_net, init_net
+
+
+def run_and_save_graph(predict_net, init_net, tensor_inputs, graph_save_path):
+    """
+    Run the caffe2 model on given inputs, recording the shape and draw the graph.
+
+    predict_net/init_net: caffe2 model.
+    tensor_inputs: a list of tensors that caffe2 model takes as input.
+    graph_save_path: path for saving graph of exported model.
+    """
+
+    logger.info("Saving graph of ONNX exported model to {} ...".format(graph_save_path))
+    save_graph(predict_net, graph_save_path, op_only=False)
+
+    # Run the exported Caffe2 net
+    logger.info("Running ONNX exported model ...")
+    with ScopedWS("__ws_tmp__", True) as ws:
+        ws.RunNetOnce(init_net)
+        initialized_blobs = set(ws.Blobs())
+        uninitialized = [inp for inp in predict_net.external_input if inp not in initialized_blobs]
+        for name, blob in zip(uninitialized, tensor_inputs):
+            ws.FeedBlob(name, blob)
+
+        try:
+            ws.RunNetOnce(predict_net)
+        except RuntimeError as e:
+            logger.warning("Encountered RuntimeError: \n{}".format(str(e)))
+
+        ws_blobs = {b: ws.FetchBlob(b) for b in ws.Blobs()}
+        blob_sizes = {b: ws_blobs[b].shape for b in ws_blobs if isinstance(ws_blobs[b], np.ndarray)}
+
+        logger.info("Saving graph with blob shapes to {} ...".format(graph_save_path))
+        save_graph(predict_net, graph_save_path, op_only=False, blob_sizes=blob_sizes)
+
+        return ws_blobs
diff --git a/detectron2/export/caffe2_inference.py b/detectron2/export/caffe2_inference.py
new file mode 100644
index 0000000000000000000000000000000000000000..deb886c0417285ed1d5ad85eb941fa1ac757cdab
--- /dev/null
+++ b/detectron2/export/caffe2_inference.py
@@ -0,0 +1,161 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+
+import logging
+import numpy as np
+from itertools import count
+import torch
+from caffe2.proto import caffe2_pb2
+from caffe2.python import core
+
+from .caffe2_modeling import META_ARCH_CAFFE2_EXPORT_TYPE_MAP, convert_batched_inputs_to_c2_format
+from .shared import ScopedWS, get_pb_arg_vali, get_pb_arg_vals, infer_device_type
+
+logger = logging.getLogger(__name__)
+
+
+# ===== ref: mobile-vision predictor's 'Caffe2Wrapper' class ======
+class ProtobufModel(torch.nn.Module):
+    """
+    Wrapper of a caffe2's protobuf model.
+    It works just like nn.Module, but running caffe2 under the hood.
+    Input/Output are tuple[tensor] that match the caffe2 net's external_input/output.
+    """
+
+    _ids = count(0)
+
+    def __init__(self, predict_net, init_net):
+        logger.info(f"Initializing ProtobufModel for: {predict_net.name} ...")
+        super().__init__()
+        assert isinstance(predict_net, caffe2_pb2.NetDef)
+        assert isinstance(init_net, caffe2_pb2.NetDef)
+        # create unique temporary workspace for each instance
+        self.ws_name = "__tmp_ProtobufModel_{}__".format(next(self._ids))
+        self.net = core.Net(predict_net)
+
+        logger.info("Running init_net once to fill the parameters ...")
+        with ScopedWS(self.ws_name, is_reset=True, is_cleanup=False) as ws:
+            ws.RunNetOnce(init_net)
+            uninitialized_external_input = []
+            for blob in self.net.Proto().external_input:
+                if blob not in ws.Blobs():
+                    uninitialized_external_input.append(blob)
+                    ws.CreateBlob(blob)
+            ws.CreateNet(self.net)
+
+        self._error_msgs = set()
+        self._input_blobs = uninitialized_external_input
+
+    def _infer_output_devices(self, inputs):
+        """
+        Returns:
+            list[str]: list of device for each external output
+        """
+
+        def _get_device_type(torch_tensor):
+            assert torch_tensor.device.type in ["cpu", "cuda"]
+            assert torch_tensor.device.index == 0
+            return torch_tensor.device.type
+
+        predict_net = self.net.Proto()
+        input_device_types = {
+            (name, 0): _get_device_type(tensor) for name, tensor in zip(self._input_blobs, inputs)
+        }
+        device_type_map = infer_device_type(
+            predict_net, known_status=input_device_types, device_name_style="pytorch"
+        )
+        ssa, versions = core.get_ssa(predict_net)
+        versioned_outputs = [(name, versions[name]) for name in predict_net.external_output]
+        output_devices = [device_type_map[outp] for outp in versioned_outputs]
+        return output_devices
+
+    def forward(self, inputs):
+        """
+        Args:
+            inputs (tuple[torch.Tensor])
+
+        Returns:
+            tuple[torch.Tensor]
+        """
+        assert len(inputs) == len(self._input_blobs), (
+            f"Length of inputs ({len(inputs)}) "
+            f"doesn't match the required input blobs: {self._input_blobs}"
+        )
+
+        with ScopedWS(self.ws_name, is_reset=False, is_cleanup=False) as ws:
+            for b, tensor in zip(self._input_blobs, inputs):
+                ws.FeedBlob(b, tensor)
+
+            try:
+                ws.RunNet(self.net.Proto().name)
+            except RuntimeError as e:
+                if not str(e) in self._error_msgs:
+                    self._error_msgs.add(str(e))
+                    logger.warning("Encountered new RuntimeError: \n{}".format(str(e)))
+                logger.warning("Catch the error and use partial results.")
+
+            c2_outputs = [ws.FetchBlob(b) for b in self.net.Proto().external_output]
+            # Remove outputs of current run, this is necessary in order to
+            # prevent fetching the result from previous run if the model fails
+            # in the middle.
+            for b in self.net.Proto().external_output:
+                # Needs to create uninitialized blob to make the net runable.
+                # This is "equivalent" to: ws.RemoveBlob(b) then ws.CreateBlob(b),
+                # but there'no such API.
+                ws.FeedBlob(b, f"{b}, a C++ native class of type nullptr (uninitialized).")
+
+        # Cast output to torch.Tensor on the desired device
+        output_devices = (
+            self._infer_output_devices(inputs)
+            if any(t.device.type != "cpu" for t in inputs)
+            else ["cpu" for _ in self.net.Proto().external_output]
+        )
+
+        outputs = []
+        for name, c2_output, device in zip(
+            self.net.Proto().external_output, c2_outputs, output_devices
+        ):
+            if not isinstance(c2_output, np.ndarray):
+                raise RuntimeError(
+                    "Invalid output for blob {}, received: {}".format(name, c2_output)
+                )
+            outputs.append(torch.tensor(c2_output).to(device=device))
+        return tuple(outputs)
+
+
+class ProtobufDetectionModel(torch.nn.Module):
+    """
+    A class works just like a pytorch meta arch in terms of inference, but running
+    caffe2 model under the hood.
+    """
+
+    def __init__(self, predict_net, init_net, *, convert_outputs=None):
+        """
+        Args:
+            predict_net, init_net (core.Net): caffe2 nets
+            convert_outptus (callable): a function that converts caffe2
+                outputs to the same format of the original pytorch model.
+                By default, use the one defined in the caffe2 meta_arch.
+        """
+        super().__init__()
+        self.protobuf_model = ProtobufModel(predict_net, init_net)
+        self.size_divisibility = get_pb_arg_vali(predict_net, "size_divisibility", 0)
+        self.device = get_pb_arg_vals(predict_net, "device", b"cpu").decode("ascii")
+
+        if convert_outputs is None:
+            meta_arch = get_pb_arg_vals(predict_net, "meta_architecture", b"GeneralizedRCNN")
+            meta_arch = META_ARCH_CAFFE2_EXPORT_TYPE_MAP[meta_arch.decode("ascii")]
+            self._convert_outputs = meta_arch.get_outputs_converter(predict_net, init_net)
+        else:
+            self._convert_outputs = convert_outputs
+
+    def _convert_inputs(self, batched_inputs):
+        # currently all models convert inputs in the same way
+        return convert_batched_inputs_to_c2_format(
+            batched_inputs, self.size_divisibility, self.device
+        )
+
+    def forward(self, batched_inputs):
+        c2_inputs = self._convert_inputs(batched_inputs)
+        c2_results = self.protobuf_model(c2_inputs)
+        c2_results = dict(zip(self.protobuf_model.net.Proto().external_output, c2_results))
+        return self._convert_outputs(batched_inputs, c2_inputs, c2_results)
diff --git a/detectron2/export/caffe2_modeling.py b/detectron2/export/caffe2_modeling.py
new file mode 100644
index 0000000000000000000000000000000000000000..3e675c45d62f7b363a298099cd520c417832d58c
--- /dev/null
+++ b/detectron2/export/caffe2_modeling.py
@@ -0,0 +1,420 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+
+import functools
+import io
+import struct
+import types
+import torch
+
+from detectron2.modeling import meta_arch
+from detectron2.modeling.box_regression import Box2BoxTransform
+from detectron2.modeling.roi_heads import keypoint_head
+from detectron2.structures import Boxes, ImageList, Instances, RotatedBoxes
+
+from .c10 import Caffe2Compatible
+from .caffe2_patch import ROIHeadsPatcher, patch_generalized_rcnn
+from .shared import (
+    alias,
+    check_set_pb_arg,
+    get_pb_arg_floats,
+    get_pb_arg_valf,
+    get_pb_arg_vali,
+    get_pb_arg_vals,
+    mock_torch_nn_functional_interpolate,
+)
+
+
+def assemble_rcnn_outputs_by_name(image_sizes, tensor_outputs, force_mask_on=False):
+    """
+    A function to assemble caffe2 model's outputs (i.e. Dict[str, Tensor])
+    to detectron2's format (i.e. list of Instances instance).
+    This only works when the model follows the Caffe2 detectron's naming convention.
+
+    Args:
+        image_sizes (List[List[int, int]]): [H, W] of every image.
+        tensor_outputs (Dict[str, Tensor]): external_output to its tensor.
+
+        force_mask_on (Bool): if true, the it make sure there'll be pred_masks even
+            if the mask is not found from tensor_outputs (usually due to model crash)
+    """
+
+    results = [Instances(image_size) for image_size in image_sizes]
+
+    batch_splits = tensor_outputs.get("batch_splits", None)
+    if batch_splits:
+        raise NotImplementedError()
+    assert len(image_sizes) == 1
+    result = results[0]
+
+    bbox_nms = tensor_outputs["bbox_nms"]
+    score_nms = tensor_outputs["score_nms"]
+    class_nms = tensor_outputs["class_nms"]
+    # Detection will always success because Conv support 0-batch
+    assert bbox_nms is not None
+    assert score_nms is not None
+    assert class_nms is not None
+    if bbox_nms.shape[1] == 5:
+        result.pred_boxes = RotatedBoxes(bbox_nms)
+    else:
+        result.pred_boxes = Boxes(bbox_nms)
+    result.scores = score_nms
+    result.pred_classes = class_nms.to(torch.int64)
+
+    mask_fcn_probs = tensor_outputs.get("mask_fcn_probs", None)
+    if mask_fcn_probs is not None:
+        # finish the mask pred
+        mask_probs_pred = mask_fcn_probs
+        num_masks = mask_probs_pred.shape[0]
+        class_pred = result.pred_classes
+        indices = torch.arange(num_masks, device=class_pred.device)
+        mask_probs_pred = mask_probs_pred[indices, class_pred][:, None]
+        result.pred_masks = mask_probs_pred
+    elif force_mask_on:
+        # NOTE: there's no way to know the height/width of mask here, it won't be
+        # used anyway when batch size is 0, so just set them to 0.
+        result.pred_masks = torch.zeros([0, 1, 0, 0], dtype=torch.uint8)
+
+    keypoints_out = tensor_outputs.get("keypoints_out", None)
+    kps_score = tensor_outputs.get("kps_score", None)
+    if keypoints_out is not None:
+        # keypoints_out: [N, 4, #kypoints], where 4 is in order of (x, y, score, prob)
+        keypoints_tensor = keypoints_out
+        # NOTE: it's possible that prob is not calculated if "should_output_softmax"
+        # is set to False in HeatmapMaxKeypoint, so just using raw score, seems
+        # it doesn't affect mAP. TODO: check more carefully.
+        keypoint_xyp = keypoints_tensor.transpose(1, 2)[:, :, [0, 1, 2]]
+        result.pred_keypoints = keypoint_xyp
+    elif kps_score is not None:
+        # keypoint heatmap to sparse data structure
+        pred_keypoint_logits = kps_score
+        keypoint_head.keypoint_rcnn_inference(pred_keypoint_logits, [result])
+
+    return results
+
+
+def _cast_to_f32(f64):
+    return struct.unpack("f", struct.pack("f", f64))[0]
+
+
+def set_caffe2_compatible_tensor_mode(model, enable=True):
+    def _fn(m):
+        if isinstance(m, Caffe2Compatible):
+            m.tensor_mode = enable
+
+    model.apply(_fn)
+
+
+def convert_batched_inputs_to_c2_format(batched_inputs, size_divisibility, device):
+    """
+    See get_caffe2_inputs() below.
+    """
+    assert all(isinstance(x, dict) for x in batched_inputs)
+    assert all(x["image"].dim() == 3 for x in batched_inputs)
+
+    images = [x["image"] for x in batched_inputs]
+    images = ImageList.from_tensors(images, size_divisibility)
+
+    im_info = []
+    for input_per_image, image_size in zip(batched_inputs, images.image_sizes):
+        target_height = input_per_image.get("height", image_size[0])
+        target_width = input_per_image.get("width", image_size[1])  # noqa
+        # NOTE: The scale inside im_info is kept as convention and for providing
+        # post-processing information if further processing is needed. For
+        # current Caffe2 model definitions that don't include post-processing inside
+        # the model, this number is not used.
+        # NOTE: There can be a slight difference between width and height
+        # scales, using a single number can results in numerical difference
+        # compared with D2's post-processing.
+        scale = target_height / image_size[0]
+        im_info.append([image_size[0], image_size[1], scale])
+    im_info = torch.Tensor(im_info)
+
+    return images.tensor.to(device), im_info.to(device)
+
+
+class Caffe2MetaArch(Caffe2Compatible, torch.nn.Module):
+    """
+    Base class for caffe2-compatible implementation of a meta architecture.
+    The forward is traceable and its traced graph can be converted to caffe2
+    graph through ONNX.
+    """
+
+    def __init__(self, cfg, torch_model, enable_tensor_mode=True):
+        """
+        Args:
+            cfg (CfgNode):
+            torch_model (nn.Module): the detectron2 model (meta_arch) to be
+                converted.
+        """
+        super().__init__()
+        self._wrapped_model = torch_model
+        self.eval()
+        set_caffe2_compatible_tensor_mode(self, enable_tensor_mode)
+
+    def get_caffe2_inputs(self, batched_inputs):
+        """
+        Convert pytorch-style structured inputs to caffe2-style inputs that
+        are tuples of tensors.
+
+        Args:
+            batched_inputs (list[dict]): inputs to a detectron2 model
+                in its standard format. Each dict has "image" (CHW tensor), and optionally
+                "height" and "width".
+
+        Returns:
+            tuple[Tensor]:
+                tuple of tensors that will be the inputs to the
+                :meth:`forward` method. For existing models, the first
+                is an NCHW tensor (padded and batched); the second is
+                a im_info Nx3 tensor, where the rows are
+                (height, width, unused legacy parameter)
+        """
+        return convert_batched_inputs_to_c2_format(
+            batched_inputs,
+            self._wrapped_model.backbone.size_divisibility,
+            self._wrapped_model.device,
+        )
+
+    def encode_additional_info(self, predict_net, init_net):
+        """
+        Save extra metadata that will be used by inference in the output protobuf.
+        """
+        pass
+
+    def forward(self, inputs):
+        """
+        Run the forward in caffe2-style. It has to use caffe2-compatible ops
+        and the method will be used for tracing.
+
+        Args:
+            inputs (tuple[Tensor]): inputs defined by :meth:`get_caffe2_input`.
+                They will be the inputs of the converted caffe2 graph.
+
+        Returns:
+            tuple[Tensor]: output tensors. They will be the outputs of the
+                converted caffe2 graph.
+        """
+        raise NotImplementedError
+
+    def _caffe2_preprocess_image(self, inputs):
+        """
+        Caffe2 implementation of preprocess_image, which is called inside each MetaArch's forward.
+        It normalizes the input images, and the final caffe2 graph assumes the
+        inputs have been batched already.
+        """
+        data, im_info = inputs
+        data = alias(data, "data")
+        im_info = alias(im_info, "im_info")
+        mean, std = self._wrapped_model.pixel_mean, self._wrapped_model.pixel_std
+        normalized_data = (data - mean) / std
+        normalized_data = alias(normalized_data, "normalized_data")
+
+        # Pack (data, im_info) into ImageList which is recognized by self.inference.
+        images = ImageList(tensor=normalized_data, image_sizes=im_info)
+        return images
+
+    @staticmethod
+    def get_outputs_converter(predict_net, init_net):
+        """
+        Creates a function that converts outputs of the caffe2 model to
+        detectron2's standard format.
+        The function uses information in `predict_net` and `init_net` that are
+        available at inferene time. Therefore the function logic can be used in inference.
+
+        The returned function has the following signature:
+
+            def convert(batched_inputs, c2_inputs, c2_results) -> detectron2_outputs
+
+        Where
+
+            * batched_inputs (list[dict]): the original input format of the meta arch
+            * c2_inputs (tuple[Tensor]): the caffe2 inputs.
+            * c2_results (dict[str, Tensor]): the caffe2 output format,
+                corresponding to the outputs of the :meth:`forward` function.
+            * detectron2_outputs: the original output format of the meta arch.
+
+        This function can be used to compare the outputs of the original meta arch and
+        the converted caffe2 graph.
+
+        Returns:
+            callable: a callable of the above signature.
+        """
+        raise NotImplementedError
+
+
+class Caffe2GeneralizedRCNN(Caffe2MetaArch):
+    def __init__(self, cfg, torch_model, enable_tensor_mode=True):
+        assert isinstance(torch_model, meta_arch.GeneralizedRCNN)
+        torch_model = patch_generalized_rcnn(torch_model)
+        super().__init__(cfg, torch_model, enable_tensor_mode)
+
+        try:
+            use_heatmap_max_keypoint = cfg.EXPORT_CAFFE2.USE_HEATMAP_MAX_KEYPOINT
+        except AttributeError:
+            use_heatmap_max_keypoint = False
+        self.roi_heads_patcher = ROIHeadsPatcher(
+            self._wrapped_model.roi_heads, use_heatmap_max_keypoint
+        )
+        if self.tensor_mode:
+            self.roi_heads_patcher.patch_roi_heads()
+
+    def encode_additional_info(self, predict_net, init_net):
+        size_divisibility = self._wrapped_model.backbone.size_divisibility
+        check_set_pb_arg(predict_net, "size_divisibility", "i", size_divisibility)
+        check_set_pb_arg(
+            predict_net, "device", "s", str.encode(str(self._wrapped_model.device), "ascii")
+        )
+        check_set_pb_arg(predict_net, "meta_architecture", "s", b"GeneralizedRCNN")
+
+    @mock_torch_nn_functional_interpolate()
+    def forward(self, inputs):
+        if not self.tensor_mode:
+            return self._wrapped_model.inference(inputs)
+        images = self._caffe2_preprocess_image(inputs)
+        features = self._wrapped_model.backbone(images.tensor)
+        proposals, _ = self._wrapped_model.proposal_generator(images, features)
+        detector_results, _ = self._wrapped_model.roi_heads(images, features, proposals)
+        return tuple(detector_results[0].flatten())
+
+    @staticmethod
+    def get_outputs_converter(predict_net, init_net):
+        def f(batched_inputs, c2_inputs, c2_results):
+            _, im_info = c2_inputs
+            image_sizes = [[int(im[0]), int(im[1])] for im in im_info]
+            results = assemble_rcnn_outputs_by_name(image_sizes, c2_results)
+            return meta_arch.GeneralizedRCNN._postprocess(results, batched_inputs, image_sizes)
+
+        return f
+
+
+class Caffe2RetinaNet(Caffe2MetaArch):
+    def __init__(self, cfg, torch_model):
+        assert isinstance(torch_model, meta_arch.RetinaNet)
+        super().__init__(cfg, torch_model)
+
+    @mock_torch_nn_functional_interpolate()
+    def forward(self, inputs):
+        assert self.tensor_mode
+        images = self._caffe2_preprocess_image(inputs)
+
+        # explicitly return the images sizes to avoid removing "im_info" by ONNX
+        # since it's not used in the forward path
+        return_tensors = [images.image_sizes]
+
+        features = self._wrapped_model.backbone(images.tensor)
+        features = [features[f] for f in self._wrapped_model.head_in_features]
+        for i, feature_i in enumerate(features):
+            features[i] = alias(feature_i, "feature_{}".format(i), is_backward=True)
+            return_tensors.append(features[i])
+
+        pred_logits, pred_anchor_deltas = self._wrapped_model.head(features)
+        for i, (box_cls_i, box_delta_i) in enumerate(zip(pred_logits, pred_anchor_deltas)):
+            return_tensors.append(alias(box_cls_i, "box_cls_{}".format(i)))
+            return_tensors.append(alias(box_delta_i, "box_delta_{}".format(i)))
+
+        return tuple(return_tensors)
+
+    def encode_additional_info(self, predict_net, init_net):
+        size_divisibility = self._wrapped_model.backbone.size_divisibility
+        check_set_pb_arg(predict_net, "size_divisibility", "i", size_divisibility)
+        check_set_pb_arg(
+            predict_net, "device", "s", str.encode(str(self._wrapped_model.device), "ascii")
+        )
+        check_set_pb_arg(predict_net, "meta_architecture", "s", b"RetinaNet")
+
+        # Inference parameters:
+        check_set_pb_arg(
+            predict_net, "score_threshold", "f", _cast_to_f32(self._wrapped_model.test_score_thresh)
+        )
+        check_set_pb_arg(
+            predict_net, "topk_candidates", "i", self._wrapped_model.test_topk_candidates
+        )
+        check_set_pb_arg(
+            predict_net, "nms_threshold", "f", _cast_to_f32(self._wrapped_model.test_nms_thresh)
+        )
+        check_set_pb_arg(
+            predict_net,
+            "max_detections_per_image",
+            "i",
+            self._wrapped_model.max_detections_per_image,
+        )
+
+        check_set_pb_arg(
+            predict_net,
+            "bbox_reg_weights",
+            "floats",
+            [_cast_to_f32(w) for w in self._wrapped_model.box2box_transform.weights],
+        )
+        self._encode_anchor_generator_cfg(predict_net)
+
+    def _encode_anchor_generator_cfg(self, predict_net):
+        # serialize anchor_generator for future use
+        serialized_anchor_generator = io.BytesIO()
+        torch.save(self._wrapped_model.anchor_generator, serialized_anchor_generator)
+        # Ideally we can put anchor generating inside the model, then we don't
+        # need to store this information.
+        bytes = serialized_anchor_generator.getvalue()
+        check_set_pb_arg(predict_net, "serialized_anchor_generator", "s", bytes)
+
+    @staticmethod
+    def get_outputs_converter(predict_net, init_net):
+        self = types.SimpleNamespace()
+        serialized_anchor_generator = io.BytesIO(
+            get_pb_arg_vals(predict_net, "serialized_anchor_generator", None)
+        )
+        self.anchor_generator = torch.load(serialized_anchor_generator)
+        bbox_reg_weights = get_pb_arg_floats(predict_net, "bbox_reg_weights", None)
+        self.box2box_transform = Box2BoxTransform(weights=tuple(bbox_reg_weights))
+        self.test_score_thresh = get_pb_arg_valf(predict_net, "score_threshold", None)
+        self.test_topk_candidates = get_pb_arg_vali(predict_net, "topk_candidates", None)
+        self.test_nms_thresh = get_pb_arg_valf(predict_net, "nms_threshold", None)
+        self.max_detections_per_image = get_pb_arg_vali(
+            predict_net, "max_detections_per_image", None
+        )
+
+        # hack to reuse inference code from RetinaNet
+        for meth in [
+            "forward_inference",
+            "inference_single_image",
+            "_transpose_dense_predictions",
+            "_decode_multi_level_predictions",
+            "_decode_per_level_predictions",
+        ]:
+            setattr(self, meth, functools.partial(getattr(meta_arch.RetinaNet, meth), self))
+
+        def f(batched_inputs, c2_inputs, c2_results):
+            _, im_info = c2_inputs
+            image_sizes = [[int(im[0]), int(im[1])] for im in im_info]
+            dummy_images = ImageList(
+                torch.randn(
+                    (
+                        len(im_info),
+                        3,
+                    )
+                    + tuple(image_sizes[0])
+                ),
+                image_sizes,
+            )
+
+            num_features = len([x for x in c2_results.keys() if x.startswith("box_cls_")])
+            pred_logits = [c2_results["box_cls_{}".format(i)] for i in range(num_features)]
+            pred_anchor_deltas = [c2_results["box_delta_{}".format(i)] for i in range(num_features)]
+
+            # For each feature level, feature should have the same batch size and
+            # spatial dimension as the box_cls and box_delta.
+            dummy_features = [x.clone()[:, 0:0, :, :] for x in pred_logits]
+            # self.num_classess can be inferred
+            self.num_classes = pred_logits[0].shape[1] // (pred_anchor_deltas[0].shape[1] // 4)
+
+            results = self.forward_inference(
+                dummy_images, dummy_features, [pred_logits, pred_anchor_deltas]
+            )
+            return meta_arch.GeneralizedRCNN._postprocess(results, batched_inputs, image_sizes)
+
+        return f
+
+
+META_ARCH_CAFFE2_EXPORT_TYPE_MAP = {
+    "GeneralizedRCNN": Caffe2GeneralizedRCNN,
+    "RetinaNet": Caffe2RetinaNet,
+}
diff --git a/detectron2/export/caffe2_patch.py b/detectron2/export/caffe2_patch.py
new file mode 100644
index 0000000000000000000000000000000000000000..4ddc2c1c6c5cff3e70df9b6001fcf43aae1d732d
--- /dev/null
+++ b/detectron2/export/caffe2_patch.py
@@ -0,0 +1,189 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+
+import contextlib
+from unittest import mock
+import torch
+
+from detectron2.modeling import poolers
+from detectron2.modeling.proposal_generator import rpn
+from detectron2.modeling.roi_heads import keypoint_head, mask_head
+from detectron2.modeling.roi_heads.fast_rcnn import FastRCNNOutputLayers
+
+from .c10 import (
+    Caffe2Compatible,
+    Caffe2FastRCNNOutputsInference,
+    Caffe2KeypointRCNNInference,
+    Caffe2MaskRCNNInference,
+    Caffe2ROIPooler,
+    Caffe2RPN,
+    caffe2_fast_rcnn_outputs_inference,
+    caffe2_keypoint_rcnn_inference,
+    caffe2_mask_rcnn_inference,
+)
+
+
+class GenericMixin:
+    pass
+
+
+class Caffe2CompatibleConverter:
+    """
+    A GenericUpdater which implements the `create_from` interface, by modifying
+    module object and assign it with another class replaceCls.
+    """
+
+    def __init__(self, replaceCls):
+        self.replaceCls = replaceCls
+
+    def create_from(self, module):
+        # update module's class to the new class
+        assert isinstance(module, torch.nn.Module)
+        if issubclass(self.replaceCls, GenericMixin):
+            # replaceCls should act as mixin, create a new class on-the-fly
+            new_class = type(
+                "{}MixedWith{}".format(self.replaceCls.__name__, module.__class__.__name__),
+                (self.replaceCls, module.__class__),
+                {},  # {"new_method": lambda self: ...},
+            )
+            module.__class__ = new_class
+        else:
+            # replaceCls is complete class, this allow arbitrary class swap
+            module.__class__ = self.replaceCls
+
+        # initialize Caffe2Compatible
+        if isinstance(module, Caffe2Compatible):
+            module.tensor_mode = False
+
+        return module
+
+
+def patch(model, target, updater, *args, **kwargs):
+    """
+    recursively (post-order) update all modules with the target type and its
+    subclasses, make a initialization/composition/inheritance/... via the
+    updater.create_from.
+    """
+    for name, module in model.named_children():
+        model._modules[name] = patch(module, target, updater, *args, **kwargs)
+    if isinstance(model, target):
+        return updater.create_from(model, *args, **kwargs)
+    return model
+
+
+def patch_generalized_rcnn(model):
+    ccc = Caffe2CompatibleConverter
+    model = patch(model, rpn.RPN, ccc(Caffe2RPN))
+    model = patch(model, poolers.ROIPooler, ccc(Caffe2ROIPooler))
+
+    return model
+
+
+@contextlib.contextmanager
+def mock_fastrcnn_outputs_inference(
+    tensor_mode, check=True, box_predictor_type=FastRCNNOutputLayers
+):
+    with mock.patch.object(
+        box_predictor_type,
+        "inference",
+        autospec=True,
+        side_effect=Caffe2FastRCNNOutputsInference(tensor_mode),
+    ) as mocked_func:
+        yield
+    if check:
+        assert mocked_func.call_count > 0
+
+
+@contextlib.contextmanager
+def mock_mask_rcnn_inference(tensor_mode, patched_module, check=True):
+    with mock.patch(
+        "{}.mask_rcnn_inference".format(patched_module), side_effect=Caffe2MaskRCNNInference()
+    ) as mocked_func:
+        yield
+    if check:
+        assert mocked_func.call_count > 0
+
+
+@contextlib.contextmanager
+def mock_keypoint_rcnn_inference(tensor_mode, patched_module, use_heatmap_max_keypoint, check=True):
+    with mock.patch(
+        "{}.keypoint_rcnn_inference".format(patched_module),
+        side_effect=Caffe2KeypointRCNNInference(use_heatmap_max_keypoint),
+    ) as mocked_func:
+        yield
+    if check:
+        assert mocked_func.call_count > 0
+
+
+class ROIHeadsPatcher:
+    def __init__(self, heads, use_heatmap_max_keypoint):
+        self.heads = heads
+        self.use_heatmap_max_keypoint = use_heatmap_max_keypoint
+        self.previous_patched = {}
+
+    @contextlib.contextmanager
+    def mock_roi_heads(self, tensor_mode=True):
+        """
+        Patching several inference functions inside ROIHeads and its subclasses
+
+        Args:
+            tensor_mode (bool): whether the inputs/outputs are caffe2's tensor
+                format or not. Default to True.
+        """
+        # NOTE: this requries the `keypoint_rcnn_inference` and `mask_rcnn_inference`
+        # are called inside the same file as BaseXxxHead due to using mock.patch.
+        kpt_heads_mod = keypoint_head.BaseKeypointRCNNHead.__module__
+        mask_head_mod = mask_head.BaseMaskRCNNHead.__module__
+
+        mock_ctx_managers = [
+            mock_fastrcnn_outputs_inference(
+                tensor_mode=tensor_mode,
+                check=True,
+                box_predictor_type=type(self.heads.box_predictor),
+            )
+        ]
+        if getattr(self.heads, "keypoint_on", False):
+            mock_ctx_managers += [
+                mock_keypoint_rcnn_inference(
+                    tensor_mode, kpt_heads_mod, self.use_heatmap_max_keypoint
+                )
+            ]
+        if getattr(self.heads, "mask_on", False):
+            mock_ctx_managers += [mock_mask_rcnn_inference(tensor_mode, mask_head_mod)]
+
+        with contextlib.ExitStack() as stack:  # python 3.3+
+            for mgr in mock_ctx_managers:
+                stack.enter_context(mgr)
+            yield
+
+    def patch_roi_heads(self, tensor_mode=True):
+        self.previous_patched["box_predictor"] = self.heads.box_predictor.inference
+        self.previous_patched["keypoint_rcnn"] = keypoint_head.keypoint_rcnn_inference
+        self.previous_patched["mask_rcnn"] = mask_head.mask_rcnn_inference
+
+        def patched_fastrcnn_outputs_inference(predictions, proposal):
+            return caffe2_fast_rcnn_outputs_inference(
+                True, self.heads.box_predictor, predictions, proposal
+            )
+
+        self.heads.box_predictor.inference = patched_fastrcnn_outputs_inference
+
+        if getattr(self.heads, "keypoint_on", False):
+
+            def patched_keypoint_rcnn_inference(pred_keypoint_logits, pred_instances):
+                return caffe2_keypoint_rcnn_inference(
+                    self.use_heatmap_max_keypoint, pred_keypoint_logits, pred_instances
+                )
+
+            keypoint_head.keypoint_rcnn_inference = patched_keypoint_rcnn_inference
+
+        if getattr(self.heads, "mask_on", False):
+
+            def patched_mask_rcnn_inference(pred_mask_logits, pred_instances):
+                return caffe2_mask_rcnn_inference(pred_mask_logits, pred_instances)
+
+            mask_head.mask_rcnn_inference = patched_mask_rcnn_inference
+
+    def unpatch_roi_heads(self):
+        self.heads.box_predictor.inference = self.previous_patched["box_predictor"]
+        keypoint_head.keypoint_rcnn_inference = self.previous_patched["keypoint_rcnn"]
+        mask_head.mask_rcnn_inference = self.previous_patched["mask_rcnn"]
diff --git a/detectron2/export/flatten.py b/detectron2/export/flatten.py
new file mode 100644
index 0000000000000000000000000000000000000000..f5ba4297567d650f147eebeed361e9d62fab899d
--- /dev/null
+++ b/detectron2/export/flatten.py
@@ -0,0 +1,330 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+import collections
+from dataclasses import dataclass
+from typing import Callable, List, Optional, Tuple
+import torch
+from torch import nn
+
+from detectron2.structures import Boxes, Instances, ROIMasks
+from detectron2.utils.registry import _convert_target_to_string, locate
+
+from .torchscript_patch import patch_builtin_len
+
+
+@dataclass
+class Schema:
+    """
+    A Schema defines how to flatten a possibly hierarchical object into tuple of
+    primitive objects, so it can be used as inputs/outputs of PyTorch's tracing.
+
+    PyTorch does not support tracing a function that produces rich output
+    structures (e.g. dict, Instances, Boxes). To trace such a function, we
+    flatten the rich object into tuple of tensors, and return this tuple of tensors
+    instead. Meanwhile, we also need to know how to "rebuild" the original object
+    from the flattened results, so we can evaluate the flattened results.
+    A Schema defines how to flatten an object, and while flattening it, it records
+    necessary schemas so that the object can be rebuilt using the flattened outputs.
+
+    The flattened object and the schema object is returned by ``.flatten`` classmethod.
+    Then the original object can be rebuilt with the ``__call__`` method of schema.
+
+    A Schema is a dataclass that can be serialized easily.
+    """
+
+    # inspired by FetchMapper in tensorflow/python/client/session.py
+
+    @classmethod
+    def flatten(cls, obj):
+        raise NotImplementedError
+
+    def __call__(self, values):
+        raise NotImplementedError
+
+    @staticmethod
+    def _concat(values):
+        ret = ()
+        sizes = []
+        for v in values:
+            assert isinstance(v, tuple), "Flattened results must be a tuple"
+            ret = ret + v
+            sizes.append(len(v))
+        return ret, sizes
+
+    @staticmethod
+    def _split(values, sizes):
+        if len(sizes):
+            expected_len = sum(sizes)
+            assert (
+                len(values) == expected_len
+            ), f"Values has length {len(values)} but expect length {expected_len}."
+        ret = []
+        for k in range(len(sizes)):
+            begin, end = sum(sizes[:k]), sum(sizes[: k + 1])
+            ret.append(values[begin:end])
+        return ret
+
+
+@dataclass
+class ListSchema(Schema):
+    schemas: List[Schema]  # the schemas that define how to flatten each element in the list
+    sizes: List[int]  # the flattened length of each element
+
+    def __call__(self, values):
+        values = self._split(values, self.sizes)
+        if len(values) != len(self.schemas):
+            raise ValueError(
+                f"Values has length {len(values)} but schemas " f"has length {len(self.schemas)}!"
+            )
+        values = [m(v) for m, v in zip(self.schemas, values)]
+        return list(values)
+
+    @classmethod
+    def flatten(cls, obj):
+        res = [flatten_to_tuple(k) for k in obj]
+        values, sizes = cls._concat([k[0] for k in res])
+        return values, cls([k[1] for k in res], sizes)
+
+
+@dataclass
+class TupleSchema(ListSchema):
+    def __call__(self, values):
+        return tuple(super().__call__(values))
+
+
+@dataclass
+class IdentitySchema(Schema):
+    def __call__(self, values):
+        return values[0]
+
+    @classmethod
+    def flatten(cls, obj):
+        return (obj,), cls()
+
+
+@dataclass
+class DictSchema(ListSchema):
+    keys: List[str]
+
+    def __call__(self, values):
+        values = super().__call__(values)
+        return dict(zip(self.keys, values))
+
+    @classmethod
+    def flatten(cls, obj):
+        for k in obj.keys():
+            if not isinstance(k, str):
+                raise KeyError("Only support flattening dictionaries if keys are str.")
+        keys = sorted(obj.keys())
+        values = [obj[k] for k in keys]
+        ret, schema = ListSchema.flatten(values)
+        return ret, cls(schema.schemas, schema.sizes, keys)
+
+
+@dataclass
+class InstancesSchema(DictSchema):
+    def __call__(self, values):
+        image_size, fields = values[-1], values[:-1]
+        fields = super().__call__(fields)
+        return Instances(image_size, **fields)
+
+    @classmethod
+    def flatten(cls, obj):
+        ret, schema = super().flatten(obj.get_fields())
+        size = obj.image_size
+        if not isinstance(size, torch.Tensor):
+            size = torch.tensor(size)
+        return ret + (size,), schema
+
+
+@dataclass
+class TensorWrapSchema(Schema):
+    """
+    For classes that are simple wrapper of tensors, e.g.
+    Boxes, RotatedBoxes, BitMasks
+    """
+
+    class_name: str
+
+    def __call__(self, values):
+        return locate(self.class_name)(values[0])
+
+    @classmethod
+    def flatten(cls, obj):
+        return (obj.tensor,), cls(_convert_target_to_string(type(obj)))
+
+
+# if more custom structures needed in the future, can allow
+# passing in extra schemas for custom types
+def flatten_to_tuple(obj):
+    """
+    Flatten an object so it can be used for PyTorch tracing.
+    Also returns how to rebuild the original object from the flattened outputs.
+
+    Returns:
+        res (tuple): the flattened results that can be used as tracing outputs
+        schema: an object with a ``__call__`` method such that ``schema(res) == obj``.
+             It is a pure dataclass that can be serialized.
+    """
+    schemas = [
+        ((str, bytes), IdentitySchema),
+        (list, ListSchema),
+        (tuple, TupleSchema),
+        (collections.abc.Mapping, DictSchema),
+        (Instances, InstancesSchema),
+        ((Boxes, ROIMasks), TensorWrapSchema),
+    ]
+    for klass, schema in schemas:
+        if isinstance(obj, klass):
+            F = schema
+            break
+    else:
+        F = IdentitySchema
+
+    return F.flatten(obj)
+
+
+class TracingAdapter(nn.Module):
+    """
+    A model may take rich input/output format (e.g. dict or custom classes),
+    but `torch.jit.trace` requires tuple of tensors as input/output.
+    This adapter flattens input/output format of a model so it becomes traceable.
+
+    It also records the necessary schema to rebuild model's inputs/outputs from flattened
+    inputs/outputs.
+
+    Example:
+    ::
+        outputs = model(inputs)   # inputs/outputs may be rich structure
+        adapter = TracingAdapter(model, inputs)
+
+        # can now trace the model, with adapter.flattened_inputs, or another
+        # tuple of tensors with the same length and meaning
+        traced = torch.jit.trace(adapter, adapter.flattened_inputs)
+
+        # traced model can only produce flattened outputs (tuple of tensors)
+        flattened_outputs = traced(*adapter.flattened_inputs)
+        # adapter knows the schema to convert it back (new_outputs == outputs)
+        new_outputs = adapter.outputs_schema(flattened_outputs)
+    """
+
+    flattened_inputs: Tuple[torch.Tensor] = None
+    """
+    Flattened version of inputs given to this class's constructor.
+    """
+
+    inputs_schema: Schema = None
+    """
+    Schema of the inputs given to this class's constructor.
+    """
+
+    outputs_schema: Schema = None
+    """
+    Schema of the output produced by calling the given model with inputs.
+    """
+
+    def __init__(
+        self,
+        model: nn.Module,
+        inputs,
+        inference_func: Optional[Callable] = None,
+        allow_non_tensor: bool = False,
+    ):
+        """
+        Args:
+            model: an nn.Module
+            inputs: An input argument or a tuple of input arguments used to call model.
+                After flattening, it has to only consist of tensors.
+            inference_func: a callable that takes (model, *inputs), calls the
+                model with inputs, and return outputs. By default it
+                is ``lambda model, *inputs: model(*inputs)``. Can be override
+                if you need to call the model differently.
+            allow_non_tensor: allow inputs/outputs to contain non-tensor objects.
+                This option will filter out non-tensor objects to make the
+                model traceable, but ``inputs_schema``/``outputs_schema`` cannot be
+                used anymore because inputs/outputs cannot be rebuilt from pure tensors.
+                This is useful when you're only interested in the single trace of
+                execution (e.g. for flop count), but not interested in
+                generalizing the traced graph to new inputs.
+        """
+        super().__init__()
+        if isinstance(model, (nn.parallel.distributed.DistributedDataParallel, nn.DataParallel)):
+            model = model.module
+        self.model = model
+        if not isinstance(inputs, tuple):
+            inputs = (inputs,)
+        self.inputs = inputs
+        self.allow_non_tensor = allow_non_tensor
+
+        if inference_func is None:
+            inference_func = lambda model, *inputs: model(*inputs)  # noqa
+        self.inference_func = inference_func
+
+        self.flattened_inputs, self.inputs_schema = flatten_to_tuple(inputs)
+
+        if all(isinstance(x, torch.Tensor) for x in self.flattened_inputs):
+            return
+        if self.allow_non_tensor:
+            self.flattened_inputs = tuple(
+                [x for x in self.flattened_inputs if isinstance(x, torch.Tensor)]
+            )
+            self.inputs_schema = None
+        else:
+            for input in self.flattened_inputs:
+                if not isinstance(input, torch.Tensor):
+                    raise ValueError(
+                        "Inputs for tracing must only contain tensors. "
+                        f"Got a {type(input)} instead."
+                    )
+
+    def forward(self, *args: torch.Tensor):
+        with torch.no_grad(), patch_builtin_len():
+            if self.inputs_schema is not None:
+                inputs_orig_format = self.inputs_schema(args)
+            else:
+                if len(args) != len(self.flattened_inputs) or any(
+                    x is not y for x, y in zip(args, self.flattened_inputs)
+                ):
+                    raise ValueError(
+                        "TracingAdapter does not contain valid inputs_schema."
+                        " So it cannot generalize to other inputs and must be"
+                        " traced with `.flattened_inputs`."
+                    )
+                inputs_orig_format = self.inputs
+
+            outputs = self.inference_func(self.model, *inputs_orig_format)
+            flattened_outputs, schema = flatten_to_tuple(outputs)
+
+            flattened_output_tensors = tuple(
+                [x for x in flattened_outputs if isinstance(x, torch.Tensor)]
+            )
+            if len(flattened_output_tensors) < len(flattened_outputs):
+                if self.allow_non_tensor:
+                    flattened_outputs = flattened_output_tensors
+                    self.outputs_schema = None
+                else:
+                    raise ValueError(
+                        "Model cannot be traced because some model outputs "
+                        "cannot flatten to tensors."
+                    )
+            else:  # schema is valid
+                if self.outputs_schema is None:
+                    self.outputs_schema = schema
+                else:
+                    assert self.outputs_schema == schema, (
+                        "Model should always return outputs with the same "
+                        "structure so it can be traced!"
+                    )
+            return flattened_outputs
+
+    def _create_wrapper(self, traced_model):
+        """
+        Return a function that has an input/output interface the same as the
+        original model, but it calls the given traced model under the hood.
+        """
+
+        def forward(*args):
+            flattened_inputs, _ = flatten_to_tuple(args)
+            flattened_outputs = traced_model(*flattened_inputs)
+            return self.outputs_schema(flattened_outputs)
+
+        return forward
diff --git a/detectron2/export/shared.py b/detectron2/export/shared.py
new file mode 100644
index 0000000000000000000000000000000000000000..fe5b790fa301b911d2e00e0fdd0c0a3d8e27cbf2
--- /dev/null
+++ b/detectron2/export/shared.py
@@ -0,0 +1,1039 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+
+import collections
+import copy
+import functools
+import logging
+import numpy as np
+import os
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+from unittest import mock
+import caffe2.python.utils as putils
+import torch
+import torch.nn.functional as F
+from caffe2.proto import caffe2_pb2
+from caffe2.python import core, net_drawer, workspace
+from torch.nn.functional import interpolate as interp
+
+logger = logging.getLogger(__name__)
+
+
+# ==== torch/utils_toffee/cast.py =======================================
+
+
+def to_device(t, device_str):
+    """
+    This function is a replacement of .to(another_device) such that it allows the
+    casting to be traced properly by explicitly calling the underlying copy ops.
+    It also avoids introducing unncessary op when casting to the same device.
+    """
+    src = t.device
+    dst = torch.device(device_str)
+
+    if src == dst:
+        return t
+    elif src.type == "cuda" and dst.type == "cpu":
+        return torch.ops._caffe2.CopyGPUToCPU(t)
+    elif src.type == "cpu" and dst.type == "cuda":
+        return torch.ops._caffe2.CopyCPUToGPU(t)
+    else:
+        raise RuntimeError("Can't cast tensor from device {} to device {}".format(src, dst))
+
+
+# ==== torch/utils_toffee/interpolate.py =======================================
+
+
+# Note: borrowed from vision/detection/fair/detectron/detectron/modeling/detector.py
+def BilinearInterpolation(tensor_in, up_scale):
+    assert up_scale % 2 == 0, "Scale should be even"
+
+    def upsample_filt(size):
+        factor = (size + 1) // 2
+        if size % 2 == 1:
+            center = factor - 1
+        else:
+            center = factor - 0.5
+
+        og = np.ogrid[:size, :size]
+        return (1 - abs(og[0] - center) / factor) * (1 - abs(og[1] - center) / factor)
+
+    kernel_size = int(up_scale) * 2
+    bil_filt = upsample_filt(kernel_size)
+
+    dim = int(tensor_in.shape[1])
+    kernel = np.zeros((dim, dim, kernel_size, kernel_size), dtype=np.float32)
+    kernel[range(dim), range(dim), :, :] = bil_filt
+
+    tensor_out = F.conv_transpose2d(
+        tensor_in,
+        weight=to_device(torch.Tensor(kernel), tensor_in.device),
+        bias=None,
+        stride=int(up_scale),
+        padding=int(up_scale / 2),
+    )
+
+    return tensor_out
+
+
+# NOTE: ONNX is incompatible with traced torch.nn.functional.interpolate if
+# using dynamic `scale_factor` rather than static `size`. (T43166860)
+# NOTE: Caffe2 Int8 conversion might not be able to quantize `size` properly.
+def onnx_compatibale_interpolate(
+    input, size=None, scale_factor=None, mode="nearest", align_corners=None
+):
+    # NOTE: The input dimensions are interpreted in the form:
+    # `mini-batch x channels x [optional depth] x [optional height] x width`.
+    if size is None and scale_factor is not None:
+        if input.dim() == 4:
+            if isinstance(scale_factor, (int, float)):
+                height_scale, width_scale = (scale_factor, scale_factor)
+            else:
+                assert isinstance(scale_factor, (tuple, list))
+                assert len(scale_factor) == 2
+                height_scale, width_scale = scale_factor
+
+            assert not align_corners, "No matching C2 op for align_corners == True"
+            if mode == "nearest":
+                return torch.ops._caffe2.ResizeNearest(
+                    input, order="NCHW", width_scale=width_scale, height_scale=height_scale
+                )
+            elif mode == "bilinear":
+                logger.warning(
+                    "Use F.conv_transpose2d for bilinear interpolate"
+                    " because there's no such C2 op, this may cause significant"
+                    " slowdown and the boundary pixels won't be as same as"
+                    " using F.interpolate due to padding."
+                )
+                assert height_scale == width_scale
+                return BilinearInterpolation(input, up_scale=height_scale)
+        logger.warning("Output size is not static, it might cause ONNX conversion issue")
+
+    return interp(input, size, scale_factor, mode, align_corners)
+
+
+def mock_torch_nn_functional_interpolate():
+    def decorator(func):
+        @functools.wraps(func)
+        def _mock_torch_nn_functional_interpolate(*args, **kwargs):
+            if torch.onnx.is_in_onnx_export():
+                with mock.patch(
+                    "torch.nn.functional.interpolate", side_effect=onnx_compatibale_interpolate
+                ):
+                    return func(*args, **kwargs)
+            else:
+                return func(*args, **kwargs)
+
+        return _mock_torch_nn_functional_interpolate
+
+    return decorator
+
+
+# ==== torch/utils_caffe2/ws_utils.py ==========================================
+
+
+class ScopedWS:
+    def __init__(self, ws_name, is_reset, is_cleanup=False):
+        self.ws_name = ws_name
+        self.is_reset = is_reset
+        self.is_cleanup = is_cleanup
+        self.org_ws = ""
+
+    def __enter__(self):
+        self.org_ws = workspace.CurrentWorkspace()
+        if self.ws_name is not None:
+            workspace.SwitchWorkspace(self.ws_name, True)
+        if self.is_reset:
+            workspace.ResetWorkspace()
+
+        return workspace
+
+    def __exit__(self, *args):
+        if self.is_cleanup:
+            workspace.ResetWorkspace()
+        if self.ws_name is not None:
+            workspace.SwitchWorkspace(self.org_ws)
+
+
+def fetch_any_blob(name):
+    bb = None
+    try:
+        bb = workspace.FetchBlob(name)
+    except TypeError:
+        bb = workspace.FetchInt8Blob(name)
+    except Exception as e:
+        logger.error("Get blob {} error: {}".format(name, e))
+
+    return bb
+
+
+# ==== torch/utils_caffe2/protobuf.py ==========================================
+
+
+def get_pb_arg(pb, arg_name):
+    for x in pb.arg:
+        if x.name == arg_name:
+            return x
+    return None
+
+
+def get_pb_arg_valf(pb, arg_name, default_val):
+    arg = get_pb_arg(pb, arg_name)
+    return arg.f if arg is not None else default_val
+
+
+def get_pb_arg_floats(pb, arg_name, default_val):
+    arg = get_pb_arg(pb, arg_name)
+    return list(map(float, arg.floats)) if arg is not None else default_val
+
+
+def get_pb_arg_ints(pb, arg_name, default_val):
+    arg = get_pb_arg(pb, arg_name)
+    return list(map(int, arg.ints)) if arg is not None else default_val
+
+
+def get_pb_arg_vali(pb, arg_name, default_val):
+    arg = get_pb_arg(pb, arg_name)
+    return arg.i if arg is not None else default_val
+
+
+def get_pb_arg_vals(pb, arg_name, default_val):
+    arg = get_pb_arg(pb, arg_name)
+    return arg.s if arg is not None else default_val
+
+
+def get_pb_arg_valstrings(pb, arg_name, default_val):
+    arg = get_pb_arg(pb, arg_name)
+    return list(arg.strings) if arg is not None else default_val
+
+
+def check_set_pb_arg(pb, arg_name, arg_attr, arg_value, allow_override=False):
+    arg = get_pb_arg(pb, arg_name)
+    if arg is None:
+        arg = putils.MakeArgument(arg_name, arg_value)
+        assert hasattr(arg, arg_attr)
+        pb.arg.extend([arg])
+    if allow_override and getattr(arg, arg_attr) != arg_value:
+        logger.warning(
+            "Override argument {}: {} -> {}".format(arg_name, getattr(arg, arg_attr), arg_value)
+        )
+        setattr(arg, arg_attr, arg_value)
+    else:
+        assert arg is not None
+        assert getattr(arg, arg_attr) == arg_value, "Existing value {}, new value {}".format(
+            getattr(arg, arg_attr), arg_value
+        )
+
+
+def _create_const_fill_op_from_numpy(name, tensor, device_option=None):
+    assert type(tensor) == np.ndarray
+    kTypeNameMapper = {
+        np.dtype("float32"): "GivenTensorFill",
+        np.dtype("int32"): "GivenTensorIntFill",
+        np.dtype("int64"): "GivenTensorInt64Fill",
+        np.dtype("uint8"): "GivenTensorStringFill",
+    }
+
+    args_dict = {}
+    if tensor.dtype == np.dtype("uint8"):
+        args_dict.update({"values": [str(tensor.data)], "shape": [1]})
+    else:
+        args_dict.update({"values": tensor, "shape": tensor.shape})
+
+    if device_option is not None:
+        args_dict["device_option"] = device_option
+
+    return core.CreateOperator(kTypeNameMapper[tensor.dtype], [], [name], **args_dict)
+
+
+def _create_const_fill_op_from_c2_int8_tensor(name, int8_tensor):
+    assert type(int8_tensor) == workspace.Int8Tensor
+    kTypeNameMapper = {
+        np.dtype("int32"): "Int8GivenIntTensorFill",
+        np.dtype("uint8"): "Int8GivenTensorFill",
+    }
+
+    tensor = int8_tensor.data
+    assert tensor.dtype in [np.dtype("uint8"), np.dtype("int32")]
+    values = tensor.tobytes() if tensor.dtype == np.dtype("uint8") else tensor
+
+    return core.CreateOperator(
+        kTypeNameMapper[tensor.dtype],
+        [],
+        [name],
+        values=values,
+        shape=tensor.shape,
+        Y_scale=int8_tensor.scale,
+        Y_zero_point=int8_tensor.zero_point,
+    )
+
+
+def create_const_fill_op(
+    name: str,
+    blob: Union[np.ndarray, workspace.Int8Tensor],
+    device_option: Optional[caffe2_pb2.DeviceOption] = None,
+) -> caffe2_pb2.OperatorDef:
+    """
+    Given a blob object, return the Caffe2 operator that creates this blob
+    as constant. Currently support NumPy tensor and Caffe2 Int8Tensor.
+    """
+
+    tensor_type = type(blob)
+    assert tensor_type in [
+        np.ndarray,
+        workspace.Int8Tensor,
+    ], 'Error when creating const fill op for "{}", unsupported blob type: {}'.format(
+        name, type(blob)
+    )
+
+    if tensor_type == np.ndarray:
+        return _create_const_fill_op_from_numpy(name, blob, device_option)
+    elif tensor_type == workspace.Int8Tensor:
+        assert device_option is None
+        return _create_const_fill_op_from_c2_int8_tensor(name, blob)
+
+
+def construct_init_net_from_params(
+    params: Dict[str, Any], device_options: Optional[Dict[str, caffe2_pb2.DeviceOption]] = None
+) -> caffe2_pb2.NetDef:
+    """
+    Construct the init_net from params dictionary
+    """
+    init_net = caffe2_pb2.NetDef()
+    device_options = device_options or {}
+    for name, blob in params.items():
+        if isinstance(blob, str):
+            logger.warning(
+                (
+                    "Blob {} with type {} is not supported in generating init net,"
+                    " skipped.".format(name, type(blob))
+                )
+            )
+            continue
+        init_net.op.extend(
+            [create_const_fill_op(name, blob, device_option=device_options.get(name, None))]
+        )
+        init_net.external_output.append(name)
+    return init_net
+
+
+def get_producer_map(ssa):
+    """
+    Return dict from versioned blob to (i, j),
+        where i is index of producer op, j is the index of output of that op.
+    """
+    producer_map = {}
+    for i in range(len(ssa)):
+        outputs = ssa[i][1]
+        for j, outp in enumerate(outputs):
+            producer_map[outp] = (i, j)
+    return producer_map
+
+
+def get_consumer_map(ssa):
+    """
+    Return dict from versioned blob to list of (i, j),
+        where i is index of consumer op, j is the index of input of that op.
+    """
+    consumer_map = collections.defaultdict(list)
+    for i in range(len(ssa)):
+        inputs = ssa[i][0]
+        for j, inp in enumerate(inputs):
+            consumer_map[inp].append((i, j))
+    return consumer_map
+
+
+def get_params_from_init_net(
+    init_net: caffe2_pb2.NetDef,
+) -> [Dict[str, Any], Dict[str, caffe2_pb2.DeviceOption]]:
+    """
+    Take the output blobs from init_net by running it.
+    Outputs:
+        params: dict from blob name to numpy array
+        device_options: dict from blob name to the device option of its creating op
+    """
+    # NOTE: this assumes that the params is determined by producer op with the
+    # only exception be CopyGPUToCPU which is CUDA op but returns CPU tensor.
+    def _get_device_option(producer_op):
+        if producer_op.type == "CopyGPUToCPU":
+            return caffe2_pb2.DeviceOption()
+        else:
+            return producer_op.device_option
+
+    with ScopedWS("__get_params_from_init_net__", is_reset=True, is_cleanup=True) as ws:
+        ws.RunNetOnce(init_net)
+        params = {b: fetch_any_blob(b) for b in init_net.external_output}
+    ssa, versions = core.get_ssa(init_net)
+    producer_map = get_producer_map(ssa)
+    device_options = {
+        b: _get_device_option(init_net.op[producer_map[(b, versions[b])][0]])
+        for b in init_net.external_output
+    }
+    return params, device_options
+
+
+def _updater_raise(op, input_types, output_types):
+    raise RuntimeError(
+        "Failed to apply updater for op {} given input_types {} and"
+        " output_types {}".format(op, input_types, output_types)
+    )
+
+
+def _generic_status_identifier(
+    predict_net: caffe2_pb2.NetDef,
+    status_updater: Callable,
+    known_status: Dict[Tuple[str, int], Any],
+) -> Dict[Tuple[str, int], Any]:
+    """
+    Statically infer the status of each blob, the status can be such as device type
+        (CPU/GPU), layout (NCHW/NHWC), data type (float32/int8), etc. "Blob" here
+        is versioned blob (Tuple[str, int]) in the format compatible with ssa.
+    Inputs:
+        predict_net: the caffe2 network
+        status_updater: a callable, given an op and the status of its input/output,
+            it returns the updated status of input/output. `None` is used for
+            representing unknown status.
+        known_status: a dict containing known status, used as initialization.
+    Outputs:
+        A dict mapping from versioned blob to its status
+    """
+    ssa, versions = core.get_ssa(predict_net)
+    versioned_ext_input = [(b, 0) for b in predict_net.external_input]
+    versioned_ext_output = [(b, versions[b]) for b in predict_net.external_output]
+    all_versioned_blobs = set().union(*[set(x[0] + x[1]) for x in ssa])
+
+    allowed_vbs = all_versioned_blobs.union(versioned_ext_input).union(versioned_ext_output)
+    assert all(k in allowed_vbs for k in known_status)
+    assert all(v is not None for v in known_status.values())
+    _known_status = copy.deepcopy(known_status)
+
+    def _check_and_update(key, value):
+        assert value is not None
+        if key in _known_status:
+            if not _known_status[key] == value:
+                raise RuntimeError(
+                    "Confilict status for {}, existing status {}, new status {}".format(
+                        key, _known_status[key], value
+                    )
+                )
+        _known_status[key] = value
+
+    def _update_i(op, ssa_i):
+        versioned_inputs = ssa_i[0]
+        versioned_outputs = ssa_i[1]
+
+        inputs_status = [_known_status.get(b, None) for b in versioned_inputs]
+        outputs_status = [_known_status.get(b, None) for b in versioned_outputs]
+
+        new_inputs_status, new_outputs_status = status_updater(op, inputs_status, outputs_status)
+
+        for versioned_blob, status in zip(
+            versioned_inputs + versioned_outputs, new_inputs_status + new_outputs_status
+        ):
+            if status is not None:
+                _check_and_update(versioned_blob, status)
+
+    for op, ssa_i in zip(predict_net.op, ssa):
+        _update_i(op, ssa_i)
+    for op, ssa_i in zip(reversed(predict_net.op), reversed(ssa)):
+        _update_i(op, ssa_i)
+
+    # NOTE: This strictly checks all the blob from predict_net must be assgined
+    # a known status. However sometimes it's impossible (eg. having deadend op),
+    # we may relax this constraint if
+    for k in all_versioned_blobs:
+        if k not in _known_status:
+            raise NotImplementedError(
+                "Can not infer the status for {}. Currently only support the case where"
+                " a single forward and backward pass can identify status for all blobs.".format(k)
+            )
+
+    return _known_status
+
+
+def infer_device_type(
+    predict_net: caffe2_pb2.NetDef,
+    known_status: Dict[Tuple[str, int], Any],
+    device_name_style: str = "caffe2",
+) -> Dict[Tuple[str, int], str]:
+    """Return the device type ("cpu" or "gpu"/"cuda") of each (versioned) blob"""
+
+    assert device_name_style in ["caffe2", "pytorch"]
+    _CPU_STR = "cpu"
+    _GPU_STR = "gpu" if device_name_style == "caffe2" else "cuda"
+
+    def _copy_cpu_to_gpu_updater(op, input_types, output_types):
+        if input_types[0] == _GPU_STR or output_types[0] == _CPU_STR:
+            _updater_raise(op, input_types, output_types)
+        return ([_CPU_STR], [_GPU_STR])
+
+    def _copy_gpu_to_cpu_updater(op, input_types, output_types):
+        if input_types[0] == _CPU_STR or output_types[0] == _GPU_STR:
+            _updater_raise(op, input_types, output_types)
+        return ([_GPU_STR], [_CPU_STR])
+
+    def _other_ops_updater(op, input_types, output_types):
+        non_none_types = [x for x in input_types + output_types if x is not None]
+        if len(non_none_types) > 0:
+            the_type = non_none_types[0]
+            if not all(x == the_type for x in non_none_types):
+                _updater_raise(op, input_types, output_types)
+        else:
+            the_type = None
+        return ([the_type for _ in op.input], [the_type for _ in op.output])
+
+    def _device_updater(op, *args, **kwargs):
+        return {
+            "CopyCPUToGPU": _copy_cpu_to_gpu_updater,
+            "CopyGPUToCPU": _copy_gpu_to_cpu_updater,
+        }.get(op.type, _other_ops_updater)(op, *args, **kwargs)
+
+    return _generic_status_identifier(predict_net, _device_updater, known_status)
+
+
+# ==== torch/utils_caffe2/vis.py ===============================================
+
+
+def _modify_blob_names(ops, blob_rename_f):
+    ret = []
+
+    def _replace_list(blob_list, replaced_list):
+        del blob_list[:]
+        blob_list.extend(replaced_list)
+
+    for x in ops:
+        cur = copy.deepcopy(x)
+        _replace_list(cur.input, list(map(blob_rename_f, cur.input)))
+        _replace_list(cur.output, list(map(blob_rename_f, cur.output)))
+        ret.append(cur)
+
+    return ret
+
+
+def _rename_blob(name, blob_sizes, blob_ranges):
+    def _list_to_str(bsize):
+        ret = ", ".join([str(x) for x in bsize])
+        ret = "[" + ret + "]"
+        return ret
+
+    ret = name
+    if blob_sizes is not None and name in blob_sizes:
+        ret += "\n" + _list_to_str(blob_sizes[name])
+    if blob_ranges is not None and name in blob_ranges:
+        ret += "\n" + _list_to_str(blob_ranges[name])
+
+    return ret
+
+
+# graph_name could not contain word 'graph'
+def save_graph(net, file_name, graph_name="net", op_only=True, blob_sizes=None, blob_ranges=None):
+    blob_rename_f = functools.partial(_rename_blob, blob_sizes=blob_sizes, blob_ranges=blob_ranges)
+    return save_graph_base(net, file_name, graph_name, op_only, blob_rename_f)
+
+
+def save_graph_base(net, file_name, graph_name="net", op_only=True, blob_rename_func=None):
+    graph = None
+    ops = net.op
+    if blob_rename_func is not None:
+        ops = _modify_blob_names(ops, blob_rename_func)
+    if not op_only:
+        graph = net_drawer.GetPydotGraph(ops, graph_name, rankdir="TB")
+    else:
+        graph = net_drawer.GetPydotGraphMinimal(
+            ops, graph_name, rankdir="TB", minimal_dependency=True
+        )
+
+    try:
+        par_dir = os.path.dirname(file_name)
+        if not os.path.exists(par_dir):
+            os.makedirs(par_dir)
+
+        format = os.path.splitext(os.path.basename(file_name))[-1]
+        if format == ".png":
+            graph.write_png(file_name)
+        elif format == ".pdf":
+            graph.write_pdf(file_name)
+        elif format == ".svg":
+            graph.write_svg(file_name)
+        else:
+            print("Incorrect format {}".format(format))
+    except Exception as e:
+        print("Error when writing graph to image {}".format(e))
+
+    return graph
+
+
+# ==== torch/utils_toffee/aten_to_caffe2.py ====================================
+
+
+def group_norm_replace_aten_with_caffe2(predict_net: caffe2_pb2.NetDef):
+    """
+    For ONNX exported model, GroupNorm will be represented as ATen op,
+        this can be a drop in replacement from ATen to GroupNorm
+    """
+    count = 0
+    for op in predict_net.op:
+        if op.type == "ATen":
+            op_name = get_pb_arg_vals(op, "operator", None)  # return byte in py3
+            if op_name and op_name.decode() == "group_norm":
+                op.arg.remove(get_pb_arg(op, "operator"))
+
+                if get_pb_arg_vali(op, "cudnn_enabled", None):
+                    op.arg.remove(get_pb_arg(op, "cudnn_enabled"))
+
+                num_groups = get_pb_arg_vali(op, "num_groups", None)
+                if num_groups is not None:
+                    op.arg.remove(get_pb_arg(op, "num_groups"))
+                    check_set_pb_arg(op, "group", "i", num_groups)
+
+                op.type = "GroupNorm"
+                count += 1
+    if count > 1:
+        logger.info("Replaced {} ATen operator to GroupNormOp".format(count))
+
+
+# ==== torch/utils_toffee/alias.py =============================================
+
+
+def alias(x, name, is_backward=False):
+    if not torch.onnx.is_in_onnx_export():
+        return x
+    assert isinstance(x, torch.Tensor)
+    return torch.ops._caffe2.AliasWithName(x, name, is_backward=is_backward)
+
+
+def fuse_alias_placeholder(predict_net, init_net):
+    """Remove AliasWithName placeholder and rename the input/output of it"""
+    # First we finish all the re-naming
+    for i, op in enumerate(predict_net.op):
+        if op.type == "AliasWithName":
+            assert len(op.input) == 1
+            assert len(op.output) == 1
+            name = get_pb_arg_vals(op, "name", None).decode()
+            is_backward = bool(get_pb_arg_vali(op, "is_backward", 0))
+            rename_op_input(predict_net, init_net, i, 0, name, from_producer=is_backward)
+            rename_op_output(predict_net, i, 0, name)
+
+    # Remove AliasWithName, should be very safe since it's a non-op
+    new_ops = []
+    for op in predict_net.op:
+        if op.type != "AliasWithName":
+            new_ops.append(op)
+        else:
+            # safety check
+            assert op.input == op.output
+            assert op.input[0] == op.arg[0].s.decode()
+    del predict_net.op[:]
+    predict_net.op.extend(new_ops)
+
+
+# ==== torch/utils_caffe2/graph_transform.py ===================================
+
+
+class IllegalGraphTransformError(ValueError):
+    """When a graph transform function call can't be executed."""
+
+
+def _rename_versioned_blob_in_proto(
+    proto: caffe2_pb2.NetDef,
+    old_name: str,
+    new_name: str,
+    version: int,
+    ssa: List[Tuple[List[Tuple[str, int]], List[Tuple[str, int]]]],
+    start_versions: Dict[str, int],
+    end_versions: Dict[str, int],
+):
+    """In given proto, rename all blobs with matched version"""
+    # Operater list
+    for op, i_th_ssa in zip(proto.op, ssa):
+        versioned_inputs, versioned_outputs = i_th_ssa
+        for i in range(len(op.input)):
+            if versioned_inputs[i] == (old_name, version):
+                op.input[i] = new_name
+        for i in range(len(op.output)):
+            if versioned_outputs[i] == (old_name, version):
+                op.output[i] = new_name
+    # external_input
+    if start_versions.get(old_name, 0) == version:
+        for i in range(len(proto.external_input)):
+            if proto.external_input[i] == old_name:
+                proto.external_input[i] = new_name
+    # external_output
+    if end_versions.get(old_name, 0) == version:
+        for i in range(len(proto.external_output)):
+            if proto.external_output[i] == old_name:
+                proto.external_output[i] = new_name
+
+
+def rename_op_input(
+    predict_net: caffe2_pb2.NetDef,
+    init_net: caffe2_pb2.NetDef,
+    op_id: int,
+    input_id: int,
+    new_name: str,
+    from_producer: bool = False,
+):
+    """
+    Rename the op_id-th operator in predict_net, change it's input_id-th input's
+        name to the new_name. It also does automatic re-route and change
+        external_input and init_net if necessary.
+    - It requires the input is only consumed by this op.
+    - This function modifies predict_net and init_net in-place.
+    - When from_producer is enable, this also updates other operators that consumes
+        the same input. Be cautious because may trigger unintended behavior.
+    """
+    assert isinstance(predict_net, caffe2_pb2.NetDef)
+    assert isinstance(init_net, caffe2_pb2.NetDef)
+
+    init_net_ssa, init_net_versions = core.get_ssa(init_net)
+    predict_net_ssa, predict_net_versions = core.get_ssa(
+        predict_net, copy.deepcopy(init_net_versions)
+    )
+
+    versioned_inputs, versioned_outputs = predict_net_ssa[op_id]
+    old_name, version = versioned_inputs[input_id]
+
+    if from_producer:
+        producer_map = get_producer_map(predict_net_ssa)
+        if not (old_name, version) in producer_map:
+            raise NotImplementedError(
+                "Can't find producer, the input {} is probably from"
+                " init_net, this is not supported yet.".format(old_name)
+            )
+        producer = producer_map[(old_name, version)]
+        rename_op_output(predict_net, producer[0], producer[1], new_name)
+        return
+
+    def contain_targets(op_ssa):
+        return (old_name, version) in op_ssa[0]
+
+    is_consumer = [contain_targets(op_ssa) for op_ssa in predict_net_ssa]
+    if sum(is_consumer) > 1:
+        raise IllegalGraphTransformError(
+            (
+                "Input '{}' of operator(#{}) are consumed by other ops, please use"
+                + " rename_op_output on the producer instead. Offending op: \n{}"
+            ).format(old_name, op_id, predict_net.op[op_id])
+        )
+
+    # update init_net
+    _rename_versioned_blob_in_proto(
+        init_net, old_name, new_name, version, init_net_ssa, {}, init_net_versions
+    )
+    # update predict_net
+    _rename_versioned_blob_in_proto(
+        predict_net,
+        old_name,
+        new_name,
+        version,
+        predict_net_ssa,
+        init_net_versions,
+        predict_net_versions,
+    )
+
+
+def rename_op_output(predict_net: caffe2_pb2.NetDef, op_id: int, output_id: int, new_name: str):
+    """
+    Rename the op_id-th operator in predict_net, change it's output_id-th input's
+        name to the new_name. It also does automatic re-route and change
+        external_output and if necessary.
+    - It allows multiple consumers of its output.
+    - This function modifies predict_net in-place, doesn't need init_net.
+    """
+    assert isinstance(predict_net, caffe2_pb2.NetDef)
+
+    ssa, blob_versions = core.get_ssa(predict_net)
+
+    versioned_inputs, versioned_outputs = ssa[op_id]
+    old_name, version = versioned_outputs[output_id]
+
+    # update predict_net
+    _rename_versioned_blob_in_proto(
+        predict_net, old_name, new_name, version, ssa, {}, blob_versions
+    )
+
+
+def get_sub_graph_external_input_output(
+    predict_net: caffe2_pb2.NetDef, sub_graph_op_indices: List[int]
+) -> Tuple[List[Tuple[str, int]], List[Tuple[str, int]]]:
+    """
+    Return the list of external input/output of sub-graph,
+    each element is tuple of the name and corresponding version in predict_net.
+
+    external input/output is defined the same way as caffe2 NetDef.
+    """
+    ssa, versions = core.get_ssa(predict_net)
+
+    all_inputs = []
+    all_outputs = []
+    for op_id in sub_graph_op_indices:
+        all_inputs += [inp for inp in ssa[op_id][0] if inp not in all_inputs]
+        all_outputs += list(ssa[op_id][1])  # ssa output won't repeat
+
+    # for versioned blobs, external inputs are just those blob in all_inputs
+    # but not in all_outputs
+    ext_inputs = [inp for inp in all_inputs if inp not in all_outputs]
+
+    # external outputs are essentially outputs of this subgraph that are used
+    # outside of this sub-graph (including predict_net.external_output)
+    all_other_inputs = sum(
+        (ssa[i][0] for i in range(len(ssa)) if i not in sub_graph_op_indices),
+        [(outp, versions[outp]) for outp in predict_net.external_output],
+    )
+    ext_outputs = [outp for outp in all_outputs if outp in set(all_other_inputs)]
+
+    return ext_inputs, ext_outputs
+
+
+class DiGraph:
+    """A DAG representation of caffe2 graph, each vertice is a versioned blob."""
+
+    def __init__(self):
+        self.vertices = set()
+        self.graph = collections.defaultdict(list)
+
+    def add_edge(self, u, v):
+        self.graph[u].append(v)
+        self.vertices.add(u)
+        self.vertices.add(v)
+
+    # grab from https://www.geeksforgeeks.org/find-paths-given-source-destination/
+    def get_all_paths(self, s, d):
+        visited = {k: False for k in self.vertices}
+        path = []
+        all_paths = []
+
+        def _get_all_paths_util(graph, u, d, visited, path):
+            visited[u] = True
+            path.append(u)
+            if u == d:
+                all_paths.append(copy.deepcopy(path))
+            else:
+                for i in graph[u]:
+                    if not visited[i]:
+                        _get_all_paths_util(graph, i, d, visited, path)
+            path.pop()
+            visited[u] = False
+
+        _get_all_paths_util(self.graph, s, d, visited, path)
+        return all_paths
+
+    @staticmethod
+    def from_ssa(ssa):
+        graph = DiGraph()
+        for op_id in range(len(ssa)):
+            for inp in ssa[op_id][0]:
+                for outp in ssa[op_id][1]:
+                    graph.add_edge(inp, outp)
+        return graph
+
+
+def _get_dependency_chain(ssa, versioned_target, versioned_source):
+    """
+    Return the index list of relevant operator to produce target blob from source blob,
+        if there's no dependency, return empty list.
+    """
+
+    # finding all paths between nodes can be O(N!), thus we can only search
+    # in the subgraph using the op starting from the first consumer of source blob
+    # to the producer of the target blob.
+    consumer_map = get_consumer_map(ssa)
+    producer_map = get_producer_map(ssa)
+    start_op = min(x[0] for x in consumer_map[versioned_source]) - 15
+    end_op = (
+        producer_map[versioned_target][0] + 15 if versioned_target in producer_map else start_op
+    )
+    sub_graph_ssa = ssa[start_op : end_op + 1]
+    if len(sub_graph_ssa) > 30:
+        logger.warning(
+            "Subgraph bebetween {} and {} is large (from op#{} to op#{}), it"
+            " might take non-trival time to find all paths between them.".format(
+                versioned_source, versioned_target, start_op, end_op
+            )
+        )
+
+    dag = DiGraph.from_ssa(sub_graph_ssa)
+    paths = dag.get_all_paths(versioned_source, versioned_target)  # include two ends
+    ops_in_paths = [[producer_map[blob][0] for blob in path[1:]] for path in paths]
+    return sorted(set().union(*[set(ops) for ops in ops_in_paths]))
+
+
+def identify_reshape_sub_graph(predict_net: caffe2_pb2.NetDef) -> List[List[int]]:
+    """
+    Idenfity the reshape sub-graph in a protobuf.
+    The reshape sub-graph is defined as matching the following pattern:
+
+    (input_blob) -> Op_1 -> ... -> Op_N -> (new_shape) -─┐
+        └-------------------------------------------> Reshape -> (output_blob)
+
+    Return:
+        List of sub-graphs, each sub-graph is represented as a list of indices
+        of the relavent ops, [Op_1, Op_2, ..., Op_N, Reshape]
+    """
+
+    ssa, _ = core.get_ssa(predict_net)
+
+    ret = []
+    for i, op in enumerate(predict_net.op):
+        if op.type == "Reshape":
+            assert len(op.input) == 2
+            input_ssa = ssa[i][0]
+            data_source = input_ssa[0]
+            shape_source = input_ssa[1]
+            op_indices = _get_dependency_chain(ssa, shape_source, data_source)
+            ret.append(op_indices + [i])
+    return ret
+
+
+def remove_reshape_for_fc(predict_net, params):
+    """
+    In PyTorch nn.Linear has to take 2D tensor, this often leads to reshape
+        a 4D tensor to 2D by calling .view(). However this (dynamic) reshaping
+        doesn't work well with ONNX and Int8 tools, and cause using extra
+        ops (eg. ExpandDims) that might not be available on mobile.
+    Luckily Caffe2 supports 4D tensor for FC, so we can remove those reshape
+        after exporting ONNX model.
+    """
+    from caffe2.python import core
+
+    # find all reshape sub-graph that can be removed, which is now all Reshape
+    # sub-graph whose output is only consumed by FC.
+    # TODO: to make it safer, we may need the actually value to better determine
+    # if a Reshape before FC is removable.
+    reshape_sub_graphs = identify_reshape_sub_graph(predict_net)
+    sub_graphs_to_remove = []
+    for reshape_sub_graph in reshape_sub_graphs:
+        reshape_op_id = reshape_sub_graph[-1]
+        assert predict_net.op[reshape_op_id].type == "Reshape"
+        ssa, _ = core.get_ssa(predict_net)
+        reshape_output = ssa[reshape_op_id][1][0]
+        consumers = [i for i in range(len(ssa)) if reshape_output in ssa[i][0]]
+        if all(predict_net.op[consumer].type == "FC" for consumer in consumers):
+            # safety check if the sub-graph is isolated, for this reshape sub-graph,
+            # it means it has one non-param external input and one external output.
+            ext_inputs, ext_outputs = get_sub_graph_external_input_output(
+                predict_net, reshape_sub_graph
+            )
+            non_params_ext_inputs = [inp for inp in ext_inputs if inp[1] != 0]
+            if len(non_params_ext_inputs) == 1 and len(ext_outputs) == 1:
+                sub_graphs_to_remove.append(reshape_sub_graph)
+
+    # perform removing subgraph by:
+    # 1: rename the Reshape's output to its input, then the graph can be
+    #   seen as in-place itentify, meaning whose external input/output are the same.
+    # 2: simply remove those ops.
+    remove_op_ids = []
+    params_to_remove = []
+    for sub_graph in sub_graphs_to_remove:
+        logger.info(
+            "Remove Reshape sub-graph:\n{}".format(
+                "".join(["(#{:>4})\n{}".format(i, predict_net.op[i]) for i in sub_graph])
+            )
+        )
+        reshape_op_id = sub_graph[-1]
+        new_reshap_output = predict_net.op[reshape_op_id].input[0]
+        rename_op_output(predict_net, reshape_op_id, 0, new_reshap_output)
+        ext_inputs, ext_outputs = get_sub_graph_external_input_output(predict_net, sub_graph)
+        non_params_ext_inputs = [inp for inp in ext_inputs if inp[1] != 0]
+        params_ext_inputs = [inp for inp in ext_inputs if inp[1] == 0]
+        assert len(non_params_ext_inputs) == 1 and len(ext_outputs) == 1
+        assert ext_outputs[0][0] == non_params_ext_inputs[0][0]
+        assert ext_outputs[0][1] == non_params_ext_inputs[0][1] + 1
+        remove_op_ids.extend(sub_graph)
+        params_to_remove.extend(params_ext_inputs)
+
+    predict_net = copy.deepcopy(predict_net)
+    new_ops = [op for i, op in enumerate(predict_net.op) if i not in remove_op_ids]
+    del predict_net.op[:]
+    predict_net.op.extend(new_ops)
+    for versioned_params in params_to_remove:
+        name = versioned_params[0]
+        logger.info("Remove params: {} from init_net and predict_net.external_input".format(name))
+        del params[name]
+        predict_net.external_input.remove(name)
+
+    return predict_net, params
+
+
+def fuse_copy_between_cpu_and_gpu(predict_net: caffe2_pb2.NetDef):
+    """
+    In-place fuse extra copy ops between cpu/gpu for the following case:
+        a -CopyAToB-> b -CopyBToA> c1 -NextOp1-> d1
+                        -CopyBToA> c2 -NextOp2-> d2
+    The fused network will look like:
+        a -NextOp1-> d1
+          -NextOp2-> d2
+    """
+
+    _COPY_OPS = ["CopyCPUToGPU", "CopyGPUToCPU"]
+
+    def _fuse_once(predict_net):
+        ssa, blob_versions = core.get_ssa(predict_net)
+        consumer_map = get_consumer_map(ssa)
+        versioned_external_output = [
+            (name, blob_versions[name]) for name in predict_net.external_output
+        ]
+
+        for op_id, op in enumerate(predict_net.op):
+            if op.type in _COPY_OPS:
+                fw_copy_versioned_output = ssa[op_id][1][0]
+                consumer_ids = [x[0] for x in consumer_map[fw_copy_versioned_output]]
+                reverse_op_type = _COPY_OPS[1 - _COPY_OPS.index(op.type)]
+
+                is_fusable = (
+                    len(consumer_ids) > 0
+                    and fw_copy_versioned_output not in versioned_external_output
+                    and all(
+                        predict_net.op[_op_id].type == reverse_op_type
+                        and ssa[_op_id][1][0] not in versioned_external_output
+                        for _op_id in consumer_ids
+                    )
+                )
+
+                if is_fusable:
+                    for rv_copy_op_id in consumer_ids:
+                        # making each NextOp uses "a" directly and removing Copy ops
+                        rs_copy_versioned_output = ssa[rv_copy_op_id][1][0]
+                        next_op_id, inp_id = consumer_map[rs_copy_versioned_output][0]
+                        predict_net.op[next_op_id].input[inp_id] = op.input[0]
+                    # remove CopyOps
+                    new_ops = [
+                        op
+                        for i, op in enumerate(predict_net.op)
+                        if i != op_id and i not in consumer_ids
+                    ]
+                    del predict_net.op[:]
+                    predict_net.op.extend(new_ops)
+                    return True
+
+        return False
+
+    # _fuse_once returns False is nothing can be fused
+    while _fuse_once(predict_net):
+        pass
+
+
+def remove_dead_end_ops(net_def: caffe2_pb2.NetDef):
+    """remove ops if its output is not used or not in external_output"""
+    ssa, versions = core.get_ssa(net_def)
+    versioned_external_output = [(name, versions[name]) for name in net_def.external_output]
+    consumer_map = get_consumer_map(ssa)
+    removed_op_ids = set()
+
+    def _is_dead_end(versioned_blob):
+        return not (
+            versioned_blob in versioned_external_output
+            or (
+                len(consumer_map[versioned_blob]) > 0
+                and all(x[0] not in removed_op_ids for x in consumer_map[versioned_blob])
+            )
+        )
+
+    for i, ssa_i in reversed(list(enumerate(ssa))):
+        versioned_outputs = ssa_i[1]
+        if all(_is_dead_end(outp) for outp in versioned_outputs):
+            removed_op_ids.add(i)
+
+    # simply removing those deadend ops should have no effect to external_output
+    new_ops = [op for i, op in enumerate(net_def.op) if i not in removed_op_ids]
+    del net_def.op[:]
+    net_def.op.extend(new_ops)
diff --git a/detectron2/export/torchscript.py b/detectron2/export/torchscript.py
new file mode 100644
index 0000000000000000000000000000000000000000..24fe59bda44225324928542df3f2ef1745375dfd
--- /dev/null
+++ b/detectron2/export/torchscript.py
@@ -0,0 +1,132 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+
+import os
+import torch
+
+from detectron2.utils.file_io import PathManager
+
+from .torchscript_patch import freeze_training_mode, patch_instances
+
+__all__ = ["scripting_with_instances", "dump_torchscript_IR"]
+
+
+def scripting_with_instances(model, fields):
+    """
+    Run :func:`torch.jit.script` on a model that uses the :class:`Instances` class. Since
+    attributes of :class:`Instances` are "dynamically" added in eager mode，it is difficult
+    for scripting to support it out of the box. This function is made to support scripting
+    a model that uses :class:`Instances`. It does the following:
+
+    1. Create a scriptable ``new_Instances`` class which behaves similarly to ``Instances``,
+       but with all attributes been "static".
+       The attributes need to be statically declared in the ``fields`` argument.
+    2. Register ``new_Instances``, and force scripting compiler to
+       use it when trying to compile ``Instances``.
+
+    After this function, the process will be reverted. User should be able to script another model
+    using different fields.
+
+    Example:
+        Assume that ``Instances`` in the model consist of two attributes named
+        ``proposal_boxes`` and ``objectness_logits`` with type :class:`Boxes` and
+        :class:`Tensor` respectively during inference. You can call this function like:
+        ::
+            fields = {"proposal_boxes": Boxes, "objectness_logits": torch.Tensor}
+            torchscipt_model =  scripting_with_instances(model, fields)
+
+    Note:
+        It only support models in evaluation mode.
+
+    Args:
+        model (nn.Module): The input model to be exported by scripting.
+        fields (Dict[str, type]): Attribute names and corresponding type that
+            ``Instances`` will use in the model. Note that all attributes used in ``Instances``
+            need to be added, regardless of whether they are inputs/outputs of the model.
+            Data type not defined in detectron2 is not supported for now.
+
+    Returns:
+        torch.jit.ScriptModule: the model in torchscript format
+    """
+    assert (
+        not model.training
+    ), "Currently we only support exporting models in evaluation mode to torchscript"
+
+    with freeze_training_mode(model), patch_instances(fields):
+        scripted_model = torch.jit.script(model)
+        return scripted_model
+
+
+# alias for old name
+export_torchscript_with_instances = scripting_with_instances
+
+
+def dump_torchscript_IR(model, dir):
+    """
+    Dump IR of a TracedModule/ScriptModule/Function in various format (code, graph,
+    inlined graph). Useful for debugging.
+
+    Args:
+        model (TracedModule/ScriptModule/ScriptFUnction): traced or scripted module
+        dir (str): output directory to dump files.
+    """
+    dir = os.path.expanduser(dir)
+    PathManager.mkdirs(dir)
+
+    def _get_script_mod(mod):
+        if isinstance(mod, torch.jit.TracedModule):
+            return mod._actual_script_module
+        return mod
+
+    # Dump pretty-printed code: https://pytorch.org/docs/stable/jit.html#inspecting-code
+    with PathManager.open(os.path.join(dir, "model_ts_code.txt"), "w") as f:
+
+        def get_code(mod):
+            # Try a few ways to get code using private attributes.
+            try:
+                # This contains more information than just `mod.code`
+                return _get_script_mod(mod)._c.code
+            except AttributeError:
+                pass
+            try:
+                return mod.code
+            except AttributeError:
+                return None
+
+        def dump_code(prefix, mod):
+            code = get_code(mod)
+            name = prefix or "root model"
+            if code is None:
+                f.write(f"Could not found code for {name} (type={mod.original_name})\n")
+                f.write("\n")
+            else:
+                f.write(f"\nCode for {name}, type={mod.original_name}:\n")
+                f.write(code)
+                f.write("\n")
+                f.write("-" * 80)
+
+            for name, m in mod.named_children():
+                dump_code(prefix + "." + name, m)
+
+        if isinstance(model, torch.jit.ScriptFunction):
+            f.write(get_code(model))
+        else:
+            dump_code("", model)
+
+    def _get_graph(model):
+        try:
+            # Recursively dump IR of all modules
+            return _get_script_mod(model)._c.dump_to_str(True, False, False)
+        except AttributeError:
+            return model.graph.str()
+
+    with PathManager.open(os.path.join(dir, "model_ts_IR.txt"), "w") as f:
+        f.write(_get_graph(model))
+
+    # Dump IR of the entire graph (all submodules inlined)
+    with PathManager.open(os.path.join(dir, "model_ts_IR_inlined.txt"), "w") as f:
+        f.write(str(model.inlined_graph))
+
+    if not isinstance(model, torch.jit.ScriptFunction):
+        # Dump the model structure in pytorch style
+        with PathManager.open(os.path.join(dir, "model.txt"), "w") as f:
+            f.write(str(model))
diff --git a/detectron2/export/torchscript_patch.py b/detectron2/export/torchscript_patch.py
new file mode 100644
index 0000000000000000000000000000000000000000..da9b324f1582e31d1a16d2fe462ac2989bea56ea
--- /dev/null
+++ b/detectron2/export/torchscript_patch.py
@@ -0,0 +1,406 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+
+import os
+import sys
+import tempfile
+from contextlib import ExitStack, contextmanager
+from copy import deepcopy
+from unittest import mock
+import torch
+from torch import nn
+
+# need some explicit imports due to https://github.com/pytorch/pytorch/issues/38964
+import detectron2  # noqa F401
+from detectron2.structures import Boxes, Instances
+from detectron2.utils.env import _import_file
+
+_counter = 0
+
+
+def _clear_jit_cache():
+    from torch.jit._recursive import concrete_type_store
+    from torch.jit._state import _jit_caching_layer
+
+    concrete_type_store.type_store.clear()  # for modules
+    _jit_caching_layer.clear()  # for free functions
+
+
+def _add_instances_conversion_methods(newInstances):
+    """
+    Add from_instances methods to the scripted Instances class.
+    """
+    cls_name = newInstances.__name__
+
+    @torch.jit.unused
+    def from_instances(instances: Instances):
+        """
+        Create scripted Instances from original Instances
+        """
+        fields = instances.get_fields()
+        image_size = instances.image_size
+        ret = newInstances(image_size)
+        for name, val in fields.items():
+            assert hasattr(ret, f"_{name}"), f"No attribute named {name} in {cls_name}"
+            setattr(ret, name, deepcopy(val))
+        return ret
+
+    newInstances.from_instances = from_instances
+
+
+@contextmanager
+def patch_instances(fields):
+    """
+    A contextmanager, under which the Instances class in detectron2 is replaced
+    by a statically-typed scriptable class, defined by `fields`.
+    See more in `scripting_with_instances`.
+    """
+
+    with tempfile.TemporaryDirectory(prefix="detectron2") as dir, tempfile.NamedTemporaryFile(
+        mode="w", encoding="utf-8", suffix=".py", dir=dir, delete=False
+    ) as f:
+        try:
+            # Objects that use Instances should not reuse previously-compiled
+            # results in cache, because `Instances` could be a new class each time.
+            _clear_jit_cache()
+
+            cls_name, s = _gen_instance_module(fields)
+            f.write(s)
+            f.flush()
+            f.close()
+
+            module = _import(f.name)
+            new_instances = getattr(module, cls_name)
+            _ = torch.jit.script(new_instances)
+            # let torchscript think Instances was scripted already
+            Instances.__torch_script_class__ = True
+            # let torchscript find new_instances when looking for the jit type of Instances
+            Instances._jit_override_qualname = torch._jit_internal._qualified_name(new_instances)
+
+            _add_instances_conversion_methods(new_instances)
+            yield new_instances
+        finally:
+            try:
+                del Instances.__torch_script_class__
+                del Instances._jit_override_qualname
+            except AttributeError:
+                pass
+            sys.modules.pop(module.__name__)
+
+
+def _gen_instance_class(fields):
+    """
+    Args:
+        fields (dict[name: type])
+    """
+
+    class _FieldType:
+        def __init__(self, name, type_):
+            assert isinstance(name, str), f"Field name must be str, got {name}"
+            self.name = name
+            self.type_ = type_
+            self.annotation = f"{type_.__module__}.{type_.__name__}"
+
+    fields = [_FieldType(k, v) for k, v in fields.items()]
+
+    def indent(level, s):
+        return " " * 4 * level + s
+
+    lines = []
+
+    global _counter
+    _counter += 1
+
+    cls_name = "ScriptedInstances{}".format(_counter)
+
+    field_names = tuple(x.name for x in fields)
+    extra_args = ", ".join([f"{f.name}: Optional[{f.annotation}] = None" for f in fields])
+    lines.append(
+        f"""
+class {cls_name}:
+    def __init__(self, image_size: Tuple[int, int], {extra_args}):
+        self.image_size = image_size
+        self._field_names = {field_names}
+"""
+    )
+
+    for f in fields:
+        lines.append(
+            indent(2, f"self._{f.name} = torch.jit.annotate(Optional[{f.annotation}], {f.name})")
+        )
+
+    for f in fields:
+        lines.append(
+            f"""
+    @property
+    def {f.name}(self) -> {f.annotation}:
+        # has to use a local for type refinement
+        # https://pytorch.org/docs/stable/jit_language_reference.html#optional-type-refinement
+        t = self._{f.name}
+        assert t is not None, "{f.name} is None and cannot be accessed!"
+        return t
+
+    @{f.name}.setter
+    def {f.name}(self, value: {f.annotation}) -> None:
+        self._{f.name} = value
+"""
+        )
+
+    # support method `__len__`
+    lines.append(
+        """
+    def __len__(self) -> int:
+"""
+    )
+    for f in fields:
+        lines.append(
+            f"""
+        t = self._{f.name}
+        if t is not None:
+            return len(t)
+"""
+        )
+    lines.append(
+        """
+        raise NotImplementedError("Empty Instances does not support __len__!")
+"""
+    )
+
+    # support method `has`
+    lines.append(
+        """
+    def has(self, name: str) -> bool:
+"""
+    )
+    for f in fields:
+        lines.append(
+            f"""
+        if name == "{f.name}":
+            return self._{f.name} is not None
+"""
+        )
+    lines.append(
+        """
+        return False
+"""
+    )
+
+    # support method `to`
+    none_args = ", None" * len(fields)
+    lines.append(
+        f"""
+    def to(self, device: torch.device) -> "{cls_name}":
+        ret = {cls_name}(self.image_size{none_args})
+"""
+    )
+    for f in fields:
+        if hasattr(f.type_, "to"):
+            lines.append(
+                f"""
+        t = self._{f.name}
+        if t is not None:
+            ret._{f.name} = t.to(device)
+"""
+            )
+        else:
+            # For now, ignore fields that cannot be moved to devices.
+            # Maybe can support other tensor-like classes (e.g. __torch_function__)
+            pass
+    lines.append(
+        """
+        return ret
+"""
+    )
+
+    # support method `getitem`
+    none_args = ", None" * len(fields)
+    lines.append(
+        f"""
+    def __getitem__(self, item) -> "{cls_name}":
+        ret = {cls_name}(self.image_size{none_args})
+"""
+    )
+    for f in fields:
+        lines.append(
+            f"""
+        t = self._{f.name}
+        if t is not None:
+            ret._{f.name} = t[item]
+"""
+        )
+    lines.append(
+        """
+        return ret
+"""
+    )
+
+    # support method `cat`
+    # this version does not contain checks that all instances have same size and fields
+    none_args = ", None" * len(fields)
+    lines.append(
+        f"""
+    def cat(self, instances: List["{cls_name}"]) -> "{cls_name}":
+        ret = {cls_name}(self.image_size{none_args})
+"""
+    )
+    for f in fields:
+        lines.append(
+            f"""
+        t = self._{f.name}
+        if t is not None:
+            values: List[{f.annotation}] = [x.{f.name} for x in instances]
+            if torch.jit.isinstance(t, torch.Tensor):
+                ret._{f.name} = torch.cat(values, dim=0)
+            else:
+                ret._{f.name} = t.cat(values)
+"""
+        )
+    lines.append(
+        """
+        return ret"""
+    )
+
+    # support method `get_fields()`
+    lines.append(
+        """
+    def get_fields(self) -> Dict[str, Tensor]:
+        ret = {}
+    """
+    )
+    for f in fields:
+        if f.type_ == Boxes:
+            stmt = "t.tensor"
+        elif f.type_ == torch.Tensor:
+            stmt = "t"
+        else:
+            stmt = f'assert False, "unsupported type {str(f.type_)}"'
+        lines.append(
+            f"""
+        t = self._{f.name}
+        if t is not None:
+            ret["{f.name}"] = {stmt}
+        """
+        )
+    lines.append(
+        """
+        return ret"""
+    )
+    return cls_name, os.linesep.join(lines)
+
+
+def _gen_instance_module(fields):
+    # TODO: find a more automatic way to enable import of other classes
+    s = """
+from copy import deepcopy
+import torch
+from torch import Tensor
+import typing
+from typing import *
+
+import detectron2
+from detectron2.structures import Boxes, Instances
+
+"""
+
+    cls_name, cls_def = _gen_instance_class(fields)
+    s += cls_def
+    return cls_name, s
+
+
+def _import(path):
+    return _import_file(
+        "{}{}".format(sys.modules[__name__].__name__, _counter), path, make_importable=True
+    )
+
+
+@contextmanager
+def patch_builtin_len(modules=()):
+    """
+    Patch the builtin len() function of a few detectron2 modules
+    to use __len__ instead, because __len__ does not convert values to
+    integers and therefore is friendly to tracing.
+
+    Args:
+        modules (list[stsr]): names of extra modules to patch len(), in
+            addition to those in detectron2.
+    """
+
+    def _new_len(obj):
+        return obj.__len__()
+
+    with ExitStack() as stack:
+        MODULES = [
+            "detectron2.modeling.roi_heads.fast_rcnn",
+            "detectron2.modeling.roi_heads.mask_head",
+            "detectron2.modeling.roi_heads.keypoint_head",
+        ] + list(modules)
+        ctxs = [stack.enter_context(mock.patch(mod + ".len")) for mod in MODULES]
+        for m in ctxs:
+            m.side_effect = _new_len
+        yield
+
+
+def patch_nonscriptable_classes():
+    """
+    Apply patches on a few nonscriptable detectron2 classes.
+    Should not have side-effects on eager usage.
+    """
+    # __prepare_scriptable__ can also be added to models for easier maintenance.
+    # But it complicates the clean model code.
+
+    from detectron2.modeling.backbone import ResNet, FPN
+
+    # Due to https://github.com/pytorch/pytorch/issues/36061,
+    # we change backbone to use ModuleList for scripting.
+    # (note: this changes param names in state_dict)
+
+    def prepare_resnet(self):
+        ret = deepcopy(self)
+        ret.stages = nn.ModuleList(ret.stages)
+        for k in self.stage_names:
+            delattr(ret, k)
+        return ret
+
+    ResNet.__prepare_scriptable__ = prepare_resnet
+
+    def prepare_fpn(self):
+        ret = deepcopy(self)
+        ret.lateral_convs = nn.ModuleList(ret.lateral_convs)
+        ret.output_convs = nn.ModuleList(ret.output_convs)
+        for name, _ in self.named_children():
+            if name.startswith("fpn_"):
+                delattr(ret, name)
+        return ret
+
+    FPN.__prepare_scriptable__ = prepare_fpn
+
+    # Annotate some attributes to be constants for the purpose of scripting,
+    # even though they are not constants in eager mode.
+    from detectron2.modeling.roi_heads import StandardROIHeads
+
+    if hasattr(StandardROIHeads, "__annotations__"):
+        # copy first to avoid editing annotations of base class
+        StandardROIHeads.__annotations__ = deepcopy(StandardROIHeads.__annotations__)
+        StandardROIHeads.__annotations__["mask_on"] = torch.jit.Final[bool]
+        StandardROIHeads.__annotations__["keypoint_on"] = torch.jit.Final[bool]
+
+
+# These patches are not supposed to have side-effects.
+patch_nonscriptable_classes()
+
+
+@contextmanager
+def freeze_training_mode(model):
+    """
+    A context manager that annotates the "training" attribute of every submodule
+    to constant, so that the training codepath in these modules can be
+    meta-compiled away. Upon exiting, the annotations are reverted.
+    """
+    classes = {type(x) for x in model.modules()}
+    # __constants__ is the old way to annotate constants and not compatible
+    # with __annotations__ .
+    classes = {x for x in classes if not hasattr(x, "__constants__")}
+    for cls in classes:
+        cls.__annotations__["training"] = torch.jit.Final[bool]
+    yield
+    for cls in classes:
+        cls.__annotations__["training"] = bool
diff --git a/detectron2/layers/__init__.py b/detectron2/layers/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..761a3d1c7afa049e9779ee9fc4d299e9aae38cad
--- /dev/null
+++ b/detectron2/layers/__init__.py
@@ -0,0 +1,26 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+from .batch_norm import FrozenBatchNorm2d, get_norm, NaiveSyncBatchNorm, CycleBatchNormList
+from .deform_conv import DeformConv, ModulatedDeformConv
+from .mask_ops import paste_masks_in_image
+from .nms import batched_nms, batched_nms_rotated, nms, nms_rotated
+from .roi_align import ROIAlign, roi_align
+from .roi_align_rotated import ROIAlignRotated, roi_align_rotated
+from .shape_spec import ShapeSpec
+from .wrappers import (
+    BatchNorm2d,
+    Conv2d,
+    ConvTranspose2d,
+    cat,
+    interpolate,
+    Linear,
+    nonzero_tuple,
+    cross_entropy,
+    empty_input_loss_func_wrapper,
+    shapes_to_tensor,
+    move_device_like,
+)
+from .blocks import CNNBlockBase, DepthwiseSeparableConv2d
+from .aspp import ASPP
+from .losses import ciou_loss, diou_loss
+
+__all__ = [k for k in globals().keys() if not k.startswith("_")]
diff --git a/detectron2/layers/__pycache__/__init__.cpython-310.pyc b/detectron2/layers/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7924f785585cdfd625348f8a3153f579186c4da1
Binary files /dev/null and b/detectron2/layers/__pycache__/__init__.cpython-310.pyc differ
diff --git a/detectron2/layers/__pycache__/__init__.cpython-39.pyc b/detectron2/layers/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..56f1a7fd009df0b0c79a6b5289a1f9d306829ef7
Binary files /dev/null and b/detectron2/layers/__pycache__/__init__.cpython-39.pyc differ
diff --git a/detectron2/layers/__pycache__/aspp.cpython-310.pyc b/detectron2/layers/__pycache__/aspp.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..110b504330448fb61bfb8f2096ab21c8898a931b
Binary files /dev/null and b/detectron2/layers/__pycache__/aspp.cpython-310.pyc differ
diff --git a/detectron2/layers/__pycache__/aspp.cpython-39.pyc b/detectron2/layers/__pycache__/aspp.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6886b03aacc147c388c687a7ccf7125d138c572d
Binary files /dev/null and b/detectron2/layers/__pycache__/aspp.cpython-39.pyc differ
diff --git a/detectron2/layers/__pycache__/batch_norm.cpython-310.pyc b/detectron2/layers/__pycache__/batch_norm.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..aa2180c3769b4e53fdc2d8520877de01ea5f89ad
Binary files /dev/null and b/detectron2/layers/__pycache__/batch_norm.cpython-310.pyc differ
diff --git a/detectron2/layers/__pycache__/batch_norm.cpython-39.pyc b/detectron2/layers/__pycache__/batch_norm.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..12c0774cf6c708cc72514e14474224939d424335
Binary files /dev/null and b/detectron2/layers/__pycache__/batch_norm.cpython-39.pyc differ
diff --git a/detectron2/layers/__pycache__/blocks.cpython-310.pyc b/detectron2/layers/__pycache__/blocks.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f4be0a4ccc61b950358dfe43e39c521ce4fa0ec1
Binary files /dev/null and b/detectron2/layers/__pycache__/blocks.cpython-310.pyc differ
diff --git a/detectron2/layers/__pycache__/blocks.cpython-39.pyc b/detectron2/layers/__pycache__/blocks.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..837212db4d1f2c4495aa9a3ca7613cbc91a244db
Binary files /dev/null and b/detectron2/layers/__pycache__/blocks.cpython-39.pyc differ
diff --git a/detectron2/layers/__pycache__/deform_conv.cpython-310.pyc b/detectron2/layers/__pycache__/deform_conv.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4f2d442492eddf87a4d6804dfea977eefb695efa
Binary files /dev/null and b/detectron2/layers/__pycache__/deform_conv.cpython-310.pyc differ
diff --git a/detectron2/layers/__pycache__/deform_conv.cpython-39.pyc b/detectron2/layers/__pycache__/deform_conv.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e52555e073cc64da76bbd7f033fd1cc936944280
Binary files /dev/null and b/detectron2/layers/__pycache__/deform_conv.cpython-39.pyc differ
diff --git a/detectron2/layers/__pycache__/losses.cpython-310.pyc b/detectron2/layers/__pycache__/losses.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..859dac6036dd41a450fc340f95b82b365f43129a
Binary files /dev/null and b/detectron2/layers/__pycache__/losses.cpython-310.pyc differ
diff --git a/detectron2/layers/__pycache__/losses.cpython-39.pyc b/detectron2/layers/__pycache__/losses.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8f13592f6733072ab8470574f7e66731e200fa16
Binary files /dev/null and b/detectron2/layers/__pycache__/losses.cpython-39.pyc differ
diff --git a/detectron2/layers/__pycache__/mask_ops.cpython-310.pyc b/detectron2/layers/__pycache__/mask_ops.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..75ba404297bc73b16adf260784e5a28060a8f383
Binary files /dev/null and b/detectron2/layers/__pycache__/mask_ops.cpython-310.pyc differ
diff --git a/detectron2/layers/__pycache__/mask_ops.cpython-39.pyc b/detectron2/layers/__pycache__/mask_ops.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..825ad6bff102f9894c83bf38efae7385521bd4f0
Binary files /dev/null and b/detectron2/layers/__pycache__/mask_ops.cpython-39.pyc differ
diff --git a/detectron2/layers/__pycache__/nms.cpython-310.pyc b/detectron2/layers/__pycache__/nms.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1179f4ca5ef09337a64b56ef2509d8a7dc9b5cc6
Binary files /dev/null and b/detectron2/layers/__pycache__/nms.cpython-310.pyc differ
diff --git a/detectron2/layers/__pycache__/nms.cpython-39.pyc b/detectron2/layers/__pycache__/nms.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..44458e58081e19ab127b49c3d33eab2457ca8713
Binary files /dev/null and b/detectron2/layers/__pycache__/nms.cpython-39.pyc differ
diff --git a/detectron2/layers/__pycache__/roi_align.cpython-310.pyc b/detectron2/layers/__pycache__/roi_align.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ff8a44f593a2e5275af1ba296c51b17b0951c941
Binary files /dev/null and b/detectron2/layers/__pycache__/roi_align.cpython-310.pyc differ
diff --git a/detectron2/layers/__pycache__/roi_align.cpython-39.pyc b/detectron2/layers/__pycache__/roi_align.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..50dade3205bf3806ca34539bb691aed4be51ee44
Binary files /dev/null and b/detectron2/layers/__pycache__/roi_align.cpython-39.pyc differ
diff --git a/detectron2/layers/__pycache__/roi_align_rotated.cpython-310.pyc b/detectron2/layers/__pycache__/roi_align_rotated.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..72646d4fd3a5d4500d0a0cc1af4f53e6cf309f11
Binary files /dev/null and b/detectron2/layers/__pycache__/roi_align_rotated.cpython-310.pyc differ
diff --git a/detectron2/layers/__pycache__/roi_align_rotated.cpython-39.pyc b/detectron2/layers/__pycache__/roi_align_rotated.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e3bbd751505981efc7c7fdced3107ca55f275dab
Binary files /dev/null and b/detectron2/layers/__pycache__/roi_align_rotated.cpython-39.pyc differ
diff --git a/detectron2/layers/__pycache__/rotated_boxes.cpython-310.pyc b/detectron2/layers/__pycache__/rotated_boxes.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ea9bae06c35b9e62e871be6df4143a06b657946b
Binary files /dev/null and b/detectron2/layers/__pycache__/rotated_boxes.cpython-310.pyc differ
diff --git a/detectron2/layers/__pycache__/rotated_boxes.cpython-39.pyc b/detectron2/layers/__pycache__/rotated_boxes.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..bcf0a334bbb3e583c1504410a39c88846f1a5415
Binary files /dev/null and b/detectron2/layers/__pycache__/rotated_boxes.cpython-39.pyc differ
diff --git a/detectron2/layers/__pycache__/shape_spec.cpython-310.pyc b/detectron2/layers/__pycache__/shape_spec.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..553f7950b9ea2576e0a2a986e812e3068c4646b2
Binary files /dev/null and b/detectron2/layers/__pycache__/shape_spec.cpython-310.pyc differ
diff --git a/detectron2/layers/__pycache__/shape_spec.cpython-39.pyc b/detectron2/layers/__pycache__/shape_spec.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..188c6afe66a7e4515f7496a88ce4f09d186b6285
Binary files /dev/null and b/detectron2/layers/__pycache__/shape_spec.cpython-39.pyc differ
diff --git a/detectron2/layers/__pycache__/wrappers.cpython-310.pyc b/detectron2/layers/__pycache__/wrappers.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..58eec74feabccd929d5c929f621ce97a3c88a290
Binary files /dev/null and b/detectron2/layers/__pycache__/wrappers.cpython-310.pyc differ
diff --git a/detectron2/layers/__pycache__/wrappers.cpython-39.pyc b/detectron2/layers/__pycache__/wrappers.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e47d91603f080a6be0e6e6b2d6ad0f1cb66f23b4
Binary files /dev/null and b/detectron2/layers/__pycache__/wrappers.cpython-39.pyc differ
diff --git a/detectron2/layers/aspp.py b/detectron2/layers/aspp.py
new file mode 100644
index 0000000000000000000000000000000000000000..14861aa9ede4fea6a69a49f189bcab997b558148
--- /dev/null
+++ b/detectron2/layers/aspp.py
@@ -0,0 +1,144 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+
+from copy import deepcopy
+import fvcore.nn.weight_init as weight_init
+import torch
+from torch import nn
+from torch.nn import functional as F
+
+from .batch_norm import get_norm
+from .blocks import DepthwiseSeparableConv2d
+from .wrappers import Conv2d
+
+
+class ASPP(nn.Module):
+    """
+    Atrous Spatial Pyramid Pooling (ASPP).
+    """
+
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        dilations,
+        *,
+        norm,
+        activation,
+        pool_kernel_size=None,
+        dropout: float = 0.0,
+        use_depthwise_separable_conv=False,
+    ):
+        """
+        Args:
+            in_channels (int): number of input channels for ASPP.
+            out_channels (int): number of output channels.
+            dilations (list): a list of 3 dilations in ASPP.
+            norm (str or callable): normalization for all conv layers.
+                See :func:`layers.get_norm` for supported format. norm is
+                applied to all conv layers except the conv following
+                global average pooling.
+            activation (callable): activation function.
+            pool_kernel_size (tuple, list): the average pooling size (kh, kw)
+                for image pooling layer in ASPP. If set to None, it always
+                performs global average pooling. If not None, it must be
+                divisible by the shape of inputs in forward(). It is recommended
+                to use a fixed input feature size in training, and set this
+                option to match this size, so that it performs global average
+                pooling in training, and the size of the pooling window stays
+                consistent in inference.
+            dropout (float): apply dropout on the output of ASPP. It is used in
+                the official DeepLab implementation with a rate of 0.1:
+                https://github.com/tensorflow/models/blob/21b73d22f3ed05b650e85ac50849408dd36de32e/research/deeplab/model.py#L532  # noqa
+            use_depthwise_separable_conv (bool): use DepthwiseSeparableConv2d
+                for 3x3 convs in ASPP, proposed in :paper:`DeepLabV3+`.
+        """
+        super(ASPP, self).__init__()
+        assert len(dilations) == 3, "ASPP expects 3 dilations, got {}".format(len(dilations))
+        self.pool_kernel_size = pool_kernel_size
+        self.dropout = dropout
+        use_bias = norm == ""
+        self.convs = nn.ModuleList()
+        # conv 1x1
+        self.convs.append(
+            Conv2d(
+                in_channels,
+                out_channels,
+                kernel_size=1,
+                bias=use_bias,
+                norm=get_norm(norm, out_channels),
+                activation=deepcopy(activation),
+            )
+        )
+        weight_init.c2_xavier_fill(self.convs[-1])
+        # atrous convs
+        for dilation in dilations:
+            if use_depthwise_separable_conv:
+                self.convs.append(
+                    DepthwiseSeparableConv2d(
+                        in_channels,
+                        out_channels,
+                        kernel_size=3,
+                        padding=dilation,
+                        dilation=dilation,
+                        norm1=norm,
+                        activation1=deepcopy(activation),
+                        norm2=norm,
+                        activation2=deepcopy(activation),
+                    )
+                )
+            else:
+                self.convs.append(
+                    Conv2d(
+                        in_channels,
+                        out_channels,
+                        kernel_size=3,
+                        padding=dilation,
+                        dilation=dilation,
+                        bias=use_bias,
+                        norm=get_norm(norm, out_channels),
+                        activation=deepcopy(activation),
+                    )
+                )
+                weight_init.c2_xavier_fill(self.convs[-1])
+        # image pooling
+        # We do not add BatchNorm because the spatial resolution is 1x1,
+        # the original TF implementation has BatchNorm.
+        if pool_kernel_size is None:
+            image_pooling = nn.Sequential(
+                nn.AdaptiveAvgPool2d(1),
+                Conv2d(in_channels, out_channels, 1, bias=True, activation=deepcopy(activation)),
+            )
+        else:
+            image_pooling = nn.Sequential(
+                nn.AvgPool2d(kernel_size=pool_kernel_size, stride=1),
+                Conv2d(in_channels, out_channels, 1, bias=True, activation=deepcopy(activation)),
+            )
+        weight_init.c2_xavier_fill(image_pooling[1])
+        self.convs.append(image_pooling)
+
+        self.project = Conv2d(
+            5 * out_channels,
+            out_channels,
+            kernel_size=1,
+            bias=use_bias,
+            norm=get_norm(norm, out_channels),
+            activation=deepcopy(activation),
+        )
+        weight_init.c2_xavier_fill(self.project)
+
+    def forward(self, x):
+        size = x.shape[-2:]
+        if self.pool_kernel_size is not None:
+            if size[0] % self.pool_kernel_size[0] or size[1] % self.pool_kernel_size[1]:
+                raise ValueError(
+                    "`pool_kernel_size` must be divisible by the shape of inputs. "
+                    "Input size: {} `pool_kernel_size`: {}".format(size, self.pool_kernel_size)
+                )
+        res = []
+        for conv in self.convs:
+            res.append(conv(x))
+        res[-1] = F.interpolate(res[-1], size=size, mode="bilinear", align_corners=False)
+        res = torch.cat(res, dim=1)
+        res = self.project(res)
+        res = F.dropout(res, self.dropout, training=self.training) if self.dropout > 0 else res
+        return res
diff --git a/detectron2/layers/batch_norm.py b/detectron2/layers/batch_norm.py
new file mode 100644
index 0000000000000000000000000000000000000000..d304061ecf36dc1ebacccf19a154b8ba2fe8e785
--- /dev/null
+++ b/detectron2/layers/batch_norm.py
@@ -0,0 +1,353 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+import torch
+import torch.distributed as dist
+from fvcore.nn.distributed import differentiable_all_reduce
+from torch import nn
+from torch.nn import functional as F
+
+from detectron2.utils import comm, env
+
+from .wrappers import BatchNorm2d
+
+
+class FrozenBatchNorm2d(nn.Module):
+    """
+    BatchNorm2d where the batch statistics and the affine parameters are fixed.
+
+    It contains non-trainable buffers called
+    "weight" and "bias", "running_mean", "running_var",
+    initialized to perform identity transformation.
+
+    The pre-trained backbone models from Caffe2 only contain "weight" and "bias",
+    which are computed from the original four parameters of BN.
+    The affine transform `x * weight + bias` will perform the equivalent
+    computation of `(x - running_mean) / sqrt(running_var) * weight + bias`.
+    When loading a backbone model from Caffe2, "running_mean" and "running_var"
+    will be left unchanged as identity transformation.
+
+    Other pre-trained backbone models may contain all 4 parameters.
+
+    The forward is implemented by `F.batch_norm(..., training=False)`.
+    """
+
+    _version = 3
+
+    def __init__(self, num_features, eps=1e-5):
+        super().__init__()
+        self.num_features = num_features
+        self.eps = eps
+        self.register_buffer("weight", torch.ones(num_features))
+        self.register_buffer("bias", torch.zeros(num_features))
+        self.register_buffer("running_mean", torch.zeros(num_features))
+        self.register_buffer("running_var", torch.ones(num_features) - eps)
+        self.register_buffer("num_batches_tracked", None)
+
+    def forward(self, x):
+        if x.requires_grad:
+            # When gradients are needed, F.batch_norm will use extra memory
+            # because its backward op computes gradients for weight/bias as well.
+            scale = self.weight * (self.running_var + self.eps).rsqrt()
+            bias = self.bias - self.running_mean * scale
+            scale = scale.reshape(1, -1, 1, 1)
+            bias = bias.reshape(1, -1, 1, 1)
+            out_dtype = x.dtype  # may be half
+            return x * scale.to(out_dtype) + bias.to(out_dtype)
+        else:
+            # When gradients are not needed, F.batch_norm is a single fused op
+            # and provide more optimization opportunities.
+            return F.batch_norm(
+                x,
+                self.running_mean,
+                self.running_var,
+                self.weight,
+                self.bias,
+                training=False,
+                eps=self.eps,
+            )
+
+    def _load_from_state_dict(
+        self,
+        state_dict,
+        prefix,
+        local_metadata,
+        strict,
+        missing_keys,
+        unexpected_keys,
+        error_msgs,
+    ):
+        version = local_metadata.get("version", None)
+
+        if version is None or version < 2:
+            # No running_mean/var in early versions
+            # This will silent the warnings
+            if prefix + "running_mean" not in state_dict:
+                state_dict[prefix + "running_mean"] = torch.zeros_like(self.running_mean)
+            if prefix + "running_var" not in state_dict:
+                state_dict[prefix + "running_var"] = torch.ones_like(self.running_var)
+
+        super()._load_from_state_dict(
+            state_dict,
+            prefix,
+            local_metadata,
+            strict,
+            missing_keys,
+            unexpected_keys,
+            error_msgs,
+        )
+
+    def __repr__(self):
+        return "FrozenBatchNorm2d(num_features={}, eps={})".format(self.num_features, self.eps)
+
+    @classmethod
+    def convert_frozen_batchnorm(cls, module):
+        """
+        Convert all BatchNorm/SyncBatchNorm in module into FrozenBatchNorm.
+
+        Args:
+            module (torch.nn.Module):
+
+        Returns:
+            If module is BatchNorm/SyncBatchNorm, returns a new module.
+            Otherwise, in-place convert module and return it.
+
+        Similar to convert_sync_batchnorm in
+        https://github.com/pytorch/pytorch/blob/master/torch/nn/modules/batchnorm.py
+        """
+        bn_module = nn.modules.batchnorm
+        bn_module = (bn_module.BatchNorm2d, bn_module.SyncBatchNorm)
+        res = module
+        if isinstance(module, bn_module):
+            res = cls(module.num_features)
+            if module.affine:
+                res.weight.data = module.weight.data.clone().detach()
+                res.bias.data = module.bias.data.clone().detach()
+            res.running_mean.data = module.running_mean.data
+            res.running_var.data = module.running_var.data
+            res.eps = module.eps
+            res.num_batches_tracked = module.num_batches_tracked
+        else:
+            for name, child in module.named_children():
+                new_child = cls.convert_frozen_batchnorm(child)
+                if new_child is not child:
+                    res.add_module(name, new_child)
+        return res
+
+    @classmethod
+    def convert_frozenbatchnorm2d_to_batchnorm2d(cls, module: nn.Module) -> nn.Module:
+        """
+        Convert all FrozenBatchNorm2d to BatchNorm2d
+
+        Args:
+            module (torch.nn.Module):
+
+        Returns:
+            If module is FrozenBatchNorm2d, returns a new module.
+            Otherwise, in-place convert module and return it.
+
+        This is needed for quantization:
+            https://fb.workplace.com/groups/1043663463248667/permalink/1296330057982005/
+        """
+
+        res = module
+        if isinstance(module, FrozenBatchNorm2d):
+            res = torch.nn.BatchNorm2d(module.num_features, module.eps)
+
+            res.weight.data = module.weight.data.clone().detach()
+            res.bias.data = module.bias.data.clone().detach()
+            res.running_mean.data = module.running_mean.data.clone().detach()
+            res.running_var.data = module.running_var.data.clone().detach()
+            res.eps = module.eps
+            res.num_batches_tracked = module.num_batches_tracked
+        else:
+            for name, child in module.named_children():
+                new_child = cls.convert_frozenbatchnorm2d_to_batchnorm2d(child)
+                if new_child is not child:
+                    res.add_module(name, new_child)
+        return res
+
+
+def get_norm(norm, out_channels):
+    """
+    Args:
+        norm (str or callable): either one of BN, SyncBN, FrozenBN, GN;
+            or a callable that takes a channel number and returns
+            the normalization layer as a nn.Module.
+
+    Returns:
+        nn.Module or None: the normalization layer
+    """
+    if norm is None:
+        return None
+    if isinstance(norm, str):
+        if len(norm) == 0:
+            return None
+        norm = {
+            "BN": BatchNorm2d,
+            # Fixed in https://github.com/pytorch/pytorch/pull/36382
+            "SyncBN": NaiveSyncBatchNorm if env.TORCH_VERSION <= (1, 5) else nn.SyncBatchNorm,
+            "FrozenBN": FrozenBatchNorm2d,
+            "GN": lambda channels: nn.GroupNorm(32, channels),
+            # for debugging:
+            "nnSyncBN": nn.SyncBatchNorm,
+            "naiveSyncBN": NaiveSyncBatchNorm,
+            # expose stats_mode N as an option to caller, required for zero-len inputs
+            "naiveSyncBN_N": lambda channels: NaiveSyncBatchNorm(channels, stats_mode="N"),
+            "LN": lambda channels: LayerNorm(channels),
+        }[norm]
+    return norm(out_channels)
+
+
+class NaiveSyncBatchNorm(BatchNorm2d):
+    """
+    In PyTorch<=1.5, ``nn.SyncBatchNorm`` has incorrect gradient
+    when the batch size on each worker is different.
+    (e.g., when scale augmentation is used, or when it is applied to mask head).
+
+    This is a slower but correct alternative to `nn.SyncBatchNorm`.
+
+    Note:
+        There isn't a single definition of Sync BatchNorm.
+
+        When ``stats_mode==""``, this module computes overall statistics by using
+        statistics of each worker with equal weight.  The result is true statistics
+        of all samples (as if they are all on one worker) only when all workers
+        have the same (N, H, W). This mode does not support inputs with zero batch size.
+
+        When ``stats_mode=="N"``, this module computes overall statistics by weighting
+        the statistics of each worker by their ``N``. The result is true statistics
+        of all samples (as if they are all on one worker) only when all workers
+        have the same (H, W). It is slower than ``stats_mode==""``.
+
+        Even though the result of this module may not be the true statistics of all samples,
+        it may still be reasonable because it might be preferrable to assign equal weights
+        to all workers, regardless of their (H, W) dimension, instead of putting larger weight
+        on larger images. From preliminary experiments, little difference is found between such
+        a simplified implementation and an accurate computation of overall mean & variance.
+    """
+
+    def __init__(self, *args, stats_mode="", **kwargs):
+        super().__init__(*args, **kwargs)
+        assert stats_mode in ["", "N"]
+        self._stats_mode = stats_mode
+
+    def forward(self, input):
+        if comm.get_world_size() == 1 or not self.training:
+            return super().forward(input)
+
+        B, C = input.shape[0], input.shape[1]
+
+        half_input = input.dtype == torch.float16
+        if half_input:
+            # fp16 does not have good enough numerics for the reduction here
+            input = input.float()
+        mean = torch.mean(input, dim=[0, 2, 3])
+        meansqr = torch.mean(input * input, dim=[0, 2, 3])
+
+        if self._stats_mode == "":
+            assert B > 0, 'SyncBatchNorm(stats_mode="") does not support zero batch size.'
+            vec = torch.cat([mean, meansqr], dim=0)
+            vec = differentiable_all_reduce(vec) * (1.0 / dist.get_world_size())
+            mean, meansqr = torch.split(vec, C)
+            momentum = self.momentum
+        else:
+            if B == 0:
+                vec = torch.zeros([2 * C + 1], device=mean.device, dtype=mean.dtype)
+                vec = vec + input.sum()  # make sure there is gradient w.r.t input
+            else:
+                vec = torch.cat(
+                    [
+                        mean,
+                        meansqr,
+                        torch.ones([1], device=mean.device, dtype=mean.dtype),
+                    ],
+                    dim=0,
+                )
+            vec = differentiable_all_reduce(vec * B)
+
+            total_batch = vec[-1].detach()
+            momentum = total_batch.clamp(max=1) * self.momentum  # no update if total_batch is 0
+            mean, meansqr, _ = torch.split(vec / total_batch.clamp(min=1), C)  # avoid div-by-zero
+
+        var = meansqr - mean * mean
+        invstd = torch.rsqrt(var + self.eps)
+        scale = self.weight * invstd
+        bias = self.bias - mean * scale
+        scale = scale.reshape(1, -1, 1, 1)
+        bias = bias.reshape(1, -1, 1, 1)
+
+        self.running_mean += momentum * (mean.detach() - self.running_mean)
+        self.running_var += momentum * (var.detach() - self.running_var)
+        ret = input * scale + bias
+        if half_input:
+            ret = ret.half()
+        return ret
+
+
+class CycleBatchNormList(nn.ModuleList):
+    """
+    Implement domain-specific BatchNorm by cycling.
+
+    When a BatchNorm layer is used for multiple input domains or input
+    features, it might need to maintain a separate test-time statistics
+    for each domain. See Sec 5.2 in :paper:`rethinking-batchnorm`.
+
+    This module implements it by using N separate BN layers
+    and it cycles through them every time a forward() is called.
+
+    NOTE: The caller of this module MUST guarantee to always call
+    this module by multiple of N times. Otherwise its test-time statistics
+    will be incorrect.
+    """
+
+    def __init__(self, length: int, bn_class=nn.BatchNorm2d, **kwargs):
+        """
+        Args:
+            length: number of BatchNorm layers to cycle.
+            bn_class: the BatchNorm class to use
+            kwargs: arguments of the BatchNorm class, such as num_features.
+        """
+        self._affine = kwargs.pop("affine", True)
+        super().__init__([bn_class(**kwargs, affine=False) for k in range(length)])
+        if self._affine:
+            # shared affine, domain-specific BN
+            channels = self[0].num_features
+            self.weight = nn.Parameter(torch.ones(channels))
+            self.bias = nn.Parameter(torch.zeros(channels))
+        self._pos = 0
+
+    def forward(self, x):
+        ret = self[self._pos](x)
+        self._pos = (self._pos + 1) % len(self)
+
+        if self._affine:
+            w = self.weight.reshape(1, -1, 1, 1)
+            b = self.bias.reshape(1, -1, 1, 1)
+            return ret * w + b
+        else:
+            return ret
+
+    def extra_repr(self):
+        return f"affine={self._affine}"
+
+
+class LayerNorm(nn.Module):
+    """
+    A LayerNorm variant, popularized by Transformers, that performs point-wise mean and
+    variance normalization over the channel dimension for inputs that have shape
+    (batch_size, channels, height, width).
+    https://github.com/facebookresearch/ConvNeXt/blob/d1fa8f6fef0a165b27399986cc2bdacc92777e40/models/convnext.py#L119  # noqa B950
+    """
+
+    def __init__(self, normalized_shape, eps=1e-6):
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(normalized_shape))
+        self.bias = nn.Parameter(torch.zeros(normalized_shape))
+        self.eps = eps
+        self.normalized_shape = (normalized_shape,)
+
+    def forward(self, x):
+        u = x.mean(1, keepdim=True)
+        s = (x - u).pow(2).mean(1, keepdim=True)
+        x = (x - u) / torch.sqrt(s + self.eps)
+        x = self.weight[:, None, None] * x + self.bias[:, None, None]
+        return x
diff --git a/detectron2/layers/blocks.py b/detectron2/layers/blocks.py
new file mode 100644
index 0000000000000000000000000000000000000000..1995a4bf7339e8deb7eaaffda4f819dda55e7ac7
--- /dev/null
+++ b/detectron2/layers/blocks.py
@@ -0,0 +1,111 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) Facebook, Inc. and its affiliates.
+
+import fvcore.nn.weight_init as weight_init
+from torch import nn
+
+from .batch_norm import FrozenBatchNorm2d, get_norm
+from .wrappers import Conv2d
+
+
+"""
+CNN building blocks.
+"""
+
+
+class CNNBlockBase(nn.Module):
+    """
+    A CNN block is assumed to have input channels, output channels and a stride.
+    The input and output of `forward()` method must be NCHW tensors.
+    The method can perform arbitrary computation but must match the given
+    channels and stride specification.
+
+    Attribute:
+        in_channels (int):
+        out_channels (int):
+        stride (int):
+    """
+
+    def __init__(self, in_channels, out_channels, stride):
+        """
+        The `__init__` method of any subclass should also contain these arguments.
+
+        Args:
+            in_channels (int):
+            out_channels (int):
+            stride (int):
+        """
+        super().__init__()
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.stride = stride
+
+    def freeze(self):
+        """
+        Make this block not trainable.
+        This method sets all parameters to `requires_grad=False`,
+        and convert all BatchNorm layers to FrozenBatchNorm
+
+        Returns:
+            the block itself
+        """
+        for p in self.parameters():
+            p.requires_grad = False
+        FrozenBatchNorm2d.convert_frozen_batchnorm(self)
+        return self
+
+
+class DepthwiseSeparableConv2d(nn.Module):
+    """
+    A kxk depthwise convolution + a 1x1 convolution.
+
+    In :paper:`xception`, norm & activation are applied on the second conv.
+    :paper:`mobilenet` uses norm & activation on both convs.
+    """
+
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        kernel_size=3,
+        padding=1,
+        dilation=1,
+        *,
+        norm1=None,
+        activation1=None,
+        norm2=None,
+        activation2=None,
+    ):
+        """
+        Args:
+            norm1, norm2 (str or callable): normalization for the two conv layers.
+            activation1, activation2 (callable(Tensor) -> Tensor): activation
+                function for the two conv layers.
+        """
+        super().__init__()
+        self.depthwise = Conv2d(
+            in_channels,
+            in_channels,
+            kernel_size=kernel_size,
+            padding=padding,
+            dilation=dilation,
+            groups=in_channels,
+            bias=not norm1,
+            norm=get_norm(norm1, in_channels),
+            activation=activation1,
+        )
+        self.pointwise = Conv2d(
+            in_channels,
+            out_channels,
+            kernel_size=1,
+            bias=not norm2,
+            norm=get_norm(norm2, out_channels),
+            activation=activation2,
+        )
+
+        # default initialization
+        weight_init.c2_msra_fill(self.depthwise)
+        weight_init.c2_msra_fill(self.pointwise)
+
+    def forward(self, x):
+        return self.pointwise(self.depthwise(x))
diff --git a/detectron2/layers/csrc/README.md b/detectron2/layers/csrc/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..778ed3da0bae89820831bcd8a72ff7b9cad8d4dd
--- /dev/null
+++ b/detectron2/layers/csrc/README.md
@@ -0,0 +1,7 @@
+
+
+To add a new Op:
+
+1. Create a new directory
+2. Implement new ops there
+3. Delcare its Python interface in `vision.cpp`.
diff --git a/detectron2/layers/csrc/ROIAlignRotated/ROIAlignRotated.h b/detectron2/layers/csrc/ROIAlignRotated/ROIAlignRotated.h
new file mode 100644
index 0000000000000000000000000000000000000000..03f4211003f42f601f0cfcf4a690f5da4a0a1f67
--- /dev/null
+++ b/detectron2/layers/csrc/ROIAlignRotated/ROIAlignRotated.h
@@ -0,0 +1,115 @@
+// Copyright (c) Facebook, Inc. and its affiliates.
+#pragma once
+#include <torch/types.h>
+
+namespace detectron2 {
+
+at::Tensor ROIAlignRotated_forward_cpu(
+    const at::Tensor& input,
+    const at::Tensor& rois,
+    const float spatial_scale,
+    const int pooled_height,
+    const int pooled_width,
+    const int sampling_ratio);
+
+at::Tensor ROIAlignRotated_backward_cpu(
+    const at::Tensor& grad,
+    const at::Tensor& rois,
+    const float spatial_scale,
+    const int pooled_height,
+    const int pooled_width,
+    const int batch_size,
+    const int channels,
+    const int height,
+    const int width,
+    const int sampling_ratio);
+
+#if defined(WITH_CUDA) || defined(WITH_HIP)
+at::Tensor ROIAlignRotated_forward_cuda(
+    const at::Tensor& input,
+    const at::Tensor& rois,
+    const float spatial_scale,
+    const int pooled_height,
+    const int pooled_width,
+    const int sampling_ratio);
+
+at::Tensor ROIAlignRotated_backward_cuda(
+    const at::Tensor& grad,
+    const at::Tensor& rois,
+    const float spatial_scale,
+    const int pooled_height,
+    const int pooled_width,
+    const int batch_size,
+    const int channels,
+    const int height,
+    const int width,
+    const int sampling_ratio);
+#endif
+
+// Interface for Python
+inline at::Tensor ROIAlignRotated_forward(
+    const at::Tensor& input,
+    const at::Tensor& rois,
+    const double spatial_scale,
+    const int64_t pooled_height,
+    const int64_t pooled_width,
+    const int64_t sampling_ratio) {
+  if (input.is_cuda()) {
+#if defined(WITH_CUDA) || defined(WITH_HIP)
+    return ROIAlignRotated_forward_cuda(
+        input,
+        rois,
+        spatial_scale,
+        pooled_height,
+        pooled_width,
+        sampling_ratio);
+#else
+    AT_ERROR("Detectron2 is not compiled with GPU support!");
+#endif
+  }
+  return ROIAlignRotated_forward_cpu(
+      input, rois, spatial_scale, pooled_height, pooled_width, sampling_ratio);
+}
+
+inline at::Tensor ROIAlignRotated_backward(
+    const at::Tensor& grad,
+    const at::Tensor& rois,
+    const double spatial_scale,
+    const int64_t pooled_height,
+    const int64_t pooled_width,
+    const int64_t batch_size,
+    const int64_t channels,
+    const int64_t height,
+    const int64_t width,
+    const int64_t sampling_ratio) {
+  if (grad.is_cuda()) {
+#if defined(WITH_CUDA) || defined(WITH_HIP)
+    return ROIAlignRotated_backward_cuda(
+        grad,
+        rois,
+        spatial_scale,
+        pooled_height,
+        pooled_width,
+        batch_size,
+        channels,
+        height,
+        width,
+        sampling_ratio);
+#else
+    AT_ERROR("Detectron2 is not compiled with GPU support!");
+#endif
+  }
+  return ROIAlignRotated_backward_cpu(
+      grad,
+      rois,
+      spatial_scale,
+      pooled_height,
+      pooled_width,
+      batch_size,
+      channels,
+      height,
+      width,
+      sampling_ratio);
+}
+
+} // namespace detectron2
diff --git a/detectron2/layers/csrc/ROIAlignRotated/ROIAlignRotated_cpu.cpp b/detectron2/layers/csrc/ROIAlignRotated/ROIAlignRotated_cpu.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..2a3d3056cc71a4acaafb570739a9dd247a7eb1ed
--- /dev/null
+++ b/detectron2/layers/csrc/ROIAlignRotated/ROIAlignRotated_cpu.cpp
@@ -0,0 +1,522 @@
+// Copyright (c) Facebook, Inc. and its affiliates.
+#include <ATen/TensorUtils.h>
+#include "ROIAlignRotated.h"
+
+// Note: this implementation originates from the Caffe2 ROIAlignRotated Op
+// and PyTorch ROIAlign (non-rotated) Op implementations.
+// The key difference between this implementation and those ones is
+// we don't do "legacy offset" in this version, as there aren't many previous
+// works, if any, using the "legacy" ROIAlignRotated Op.
+// This would make the interface a bit cleaner.
+
+namespace detectron2 {
+
+namespace {
+template <typename T>
+struct PreCalc {
+  int pos1;
+  int pos2;
+  int pos3;
+  int pos4;
+  T w1;
+  T w2;
+  T w3;
+  T w4;
+};
+
+template <typename T>
+void pre_calc_for_bilinear_interpolate(
+    const int height,
+    const int width,
+    const int pooled_height,
+    const int pooled_width,
+    const int iy_upper,
+    const int ix_upper,
+    T roi_start_h,
+    T roi_start_w,
+    T bin_size_h,
+    T bin_size_w,
+    int roi_bin_grid_h,
+    int roi_bin_grid_w,
+    T roi_center_h,
+    T roi_center_w,
+    T cos_theta,
+    T sin_theta,
+    std::vector<PreCalc<T>>& pre_calc) {
+  int pre_calc_index = 0;
+  for (int ph = 0; ph < pooled_height; ph++) {
+    for (int pw = 0; pw < pooled_width; pw++) {
+      for (int iy = 0; iy < iy_upper; iy++) {
+        const T yy = roi_start_h + ph * bin_size_h +
+            static_cast<T>(iy + .5f) * bin_size_h /
+                static_cast<T>(roi_bin_grid_h); // e.g., 0.5, 1.5
+        for (int ix = 0; ix < ix_upper; ix++) {
+          const T xx = roi_start_w + pw * bin_size_w +
+              static_cast<T>(ix + .5f) * bin_size_w /
+                  static_cast<T>(roi_bin_grid_w);
+
+          // Rotate by theta around the center and translate
+          // In image space, (y, x) is the order for Right Handed System,
+          // and this is essentially multiplying the point by a rotation matrix
+          // to rotate it counterclockwise through angle theta.
+          T y = yy * cos_theta - xx * sin_theta + roi_center_h;
+          T x = yy * sin_theta + xx * cos_theta + roi_center_w;
+          // deal with: inverse elements are out of feature map boundary
+          if (y < -1.0 || y > height || x < -1.0 || x > width) {
+            // empty
+            PreCalc<T> pc;
+            pc.pos1 = 0;
+            pc.pos2 = 0;
+            pc.pos3 = 0;
+            pc.pos4 = 0;
+            pc.w1 = 0;
+            pc.w2 = 0;
+            pc.w3 = 0;
+            pc.w4 = 0;
+            pre_calc[pre_calc_index] = pc;
+            pre_calc_index += 1;
+            continue;
+          }
+
+          if (y < 0) {
+            y = 0;
+          }
+          if (x < 0) {
+            x = 0;
+          }
+
+          int y_low = (int)y;
+          int x_low = (int)x;
+          int y_high;
+          int x_high;
+
+          if (y_low >= height - 1) {
+            y_high = y_low = height - 1;
+            y = (T)y_low;
+          } else {
+            y_high = y_low + 1;
+          }
+
+          if (x_low >= width - 1) {
+            x_high = x_low = width - 1;
+            x = (T)x_low;
+          } else {
+            x_high = x_low + 1;
+          }
+
+          T ly = y - y_low;
+          T lx = x - x_low;
+          T hy = 1. - ly, hx = 1. - lx;
+          T w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
+
+          // save weights and indices
+          PreCalc<T> pc;
+          pc.pos1 = y_low * width + x_low;
+          pc.pos2 = y_low * width + x_high;
+          pc.pos3 = y_high * width + x_low;
+          pc.pos4 = y_high * width + x_high;
+          pc.w1 = w1;
+          pc.w2 = w2;
+          pc.w3 = w3;
+          pc.w4 = w4;
+          pre_calc[pre_calc_index] = pc;
+
+          pre_calc_index += 1;
+        }
+      }
+    }
+  }
+}
+
+template <typename T>
+void bilinear_interpolate_gradient(
+    const int height,
+    const int width,
+    T y,
+    T x,
+    T& w1,
+    T& w2,
+    T& w3,
+    T& w4,
+    int& x_low,
+    int& x_high,
+    int& y_low,
+    int& y_high) {
+  // deal with cases that inverse elements are out of feature map boundary
+  if (y < -1.0 || y > height || x < -1.0 || x > width) {
+    // empty
+    w1 = w2 = w3 = w4 = 0.;
+    x_low = x_high = y_low = y_high = -1;
+    return;
+  }
+
+  if (y < 0) {
+    y = 0;
+  }
+
+  if (x < 0) {
+    x = 0;
+  }
+
+  y_low = (int)y;
+  x_low = (int)x;
+
+  if (y_low >= height - 1) {
+    y_high = y_low = height - 1;
+    y = (T)y_low;
+  } else {
+    y_high = y_low + 1;
+  }
+
+  if (x_low >= width - 1) {
+    x_high = x_low = width - 1;
+    x = (T)x_low;
+  } else {
+    x_high = x_low + 1;
+  }
+
+  T ly = y - y_low;
+  T lx = x - x_low;
+  T hy = 1. - ly, hx = 1. - lx;
+
+  // reference in forward
+  // T v1 = input[y_low * width + x_low];
+  // T v2 = input[y_low * width + x_high];
+  // T v3 = input[y_high * width + x_low];
+  // T v4 = input[y_high * width + x_high];
+  // T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+
+  w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
+
+  return;
+}
+
+template <class T>
+inline void add(T* address, const T& val) {
+  *address += val;
+}
+
+} // namespace
+
+template <typename T>
+void ROIAlignRotatedForward(
+    const int nthreads,
+    const T* input,
+    const T& spatial_scale,
+    const int channels,
+    const int height,
+    const int width,
+    const int pooled_height,
+    const int pooled_width,
+    const int sampling_ratio,
+    const T* rois,
+    T* output) {
+  int n_rois = nthreads / channels / pooled_width / pooled_height;
+  // (n, c, ph, pw) is an element in the pooled output
+  // can be parallelized using omp
+  // #pragma omp parallel for num_threads(32)
+  for (int n = 0; n < n_rois; n++) {
+    int index_n = n * channels * pooled_width * pooled_height;
+
+    const T* current_roi = rois + n * 6;
+    int roi_batch_ind = current_roi[0];
+
+    // Do not use rounding; this implementation detail is critical
+    // ROIAlignRotated supports align == true, i.e., continuous coordinate
+    // by default, thus the 0.5 offset
+    T offset = (T)0.5;
+    T roi_center_w = current_roi[1] * spatial_scale - offset;
+    T roi_center_h = current_roi[2] * spatial_scale - offset;
+    T roi_width = current_roi[3] * spatial_scale;
+    T roi_height = current_roi[4] * spatial_scale;
+    T theta = current_roi[5] * M_PI / 180.0;
+    T cos_theta = cos(theta);
+    T sin_theta = sin(theta);
+
+    AT_ASSERTM(
+        roi_width >= 0 && roi_height >= 0,
+        "ROIs in ROIAlignRotated do not have non-negative size!");
+
+    T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
+    T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
+
+    // We use roi_bin_grid to sample the grid and mimic integral
+    int roi_bin_grid_h = (sampling_ratio > 0)
+        ? sampling_ratio
+        : ceil(roi_height / pooled_height); // e.g., = 2
+    int roi_bin_grid_w =
+        (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);
+
+    // We do average (integral) pooling inside a bin
+    const T count = std::max(roi_bin_grid_h * roi_bin_grid_w, 1); // e.g. = 4
+
+    // we want to precalculate indices and weights shared by all channels,
+    // this is the key point of optimization
+    std::vector<PreCalc<T>> pre_calc(
+        roi_bin_grid_h * roi_bin_grid_w * pooled_width * pooled_height);
+
+    // roi_start_h and roi_start_w are computed wrt the center of RoI (x, y).
+    // Appropriate translation needs to be applied after.
+    T roi_start_h = -roi_height / 2.0;
+    T roi_start_w = -roi_width / 2.0;
+
+    pre_calc_for_bilinear_interpolate(
+        height,
+        width,
+        pooled_height,
+        pooled_width,
+        roi_bin_grid_h,
+        roi_bin_grid_w,
+        roi_start_h,
+        roi_start_w,
+        bin_size_h,
+        bin_size_w,
+        roi_bin_grid_h,
+        roi_bin_grid_w,
+        roi_center_h,
+        roi_center_w,
+        cos_theta,
+        sin_theta,
+        pre_calc);
+
+    for (int c = 0; c < channels; c++) {
+      int index_n_c = index_n + c * pooled_width * pooled_height;
+      const T* offset_input =
+          input + (roi_batch_ind * channels + c) * height * width;
+      int pre_calc_index = 0;
+
+      for (int ph = 0; ph < pooled_height; ph++) {
+        for (int pw = 0; pw < pooled_width; pw++) {
+          int index = index_n_c + ph * pooled_width + pw;
+
+          T output_val = 0.;
+          for (int iy = 0; iy < roi_bin_grid_h; iy++) {
+            for (int ix = 0; ix < roi_bin_grid_w; ix++) {
+              PreCalc<T> pc = pre_calc[pre_calc_index];
+              output_val += pc.w1 * offset_input[pc.pos1] +
+                  pc.w2 * offset_input[pc.pos2] +
+                  pc.w3 * offset_input[pc.pos3] + pc.w4 * offset_input[pc.pos4];
+
+              pre_calc_index += 1;
+            }
+          }
+          output_val /= count;
+
+          output[index] = output_val;
+        } // for pw
+      } // for ph
+    } // for c
+  } // for n
+}
+
+template <typename T>
+void ROIAlignRotatedBackward(
+    const int nthreads,
+    // may not be contiguous. should index using n_stride, etc
+    const T* grad_output,
+    const T& spatial_scale,
+    const int channels,
+    const int height,
+    const int width,
+    const int pooled_height,
+    const int pooled_width,
+    const int sampling_ratio,
+    T* grad_input,
+    const T* rois,
+    const int n_stride,
+    const int c_stride,
+    const int h_stride,
+    const int w_stride) {
+  for (int index = 0; index < nthreads; index++) {
+    // (n, c, ph, pw) is an element in the pooled output
+    int pw = index % pooled_width;
+    int ph = (index / pooled_width) % pooled_height;
+    int c = (index / pooled_width / pooled_height) % channels;
+    int n = index / pooled_width / pooled_height / channels;
+
+    const T* current_roi = rois + n * 6;
+    int roi_batch_ind = current_roi[0];
+
+    // Do not use rounding; this implementation detail is critical
+    // ROIAlignRotated supports align == true, i.e., continuous coordinate
+    // by default, thus the 0.5 offset
+    T offset = (T)0.5;
+    T roi_center_w = current_roi[1] * spatial_scale - offset;
+    T roi_center_h = current_roi[2] * spatial_scale - offset;
+    T roi_width = current_roi[3] * spatial_scale;
+    T roi_height = current_roi[4] * spatial_scale;
+    T theta = current_roi[5] * M_PI / 180.0;
+    T cos_theta = cos(theta);
+    T sin_theta = sin(theta);
+
+    AT_ASSERTM(
+        roi_width >= 0 && roi_height >= 0,
+        "ROIs in ROIAlignRotated do not have non-negative size!");
+
+    T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
+    T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
+
+    T* offset_grad_input =
+        grad_input + ((roi_batch_ind * channels + c) * height * width);
+
+    int output_offset = n * n_stride + c * c_stride;
+    const T* offset_grad_output = grad_output + output_offset;
+    const T grad_output_this_bin =
+        offset_grad_output[ph * h_stride + pw * w_stride];
+
+    // We use roi_bin_grid to sample the grid and mimic integral
+    int roi_bin_grid_h = (sampling_ratio > 0)
+        ? sampling_ratio
+        : ceil(roi_height / pooled_height); // e.g., = 2
+    int roi_bin_grid_w =
+        (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);
+
+    // roi_start_h and roi_start_w are computed wrt the center of RoI (x, y).
+    // Appropriate translation needs to be applied after.
+    T roi_start_h = -roi_height / 2.0;
+    T roi_start_w = -roi_width / 2.0;
+
+    // We do average (integral) pooling inside a bin
+    const T count = roi_bin_grid_h * roi_bin_grid_w; // e.g. = 4
+
+    for (int iy = 0; iy < roi_bin_grid_h; iy++) {
+      const T yy = roi_start_h + ph * bin_size_h +
+          static_cast<T>(iy + .5f) * bin_size_h /
+              static_cast<T>(roi_bin_grid_h); // e.g., 0.5, 1.5
+      for (int ix = 0; ix < roi_bin_grid_w; ix++) {
+        const T xx = roi_start_w + pw * bin_size_w +
+            static_cast<T>(ix + .5f) * bin_size_w /
+                static_cast<T>(roi_bin_grid_w);
+
+        // Rotate by theta around the center and translate
+        T y = yy * cos_theta - xx * sin_theta + roi_center_h;
+        T x = yy * sin_theta + xx * cos_theta + roi_center_w;
+
+        T w1, w2, w3, w4;
+        int x_low, x_high, y_low, y_high;
+
+        bilinear_interpolate_gradient(
+            height, width, y, x, w1, w2, w3, w4, x_low, x_high, y_low, y_high);
+
+        T g1 = grad_output_this_bin * w1 / count;
+        T g2 = grad_output_this_bin * w2 / count;
+        T g3 = grad_output_this_bin * w3 / count;
+        T g4 = grad_output_this_bin * w4 / count;
+
+        if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) {
+          // atomic add is not needed for now since it is single threaded
+          add(offset_grad_input + y_low * width + x_low, static_cast<T>(g1));
+          add(offset_grad_input + y_low * width + x_high, static_cast<T>(g2));
+          add(offset_grad_input + y_high * width + x_low, static_cast<T>(g3));
+          add(offset_grad_input + y_high * width + x_high, static_cast<T>(g4));
+        } // if
+      } // ix
+    } // iy
+  } // for
+} // ROIAlignRotatedBackward
+
+at::Tensor ROIAlignRotated_forward_cpu(
+    const at::Tensor& input,
+    const at::Tensor& rois,
+    const float spatial_scale,
+    const int pooled_height,
+    const int pooled_width,
+    const int sampling_ratio) {
+  AT_ASSERTM(input.device().is_cpu(), "input must be a CPU tensor");
+  AT_ASSERTM(rois.device().is_cpu(), "rois must be a CPU tensor");
+
+  at::TensorArg input_t{input, "input", 1}, rois_t{rois, "rois", 2};
+
+  at::CheckedFrom c = "ROIAlign_forward_cpu";
+  at::checkAllSameType(c, {input_t, rois_t});
+
+  auto num_rois = rois.size(0);
+  auto channels = input.size(1);
+  auto height = input.size(2);
+  auto width = input.size(3);
+
+  at::Tensor output = at::zeros(
+      {num_rois, channels, pooled_height, pooled_width}, input.options());
+
+  auto output_size = num_rois * pooled_height * pooled_width * channels;
+
+  if (output.numel() == 0) {
+    return output;
+  }
+
+  auto input_ = input.contiguous(), rois_ = rois.contiguous();
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      input.scalar_type(), "ROIAlignRotated_forward", [&] {
+        ROIAlignRotatedForward<scalar_t>(
+            output_size,
+            input_.data_ptr<scalar_t>(),
+            spatial_scale,
+            channels,
+            height,
+            width,
+            pooled_height,
+            pooled_width,
+            sampling_ratio,
+            rois_.data_ptr<scalar_t>(),
+            output.data_ptr<scalar_t>());
+      });
+  return output;
+}
+
+at::Tensor ROIAlignRotated_backward_cpu(
+    const at::Tensor& grad,
+    const at::Tensor& rois,
+    const float spatial_scale,
+    const int pooled_height,
+    const int pooled_width,
+    const int batch_size,
+    const int channels,
+    const int height,
+    const int width,
+    const int sampling_ratio) {
+  AT_ASSERTM(grad.device().is_cpu(), "grad must be a CPU tensor");
+  AT_ASSERTM(rois.device().is_cpu(), "rois must be a CPU tensor");
+
+  at::TensorArg grad_t{grad, "grad", 1}, rois_t{rois, "rois", 2};
+
+  at::CheckedFrom c = "ROIAlignRotated_backward_cpu";
+  at::checkAllSameType(c, {grad_t, rois_t});
+
+  at::Tensor grad_input =
+      at::zeros({batch_size, channels, height, width}, grad.options());
+
+  // handle possibly empty gradients
+  if (grad.numel() == 0) {
+    return grad_input;
+  }
+
+  // get stride values to ensure indexing into gradients is correct.
+  int n_stride = grad.stride(0);
+  int c_stride = grad.stride(1);
+  int h_stride = grad.stride(2);
+  int w_stride = grad.stride(3);
+
+  auto rois_ = rois.contiguous();
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      grad.scalar_type(), "ROIAlignRotated_forward", [&] {
+        ROIAlignRotatedBackward<scalar_t>(
+            grad.numel(),
+            grad.data_ptr<scalar_t>(),
+            spatial_scale,
+            channels,
+            height,
+            width,
+            pooled_height,
+            pooled_width,
+            sampling_ratio,
+            grad_input.data_ptr<scalar_t>(),
+            rois_.data_ptr<scalar_t>(),
+            n_stride,
+            c_stride,
+            h_stride,
+            w_stride);
+      });
+  return grad_input;
+}
+
+} // namespace detectron2
diff --git a/detectron2/layers/csrc/ROIAlignRotated/ROIAlignRotated_cuda.cu b/detectron2/layers/csrc/ROIAlignRotated/ROIAlignRotated_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..fca186519143b168a912c880a4cf495a0a5a9322
--- /dev/null
+++ b/detectron2/layers/csrc/ROIAlignRotated/ROIAlignRotated_cuda.cu
@@ -0,0 +1,443 @@
+// Copyright (c) Facebook, Inc. and its affiliates.
+#include <ATen/ATen.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAGuard.h>
+#include <ATen/cuda/CUDAApplyUtils.cuh>
+
+// TODO make it in a common file
+#define CUDA_1D_KERNEL_LOOP(i, n)                            \
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; \
+       i += blockDim.x * gridDim.x)
+
+// Note: this implementation originates from the Caffe2 ROIAlignRotated Op
+// and PyTorch ROIAlign (non-rotated) Op implementations.
+// The key difference between this implementation and those ones is
+// we don't do "legacy offset" in this version, as there aren't many previous
+// works, if any, using the "legacy" ROIAlignRotated Op.
+// This would make the interface a bit cleaner.
+
+namespace detectron2 {
+
+namespace {
+
+template <typename T>
+__device__ T bilinear_interpolate(
+    const T* input,
+    const int height,
+    const int width,
+    T y,
+    T x) {
+  // deal with cases that inverse elements are out of feature map boundary
+  if (y < -1.0 || y > height || x < -1.0 || x > width) {
+    // empty
+    return 0;
+  }
+
+  if (y < 0) {
+    y = 0;
+  }
+
+  if (x < 0) {
+    x = 0;
+  }
+
+  int y_low = (int)y;
+  int x_low = (int)x;
+  int y_high;
+  int x_high;
+
+  if (y_low >= height - 1) {
+    y_high = y_low = height - 1;
+    y = (T)y_low;
+  } else {
+    y_high = y_low + 1;
+  }
+
+  if (x_low >= width - 1) {
+    x_high = x_low = width - 1;
+    x = (T)x_low;
+  } else {
+    x_high = x_low + 1;
+  }
+
+  T ly = y - y_low;
+  T lx = x - x_low;
+  T hy = 1. - ly, hx = 1. - lx;
+  // do bilinear interpolation
+  T v1 = input[y_low * width + x_low];
+  T v2 = input[y_low * width + x_high];
+  T v3 = input[y_high * width + x_low];
+  T v4 = input[y_high * width + x_high];
+  T w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
+
+  T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+
+  return val;
+}
+
+template <typename T>
+__device__ void bilinear_interpolate_gradient(
+    const int height,
+    const int width,
+    T y,
+    T x,
+    T& w1,
+    T& w2,
+    T& w3,
+    T& w4,
+    int& x_low,
+    int& x_high,
+    int& y_low,
+    int& y_high) {
+  // deal with cases that inverse elements are out of feature map boundary
+  if (y < -1.0 || y > height || x < -1.0 || x > width) {
+    // empty
+    w1 = w2 = w3 = w4 = 0.;
+    x_low = x_high = y_low = y_high = -1;
+    return;
+  }
+
+  if (y < 0) {
+    y = 0;
+  }
+
+  if (x < 0) {
+    x = 0;
+  }
+
+  y_low = (int)y;
+  x_low = (int)x;
+
+  if (y_low >= height - 1) {
+    y_high = y_low = height - 1;
+    y = (T)y_low;
+  } else {
+    y_high = y_low + 1;
+  }
+
+  if (x_low >= width - 1) {
+    x_high = x_low = width - 1;
+    x = (T)x_low;
+  } else {
+    x_high = x_low + 1;
+  }
+
+  T ly = y - y_low;
+  T lx = x - x_low;
+  T hy = 1. - ly, hx = 1. - lx;
+
+  // reference in forward
+  // T v1 = input[y_low * width + x_low];
+  // T v2 = input[y_low * width + x_high];
+  // T v3 = input[y_high * width + x_low];
+  // T v4 = input[y_high * width + x_high];
+  // T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+
+  w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
+
+  return;
+}
+
+} // namespace
+
+template <typename T>
+__global__ void RoIAlignRotatedForward(
+    const int nthreads,
+    const T* input,
+    const T spatial_scale,
+    const int channels,
+    const int height,
+    const int width,
+    const int pooled_height,
+    const int pooled_width,
+    const int sampling_ratio,
+    const T* rois,
+    T* top_data) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    // (n, c, ph, pw) is an element in the pooled output
+    int pw = index % pooled_width;
+    int ph = (index / pooled_width) % pooled_height;
+    int c = (index / pooled_width / pooled_height) % channels;
+    int n = index / pooled_width / pooled_height / channels;
+
+    const T* current_roi = rois + n * 6;
+    int roi_batch_ind = current_roi[0];
+
+    // Do not use rounding; this implementation detail is critical
+    // ROIAlignRotated supports align == true, i.e., continuous coordinate
+    // by default, thus the 0.5 offset
+    T offset = (T)0.5;
+    T roi_center_w = current_roi[1] * spatial_scale - offset;
+    T roi_center_h = current_roi[2] * spatial_scale - offset;
+    T roi_width = current_roi[3] * spatial_scale;
+    T roi_height = current_roi[4] * spatial_scale;
+    T theta = current_roi[5] * M_PI / 180.0;
+    T cos_theta = cos(theta);
+    T sin_theta = sin(theta);
+
+    T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
+    T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
+
+    const T* offset_input =
+        input + (roi_batch_ind * channels + c) * height * width;
+
+    // We use roi_bin_grid to sample the grid and mimic integral
+    int roi_bin_grid_h = (sampling_ratio > 0)
+        ? sampling_ratio
+        : ceil(roi_height / pooled_height); // e.g., = 2
+    int roi_bin_grid_w =
+        (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);
+
+    // roi_start_h and roi_start_w are computed wrt the center of RoI (x, y).
+    // Appropriate translation needs to be applied after.
+    T roi_start_h = -roi_height / 2.0;
+    T roi_start_w = -roi_width / 2.0;
+
+    // We do average (inte  gral) pooling inside a bin
+    const T count = max(roi_bin_grid_h * roi_bin_grid_w, 1); // e.g. = 4
+
+    T output_val = 0.;
+    for (int iy = 0; iy < roi_bin_grid_h; iy++) // e.g., iy = 0, 1
+    {
+      const T yy = roi_start_h + ph * bin_size_h +
+          static_cast<T>(iy + .5f) * bin_size_h /
+              static_cast<T>(roi_bin_grid_h); // e.g., 0.5, 1.5
+      for (int ix = 0; ix < roi_bin_grid_w; ix++) {
+        const T xx = roi_start_w + pw * bin_size_w +
+            static_cast<T>(ix + .5f) * bin_size_w /
+                static_cast<T>(roi_bin_grid_w);
+
+        // Rotate by theta around the center and translate
+        T y = yy * cos_theta - xx * sin_theta + roi_center_h;
+        T x = yy * sin_theta + xx * cos_theta + roi_center_w;
+
+        T val = bilinear_interpolate(offset_input, height, width, y, x);
+        output_val += val;
+      }
+    }
+    output_val /= count;
+
+    top_data[index] = output_val;
+  }
+}
+
+template <typename T>
+__global__ void RoIAlignRotatedBackwardFeature(
+    const int nthreads,
+    const T* top_diff,
+    const int num_rois,
+    const T spatial_scale,
+    const int channels,
+    const int height,
+    const int width,
+    const int pooled_height,
+    const int pooled_width,
+    const int sampling_ratio,
+    T* bottom_diff,
+    const T* rois) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    // (n, c, ph, pw) is an element in the pooled output
+    int pw = index % pooled_width;
+    int ph = (index / pooled_width) % pooled_height;
+    int c = (index / pooled_width / pooled_height) % channels;
+    int n = index / pooled_width / pooled_height / channels;
+
+    const T* current_roi = rois + n * 6;
+    int roi_batch_ind = current_roi[0];
+
+    // Do not use rounding; this implementation detail is critical
+    // ROIAlignRotated supports align == true, i.e., continuous coordinate
+    // by default, thus the 0.5 offset
+    T offset = (T)0.5;
+    T roi_center_w = current_roi[1] * spatial_scale - offset;
+    T roi_center_h = current_roi[2] * spatial_scale - offset;
+    T roi_width = current_roi[3] * spatial_scale;
+    T roi_height = current_roi[4] * spatial_scale;
+    T theta = current_roi[5] * M_PI / 180.0;
+    T cos_theta = cos(theta);
+    T sin_theta = sin(theta);
+
+    T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
+    T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
+
+    T* offset_bottom_diff =
+        bottom_diff + (roi_batch_ind * channels + c) * height * width;
+
+    int top_offset = (n * channels + c) * pooled_height * pooled_width;
+    const T* offset_top_diff = top_diff + top_offset;
+    const T top_diff_this_bin = offset_top_diff[ph * pooled_width + pw];
+
+    // We use roi_bin_grid to sample the grid and mimic integral
+    int roi_bin_grid_h = (sampling_ratio > 0)
+        ? sampling_ratio
+        : ceil(roi_height / pooled_height); // e.g., = 2
+    int roi_bin_grid_w =
+        (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);
+
+    // roi_start_h and roi_start_w are computed wrt the center of RoI (x, y).
+    // Appropriate translation needs to be applied after.
+    T roi_start_h = -roi_height / 2.0;
+    T roi_start_w = -roi_width / 2.0;
+
+    // We do average (integral) pooling inside a bin
+    const T count = roi_bin_grid_h * roi_bin_grid_w; // e.g. = 4
+
+    for (int iy = 0; iy < roi_bin_grid_h; iy++) // e.g., iy = 0, 1
+    {
+      const T yy = roi_start_h + ph * bin_size_h +
+          static_cast<T>(iy + .5f) * bin_size_h /
+              static_cast<T>(roi_bin_grid_h); // e.g., 0.5, 1.5
+      for (int ix = 0; ix < roi_bin_grid_w; ix++) {
+        const T xx = roi_start_w + pw * bin_size_w +
+            static_cast<T>(ix + .5f) * bin_size_w /
+                static_cast<T>(roi_bin_grid_w);
+
+        // Rotate by theta around the center and translate
+        T y = yy * cos_theta - xx * sin_theta + roi_center_h;
+        T x = yy * sin_theta + xx * cos_theta + roi_center_w;
+
+        T w1, w2, w3, w4;
+        int x_low, x_high, y_low, y_high;
+
+        bilinear_interpolate_gradient(
+            height, width, y, x, w1, w2, w3, w4, x_low, x_high, y_low, y_high);
+
+        T g1 = top_diff_this_bin * w1 / count;
+        T g2 = top_diff_this_bin * w2 / count;
+        T g3 = top_diff_this_bin * w3 / count;
+        T g4 = top_diff_this_bin * w4 / count;
+
+        if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) {
+          atomicAdd(
+              offset_bottom_diff + y_low * width + x_low, static_cast<T>(g1));
+          atomicAdd(
+              offset_bottom_diff + y_low * width + x_high, static_cast<T>(g2));
+          atomicAdd(
+              offset_bottom_diff + y_high * width + x_low, static_cast<T>(g3));
+          atomicAdd(
+              offset_bottom_diff + y_high * width + x_high, static_cast<T>(g4));
+        } // if
+      } // ix
+    } // iy
+  } // CUDA_1D_KERNEL_LOOP
+} // RoIAlignRotatedBackward
+
+at::Tensor ROIAlignRotated_forward_cuda(
+    const at::Tensor& input,
+    const at::Tensor& rois,
+    const float spatial_scale,
+    const int pooled_height,
+    const int pooled_width,
+    const int sampling_ratio) {
+  AT_ASSERTM(input.device().is_cuda(), "input must be a CUDA tensor");
+  AT_ASSERTM(rois.device().is_cuda(), "rois must be a CUDA tensor");
+  at::TensorArg input_t{input, "input", 1}, rois_t{rois, "rois", 2};
+
+  at::CheckedFrom c = "ROIAlignRotated_forward_cuda";
+  at::checkAllSameGPU(c, {input_t, rois_t});
+  at::checkAllSameType(c, {input_t, rois_t});
+  at::cuda::CUDAGuard device_guard(input.device());
+
+  auto num_rois = rois.size(0);
+  auto channels = input.size(1);
+  auto height = input.size(2);
+  auto width = input.size(3);
+
+  auto output = at::empty(
+      {num_rois, channels, pooled_height, pooled_width}, input.options());
+  auto output_size = num_rois * pooled_height * pooled_width * channels;
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  dim3 grid(std::min(
+      at::cuda::ATenCeilDiv(
+          static_cast<int64_t>(output_size), static_cast<int64_t>(512)),
+      static_cast<int64_t>(4096)));
+  dim3 block(512);
+
+  if (output.numel() == 0) {
+    AT_CUDA_CHECK(cudaGetLastError());
+    return output;
+  }
+
+  auto input_ = input.contiguous(), rois_ = rois.contiguous();
+  AT_DISPATCH_FLOATING_TYPES(
+      input.scalar_type(), "ROIAlignRotated_forward", [&] {
+        RoIAlignRotatedForward<scalar_t><<<grid, block, 0, stream>>>(
+            output_size,
+            input_.data_ptr<scalar_t>(),
+            spatial_scale,
+            channels,
+            height,
+            width,
+            pooled_height,
+            pooled_width,
+            sampling_ratio,
+            rois_.data_ptr<scalar_t>(),
+            output.data_ptr<scalar_t>());
+      });
+  cudaDeviceSynchronize();
+  AT_CUDA_CHECK(cudaGetLastError());
+  return output;
+}
+
+// TODO remove the dependency on input and use instead its sizes -> save memory
+at::Tensor ROIAlignRotated_backward_cuda(
+    const at::Tensor& grad,
+    const at::Tensor& rois,
+    const float spatial_scale,
+    const int pooled_height,
+    const int pooled_width,
+    const int batch_size,
+    const int channels,
+    const int height,
+    const int width,
+    const int sampling_ratio) {
+  AT_ASSERTM(grad.device().is_cuda(), "grad must be a CUDA tensor");
+  AT_ASSERTM(rois.device().is_cuda(), "rois must be a CUDA tensor");
+
+  at::TensorArg grad_t{grad, "grad", 1}, rois_t{rois, "rois", 2};
+  at::CheckedFrom c = "ROIAlign_backward_cuda";
+  at::checkAllSameGPU(c, {grad_t, rois_t});
+  at::checkAllSameType(c, {grad_t, rois_t});
+  at::cuda::CUDAGuard device_guard(grad.device());
+
+  auto num_rois = rois.size(0);
+  auto grad_input =
+      at::zeros({batch_size, channels, height, width}, grad.options());
+
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  dim3 grid(std::min(
+      at::cuda::ATenCeilDiv(
+          static_cast<int64_t>(grad.numel()), static_cast<int64_t>(512)),
+      static_cast<int64_t>(4096)));
+  dim3 block(512);
+
+  // handle possibly empty gradients
+  if (grad.numel() == 0) {
+    AT_CUDA_CHECK(cudaGetLastError());
+    return grad_input;
+  }
+
+  auto grad_ = grad.contiguous(), rois_ = rois.contiguous();
+  AT_DISPATCH_FLOATING_TYPES(
+      grad.scalar_type(), "ROIAlignRotated_backward", [&] {
+        RoIAlignRotatedBackwardFeature<scalar_t><<<grid, block, 0, stream>>>(
+            grad.numel(),
+            grad_.data_ptr<scalar_t>(),
+            num_rois,
+            spatial_scale,
+            channels,
+            height,
+            width,
+            pooled_height,
+            pooled_width,
+            sampling_ratio,
+            grad_input.data_ptr<scalar_t>(),
+            rois_.data_ptr<scalar_t>());
+      });
+  AT_CUDA_CHECK(cudaGetLastError());
+  return grad_input;
+}
+
+} // namespace detectron2
diff --git a/detectron2/layers/csrc/box_iou_rotated/box_iou_rotated.h b/detectron2/layers/csrc/box_iou_rotated/box_iou_rotated.h
new file mode 100644
index 0000000000000000000000000000000000000000..3bf383b8ed9b358b5313d433a9682c294dfb77e4
--- /dev/null
+++ b/detectron2/layers/csrc/box_iou_rotated/box_iou_rotated.h
@@ -0,0 +1,35 @@
+// Copyright (c) Facebook, Inc. and its affiliates.
+#pragma once
+#include <torch/types.h>
+
+namespace detectron2 {
+
+at::Tensor box_iou_rotated_cpu(
+    const at::Tensor& boxes1,
+    const at::Tensor& boxes2);
+
+#if defined(WITH_CUDA) || defined(WITH_HIP)
+at::Tensor box_iou_rotated_cuda(
+    const at::Tensor& boxes1,
+    const at::Tensor& boxes2);
+#endif
+
+// Interface for Python
+// inline is needed to prevent multiple function definitions when this header is
+// included by different cpps
+inline at::Tensor box_iou_rotated(
+    const at::Tensor& boxes1,
+    const at::Tensor& boxes2) {
+  assert(boxes1.device().is_cuda() == boxes2.device().is_cuda());
+  if (boxes1.device().is_cuda()) {
+#if defined(WITH_CUDA) || defined(WITH_HIP)
+    return box_iou_rotated_cuda(boxes1.contiguous(), boxes2.contiguous());
+#else
+    AT_ERROR("Detectron2 is not compiled with GPU support!");
+#endif
+  }
+
+  return box_iou_rotated_cpu(boxes1.contiguous(), boxes2.contiguous());
+}
+
+} // namespace detectron2
diff --git a/detectron2/layers/csrc/box_iou_rotated/box_iou_rotated_cpu.cpp b/detectron2/layers/csrc/box_iou_rotated/box_iou_rotated_cpu.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..c843487b5fa4e8077dd27402ec99009266ddda8d
--- /dev/null
+++ b/detectron2/layers/csrc/box_iou_rotated/box_iou_rotated_cpu.cpp
@@ -0,0 +1,39 @@
+// Copyright (c) Facebook, Inc. and its affiliates.
+#include "box_iou_rotated.h"
+#include "box_iou_rotated_utils.h"
+
+namespace detectron2 {
+
+template <typename T>
+void box_iou_rotated_cpu_kernel(
+    const at::Tensor& boxes1,
+    const at::Tensor& boxes2,
+    at::Tensor& ious) {
+  auto num_boxes1 = boxes1.size(0);
+  auto num_boxes2 = boxes2.size(0);
+
+  for (int i = 0; i < num_boxes1; i++) {
+    for (int j = 0; j < num_boxes2; j++) {
+      ious[i * num_boxes2 + j] = single_box_iou_rotated<T>(
+          boxes1[i].data_ptr<T>(), boxes2[j].data_ptr<T>());
+    }
+  }
+}
+
+at::Tensor box_iou_rotated_cpu(
+    // input must be contiguous:
+    const at::Tensor& boxes1,
+    const at::Tensor& boxes2) {
+  auto num_boxes1 = boxes1.size(0);
+  auto num_boxes2 = boxes2.size(0);
+  at::Tensor ious =
+      at::empty({num_boxes1 * num_boxes2}, boxes1.options().dtype(at::kFloat));
+
+  box_iou_rotated_cpu_kernel<float>(boxes1, boxes2, ious);
+
+  // reshape from 1d array to 2d array
+  auto shape = std::vector<int64_t>{num_boxes1, num_boxes2};
+  return ious.reshape(shape);
+}
+
+} // namespace detectron2
diff --git a/detectron2/layers/csrc/box_iou_rotated/box_iou_rotated_cuda.cu b/detectron2/layers/csrc/box_iou_rotated/box_iou_rotated_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..952710e53041187907fbd113f8d0d0fa24134a86
--- /dev/null
+++ b/detectron2/layers/csrc/box_iou_rotated/box_iou_rotated_cuda.cu
@@ -0,0 +1,130 @@
+// Copyright (c) Facebook, Inc. and its affiliates.
+#include <ATen/ATen.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAGuard.h>
+#include <ATen/cuda/CUDAApplyUtils.cuh>
+#include "box_iou_rotated_utils.h"
+
+namespace detectron2 {
+
+// 2D block with 32 * 16 = 512 threads per block
+const int BLOCK_DIM_X = 32;
+const int BLOCK_DIM_Y = 16;
+
+template <typename T>
+__global__ void box_iou_rotated_cuda_kernel(
+    const int n_boxes1,
+    const int n_boxes2,
+    const T* dev_boxes1,
+    const T* dev_boxes2,
+    T* dev_ious) {
+  const int row_start = blockIdx.x * blockDim.x;
+  const int col_start = blockIdx.y * blockDim.y;
+
+  const int row_size = min(n_boxes1 - row_start, blockDim.x);
+  const int col_size = min(n_boxes2 - col_start, blockDim.y);
+
+  __shared__ float block_boxes1[BLOCK_DIM_X * 5];
+  __shared__ float block_boxes2[BLOCK_DIM_Y * 5];
+
+  // It's safe to copy using threadIdx.x since BLOCK_DIM_X >= BLOCK_DIM_Y
+  if (threadIdx.x < row_size && threadIdx.y == 0) {
+    block_boxes1[threadIdx.x * 5 + 0] =
+        dev_boxes1[(row_start + threadIdx.x) * 5 + 0];
+    block_boxes1[threadIdx.x * 5 + 1] =
+        dev_boxes1[(row_start + threadIdx.x) * 5 + 1];
+    block_boxes1[threadIdx.x * 5 + 2] =
+        dev_boxes1[(row_start + threadIdx.x) * 5 + 2];
+    block_boxes1[threadIdx.x * 5 + 3] =
+        dev_boxes1[(row_start + threadIdx.x) * 5 + 3];
+    block_boxes1[threadIdx.x * 5 + 4] =
+        dev_boxes1[(row_start + threadIdx.x) * 5 + 4];
+  }
+
+  if (threadIdx.x < col_size && threadIdx.y == 0) {
+    block_boxes2[threadIdx.x * 5 + 0] =
+        dev_boxes2[(col_start + threadIdx.x) * 5 + 0];
+    block_boxes2[threadIdx.x * 5 + 1] =
+        dev_boxes2[(col_start + threadIdx.x) * 5 + 1];
+    block_boxes2[threadIdx.x * 5 + 2] =
+        dev_boxes2[(col_start + threadIdx.x) * 5 + 2];
+    block_boxes2[threadIdx.x * 5 + 3] =
+        dev_boxes2[(col_start + threadIdx.x) * 5 + 3];
+    block_boxes2[threadIdx.x * 5 + 4] =
+        dev_boxes2[(col_start + threadIdx.x) * 5 + 4];
+  }
+  __syncthreads();
+
+  if (threadIdx.x < row_size && threadIdx.y < col_size) {
+    int offset = (row_start + threadIdx.x) * n_boxes2 + col_start + threadIdx.y;
+    dev_ious[offset] = single_box_iou_rotated<T>(
+        block_boxes1 + threadIdx.x * 5, block_boxes2 + threadIdx.y * 5);
+  }
+}
+
+at::Tensor box_iou_rotated_cuda(
+    // input must be contiguous
+    const at::Tensor& boxes1,
+    const at::Tensor& boxes2) {
+  using scalar_t = float;
+  AT_ASSERTM(
+      boxes1.scalar_type() == at::kFloat, "boxes1 must be a float tensor");
+  AT_ASSERTM(
+      boxes2.scalar_type() == at::kFloat, "boxes2 must be a float tensor");
+  AT_ASSERTM(boxes1.is_cuda(), "boxes1 must be a CUDA tensor");
+  AT_ASSERTM(boxes2.is_cuda(), "boxes2 must be a CUDA tensor");
+  at::cuda::CUDAGuard device_guard(boxes1.device());
+
+  auto num_boxes1 = boxes1.size(0);
+  auto num_boxes2 = boxes2.size(0);
+
+  at::Tensor ious =
+      at::empty({num_boxes1 * num_boxes2}, boxes1.options().dtype(at::kFloat));
+
+  bool transpose = false;
+  if (num_boxes1 > 0 && num_boxes2 > 0) {
+    scalar_t *data1 = boxes1.data_ptr<scalar_t>(),
+             *data2 = boxes2.data_ptr<scalar_t>();
+
+    if (num_boxes2 > 65535 * BLOCK_DIM_Y) {
+      AT_ASSERTM(
+          num_boxes1 <= 65535 * BLOCK_DIM_Y,
+          "Too many boxes for box_iou_rotated_cuda!");
+      // x dim is allowed to be large, but y dim cannot,
+      // so we transpose the two to avoid "invalid configuration argument"
+      // error. We assume one of them is small. Otherwise the result is hard to
+      // fit in memory anyway.
+      std::swap(num_boxes1, num_boxes2);
+      std::swap(data1, data2);
+      transpose = true;
+    }
+
+    const int blocks_x =
+        at::cuda::ATenCeilDiv(static_cast<int>(num_boxes1), BLOCK_DIM_X);
+    const int blocks_y =
+        at::cuda::ATenCeilDiv(static_cast<int>(num_boxes2), BLOCK_DIM_Y);
+
+    dim3 blocks(blocks_x, blocks_y);
+    dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y);
+    cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+    box_iou_rotated_cuda_kernel<scalar_t><<<blocks, threads, 0, stream>>>(
+        num_boxes1,
+        num_boxes2,
+        data1,
+        data2,
+        (scalar_t*)ious.data_ptr<scalar_t>());
+
+    AT_CUDA_CHECK(cudaGetLastError());
+  }
+
+  // reshape from 1d array to 2d array
+  auto shape = std::vector<int64_t>{num_boxes1, num_boxes2};
+  if (transpose) {
+    return ious.view(shape).t();
+  } else {
+    return ious.view(shape);
+  }
+}
+
+} // namespace detectron2
diff --git a/detectron2/layers/csrc/box_iou_rotated/box_iou_rotated_utils.h b/detectron2/layers/csrc/box_iou_rotated/box_iou_rotated_utils.h
new file mode 100644
index 0000000000000000000000000000000000000000..bc6967a76884a40581a94554e91e6e72c6f8b527
--- /dev/null
+++ b/detectron2/layers/csrc/box_iou_rotated/box_iou_rotated_utils.h
@@ -0,0 +1,391 @@
+// Copyright (c) Facebook, Inc. and its affiliates.
+#pragma once
+
+#include <cassert>
+#include <cmath>
+
+#if defined(__CUDACC__) || __HCC__ == 1 || __HIP__ == 1
+// Designates functions callable from the host (CPU) and the device (GPU)
+#define HOST_DEVICE __host__ __device__
+#define HOST_DEVICE_INLINE HOST_DEVICE __forceinline__
+#else
+#include <algorithm>
+#define HOST_DEVICE
+#define HOST_DEVICE_INLINE HOST_DEVICE inline
+#endif
+
+namespace detectron2 {
+
+namespace {
+
+template <typename T>
+struct RotatedBox {
+  T x_ctr, y_ctr, w, h, a;
+};
+
+template <typename T>
+struct Point {
+  T x, y;
+  HOST_DEVICE_INLINE Point(const T& px = 0, const T& py = 0) : x(px), y(py) {}
+  HOST_DEVICE_INLINE Point operator+(const Point& p) const {
+    return Point(x + p.x, y + p.y);
+  }
+  HOST_DEVICE_INLINE Point& operator+=(const Point& p) {
+    x += p.x;
+    y += p.y;
+    return *this;
+  }
+  HOST_DEVICE_INLINE Point operator-(const Point& p) const {
+    return Point(x - p.x, y - p.y);
+  }
+  HOST_DEVICE_INLINE Point operator*(const T coeff) const {
+    return Point(x * coeff, y * coeff);
+  }
+};
+
+template <typename T>
+HOST_DEVICE_INLINE T dot_2d(const Point<T>& A, const Point<T>& B) {
+  return A.x * B.x + A.y * B.y;
+}
+
+// R: result type. can be different from input type
+template <typename T, typename R = T>
+HOST_DEVICE_INLINE R cross_2d(const Point<T>& A, const Point<T>& B) {
+  return static_cast<R>(A.x) * static_cast<R>(B.y) -
+      static_cast<R>(B.x) * static_cast<R>(A.y);
+}
+
+template <typename T>
+HOST_DEVICE_INLINE void get_rotated_vertices(
+    const RotatedBox<T>& box,
+    Point<T> (&pts)[4]) {
+  // M_PI / 180. == 0.01745329251
+  double theta = box.a * 0.01745329251;
+  T cosTheta2 = (T)cos(theta) * 0.5f;
+  T sinTheta2 = (T)sin(theta) * 0.5f;
+
+  // y: top --> down; x: left --> right
+  pts[0].x = box.x_ctr + sinTheta2 * box.h + cosTheta2 * box.w;
+  pts[0].y = box.y_ctr + cosTheta2 * box.h - sinTheta2 * box.w;
+  pts[1].x = box.x_ctr - sinTheta2 * box.h + cosTheta2 * box.w;
+  pts[1].y = box.y_ctr - cosTheta2 * box.h - sinTheta2 * box.w;
+  pts[2].x = 2 * box.x_ctr - pts[0].x;
+  pts[2].y = 2 * box.y_ctr - pts[0].y;
+  pts[3].x = 2 * box.x_ctr - pts[1].x;
+  pts[3].y = 2 * box.y_ctr - pts[1].y;
+}
+
+template <typename T>
+HOST_DEVICE_INLINE int get_intersection_points(
+    const Point<T> (&pts1)[4],
+    const Point<T> (&pts2)[4],
+    Point<T> (&intersections)[24]) {
+  // Line vector
+  // A line from p1 to p2 is: p1 + (p2-p1)*t, t=[0,1]
+  Point<T> vec1[4], vec2[4];
+  for (int i = 0; i < 4; i++) {
+    vec1[i] = pts1[(i + 1) % 4] - pts1[i];
+    vec2[i] = pts2[(i + 1) % 4] - pts2[i];
+  }
+
+  // When computing the intersection area, it doesn't hurt if we have
+  // more (duplicated/approximate) intersections/vertices than needed,
+  // while it can cause drastic difference if we miss an intersection/vertex.
+  // Therefore, we add an epsilon to relax the comparisons between
+  // the float point numbers that decide the intersection points.
+  double EPS = 1e-5;
+
+  // Line test - test all line combos for intersection
+  int num = 0; // number of intersections
+  for (int i = 0; i < 4; i++) {
+    for (int j = 0; j < 4; j++) {
+      // Solve for 2x2 Ax=b
+      T det = cross_2d<T>(vec2[j], vec1[i]);
+
+      // This takes care of parallel lines
+      if (fabs(det) <= 1e-14) {
+        continue;
+      }
+
+      auto vec12 = pts2[j] - pts1[i];
+
+      T t1 = cross_2d<T>(vec2[j], vec12) / det;
+      T t2 = cross_2d<T>(vec1[i], vec12) / det;
+
+      if (t1 > -EPS && t1 < 1.0f + EPS && t2 > -EPS && t2 < 1.0f + EPS) {
+        intersections[num++] = pts1[i] + vec1[i] * t1;
+      }
+    }
+  }
+
+  // Check for vertices of rect1 inside rect2
+  {
+    const auto& AB = vec2[0];
+    const auto& DA = vec2[3];
+    auto ABdotAB = dot_2d<T>(AB, AB);
+    auto ADdotAD = dot_2d<T>(DA, DA);
+    for (int i = 0; i < 4; i++) {
+      // assume ABCD is the rectangle, and P is the point to be judged
+      // P is inside ABCD iff. P's projection on AB lies within AB
+      // and P's projection on AD lies within AD
+
+      auto AP = pts1[i] - pts2[0];
+
+      auto APdotAB = dot_2d<T>(AP, AB);
+      auto APdotAD = -dot_2d<T>(AP, DA);
+
+      if ((APdotAB > -EPS) && (APdotAD > -EPS) && (APdotAB < ABdotAB + EPS) &&
+          (APdotAD < ADdotAD + EPS)) {
+        intersections[num++] = pts1[i];
+      }
+    }
+  }
+
+  // Reverse the check - check for vertices of rect2 inside rect1
+  {
+    const auto& AB = vec1[0];
+    const auto& DA = vec1[3];
+    auto ABdotAB = dot_2d<T>(AB, AB);
+    auto ADdotAD = dot_2d<T>(DA, DA);
+    for (int i = 0; i < 4; i++) {
+      auto AP = pts2[i] - pts1[0];
+
+      auto APdotAB = dot_2d<T>(AP, AB);
+      auto APdotAD = -dot_2d<T>(AP, DA);
+
+      if ((APdotAB > -EPS) && (APdotAD > -EPS) && (APdotAB < ABdotAB + EPS) &&
+          (APdotAD < ADdotAD + EPS)) {
+        intersections[num++] = pts2[i];
+      }
+    }
+  }
+
+  return num;
+}
+
+template <typename T>
+HOST_DEVICE_INLINE int convex_hull_graham(
+    const Point<T> (&p)[24],
+    const int& num_in,
+    Point<T> (&q)[24],
+    bool shift_to_zero = false) {
+  assert(num_in >= 2);
+
+  // Step 1:
+  // Find point with minimum y
+  // if more than 1 points have the same minimum y,
+  // pick the one with the minimum x.
+  int t = 0;
+  for (int i = 1; i < num_in; i++) {
+    if (p[i].y < p[t].y || (p[i].y == p[t].y && p[i].x < p[t].x)) {
+      t = i;
+    }
+  }
+  auto& start = p[t]; // starting point
+
+  // Step 2:
+  // Subtract starting point from every points (for sorting in the next step)
+  for (int i = 0; i < num_in; i++) {
+    q[i] = p[i] - start;
+  }
+
+  // Swap the starting point to position 0
+  auto tmp = q[0];
+  q[0] = q[t];
+  q[t] = tmp;
+
+  // Step 3:
+  // Sort point 1 ~ num_in according to their relative cross-product values
+  // (essentially sorting according to angles)
+  // If the angles are the same, sort according to their distance to origin
+  T dist[24];
+#if defined(__CUDACC__) || __HCC__ == 1 || __HIP__ == 1
+  // compute distance to origin before sort, and sort them together with the
+  // points
+  for (int i = 0; i < num_in; i++) {
+    dist[i] = dot_2d<T>(q[i], q[i]);
+  }
+
+  // CUDA version
+  // In the future, we can potentially use thrust
+  // for sorting here to improve speed (though not guaranteed)
+  for (int i = 1; i < num_in - 1; i++) {
+    for (int j = i + 1; j < num_in; j++) {
+      T crossProduct = cross_2d<T>(q[i], q[j]);
+      if ((crossProduct < -1e-6) ||
+          (fabs(crossProduct) < 1e-6 && dist[i] > dist[j])) {
+        auto q_tmp = q[i];
+        q[i] = q[j];
+        q[j] = q_tmp;
+        auto dist_tmp = dist[i];
+        dist[i] = dist[j];
+        dist[j] = dist_tmp;
+      }
+    }
+  }
+#else
+  // CPU version
+  // std::sort(
+  //     q + 1, q + num_in, [](const Point<T>& A, const Point<T>& B) -> bool {
+  //       T temp = cross_2d<T>(A, B);
+
+  // if (fabs(temp) < 1e-6) {
+  //   return dot_2d<T>(A, A) < dot_2d<T>(B, B);
+  // } else {
+  //   return temp > 0;
+  // }
+  // });
+  for (int i = 0; i < num_in; i++) {
+    dist[i] = dot_2d<T>(q[i], q[i]);
+  }
+
+  for (int i = 1; i < num_in - 1; i++) {
+    for (int j = i + 1; j < num_in; j++) {
+      T crossProduct = cross_2d<T>(q[i], q[j]);
+      if ((crossProduct < -1e-6) ||
+          (fabs(crossProduct) < 1e-6 && dist[i] > dist[j])) {
+        auto q_tmp = q[i];
+        q[i] = q[j];
+        q[j] = q_tmp;
+        auto dist_tmp = dist[i];
+        dist[i] = dist[j];
+        dist[j] = dist_tmp;
+      }
+    }
+  }
+
+  // compute distance to origin after sort, since the points are now different.
+  for (int i = 0; i < num_in; i++) {
+    dist[i] = dot_2d<T>(q[i], q[i]);
+  }
+
+#endif
+
+  // Step 4:
+  // Make sure there are at least 2 points (that don't overlap with each other)
+  // in the stack
+  int k; // index of the non-overlapped second point
+  for (k = 1; k < num_in; k++) {
+    if (dist[k] > 1e-8) {
+      break;
+    }
+  }
+  if (k == num_in) {
+    // We reach the end, which means the convex hull is just one point
+    q[0] = p[t];
+    return 1;
+  }
+  q[1] = q[k];
+  int m = 2; // 2 points in the stack
+  // Step 5:
+  // Finally we can start the scanning process.
+  // When a non-convex relationship between the 3 points is found
+  // (either concave shape or duplicated points),
+  // we pop the previous point from the stack
+  // until the 3-point relationship is convex again, or
+  // until the stack only contains two points
+  for (int i = k + 1; i < num_in; i++) {
+    while (m > 1) {
+      auto q1 = q[i] - q[m - 2], q2 = q[m - 1] - q[m - 2];
+      // cross_2d() uses FMA and therefore computes round(round(q1.x*q2.y) -
+      // q2.x*q1.y) So it may not return 0 even when q1==q2. Therefore we
+      // compare round(q1.x*q2.y) and round(q2.x*q1.y) directly. (round means
+      // round to nearest floating point).
+      if (q1.x * q2.y >= q2.x * q1.y)
+        m--;
+      else
+        break;
+    }
+    // Using double also helps, but float can solve the issue for now.
+    // while (m > 1 && cross_2d<T, double>(q[i] - q[m - 2], q[m - 1] - q[m - 2])
+    // >= 0) {
+    //     m--;
+    // }
+    q[m++] = q[i];
+  }
+
+  // Step 6 (Optional):
+  // In general sense we need the original coordinates, so we
+  // need to shift the points back (reverting Step 2)
+  // But if we're only interested in getting the area/perimeter of the shape
+  // We can simply return.
+  if (!shift_to_zero) {
+    for (int i = 0; i < m; i++) {
+      q[i] += start;
+    }
+  }
+
+  return m;
+}
+
+template <typename T>
+HOST_DEVICE_INLINE T polygon_area(const Point<T> (&q)[24], const int& m) {
+  if (m <= 2) {
+    return 0;
+  }
+
+  T area = 0;
+  for (int i = 1; i < m - 1; i++) {
+    area += fabs(cross_2d<T>(q[i] - q[0], q[i + 1] - q[0]));
+  }
+
+  return area / 2.0;
+}
+
+template <typename T>
+HOST_DEVICE_INLINE T rotated_boxes_intersection(
+    const RotatedBox<T>& box1,
+    const RotatedBox<T>& box2) {
+  // There are up to 4 x 4 + 4 + 4 = 24 intersections (including dups) returned
+  // from rotated_rect_intersection_pts
+  Point<T> intersectPts[24], orderedPts[24];
+
+  Point<T> pts1[4];
+  Point<T> pts2[4];
+  get_rotated_vertices<T>(box1, pts1);
+  get_rotated_vertices<T>(box2, pts2);
+
+  int num = get_intersection_points<T>(pts1, pts2, intersectPts);
+
+  if (num <= 2) {
+    return 0.0;
+  }
+
+  // Convex Hull to order the intersection points in clockwise order and find
+  // the contour area.
+  int num_convex = convex_hull_graham<T>(intersectPts, num, orderedPts, true);
+  return polygon_area<T>(orderedPts, num_convex);
+}
+
+} // namespace
+
+template <typename T>
+HOST_DEVICE_INLINE T
+single_box_iou_rotated(T const* const box1_raw, T const* const box2_raw) {
+  // shift center to the middle point to achieve higher precision in result
+  RotatedBox<T> box1, box2;
+  auto center_shift_x = (box1_raw[0] + box2_raw[0]) / 2.0;
+  auto center_shift_y = (box1_raw[1] + box2_raw[1]) / 2.0;
+  box1.x_ctr = box1_raw[0] - center_shift_x;
+  box1.y_ctr = box1_raw[1] - center_shift_y;
+  box1.w = box1_raw[2];
+  box1.h = box1_raw[3];
+  box1.a = box1_raw[4];
+  box2.x_ctr = box2_raw[0] - center_shift_x;
+  box2.y_ctr = box2_raw[1] - center_shift_y;
+  box2.w = box2_raw[2];
+  box2.h = box2_raw[3];
+  box2.a = box2_raw[4];
+
+  T area1 = box1.w * box1.h;
+  T area2 = box2.w * box2.h;
+  if (area1 < 1e-14 || area2 < 1e-14) {
+    return 0.f;
+  }
+
+  T intersection = rotated_boxes_intersection<T>(box1, box2);
+  T iou = intersection / (area1 + area2 - intersection);
+  return iou;
+}
+
+} // namespace detectron2
diff --git a/detectron2/layers/csrc/cocoeval/cocoeval.cpp b/detectron2/layers/csrc/cocoeval/cocoeval.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..0a5b7b907c06720fefc77b0dfd921b8ec3ecf2be
--- /dev/null
+++ b/detectron2/layers/csrc/cocoeval/cocoeval.cpp
@@ -0,0 +1,507 @@
+// Copyright (c) Facebook, Inc. and its affiliates.
+#include "cocoeval.h"
+#include <time.h>
+#include <algorithm>
+#include <cstdint>
+#include <numeric>
+
+using namespace pybind11::literals;
+
+namespace detectron2 {
+
+namespace COCOeval {
+
+// Sort detections from highest score to lowest, such that
+// detection_instances[detection_sorted_indices[t]] >=
+// detection_instances[detection_sorted_indices[t+1]].  Use stable_sort to match
+// original COCO API
+void SortInstancesByDetectionScore(
+    const std::vector<InstanceAnnotation>& detection_instances,
+    std::vector<uint64_t>* detection_sorted_indices) {
+  detection_sorted_indices->resize(detection_instances.size());
+  std::iota(
+      detection_sorted_indices->begin(), detection_sorted_indices->end(), 0);
+  std::stable_sort(
+      detection_sorted_indices->begin(),
+      detection_sorted_indices->end(),
+      [&detection_instances](size_t j1, size_t j2) {
+        return detection_instances[j1].score > detection_instances[j2].score;
+      });
+}
+
+// Partition the ground truth objects based on whether or not to ignore them
+// based on area
+void SortInstancesByIgnore(
+    const std::array<double, 2>& area_range,
+    const std::vector<InstanceAnnotation>& ground_truth_instances,
+    std::vector<uint64_t>* ground_truth_sorted_indices,
+    std::vector<bool>* ignores) {
+  ignores->clear();
+  ignores->reserve(ground_truth_instances.size());
+  for (auto o : ground_truth_instances) {
+    ignores->push_back(
+        o.ignore || o.area < area_range[0] || o.area > area_range[1]);
+  }
+
+  ground_truth_sorted_indices->resize(ground_truth_instances.size());
+  std::iota(
+      ground_truth_sorted_indices->begin(),
+      ground_truth_sorted_indices->end(),
+      0);
+  std::stable_sort(
+      ground_truth_sorted_indices->begin(),
+      ground_truth_sorted_indices->end(),
+      [&ignores](size_t j1, size_t j2) {
+        return (int)(*ignores)[j1] < (int)(*ignores)[j2];
+      });
+}
+
+// For each IOU threshold, greedily match each detected instance to a ground
+// truth instance (if possible) and store the results
+void MatchDetectionsToGroundTruth(
+    const std::vector<InstanceAnnotation>& detection_instances,
+    const std::vector<uint64_t>& detection_sorted_indices,
+    const std::vector<InstanceAnnotation>& ground_truth_instances,
+    const std::vector<uint64_t>& ground_truth_sorted_indices,
+    const std::vector<bool>& ignores,
+    const std::vector<std::vector<double>>& ious,
+    const std::vector<double>& iou_thresholds,
+    const std::array<double, 2>& area_range,
+    ImageEvaluation* results) {
+  // Initialize memory to store return data matches and ignore
+  const int num_iou_thresholds = iou_thresholds.size();
+  const int num_ground_truth = ground_truth_sorted_indices.size();
+  const int num_detections = detection_sorted_indices.size();
+  std::vector<uint64_t> ground_truth_matches(
+      num_iou_thresholds * num_ground_truth, 0);
+  std::vector<uint64_t>& detection_matches = results->detection_matches;
+  std::vector<bool>& detection_ignores = results->detection_ignores;
+  std::vector<bool>& ground_truth_ignores = results->ground_truth_ignores;
+  detection_matches.resize(num_iou_thresholds * num_detections, 0);
+  detection_ignores.resize(num_iou_thresholds * num_detections, false);
+  ground_truth_ignores.resize(num_ground_truth);
+  for (auto g = 0; g < num_ground_truth; ++g) {
+    ground_truth_ignores[g] = ignores[ground_truth_sorted_indices[g]];
+  }
+
+  for (auto t = 0; t < num_iou_thresholds; ++t) {
+    for (auto d = 0; d < num_detections; ++d) {
+      // information about best match so far (match=-1 -> unmatched)
+      double best_iou = std::min(iou_thresholds[t], 1 - 1e-10);
+      int match = -1;
+      for (auto g = 0; g < num_ground_truth; ++g) {
+        // if this ground truth instance is already matched and not a
+        // crowd, it cannot be matched to another detection
+        if (ground_truth_matches[t * num_ground_truth + g] > 0 &&
+            !ground_truth_instances[ground_truth_sorted_indices[g]].is_crowd) {
+          continue;
+        }
+
+        // if detected instance matched to a regular ground truth
+        // instance, we can break on the first ground truth instance
+        // tagged as ignore (because they are sorted by the ignore tag)
+        if (match >= 0 && !ground_truth_ignores[match] &&
+            ground_truth_ignores[g]) {
+          break;
+        }
+
+        // if IOU overlap is the best so far, store the match appropriately
+        if (ious[d][ground_truth_sorted_indices[g]] >= best_iou) {
+          best_iou = ious[d][ground_truth_sorted_indices[g]];
+          match = g;
+        }
+      }
+      // if match was made, store id of match for both detection and
+      // ground truth
+      if (match >= 0) {
+        detection_ignores[t * num_detections + d] = ground_truth_ignores[match];
+        detection_matches[t * num_detections + d] =
+            ground_truth_instances[ground_truth_sorted_indices[match]].id;
+        ground_truth_matches[t * num_ground_truth + match] =
+            detection_instances[detection_sorted_indices[d]].id;
+      }
+
+      // set unmatched detections outside of area range to ignore
+      const InstanceAnnotation& detection =
+          detection_instances[detection_sorted_indices[d]];
+      detection_ignores[t * num_detections + d] =
+          detection_ignores[t * num_detections + d] ||
+          (detection_matches[t * num_detections + d] == 0 &&
+           (detection.area < area_range[0] || detection.area > area_range[1]));
+    }
+  }
+
+  // store detection score results
+  results->detection_scores.resize(detection_sorted_indices.size());
+  for (size_t d = 0; d < detection_sorted_indices.size(); ++d) {
+    results->detection_scores[d] =
+        detection_instances[detection_sorted_indices[d]].score;
+  }
+}
+
+std::vector<ImageEvaluation> EvaluateImages(
+    const std::vector<std::array<double, 2>>& area_ranges,
+    int max_detections,
+    const std::vector<double>& iou_thresholds,
+    const ImageCategoryInstances<std::vector<double>>& image_category_ious,
+    const ImageCategoryInstances<InstanceAnnotation>&
+        image_category_ground_truth_instances,
+    const ImageCategoryInstances<InstanceAnnotation>&
+        image_category_detection_instances) {
+  const int num_area_ranges = area_ranges.size();
+  const int num_images = image_category_ground_truth_instances.size();
+  const int num_categories =
+      image_category_ious.size() > 0 ? image_category_ious[0].size() : 0;
+  std::vector<uint64_t> detection_sorted_indices;
+  std::vector<uint64_t> ground_truth_sorted_indices;
+  std::vector<bool> ignores;
+  std::vector<ImageEvaluation> results_all(
+      num_images * num_area_ranges * num_categories);
+
+  // Store results for each image, category, and area range combination. Results
+  // for each IOU threshold are packed into the same ImageEvaluation object
+  for (auto i = 0; i < num_images; ++i) {
+    for (auto c = 0; c < num_categories; ++c) {
+      const std::vector<InstanceAnnotation>& ground_truth_instances =
+          image_category_ground_truth_instances[i][c];
+      const std::vector<InstanceAnnotation>& detection_instances =
+          image_category_detection_instances[i][c];
+
+      SortInstancesByDetectionScore(
+          detection_instances, &detection_sorted_indices);
+      if ((int)detection_sorted_indices.size() > max_detections) {
+        detection_sorted_indices.resize(max_detections);
+      }
+
+      for (size_t a = 0; a < area_ranges.size(); ++a) {
+        SortInstancesByIgnore(
+            area_ranges[a],
+            ground_truth_instances,
+            &ground_truth_sorted_indices,
+            &ignores);
+
+        MatchDetectionsToGroundTruth(
+            detection_instances,
+            detection_sorted_indices,
+            ground_truth_instances,
+            ground_truth_sorted_indices,
+            ignores,
+            image_category_ious[i][c],
+            iou_thresholds,
+            area_ranges[a],
+            &results_all
+                [c * num_area_ranges * num_images + a * num_images + i]);
+      }
+    }
+  }
+
+  return results_all;
+}
+
+// Convert a python list to a vector
+template <typename T>
+std::vector<T> list_to_vec(const py::list& l) {
+  std::vector<T> v(py::len(l));
+  for (int i = 0; i < (int)py::len(l); ++i) {
+    v[i] = l[i].cast<T>();
+  }
+  return v;
+}
+
+// Helper function to Accumulate()
+// Considers the evaluation results applicable to a particular category, area
+// range, and max_detections parameter setting, which begin at
+// evaluations[evaluation_index].  Extracts a sorted list of length n of all
+// applicable detection instances concatenated across all images in the dataset,
+// which are represented by the outputs evaluation_indices, detection_scores,
+// image_detection_indices, and detection_sorted_indices--all of which are
+// length n. evaluation_indices[i] stores the applicable index into
+// evaluations[] for instance i, which has detection score detection_score[i],
+// and is the image_detection_indices[i]'th of the list of detections
+// for the image containing i.  detection_sorted_indices[] defines a sorted
+// permutation of the 3 other outputs
+int BuildSortedDetectionList(
+    const std::vector<ImageEvaluation>& evaluations,
+    const int64_t evaluation_index,
+    const int64_t num_images,
+    const int max_detections,
+    std::vector<uint64_t>* evaluation_indices,
+    std::vector<double>* detection_scores,
+    std::vector<uint64_t>* detection_sorted_indices,
+    std::vector<uint64_t>* image_detection_indices) {
+  assert(evaluations.size() >= evaluation_index + num_images);
+
+  // Extract a list of object instances of the applicable category, area
+  // range, and max detections requirements such that they can be sorted
+  image_detection_indices->clear();
+  evaluation_indices->clear();
+  detection_scores->clear();
+  image_detection_indices->reserve(num_images * max_detections);
+  evaluation_indices->reserve(num_images * max_detections);
+  detection_scores->reserve(num_images * max_detections);
+  int num_valid_ground_truth = 0;
+  for (auto i = 0; i < num_images; ++i) {
+    const ImageEvaluation& evaluation = evaluations[evaluation_index + i];
+
+    for (int d = 0;
+         d < (int)evaluation.detection_scores.size() && d < max_detections;
+         ++d) { // detected instances
+      evaluation_indices->push_back(evaluation_index + i);
+      image_detection_indices->push_back(d);
+      detection_scores->push_back(evaluation.detection_scores[d]);
+    }
+    for (auto ground_truth_ignore : evaluation.ground_truth_ignores) {
+      if (!ground_truth_ignore) {
+        ++num_valid_ground_truth;
+      }
+    }
+  }
+
+  // Sort detections by decreasing score, using stable sort to match
+  // python implementation
+  detection_sorted_indices->resize(detection_scores->size());
+  std::iota(
+      detection_sorted_indices->begin(), detection_sorted_indices->end(), 0);
+  std::stable_sort(
+      detection_sorted_indices->begin(),
+      detection_sorted_indices->end(),
+      [&detection_scores](size_t j1, size_t j2) {
+        return (*detection_scores)[j1] > (*detection_scores)[j2];
+      });
+
+  return num_valid_ground_truth;
+}
+
+// Helper function to Accumulate()
+// Compute a precision recall curve given a sorted list of detected instances
+// encoded in evaluations, evaluation_indices, detection_scores,
+// detection_sorted_indices, image_detection_indices (see
+// BuildSortedDetectionList()). Using vectors precisions and recalls
+// and temporary storage, output the results into precisions_out, recalls_out,
+// and scores_out, which are large buffers containing many precion/recall curves
+// for all possible parameter settings, with precisions_out_index and
+// recalls_out_index defining the applicable indices to store results.
+void ComputePrecisionRecallCurve(
+    const int64_t precisions_out_index,
+    const int64_t precisions_out_stride,
+    const int64_t recalls_out_index,
+    const std::vector<double>& recall_thresholds,
+    const int iou_threshold_index,
+    const int num_iou_thresholds,
+    const int num_valid_ground_truth,
+    const std::vector<ImageEvaluation>& evaluations,
+    const std::vector<uint64_t>& evaluation_indices,
+    const std::vector<double>& detection_scores,
+    const std::vector<uint64_t>& detection_sorted_indices,
+    const std::vector<uint64_t>& image_detection_indices,
+    std::vector<double>* precisions,
+    std::vector<double>* recalls,
+    std::vector<double>* precisions_out,
+    std::vector<double>* scores_out,
+    std::vector<double>* recalls_out) {
+  assert(recalls_out->size() > recalls_out_index);
+
+  // Compute precision/recall for each instance in the sorted list of detections
+  int64_t true_positives_sum = 0, false_positives_sum = 0;
+  precisions->clear();
+  recalls->clear();
+  precisions->reserve(detection_sorted_indices.size());
+  recalls->reserve(detection_sorted_indices.size());
+  assert(!evaluations.empty() || detection_sorted_indices.empty());
+  for (auto detection_sorted_index : detection_sorted_indices) {
+    const ImageEvaluation& evaluation =
+        evaluations[evaluation_indices[detection_sorted_index]];
+    const auto num_detections =
+        evaluation.detection_matches.size() / num_iou_thresholds;
+    const auto detection_index = iou_threshold_index * num_detections +
+        image_detection_indices[detection_sorted_index];
+    assert(evaluation.detection_matches.size() > detection_index);
+    assert(evaluation.detection_ignores.size() > detection_index);
+    const int64_t detection_match =
+        evaluation.detection_matches[detection_index];
+    const bool detection_ignores =
+        evaluation.detection_ignores[detection_index];
+    const auto true_positive = detection_match > 0 && !detection_ignores;
+    const auto false_positive = detection_match == 0 && !detection_ignores;
+    if (true_positive) {
+      ++true_positives_sum;
+    }
+    if (false_positive) {
+      ++false_positives_sum;
+    }
+
+    const double recall =
+        static_cast<double>(true_positives_sum) / num_valid_ground_truth;
+    recalls->push_back(recall);
+    const int64_t num_valid_detections =
+        true_positives_sum + false_positives_sum;
+    const double precision = num_valid_detections > 0
+        ? static_cast<double>(true_positives_sum) / num_valid_detections
+        : 0.0;
+    precisions->push_back(precision);
+  }
+
+  (*recalls_out)[recalls_out_index] = !recalls->empty() ? recalls->back() : 0;
+
+  for (int64_t i = static_cast<int64_t>(precisions->size()) - 1; i > 0; --i) {
+    if ((*precisions)[i] > (*precisions)[i - 1]) {
+      (*precisions)[i - 1] = (*precisions)[i];
+    }
+  }
+
+  // Sample the per instance precision/recall list at each recall threshold
+  for (size_t r = 0; r < recall_thresholds.size(); ++r) {
+    // first index in recalls >= recall_thresholds[r]
+    std::vector<double>::iterator low = std::lower_bound(
+        recalls->begin(), recalls->end(), recall_thresholds[r]);
+    size_t precisions_index = low - recalls->begin();
+
+    const auto results_ind = precisions_out_index + r * precisions_out_stride;
+    assert(results_ind < precisions_out->size());
+    assert(results_ind < scores_out->size());
+    if (precisions_index < precisions->size()) {
+      (*precisions_out)[results_ind] = (*precisions)[precisions_index];
+      (*scores_out)[results_ind] =
+          detection_scores[detection_sorted_indices[precisions_index]];
+    } else {
+      (*precisions_out)[results_ind] = 0;
+      (*scores_out)[results_ind] = 0;
+    }
+  }
+}
+py::dict Accumulate(
+    const py::object& params,
+    const std::vector<ImageEvaluation>& evaluations) {
+  const std::vector<double> recall_thresholds =
+      list_to_vec<double>(params.attr("recThrs"));
+  const std::vector<int> max_detections =
+      list_to_vec<int>(params.attr("maxDets"));
+  const int num_iou_thresholds = py::len(params.attr("iouThrs"));
+  const int num_recall_thresholds = py::len(params.attr("recThrs"));
+  const int num_categories = params.attr("useCats").cast<int>() == 1
+      ? py::len(params.attr("catIds"))
+      : 1;
+  const int num_area_ranges = py::len(params.attr("areaRng"));
+  const int num_max_detections = py::len(params.attr("maxDets"));
+  const int num_images = py::len(params.attr("imgIds"));
+
+  std::vector<double> precisions_out(
+      num_iou_thresholds * num_recall_thresholds * num_categories *
+          num_area_ranges * num_max_detections,
+      -1);
+  std::vector<double> recalls_out(
+      num_iou_thresholds * num_categories * num_area_ranges *
+          num_max_detections,
+      -1);
+  std::vector<double> scores_out(
+      num_iou_thresholds * num_recall_thresholds * num_categories *
+          num_area_ranges * num_max_detections,
+      -1);
+
+  // Consider the list of all detected instances in the entire dataset in one
+  // large list.  evaluation_indices, detection_scores,
+  // image_detection_indices, and detection_sorted_indices all have the same
+  // length as this list, such that each entry corresponds to one detected
+  // instance
+  std::vector<uint64_t> evaluation_indices; // indices into evaluations[]
+  std::vector<double> detection_scores; // detection scores of each instance
+  std::vector<uint64_t> detection_sorted_indices; // sorted indices of all
+                                                  // instances in the dataset
+  std::vector<uint64_t>
+      image_detection_indices; // indices into the list of detected instances in
+                               // the same image as each instance
+  std::vector<double> precisions, recalls;
+
+  for (auto c = 0; c < num_categories; ++c) {
+    for (auto a = 0; a < num_area_ranges; ++a) {
+      for (auto m = 0; m < num_max_detections; ++m) {
+        // The COCO PythonAPI assumes evaluations[] (the return value of
+        // COCOeval::EvaluateImages() is one long list storing results for each
+        // combination of category, area range, and image id, with categories in
+        // the outermost loop and images in the innermost loop.
+        const int64_t evaluations_index =
+            c * num_area_ranges * num_images + a * num_images;
+        int num_valid_ground_truth = BuildSortedDetectionList(
+            evaluations,
+            evaluations_index,
+            num_images,
+            max_detections[m],
+            &evaluation_indices,
+            &detection_scores,
+            &detection_sorted_indices,
+            &image_detection_indices);
+
+        if (num_valid_ground_truth == 0) {
+          continue;
+        }
+
+        for (auto t = 0; t < num_iou_thresholds; ++t) {
+          // recalls_out is a flattened vectors representing a
+          // num_iou_thresholds X num_categories X num_area_ranges X
+          // num_max_detections matrix
+          const int64_t recalls_out_index =
+              t * num_categories * num_area_ranges * num_max_detections +
+              c * num_area_ranges * num_max_detections +
+              a * num_max_detections + m;
+
+          // precisions_out and scores_out are flattened vectors
+          // representing a num_iou_thresholds X num_recall_thresholds X
+          // num_categories X num_area_ranges X num_max_detections matrix
+          const int64_t precisions_out_stride =
+              num_categories * num_area_ranges * num_max_detections;
+          const int64_t precisions_out_index = t * num_recall_thresholds *
+                  num_categories * num_area_ranges * num_max_detections +
+              c * num_area_ranges * num_max_detections +
+              a * num_max_detections + m;
+
+          ComputePrecisionRecallCurve(
+              precisions_out_index,
+              precisions_out_stride,
+              recalls_out_index,
+              recall_thresholds,
+              t,
+              num_iou_thresholds,
+              num_valid_ground_truth,
+              evaluations,
+              evaluation_indices,
+              detection_scores,
+              detection_sorted_indices,
+              image_detection_indices,
+              &precisions,
+              &recalls,
+              &precisions_out,
+              &scores_out,
+              &recalls_out);
+        }
+      }
+    }
+  }
+
+  time_t rawtime;
+  struct tm local_time;
+  std::array<char, 200> buffer;
+  time(&rawtime);
+#ifdef _WIN32
+  localtime_s(&local_time, &rawtime);
+#else
+  localtime_r(&rawtime, &local_time);
+#endif
+  strftime(
+      buffer.data(), 200, "%Y-%m-%d %H:%num_max_detections:%S", &local_time);
+  return py::dict(
+      "params"_a = params,
+      "counts"_a = std::vector<int64_t>(
+          {num_iou_thresholds,
+           num_recall_thresholds,
+           num_categories,
+           num_area_ranges,
+           num_max_detections}),
+      "date"_a = buffer,
+      "precision"_a = precisions_out,
+      "recall"_a = recalls_out,
+      "scores"_a = scores_out);
+}
+
+} // namespace COCOeval
+
+} // namespace detectron2
diff --git a/detectron2/layers/csrc/cocoeval/cocoeval.h b/detectron2/layers/csrc/cocoeval/cocoeval.h
new file mode 100644
index 0000000000000000000000000000000000000000..db246e49a026b7cd989b305f4d3d98100be3c912
--- /dev/null
+++ b/detectron2/layers/csrc/cocoeval/cocoeval.h
@@ -0,0 +1,88 @@
+// Copyright (c) Facebook, Inc. and its affiliates.
+#pragma once
+
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+#include <pybind11/stl_bind.h>
+#include <vector>
+
+namespace py = pybind11;
+
+namespace detectron2 {
+
+namespace COCOeval {
+
+// Annotation data for a single object instance in an image
+struct InstanceAnnotation {
+  InstanceAnnotation(
+      uint64_t id,
+      double score,
+      double area,
+      bool is_crowd,
+      bool ignore)
+      : id{id}, score{score}, area{area}, is_crowd{is_crowd}, ignore{ignore} {}
+  uint64_t id;
+  double score = 0.;
+  double area = 0.;
+  bool is_crowd = false;
+  bool ignore = false;
+};
+
+// Stores intermediate results for evaluating detection results for a single
+// image that has D detected instances and G ground truth instances. This stores
+// matches between detected and ground truth instances
+struct ImageEvaluation {
+  // For each of the D detected instances, the id of the matched ground truth
+  // instance, or 0 if unmatched
+  std::vector<uint64_t> detection_matches;
+
+  // The detection score of each of the D detected instances
+  std::vector<double> detection_scores;
+
+  // Marks whether or not each of G instances was ignored from evaluation (e.g.,
+  // because it's outside area_range)
+  std::vector<bool> ground_truth_ignores;
+
+  // Marks whether or not each of D instances was ignored from evaluation (e.g.,
+  // because it's outside aRng)
+  std::vector<bool> detection_ignores;
+};
+
+template <class T>
+using ImageCategoryInstances = std::vector<std::vector<std::vector<T>>>;
+
+// C++ implementation of COCO API cocoeval.py::COCOeval.evaluateImg().  For each
+// combination of image, category, area range settings, and IOU thresholds to
+// evaluate, it matches detected instances to ground truth instances and stores
+// the results into a vector of ImageEvaluation results, which will be
+// interpreted by the COCOeval::Accumulate() function to produce precion-recall
+// curves.  The parameters of nested vectors have the following semantics:
+//   image_category_ious[i][c][d][g] is the intersection over union of the d'th
+//     detected instance and g'th ground truth instance of
+//     category category_ids[c] in image image_ids[i]
+//   image_category_ground_truth_instances[i][c] is a vector of ground truth
+//     instances in image image_ids[i] of category category_ids[c]
+//   image_category_detection_instances[i][c] is a vector of detected
+//     instances in image image_ids[i] of category category_ids[c]
+std::vector<ImageEvaluation> EvaluateImages(
+    const std::vector<std::array<double, 2>>& area_ranges, // vector of 2-tuples
+    int max_detections,
+    const std::vector<double>& iou_thresholds,
+    const ImageCategoryInstances<std::vector<double>>& image_category_ious,
+    const ImageCategoryInstances<InstanceAnnotation>&
+        image_category_ground_truth_instances,
+    const ImageCategoryInstances<InstanceAnnotation>&
+        image_category_detection_instances);
+
+// C++ implementation of COCOeval.accumulate(), which generates precision
+// recall curves for each set of category, IOU threshold, detection area range,
+// and max number of detections parameters.  It is assumed that the parameter
+// evaluations is the return value of the functon COCOeval::EvaluateImages(),
+// which was called with the same parameter settings params
+py::dict Accumulate(
+    const py::object& params,
+    const std::vector<ImageEvaluation>& evalutations);
+
+} // namespace COCOeval
+} // namespace detectron2
diff --git a/detectron2/layers/csrc/cuda_version.cu b/detectron2/layers/csrc/cuda_version.cu
new file mode 100644
index 0000000000000000000000000000000000000000..b74fddab798485a0a9b14b028289f7ec35044836
--- /dev/null
+++ b/detectron2/layers/csrc/cuda_version.cu
@@ -0,0 +1,26 @@
+// Copyright (c) Facebook, Inc. and its affiliates.
+
+#include <cuda_runtime_api.h>
+
+namespace detectron2 {
+int get_cudart_version() {
+// Not a ROCM platform: Either HIP is not used, or
+// it is used, but platform is not ROCM (i.e. it is CUDA)
+#if !defined(__HIP_PLATFORM_AMD__)
+  return CUDART_VERSION;
+#else
+  int version = 0;
+
+#if HIP_VERSION_MAJOR != 0
+  // Create a convention similar to that of CUDA, as assumed by other
+  // parts of the code.
+
+  version = HIP_VERSION_MINOR;
+  version += (HIP_VERSION_MAJOR * 100);
+#else
+  hipRuntimeGetVersion(&version);
+#endif
+  return version;
+#endif
+}
+} // namespace detectron2
diff --git a/detectron2/layers/csrc/deformable/deform_conv.h b/detectron2/layers/csrc/deformable/deform_conv.h
new file mode 100644
index 0000000000000000000000000000000000000000..965c1bfd47b58f9802d1c3fd69a5962517b2da61
--- /dev/null
+++ b/detectron2/layers/csrc/deformable/deform_conv.h
@@ -0,0 +1,377 @@
+// Copyright (c) Facebook, Inc. and its affiliates.
+#pragma once
+#include <torch/types.h>
+
+namespace detectron2 {
+
+#if defined(WITH_CUDA) || defined(WITH_HIP)
+int deform_conv_forward_cuda(
+    at::Tensor input,
+    at::Tensor weight,
+    at::Tensor offset,
+    at::Tensor output,
+    at::Tensor columns,
+    at::Tensor ones,
+    int kW,
+    int kH,
+    int dW,
+    int dH,
+    int padW,
+    int padH,
+    int dilationW,
+    int dilationH,
+    int group,
+    int deformable_group,
+    int im2col_step);
+
+int deform_conv_backward_input_cuda(
+    at::Tensor input,
+    at::Tensor offset,
+    at::Tensor gradOutput,
+    at::Tensor gradInput,
+    at::Tensor gradOffset,
+    at::Tensor weight,
+    at::Tensor columns,
+    int kW,
+    int kH,
+    int dW,
+    int dH,
+    int padW,
+    int padH,
+    int dilationW,
+    int dilationH,
+    int group,
+    int deformable_group,
+    int im2col_step);
+
+int deform_conv_backward_parameters_cuda(
+    at::Tensor input,
+    at::Tensor offset,
+    at::Tensor gradOutput,
+    at::Tensor gradWeight, // at::Tensor gradBias,
+    at::Tensor columns,
+    at::Tensor ones,
+    int kW,
+    int kH,
+    int dW,
+    int dH,
+    int padW,
+    int padH,
+    int dilationW,
+    int dilationH,
+    int group,
+    int deformable_group,
+    float scale,
+    int im2col_step);
+
+void modulated_deform_conv_cuda_forward(
+    at::Tensor input,
+    at::Tensor weight,
+    at::Tensor bias,
+    at::Tensor ones,
+    at::Tensor offset,
+    at::Tensor mask,
+    at::Tensor output,
+    at::Tensor columns,
+    int kernel_h,
+    int kernel_w,
+    const int stride_h,
+    const int stride_w,
+    const int pad_h,
+    const int pad_w,
+    const int dilation_h,
+    const int dilation_w,
+    const int group,
+    const int deformable_group,
+    const bool with_bias);
+
+void modulated_deform_conv_cuda_backward(
+    at::Tensor input,
+    at::Tensor weight,
+    at::Tensor bias,
+    at::Tensor ones,
+    at::Tensor offset,
+    at::Tensor mask,
+    at::Tensor columns,
+    at::Tensor grad_input,
+    at::Tensor grad_weight,
+    at::Tensor grad_bias,
+    at::Tensor grad_offset,
+    at::Tensor grad_mask,
+    at::Tensor grad_output,
+    int kernel_h,
+    int kernel_w,
+    int stride_h,
+    int stride_w,
+    int pad_h,
+    int pad_w,
+    int dilation_h,
+    int dilation_w,
+    int group,
+    int deformable_group,
+    const bool with_bias);
+
+#endif
+
+inline int deform_conv_forward(
+    at::Tensor input,
+    at::Tensor weight,
+    at::Tensor offset,
+    at::Tensor output,
+    at::Tensor columns,
+    at::Tensor ones,
+    int kW,
+    int kH,
+    int dW,
+    int dH,
+    int padW,
+    int padH,
+    int dilationW,
+    int dilationH,
+    int group,
+    int deformable_group,
+    int im2col_step) {
+  if (input.is_cuda()) {
+#if defined(WITH_CUDA) || defined(WITH_HIP)
+    TORCH_CHECK(weight.is_cuda(), "weight tensor is not on GPU!");
+    TORCH_CHECK(offset.is_cuda(), "offset tensor is not on GPU!");
+    return deform_conv_forward_cuda(
+        input,
+        weight,
+        offset,
+        output,
+        columns,
+        ones,
+        kW,
+        kH,
+        dW,
+        dH,
+        padW,
+        padH,
+        dilationW,
+        dilationH,
+        group,
+        deformable_group,
+        im2col_step);
+#else
+    AT_ERROR("Detectron2 is not compiled with GPU support!");
+#endif
+  }
+  AT_ERROR("This operator is not implemented on CPU");
+}
+
+inline int deform_conv_backward_input(
+    at::Tensor input,
+    at::Tensor offset,
+    at::Tensor gradOutput,
+    at::Tensor gradInput,
+    at::Tensor gradOffset,
+    at::Tensor weight,
+    at::Tensor columns,
+    int kW,
+    int kH,
+    int dW,
+    int dH,
+    int padW,
+    int padH,
+    int dilationW,
+    int dilationH,
+    int group,
+    int deformable_group,
+    int im2col_step) {
+  if (gradOutput.is_cuda()) {
+#if defined(WITH_CUDA) || defined(WITH_HIP)
+    TORCH_CHECK(input.is_cuda(), "input tensor is not on GPU!");
+    TORCH_CHECK(weight.is_cuda(), "weight tensor is not on GPU!");
+    TORCH_CHECK(offset.is_cuda(), "offset tensor is not on GPU!");
+    return deform_conv_backward_input_cuda(
+        input,
+        offset,
+        gradOutput,
+        gradInput,
+        gradOffset,
+        weight,
+        columns,
+        kW,
+        kH,
+        dW,
+        dH,
+        padW,
+        padH,
+        dilationW,
+        dilationH,
+        group,
+        deformable_group,
+        im2col_step);
+#else
+    AT_ERROR("Detectron2 is not compiled with GPU support!");
+#endif
+  }
+  AT_ERROR("This operator is not implemented on CPU");
+}
+
+inline int deform_conv_backward_filter(
+    at::Tensor input,
+    at::Tensor offset,
+    at::Tensor gradOutput,
+    at::Tensor gradWeight, // at::Tensor gradBias,
+    at::Tensor columns,
+    at::Tensor ones,
+    int kW,
+    int kH,
+    int dW,
+    int dH,
+    int padW,
+    int padH,
+    int dilationW,
+    int dilationH,
+    int group,
+    int deformable_group,
+    float scale,
+    int im2col_step) {
+  if (gradOutput.is_cuda()) {
+#if defined(WITH_CUDA) || defined(WITH_HIP)
+    TORCH_CHECK(input.is_cuda(), "input tensor is not on GPU!");
+    TORCH_CHECK(offset.is_cuda(), "offset tensor is not on GPU!");
+    return deform_conv_backward_parameters_cuda(
+        input,
+        offset,
+        gradOutput,
+        gradWeight,
+        columns,
+        ones,
+        kW,
+        kH,
+        dW,
+        dH,
+        padW,
+        padH,
+        dilationW,
+        dilationH,
+        group,
+        deformable_group,
+        scale,
+        im2col_step);
+#else
+    AT_ERROR("Detectron2 is not compiled with GPU support!");
+#endif
+  }
+  AT_ERROR("This operator is not implemented on CPU");
+}
+
+inline void modulated_deform_conv_forward(
+    at::Tensor input,
+    at::Tensor weight,
+    at::Tensor bias,
+    at::Tensor ones,
+    at::Tensor offset,
+    at::Tensor mask,
+    at::Tensor output,
+    at::Tensor columns,
+    int kernel_h,
+    int kernel_w,
+    const int stride_h,
+    const int stride_w,
+    const int pad_h,
+    const int pad_w,
+    const int dilation_h,
+    const int dilation_w,
+    const int group,
+    const int deformable_group,
+    const bool with_bias) {
+  if (input.is_cuda()) {
+#if defined(WITH_CUDA) || defined(WITH_HIP)
+    TORCH_CHECK(weight.is_cuda(), "weight tensor is not on GPU!");
+    TORCH_CHECK(bias.is_cuda(), "bias tensor is not on GPU!");
+    TORCH_CHECK(offset.is_cuda(), "offset tensor is not on GPU!");
+    return modulated_deform_conv_cuda_forward(
+        input,
+        weight,
+        bias,
+        ones,
+        offset,
+        mask,
+        output,
+        columns,
+        kernel_h,
+        kernel_w,
+        stride_h,
+        stride_w,
+        pad_h,
+        pad_w,
+        dilation_h,
+        dilation_w,
+        group,
+        deformable_group,
+        with_bias);
+#else
+    AT_ERROR("Detectron2 is not compiled with GPU support!");
+#endif
+  }
+  AT_ERROR("This operator is not implemented on CPU");
+}
+
+inline void modulated_deform_conv_backward(
+    at::Tensor input,
+    at::Tensor weight,
+    at::Tensor bias,
+    at::Tensor ones,
+    at::Tensor offset,
+    at::Tensor mask,
+    at::Tensor columns,
+    at::Tensor grad_input,
+    at::Tensor grad_weight,
+    at::Tensor grad_bias,
+    at::Tensor grad_offset,
+    at::Tensor grad_mask,
+    at::Tensor grad_output,
+    int kernel_h,
+    int kernel_w,
+    int stride_h,
+    int stride_w,
+    int pad_h,
+    int pad_w,
+    int dilation_h,
+    int dilation_w,
+    int group,
+    int deformable_group,
+    const bool with_bias) {
+  if (grad_output.is_cuda()) {
+#if defined(WITH_CUDA) || defined(WITH_HIP)
+    TORCH_CHECK(input.is_cuda(), "input tensor is not on GPU!");
+    TORCH_CHECK(weight.is_cuda(), "weight tensor is not on GPU!");
+    TORCH_CHECK(bias.is_cuda(), "bias tensor is not on GPU!");
+    TORCH_CHECK(offset.is_cuda(), "offset tensor is not on GPU!");
+    return modulated_deform_conv_cuda_backward(
+        input,
+        weight,
+        bias,
+        ones,
+        offset,
+        mask,
+        columns,
+        grad_input,
+        grad_weight,
+        grad_bias,
+        grad_offset,
+        grad_mask,
+        grad_output,
+        kernel_h,
+        kernel_w,
+        stride_h,
+        stride_w,
+        pad_h,
+        pad_w,
+        dilation_h,
+        dilation_w,
+        group,
+        deformable_group,
+        with_bias);
+#else
+    AT_ERROR("Detectron2 is not compiled with GPU support!");
+#endif
+  }
+  AT_ERROR("This operator is not implemented on CPU");
+}
+
+} // namespace detectron2
diff --git a/detectron2/layers/csrc/deformable/deform_conv_cuda.cu b/detectron2/layers/csrc/deformable/deform_conv_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..2072bb856ec40b61c3826cead2fb7bb7c971a089
--- /dev/null
+++ b/detectron2/layers/csrc/deformable/deform_conv_cuda.cu
@@ -0,0 +1,1223 @@
+// Copyright (c) Facebook, Inc. and its affiliates.
+
+// modified from
+// https://github.com/open-mmlab/mmdetection/blob/master/mmdet/ops/dcn/src/deform_conv_cuda.cpp
+// Original license: Apache 2.0
+
+// modify from
+// https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/blob/mmdetection/mmdet/ops/dcn/src/deform_conv_cuda.c
+// Original license: Apache 2.0
+
+#include <torch/types.h>
+
+#include "deform_conv.h"
+
+#include <cmath>
+#include <vector>
+
+namespace detectron2 {
+
+void deformable_im2col(
+    const at::Tensor data_im,
+    const at::Tensor data_offset,
+    const int channels,
+    const int height,
+    const int width,
+    const int ksize_h,
+    const int ksize_w,
+    const int pad_h,
+    const int pad_w,
+    const int stride_h,
+    const int stride_w,
+    const int dilation_h,
+    const int dilation_w,
+    const int parallel_imgs,
+    const int deformable_group,
+    at::Tensor data_col);
+
+void deformable_col2im(
+    const at::Tensor data_col,
+    const at::Tensor data_offset,
+    const int channels,
+    const int height,
+    const int width,
+    const int ksize_h,
+    const int ksize_w,
+    const int pad_h,
+    const int pad_w,
+    const int stride_h,
+    const int stride_w,
+    const int dilation_h,
+    const int dilation_w,
+    const int parallel_imgs,
+    const int deformable_group,
+    at::Tensor grad_im);
+
+void deformable_col2im_coord(
+    const at::Tensor data_col,
+    const at::Tensor data_im,
+    const at::Tensor data_offset,
+    const int channels,
+    const int height,
+    const int width,
+    const int ksize_h,
+    const int ksize_w,
+    const int pad_h,
+    const int pad_w,
+    const int stride_h,
+    const int stride_w,
+    const int dilation_h,
+    const int dilation_w,
+    const int parallel_imgs,
+    const int deformable_group,
+    at::Tensor grad_offset);
+
+void modulated_deformable_im2col_cuda(
+    const at::Tensor data_im,
+    const at::Tensor data_offset,
+    const at::Tensor data_mask,
+    const int batch_size,
+    const int channels,
+    const int height_im,
+    const int width_im,
+    const int height_col,
+    const int width_col,
+    const int kernel_h,
+    const int kenerl_w,
+    const int pad_h,
+    const int pad_w,
+    const int stride_h,
+    const int stride_w,
+    const int dilation_h,
+    const int dilation_w,
+    const int deformable_group,
+    at::Tensor data_col);
+
+void modulated_deformable_col2im_cuda(
+    const at::Tensor data_col,
+    const at::Tensor data_offset,
+    const at::Tensor data_mask,
+    const int batch_size,
+    const int channels,
+    const int height_im,
+    const int width_im,
+    const int height_col,
+    const int width_col,
+    const int kernel_h,
+    const int kenerl_w,
+    const int pad_h,
+    const int pad_w,
+    const int stride_h,
+    const int stride_w,
+    const int dilation_h,
+    const int dilation_w,
+    const int deformable_group,
+    at::Tensor grad_im);
+
+void modulated_deformable_col2im_coord_cuda(
+    const at::Tensor data_col,
+    const at::Tensor data_im,
+    const at::Tensor data_offset,
+    const at::Tensor data_mask,
+    const int batch_size,
+    const int channels,
+    const int height_im,
+    const int width_im,
+    const int height_col,
+    const int width_col,
+    const int kernel_h,
+    const int kenerl_w,
+    const int pad_h,
+    const int pad_w,
+    const int stride_h,
+    const int stride_w,
+    const int dilation_h,
+    const int dilation_w,
+    const int deformable_group,
+    at::Tensor grad_offset,
+    at::Tensor grad_mask);
+
+void shape_check(
+    at::Tensor input,
+    at::Tensor offset,
+    at::Tensor* gradOutput,
+    at::Tensor weight,
+    int kH,
+    int kW,
+    int dH,
+    int dW,
+    int padH,
+    int padW,
+    int dilationH,
+    int dilationW,
+    int group,
+    int deformable_group) {
+  TORCH_CHECK(
+      weight.ndimension() == 4,
+      "4D weight tensor (nOutputPlane,nInputPlane,kH,kW) expected, "
+      "but got: %s",
+      weight.ndimension());
+
+  TORCH_CHECK(weight.is_contiguous(), "weight tensor has to be contiguous");
+
+  TORCH_CHECK(
+      kW > 0 && kH > 0,
+      "kernel size should be greater than zero, but got kH: %d kW: %d",
+      kH,
+      kW);
+
+  TORCH_CHECK(
+      (weight.size(2) == kH && weight.size(3) == kW),
+      "kernel size should be consistent with weight, ",
+      "but got kH: %d kW: %d weight.size(2): %d, weight.size(3): %d",
+      kH,
+      kW,
+      weight.size(2),
+      weight.size(3));
+
+  TORCH_CHECK(
+      dW > 0 && dH > 0,
+      "stride should be greater than zero, but got dH: %d dW: %d",
+      dH,
+      dW);
+
+  TORCH_CHECK(
+      dilationW > 0 && dilationH > 0,
+      "dilation should be greater than 0, but got dilationH: %d dilationW: %d",
+      dilationH,
+      dilationW);
+
+  int ndim = input.ndimension();
+  int dimf = 0;
+  int dimh = 1;
+  int dimw = 2;
+
+  if (ndim == 4) {
+    dimf++;
+    dimh++;
+    dimw++;
+  }
+
+  TORCH_CHECK(
+      ndim == 3 || ndim == 4,
+      "3D or 4D input tensor expected but got: %s",
+      ndim);
+
+  long nInputPlane = weight.size(1) * group;
+  long inputHeight = input.size(dimh);
+  long inputWidth = input.size(dimw);
+  long nOutputPlane = weight.size(0);
+  long outputHeight =
+      (inputHeight + 2 * padH - (dilationH * (kH - 1) + 1)) / dH + 1;
+  long outputWidth =
+      (inputWidth + 2 * padW - (dilationW * (kW - 1) + 1)) / dW + 1;
+
+  TORCH_CHECK(
+      nInputPlane % deformable_group == 0,
+      "input channels must divide deformable group size");
+
+  if (outputWidth < 1 || outputHeight < 1)
+    AT_ERROR(
+        "Given input size: (%ld x %ld x %ld). "
+        "Calculated output size: (%ld x %ld x %ld). Output size is too small",
+        nInputPlane,
+        inputHeight,
+        inputWidth,
+        nOutputPlane,
+        outputHeight,
+        outputWidth);
+
+  TORCH_CHECK(
+      input.size(1) == nInputPlane,
+      "invalid number of input planes, expected: %d, but got: %d",
+      nInputPlane,
+      input.size(1));
+
+  TORCH_CHECK(
+      (inputHeight + 2 * padH >= kH && inputWidth + 2 * padW >= kW),
+      "input image is smaller than kernel");
+
+  TORCH_CHECK(
+      (offset.size(2) == outputHeight && offset.size(3) == outputWidth),
+      "invalid spatial size of offset, expected height: %d width: %d, but "
+      "got height: %d width: %d",
+      outputHeight,
+      outputWidth,
+      offset.size(2),
+      offset.size(3));
+
+  TORCH_CHECK(
+      (offset.size(1) == deformable_group * 2 * kH * kW),
+      "invalid number of channels of offset");
+
+  if (gradOutput != NULL) {
+    TORCH_CHECK(
+        gradOutput->size(dimf) == nOutputPlane,
+        "invalid number of gradOutput planes, expected: %d, but got: %d",
+        nOutputPlane,
+        gradOutput->size(dimf));
+
+    TORCH_CHECK(
+        (gradOutput->size(dimh) == outputHeight &&
+         gradOutput->size(dimw) == outputWidth),
+        "invalid size of gradOutput, expected height: %d width: %d , but "
+        "got height: %d width: %d",
+        outputHeight,
+        outputWidth,
+        gradOutput->size(dimh),
+        gradOutput->size(dimw));
+  }
+}
+
+int deform_conv_forward_cuda(
+    at::Tensor input,
+    at::Tensor weight,
+    at::Tensor offset,
+    at::Tensor output,
+    at::Tensor columns,
+    at::Tensor ones,
+    int kW,
+    int kH,
+    int dW,
+    int dH,
+    int padW,
+    int padH,
+    int dilationW,
+    int dilationH,
+    int group,
+    int deformable_group,
+    int im2col_step) {
+  // todo: resize columns to include im2col: done
+  // todo: add im2col_step as input
+  // todo: add new output buffer and transpose it to output (or directly
+  // transpose output) todo: possibly change data indexing because of
+  // parallel_imgs
+
+  shape_check(
+      input,
+      offset,
+      NULL,
+      weight,
+      kH,
+      kW,
+      dH,
+      dW,
+      padH,
+      padW,
+      dilationH,
+      dilationW,
+      group,
+      deformable_group);
+
+  input = input.contiguous();
+  offset = offset.contiguous();
+  weight = weight.contiguous();
+
+  int batch = 1;
+  if (input.ndimension() == 3) {
+    // Force batch
+    batch = 0;
+    input.unsqueeze_(0);
+    offset.unsqueeze_(0);
+  }
+
+  // todo: assert batchsize dividable by im2col_step
+
+  long batchSize = input.size(0);
+  long nInputPlane = input.size(1);
+  long inputHeight = input.size(2);
+  long inputWidth = input.size(3);
+
+  long nOutputPlane = weight.size(0);
+
+  long outputWidth =
+      (inputWidth + 2 * padW - (dilationW * (kW - 1) + 1)) / dW + 1;
+  long outputHeight =
+      (inputHeight + 2 * padH - (dilationH * (kH - 1) + 1)) / dH + 1;
+
+  TORCH_CHECK((offset.size(0) == batchSize), "invalid batch size of offset");
+
+  output = output.view(
+      {batchSize / im2col_step,
+       im2col_step,
+       nOutputPlane,
+       outputHeight,
+       outputWidth});
+  columns = at::zeros(
+      {nInputPlane * kW * kH, im2col_step * outputHeight * outputWidth},
+      input.options());
+
+  if (ones.ndimension() != 2 ||
+      ones.size(0) * ones.size(1) < outputHeight * outputWidth) {
+    ones = at::ones({outputHeight, outputWidth}, input.options());
+  }
+
+  input = input.view(
+      {batchSize / im2col_step,
+       im2col_step,
+       nInputPlane,
+       inputHeight,
+       inputWidth});
+  offset = offset.view(
+      {batchSize / im2col_step,
+       im2col_step,
+       deformable_group * 2 * kH * kW,
+       outputHeight,
+       outputWidth});
+
+  at::Tensor output_buffer = at::zeros(
+      {batchSize / im2col_step,
+       nOutputPlane,
+       im2col_step * outputHeight,
+       outputWidth},
+      output.options());
+
+  output_buffer = output_buffer.view(
+      {output_buffer.size(0),
+       group,
+       output_buffer.size(1) / group,
+       output_buffer.size(2),
+       output_buffer.size(3)});
+
+  for (int elt = 0; elt < batchSize / im2col_step; elt++) {
+    deformable_im2col(
+        input[elt],
+        offset[elt],
+        nInputPlane,
+        inputHeight,
+        inputWidth,
+        kH,
+        kW,
+        padH,
+        padW,
+        dH,
+        dW,
+        dilationH,
+        dilationW,
+        im2col_step,
+        deformable_group,
+        columns);
+
+    columns = columns.view({group, columns.size(0) / group, columns.size(1)});
+    weight = weight.view(
+        {group,
+         weight.size(0) / group,
+         weight.size(1),
+         weight.size(2),
+         weight.size(3)});
+
+    for (int g = 0; g < group; g++) {
+      output_buffer[elt][g] = output_buffer[elt][g]
+                                  .flatten(1)
+                                  .addmm_(weight[g].flatten(1), columns[g])
+                                  .view_as(output_buffer[elt][g]);
+    }
+  }
+
+  output_buffer = output_buffer.view(
+      {output_buffer.size(0),
+       output_buffer.size(1) * output_buffer.size(2),
+       output_buffer.size(3),
+       output_buffer.size(4)});
+
+  output_buffer = output_buffer.view(
+      {batchSize / im2col_step,
+       nOutputPlane,
+       im2col_step,
+       outputHeight,
+       outputWidth});
+  output_buffer.transpose_(1, 2);
+  output.copy_(output_buffer);
+  output = output.view({batchSize, nOutputPlane, outputHeight, outputWidth});
+
+  input = input.view({batchSize, nInputPlane, inputHeight, inputWidth});
+  offset = offset.view(
+      {batchSize, deformable_group * 2 * kH * kW, outputHeight, outputWidth});
+
+  if (batch == 0) {
+    output = output.view({nOutputPlane, outputHeight, outputWidth});
+    input = input.view({nInputPlane, inputHeight, inputWidth});
+    offset = offset.view({offset.size(1), offset.size(2), offset.size(3)});
+  }
+
+  return 1;
+}
+
+int deform_conv_backward_input_cuda(
+    at::Tensor input,
+    at::Tensor offset,
+    at::Tensor gradOutput,
+    at::Tensor gradInput,
+    at::Tensor gradOffset,
+    at::Tensor weight,
+    at::Tensor columns,
+    int kW,
+    int kH,
+    int dW,
+    int dH,
+    int padW,
+    int padH,
+    int dilationW,
+    int dilationH,
+    int group,
+    int deformable_group,
+    int im2col_step) {
+  shape_check(
+      input,
+      offset,
+      &gradOutput,
+      weight,
+      kH,
+      kW,
+      dH,
+      dW,
+      padH,
+      padW,
+      dilationH,
+      dilationW,
+      group,
+      deformable_group);
+
+  input = input.contiguous();
+  offset = offset.contiguous();
+  gradOutput = gradOutput.contiguous();
+  weight = weight.contiguous();
+
+  int batch = 1;
+
+  if (input.ndimension() == 3) {
+    // Force batch
+    batch = 0;
+    input = input.view({1, input.size(0), input.size(1), input.size(2)});
+    offset = offset.view({1, offset.size(0), offset.size(1), offset.size(2)});
+    gradOutput = gradOutput.view(
+        {1, gradOutput.size(0), gradOutput.size(1), gradOutput.size(2)});
+  }
+
+  long batchSize = input.size(0);
+  long nInputPlane = input.size(1);
+  long inputHeight = input.size(2);
+  long inputWidth = input.size(3);
+
+  long nOutputPlane = weight.size(0);
+
+  long outputWidth =
+      (inputWidth + 2 * padW - (dilationW * (kW - 1) + 1)) / dW + 1;
+  long outputHeight =
+      (inputHeight + 2 * padH - (dilationH * (kH - 1) + 1)) / dH + 1;
+
+  TORCH_CHECK((offset.size(0) == batchSize), 3, "invalid batch size of offset");
+  gradInput = gradInput.view({batchSize, nInputPlane, inputHeight, inputWidth});
+  columns = at::zeros(
+      {nInputPlane * kW * kH, im2col_step * outputHeight * outputWidth},
+      input.options());
+
+  // change order of grad output
+  gradOutput = gradOutput.view(
+      {batchSize / im2col_step,
+       im2col_step,
+       nOutputPlane,
+       outputHeight,
+       outputWidth});
+  gradOutput.transpose_(1, 2);
+
+  gradInput = gradInput.view(
+      {batchSize / im2col_step,
+       im2col_step,
+       nInputPlane,
+       inputHeight,
+       inputWidth});
+  input = input.view(
+      {batchSize / im2col_step,
+       im2col_step,
+       nInputPlane,
+       inputHeight,
+       inputWidth});
+  gradOffset = gradOffset.view(
+      {batchSize / im2col_step,
+       im2col_step,
+       deformable_group * 2 * kH * kW,
+       outputHeight,
+       outputWidth});
+  offset = offset.view(
+      {batchSize / im2col_step,
+       im2col_step,
+       deformable_group * 2 * kH * kW,
+       outputHeight,
+       outputWidth});
+
+  for (int elt = 0; elt < batchSize / im2col_step; elt++) {
+    // divide into groups
+    columns = columns.view({group, columns.size(0) / group, columns.size(1)});
+    weight = weight.view(
+        {group,
+         weight.size(0) / group,
+         weight.size(1),
+         weight.size(2),
+         weight.size(3)});
+    gradOutput = gradOutput.view(
+        {gradOutput.size(0),
+         group,
+         gradOutput.size(1) / group,
+         gradOutput.size(2),
+         gradOutput.size(3),
+         gradOutput.size(4)});
+
+    for (int g = 0; g < group; g++) {
+      columns[g] = columns[g].addmm_(
+          weight[g].flatten(1).transpose(0, 1),
+          gradOutput[elt][g].flatten(1),
+          0.0f,
+          1.0f);
+    }
+
+    columns =
+        columns.view({columns.size(0) * columns.size(1), columns.size(2)});
+    gradOutput = gradOutput.view(
+        {gradOutput.size(0),
+         gradOutput.size(1) * gradOutput.size(2),
+         gradOutput.size(3),
+         gradOutput.size(4),
+         gradOutput.size(5)});
+
+    deformable_col2im_coord(
+        columns,
+        input[elt],
+        offset[elt],
+        nInputPlane,
+        inputHeight,
+        inputWidth,
+        kH,
+        kW,
+        padH,
+        padW,
+        dH,
+        dW,
+        dilationH,
+        dilationW,
+        im2col_step,
+        deformable_group,
+        gradOffset[elt]);
+
+    deformable_col2im(
+        columns,
+        offset[elt],
+        nInputPlane,
+        inputHeight,
+        inputWidth,
+        kH,
+        kW,
+        padH,
+        padW,
+        dH,
+        dW,
+        dilationH,
+        dilationW,
+        im2col_step,
+        deformable_group,
+        gradInput[elt]);
+  }
+
+  gradOutput.transpose_(1, 2);
+  gradOutput =
+      gradOutput.view({batchSize, nOutputPlane, outputHeight, outputWidth});
+
+  gradInput = gradInput.view({batchSize, nInputPlane, inputHeight, inputWidth});
+  input = input.view({batchSize, nInputPlane, inputHeight, inputWidth});
+  gradOffset = gradOffset.view(
+      {batchSize, deformable_group * 2 * kH * kW, outputHeight, outputWidth});
+  offset = offset.view(
+      {batchSize, deformable_group * 2 * kH * kW, outputHeight, outputWidth});
+
+  if (batch == 0) {
+    gradOutput = gradOutput.view({nOutputPlane, outputHeight, outputWidth});
+    input = input.view({nInputPlane, inputHeight, inputWidth});
+    gradInput = gradInput.view({nInputPlane, inputHeight, inputWidth});
+    offset = offset.view({offset.size(1), offset.size(2), offset.size(3)});
+    gradOffset =
+        gradOffset.view({offset.size(1), offset.size(2), offset.size(3)});
+  }
+
+  return 1;
+}
+
+int deform_conv_backward_parameters_cuda(
+    at::Tensor input,
+    at::Tensor offset,
+    at::Tensor gradOutput,
+    at::Tensor gradWeight, // at::Tensor gradBias,
+    at::Tensor columns,
+    at::Tensor ones,
+    int kW,
+    int kH,
+    int dW,
+    int dH,
+    int padW,
+    int padH,
+    int dilationW,
+    int dilationH,
+    int group,
+    int deformable_group,
+    float scale,
+    int im2col_step) {
+  // todo: transpose and reshape outGrad
+  // todo: reshape columns
+  // todo: add im2col_step as input
+
+  shape_check(
+      input,
+      offset,
+      &gradOutput,
+      gradWeight,
+      kH,
+      kW,
+      dH,
+      dW,
+      padH,
+      padW,
+      dilationH,
+      dilationW,
+      group,
+      deformable_group);
+
+  input = input.contiguous();
+  offset = offset.contiguous();
+  gradOutput = gradOutput.contiguous();
+
+  int batch = 1;
+
+  if (input.ndimension() == 3) {
+    // Force batch
+    batch = 0;
+    input = input.view(
+        at::IntList({1, input.size(0), input.size(1), input.size(2)}));
+    gradOutput = gradOutput.view(
+        {1, gradOutput.size(0), gradOutput.size(1), gradOutput.size(2)});
+  }
+
+  long batchSize = input.size(0);
+  long nInputPlane = input.size(1);
+  long inputHeight = input.size(2);
+  long inputWidth = input.size(3);
+
+  long nOutputPlane = gradWeight.size(0);
+
+  long outputWidth =
+      (inputWidth + 2 * padW - (dilationW * (kW - 1) + 1)) / dW + 1;
+  long outputHeight =
+      (inputHeight + 2 * padH - (dilationH * (kH - 1) + 1)) / dH + 1;
+
+  TORCH_CHECK((offset.size(0) == batchSize), "invalid batch size of offset");
+
+  columns = at::zeros(
+      {nInputPlane * kW * kH, im2col_step * outputHeight * outputWidth},
+      input.options());
+
+  gradOutput = gradOutput.view(
+      {batchSize / im2col_step,
+       im2col_step,
+       nOutputPlane,
+       outputHeight,
+       outputWidth});
+  gradOutput.transpose_(1, 2);
+
+  at::Tensor gradOutputBuffer = at::zeros_like(gradOutput);
+  gradOutputBuffer = gradOutputBuffer.view(
+      {batchSize / im2col_step,
+       nOutputPlane,
+       im2col_step,
+       outputHeight,
+       outputWidth});
+  gradOutputBuffer.copy_(gradOutput);
+  // gradOutput is not contiguous, so we do reshape (instead of view) next
+  gradOutputBuffer = gradOutputBuffer.reshape(
+      {batchSize / im2col_step,
+       nOutputPlane,
+       im2col_step * outputHeight,
+       outputWidth});
+
+  gradOutput.transpose_(1, 2);
+  gradOutput =
+      gradOutput.view({batchSize, nOutputPlane, outputHeight, outputWidth});
+
+  input = input.view(
+      {batchSize / im2col_step,
+       im2col_step,
+       nInputPlane,
+       inputHeight,
+       inputWidth});
+  offset = offset.view(
+      {batchSize / im2col_step,
+       im2col_step,
+       deformable_group * 2 * kH * kW,
+       outputHeight,
+       outputWidth});
+
+  for (int elt = 0; elt < batchSize / im2col_step; elt++) {
+    deformable_im2col(
+        input[elt],
+        offset[elt],
+        nInputPlane,
+        inputHeight,
+        inputWidth,
+        kH,
+        kW,
+        padH,
+        padW,
+        dH,
+        dW,
+        dilationH,
+        dilationW,
+        im2col_step,
+        deformable_group,
+        columns);
+
+    // divide into group
+    gradOutputBuffer = gradOutputBuffer.view(
+        {gradOutputBuffer.size(0),
+         group,
+         gradOutputBuffer.size(1) / group,
+         gradOutputBuffer.size(2),
+         gradOutputBuffer.size(3)});
+    columns = columns.view({group, columns.size(0) / group, columns.size(1)});
+    gradWeight = gradWeight.view(
+        {group,
+         gradWeight.size(0) / group,
+         gradWeight.size(1),
+         gradWeight.size(2),
+         gradWeight.size(3)});
+
+    for (int g = 0; g < group; g++) {
+      gradWeight[g] = gradWeight[g]
+                          .flatten(1)
+                          .addmm_(
+                              gradOutputBuffer[elt][g].flatten(1),
+                              columns[g].transpose(1, 0),
+                              1.0,
+                              scale)
+                          .view_as(gradWeight[g]);
+    }
+    gradOutputBuffer = gradOutputBuffer.view(
+        {gradOutputBuffer.size(0),
+         gradOutputBuffer.size(1) * gradOutputBuffer.size(2),
+         gradOutputBuffer.size(3),
+         gradOutputBuffer.size(4)});
+    columns =
+        columns.view({columns.size(0) * columns.size(1), columns.size(2)});
+    gradWeight = gradWeight.view(
+        {gradWeight.size(0) * gradWeight.size(1),
+         gradWeight.size(2),
+         gradWeight.size(3),
+         gradWeight.size(4)});
+  }
+
+  input = input.view({batchSize, nInputPlane, inputHeight, inputWidth});
+  offset = offset.view(
+      {batchSize, deformable_group * 2 * kH * kW, outputHeight, outputWidth});
+
+  if (batch == 0) {
+    gradOutput = gradOutput.view({nOutputPlane, outputHeight, outputWidth});
+    input = input.view({nInputPlane, inputHeight, inputWidth});
+  }
+
+  return 1;
+}
+
+void modulated_deform_conv_cuda_forward(
+    at::Tensor input,
+    at::Tensor weight,
+    at::Tensor bias,
+    at::Tensor ones,
+    at::Tensor offset,
+    at::Tensor mask,
+    at::Tensor output,
+    at::Tensor columns,
+    int kernel_h,
+    int kernel_w,
+    const int stride_h,
+    const int stride_w,
+    const int pad_h,
+    const int pad_w,
+    const int dilation_h,
+    const int dilation_w,
+    const int group,
+    const int deformable_group,
+    const bool with_bias) {
+  shape_check(
+      input,
+      offset,
+      NULL,
+      weight,
+      kernel_h,
+      kernel_w,
+      stride_h,
+      stride_w,
+      pad_h,
+      pad_w,
+      dilation_h,
+      dilation_w,
+      group,
+      deformable_group);
+
+  TORCH_CHECK(input.is_contiguous(), "input tensor has to be contiguous");
+  TORCH_CHECK(weight.is_contiguous(), "weight tensor has to be contiguous");
+
+  const int batch = input.size(0);
+  const int channels = input.size(1);
+  const int height = input.size(2);
+  const int width = input.size(3);
+
+  const int channels_out = weight.size(0);
+  const int channels_kernel = weight.size(1);
+  const int kernel_h_ = weight.size(2);
+  const int kernel_w_ = weight.size(3);
+
+  if (kernel_h_ != kernel_h || kernel_w_ != kernel_w)
+    AT_ERROR(
+        "Input shape and kernel shape wont match: (%d x %d vs %d x %d).",
+        kernel_h_,
+        kernel_w,
+        kernel_h_,
+        kernel_w_);
+  if (channels != channels_kernel * group)
+    AT_ERROR(
+        "Input shape and kernel channels wont match: (%d vs %d).",
+        channels,
+        channels_kernel * group);
+
+  const int height_out =
+      (height + 2 * pad_h - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1;
+  const int width_out =
+      (width + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1;
+
+  // mask shape check
+  TORCH_CHECK(
+      (mask.size(2) == height_out && mask.size(3) == width_out),
+      "invalid spatial size of mask, expected height: %d width: %d, but "
+      "got height: %d width: %d",
+      height_out,
+      width_out,
+      mask.size(2),
+      mask.size(3));
+
+  TORCH_CHECK(
+      (mask.size(1) == deformable_group * kernel_h * kernel_w),
+      "invalid number of channels of mask");
+
+  if (ones.ndimension() != 2 ||
+      ones.size(0) * ones.size(1) < height_out * width_out) {
+    // Resize plane and fill with ones...
+    ones = at::ones({height_out, width_out}, input.options());
+  }
+
+  // resize output
+  output = output.view({batch, channels_out, height_out, width_out}).zero_();
+  // resize temporary columns
+  columns = at::zeros(
+      {channels * kernel_h * kernel_w, 1 * height_out * width_out},
+      input.options());
+
+  output = output.view(
+      {output.size(0),
+       group,
+       output.size(1) / group,
+       output.size(2),
+       output.size(3)});
+
+  for (int b = 0; b < batch; b++) {
+    modulated_deformable_im2col_cuda(
+        input[b],
+        offset[b],
+        mask[b],
+        1,
+        channels,
+        height,
+        width,
+        height_out,
+        width_out,
+        kernel_h,
+        kernel_w,
+        pad_h,
+        pad_w,
+        stride_h,
+        stride_w,
+        dilation_h,
+        dilation_w,
+        deformable_group,
+        columns);
+
+    // divide into group
+    weight = weight.view(
+        {group,
+         weight.size(0) / group,
+         weight.size(1),
+         weight.size(2),
+         weight.size(3)});
+    columns = columns.view({group, columns.size(0) / group, columns.size(1)});
+
+    for (int g = 0; g < group; g++) {
+      output[b][g] = output[b][g]
+                         .flatten(1)
+                         .addmm_(weight[g].flatten(1), columns[g])
+                         .view_as(output[b][g]);
+    }
+
+    weight = weight.view(
+        {weight.size(0) * weight.size(1),
+         weight.size(2),
+         weight.size(3),
+         weight.size(4)});
+    columns =
+        columns.view({columns.size(0) * columns.size(1), columns.size(2)});
+  }
+
+  output = output.view(
+      {output.size(0),
+       output.size(1) * output.size(2),
+       output.size(3),
+       output.size(4)});
+
+  if (with_bias) {
+    output += bias.view({1, bias.size(0), 1, 1});
+  }
+}
+
+void modulated_deform_conv_cuda_backward(
+    at::Tensor input,
+    at::Tensor weight,
+    at::Tensor bias,
+    at::Tensor ones,
+    at::Tensor offset,
+    at::Tensor mask,
+    at::Tensor columns,
+    at::Tensor grad_input,
+    at::Tensor grad_weight,
+    at::Tensor grad_bias,
+    at::Tensor grad_offset,
+    at::Tensor grad_mask,
+    at::Tensor grad_output,
+    int kernel_h,
+    int kernel_w,
+    int stride_h,
+    int stride_w,
+    int pad_h,
+    int pad_w,
+    int dilation_h,
+    int dilation_w,
+    int group,
+    int deformable_group,
+    const bool with_bias) {
+  shape_check(
+      input,
+      offset,
+      &grad_output,
+      weight,
+      kernel_h,
+      kernel_w,
+      stride_h,
+      stride_w,
+      pad_h,
+      pad_w,
+      dilation_h,
+      dilation_w,
+      group,
+      deformable_group);
+
+  TORCH_CHECK(input.is_contiguous(), "input tensor has to be contiguous");
+  TORCH_CHECK(weight.is_contiguous(), "weight tensor has to be contiguous");
+
+  const int batch = input.size(0);
+  const int channels = input.size(1);
+  const int height = input.size(2);
+  const int width = input.size(3);
+
+  const int channels_kernel = weight.size(1);
+  const int kernel_h_ = weight.size(2);
+  const int kernel_w_ = weight.size(3);
+  if (kernel_h_ != kernel_h || kernel_w_ != kernel_w)
+    AT_ERROR(
+        "Input shape and kernel shape wont match: (%d x %d vs %d x %d).",
+        kernel_h_,
+        kernel_w,
+        kernel_h_,
+        kernel_w_);
+  if (channels != channels_kernel * group)
+    AT_ERROR(
+        "Input shape and kernel channels wont match: (%d vs %d).",
+        channels,
+        channels_kernel * group);
+
+  const int height_out =
+      (height + 2 * pad_h - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1;
+  const int width_out =
+      (width + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1;
+
+  // mask shape check
+  TORCH_CHECK(
+      (mask.size(2) == height_out && mask.size(3) == width_out),
+      "invalid spatial size of mask, expected height: %d width: %d, but "
+      "got height: %d width: %d",
+      height_out,
+      width_out,
+      mask.size(2),
+      mask.size(3));
+
+  TORCH_CHECK(
+      (mask.size(1) == deformable_group * kernel_h * kernel_w),
+      "invalid number of channels of mask");
+
+  if (ones.ndimension() != 2 ||
+      ones.size(0) * ones.size(1) < height_out * width_out) {
+    // Resize plane and fill with ones...
+    ones = at::ones({height_out, width_out}, input.options());
+  }
+
+  grad_input = grad_input.view({batch, channels, height, width});
+  columns = at::zeros(
+      {channels * kernel_h * kernel_w, height_out * width_out},
+      input.options());
+
+  grad_output = grad_output.view(
+      {grad_output.size(0),
+       group,
+       grad_output.size(1) / group,
+       grad_output.size(2),
+       grad_output.size(3)});
+
+  for (int b = 0; b < batch; b++) {
+    // divide int group
+    columns = columns.view({group, columns.size(0) / group, columns.size(1)});
+    weight = weight.view(
+        {group,
+         weight.size(0) / group,
+         weight.size(1),
+         weight.size(2),
+         weight.size(3)});
+
+    for (int g = 0; g < group; g++) {
+      columns[g].addmm_(
+          weight[g].flatten(1).transpose(0, 1),
+          grad_output[b][g].flatten(1),
+          0.0f,
+          1.0f);
+    }
+
+    columns =
+        columns.view({columns.size(0) * columns.size(1), columns.size(2)});
+    weight = weight.view(
+        {weight.size(0) * weight.size(1),
+         weight.size(2),
+         weight.size(3),
+         weight.size(4)});
+
+    // gradient w.r.t. input coordinate data
+    modulated_deformable_col2im_coord_cuda(
+        columns,
+        input[b],
+        offset[b],
+        mask[b],
+        1,
+        channels,
+        height,
+        width,
+        height_out,
+        width_out,
+        kernel_h,
+        kernel_w,
+        pad_h,
+        pad_w,
+        stride_h,
+        stride_w,
+        dilation_h,
+        dilation_w,
+        deformable_group,
+        grad_offset[b],
+        grad_mask[b]);
+    // gradient w.r.t. input data
+    modulated_deformable_col2im_cuda(
+        columns,
+        offset[b],
+        mask[b],
+        1,
+        channels,
+        height,
+        width,
+        height_out,
+        width_out,
+        kernel_h,
+        kernel_w,
+        pad_h,
+        pad_w,
+        stride_h,
+        stride_w,
+        dilation_h,
+        dilation_w,
+        deformable_group,
+        grad_input[b]);
+
+    // gradient w.r.t. weight, dWeight should accumulate across the batch and
+    // group
+    modulated_deformable_im2col_cuda(
+        input[b],
+        offset[b],
+        mask[b],
+        1,
+        channels,
+        height,
+        width,
+        height_out,
+        width_out,
+        kernel_h,
+        kernel_w,
+        pad_h,
+        pad_w,
+        stride_h,
+        stride_w,
+        dilation_h,
+        dilation_w,
+        deformable_group,
+        columns);
+
+    columns = columns.view({group, columns.size(0) / group, columns.size(1)});
+    grad_weight = grad_weight.view(
+        {group,
+         grad_weight.size(0) / group,
+         grad_weight.size(1),
+         grad_weight.size(2),
+         grad_weight.size(3)});
+    if (with_bias)
+      grad_bias = grad_bias.view({group, grad_bias.size(0) / group});
+
+    for (int g = 0; g < group; g++) {
+      grad_weight[g] =
+          grad_weight[g]
+              .flatten(1)
+              .addmm_(grad_output[b][g].flatten(1), columns[g].transpose(0, 1))
+              .view_as(grad_weight[g]);
+      if (with_bias) {
+        grad_bias[g] =
+            grad_bias[g]
+                .view({-1, 1})
+                .addmm_(grad_output[b][g].flatten(1), ones.view({-1, 1}))
+                .view(-1);
+      }
+    }
+
+    columns =
+        columns.view({columns.size(0) * columns.size(1), columns.size(2)});
+    grad_weight = grad_weight.view(
+        {grad_weight.size(0) * grad_weight.size(1),
+         grad_weight.size(2),
+         grad_weight.size(3),
+         grad_weight.size(4)});
+    if (with_bias)
+      grad_bias = grad_bias.view({grad_bias.size(0) * grad_bias.size(1)});
+  }
+  grad_output = grad_output.view(
+      {grad_output.size(0) * grad_output.size(1),
+       grad_output.size(2),
+       grad_output.size(3),
+       grad_output.size(4)});
+}
+
+} // namespace detectron2
diff --git a/detectron2/layers/csrc/deformable/deform_conv_cuda_kernel.cu b/detectron2/layers/csrc/deformable/deform_conv_cuda_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..f299c7add116685e9c87a187a85ea63f9f808867
--- /dev/null
+++ b/detectron2/layers/csrc/deformable/deform_conv_cuda_kernel.cu
@@ -0,0 +1,1288 @@
+// Copyright (c) Facebook, Inc. and its affiliates.
+
+// modified from
+// https://github.com/open-mmlab/mmdetection/blob/master/mmdet/ops/dcn/src/deform_conv_cuda_kernel.cu
+// Original license: Apache 2.0
+// clang-format off
+
+// modify from
+// https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/blob/mmdetection/mmdet/ops/dcn/src/deform_conv_cuda_kernel.cu
+
+/*!
+ ******************* BEGIN Caffe Copyright Notice and Disclaimer *****************
+ *
+ * COPYRIGHT
+ *
+ * All contributions by the University of California:
+ * Copyright (c) 2014-2017 The Regents of the University of California (Regents)
+ * All rights reserved.
+ *
+ * All other contributions:
+ * Copyright (c) 2014-2017, the respective contributors
+ * All rights reserved.
+ *
+ * Caffe uses a shared copyright model: each contributor holds copyright over
+ * their contributions to Caffe. The project versioning records all such
+ * contribution and copyright details. If a contributor wants to further mark
+ * their specific copyright on a particular contribution, they should indicate
+ * their copyright solely in the commit message of the change when it is
+ * committed.
+ *
+ * LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ *AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ *IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+ *FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ *DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ *SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ *OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * CONTRIBUTION AGREEMENT
+ *
+ * By contributing to the BVLC/caffe repository through pull-request, comment,
+ * or otherwise, the contributor releases their content to the
+ * license and copyright terms herein.
+ *
+ ***************** END Caffe Copyright Notice and Disclaimer *********************
+ *
+ * Copyright (c) 2018 Microsoft
+ * Licensed under The MIT License [see LICENSE for details]
+ * \file modulated_deformable_im2col.cuh
+ * \brief Function definitions of converting an image to
+ * column matrix based on kernel, padding, dilation, and offset.
+ * These functions are mainly used in deformable convolution operators.
+ * \ref: https://arxiv.org/abs/1703.06211
+ * \author Yuwen Xiong, Haozhi Qi, Jifeng Dai, Xizhou Zhu, Han Hu, Dazhi Cheng
+ */
+
+#include <ATen/ATen.h>
+#include <c10/cuda/CUDAGuard.h>
+#include <float.h>
+#include <math.h>
+#include <stdio.h>
+#include <THC/THCAtomics.cuh>
+
+using namespace at;
+
+#define CUDA_KERNEL_LOOP(i, n)                                 \
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
+       i += blockDim.x * gridDim.x)
+
+
+namespace {
+
+const int CUDA_NUM_THREADS = 1024;
+const int kMaxGridNum = 65535;
+
+inline int GET_BLOCKS(const int N) {
+  return std::min(kMaxGridNum, (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS);
+}
+
+}
+
+template <typename scalar_t>
+__device__ scalar_t deformable_im2col_bilinear(
+    const scalar_t* bottom_data,
+    const int data_width,
+    const int height,
+    const int width,
+    scalar_t h,
+    scalar_t w) {
+  int h_low = floor(h);
+  int w_low = floor(w);
+  int h_high = h_low + 1;
+  int w_high = w_low + 1;
+
+  scalar_t lh = h - h_low;
+  scalar_t lw = w - w_low;
+  scalar_t hh = 1 - lh, hw = 1 - lw;
+
+  scalar_t v1 = 0;
+  if (h_low >= 0 && w_low >= 0)
+    v1 = bottom_data[h_low * data_width + w_low];
+  scalar_t v2 = 0;
+  if (h_low >= 0 && w_high <= width - 1)
+    v2 = bottom_data[h_low * data_width + w_high];
+  scalar_t v3 = 0;
+  if (h_high <= height - 1 && w_low >= 0)
+    v3 = bottom_data[h_high * data_width + w_low];
+  scalar_t v4 = 0;
+  if (h_high <= height - 1 && w_high <= width - 1)
+    v4 = bottom_data[h_high * data_width + w_high];
+
+  scalar_t w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
+
+  scalar_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+  return val;
+}
+
+template <typename scalar_t>
+__device__ scalar_t get_gradient_weight(
+    scalar_t argmax_h,
+    scalar_t argmax_w,
+    const int h,
+    const int w,
+    const int height,
+    const int width) {
+  if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 ||
+      argmax_w >= width) {
+    // empty
+    return 0;
+  }
+
+  int argmax_h_low = floor(argmax_h);
+  int argmax_w_low = floor(argmax_w);
+  int argmax_h_high = argmax_h_low + 1;
+  int argmax_w_high = argmax_w_low + 1;
+
+  scalar_t weight = 0;
+  if (h == argmax_h_low && w == argmax_w_low)
+    weight = (h + 1 - argmax_h) * (w + 1 - argmax_w);
+  if (h == argmax_h_low && w == argmax_w_high)
+    weight = (h + 1 - argmax_h) * (argmax_w + 1 - w);
+  if (h == argmax_h_high && w == argmax_w_low)
+    weight = (argmax_h + 1 - h) * (w + 1 - argmax_w);
+  if (h == argmax_h_high && w == argmax_w_high)
+    weight = (argmax_h + 1 - h) * (argmax_w + 1 - w);
+  return weight;
+}
+
+template <typename scalar_t>
+__device__ scalar_t get_coordinate_weight(
+    scalar_t argmax_h,
+    scalar_t argmax_w,
+    const int height,
+    const int width,
+    const scalar_t* im_data,
+    const int data_width,
+    const int bp_dir) {
+  if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 ||
+      argmax_w >= width) {
+    // empty
+    return 0;
+  }
+
+  int argmax_h_low = floor(argmax_h);
+  int argmax_w_low = floor(argmax_w);
+  int argmax_h_high = argmax_h_low + 1;
+  int argmax_w_high = argmax_w_low + 1;
+
+  scalar_t weight = 0;
+
+  if (bp_dir == 0) {
+    if (argmax_h_low >= 0 && argmax_w_low >= 0)
+      weight += -1 * (argmax_w_low + 1 - argmax_w) *
+          im_data[argmax_h_low * data_width + argmax_w_low];
+    if (argmax_h_low >= 0 && argmax_w_high <= width - 1)
+      weight += -1 * (argmax_w - argmax_w_low) *
+          im_data[argmax_h_low * data_width + argmax_w_high];
+    if (argmax_h_high <= height - 1 && argmax_w_low >= 0)
+      weight += (argmax_w_low + 1 - argmax_w) *
+          im_data[argmax_h_high * data_width + argmax_w_low];
+    if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1)
+      weight += (argmax_w - argmax_w_low) *
+          im_data[argmax_h_high * data_width + argmax_w_high];
+  } else if (bp_dir == 1) {
+    if (argmax_h_low >= 0 && argmax_w_low >= 0)
+      weight += -1 * (argmax_h_low + 1 - argmax_h) *
+          im_data[argmax_h_low * data_width + argmax_w_low];
+    if (argmax_h_low >= 0 && argmax_w_high <= width - 1)
+      weight += (argmax_h_low + 1 - argmax_h) *
+          im_data[argmax_h_low * data_width + argmax_w_high];
+    if (argmax_h_high <= height - 1 && argmax_w_low >= 0)
+      weight += -1 * (argmax_h - argmax_h_low) *
+          im_data[argmax_h_high * data_width + argmax_w_low];
+    if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1)
+      weight += (argmax_h - argmax_h_low) *
+          im_data[argmax_h_high * data_width + argmax_w_high];
+  }
+
+  return weight;
+}
+
+template <typename scalar_t>
+__global__ void deformable_im2col_gpu_kernel(
+    const int n,
+    const scalar_t* data_im,
+    const scalar_t* data_offset,
+    const int height,
+    const int width,
+    const int kernel_h,
+    const int kernel_w,
+    const int pad_h,
+    const int pad_w,
+    const int stride_h,
+    const int stride_w,
+    const int dilation_h,
+    const int dilation_w,
+    const int channel_per_deformable_group,
+    const int batch_size,
+    const int num_channels,
+    const int deformable_group,
+    const int height_col,
+    const int width_col,
+    scalar_t* data_col) {
+  CUDA_KERNEL_LOOP(index, n) {
+    // index index of output matrix
+    const int w_col = index % width_col;
+    const int h_col = (index / width_col) % height_col;
+    const int b_col = (index / width_col / height_col) % batch_size;
+    const int c_im = (index / width_col / height_col) / batch_size;
+    const int c_col = c_im * kernel_h * kernel_w;
+
+    // compute deformable group index
+    const int deformable_group_index = c_im / channel_per_deformable_group;
+
+    const int h_in = h_col * stride_h - pad_h;
+    const int w_in = w_col * stride_w - pad_w;
+    scalar_t* data_col_ptr = data_col +
+        ((c_col * batch_size + b_col) * height_col + h_col) * width_col + w_col;
+    // const scalar_t* data_im_ptr = data_im + ((b_col * num_channels + c_im) *
+    // height + h_in) * width + w_in;
+    const scalar_t* data_im_ptr =
+        data_im + (b_col * num_channels + c_im) * height * width;
+    const scalar_t* data_offset_ptr = data_offset +
+        (b_col * deformable_group + deformable_group_index) * 2 * kernel_h *
+            kernel_w * height_col * width_col;
+
+    for (int i = 0; i < kernel_h; ++i) {
+      for (int j = 0; j < kernel_w; ++j) {
+        const int data_offset_h_ptr =
+            ((2 * (i * kernel_w + j)) * height_col + h_col) * width_col + w_col;
+        const int data_offset_w_ptr =
+            ((2 * (i * kernel_w + j) + 1) * height_col + h_col) * width_col +
+            w_col;
+        const scalar_t offset_h = data_offset_ptr[data_offset_h_ptr];
+        const scalar_t offset_w = data_offset_ptr[data_offset_w_ptr];
+        scalar_t val = static_cast<scalar_t>(0);
+        const scalar_t h_im = h_in + i * dilation_h + offset_h;
+        const scalar_t w_im = w_in + j * dilation_w + offset_w;
+        if (h_im > -1 && w_im > -1 && h_im < height && w_im < width) {
+          // const scalar_t map_h = i * dilation_h + offset_h;
+          // const scalar_t map_w = j * dilation_w + offset_w;
+          // const int cur_height = height - h_in;
+          // const int cur_width = width - w_in;
+          // val = deformable_im2col_bilinear(data_im_ptr, width, cur_height,
+          // cur_width, map_h, map_w);
+          val = deformable_im2col_bilinear(
+              data_im_ptr, width, height, width, h_im, w_im);
+        }
+        *data_col_ptr = val;
+        data_col_ptr += batch_size * height_col * width_col;
+      }
+    }
+  }
+}
+
+
+template <typename scalar_t>
+__global__ void deformable_col2im_gpu_kernel(
+    const int n,
+    const scalar_t* data_col,
+    const scalar_t* data_offset,
+    const int channels,
+    const int height,
+    const int width,
+    const int kernel_h,
+    const int kernel_w,
+    const int pad_h,
+    const int pad_w,
+    const int stride_h,
+    const int stride_w,
+    const int dilation_h,
+    const int dilation_w,
+    const int channel_per_deformable_group,
+    const int batch_size,
+    const int deformable_group,
+    const int height_col,
+    const int width_col,
+    scalar_t* grad_im) {
+  CUDA_KERNEL_LOOP(index, n) {
+    const int j = (index / width_col / height_col / batch_size) % kernel_w;
+    const int i =
+        (index / width_col / height_col / batch_size / kernel_w) % kernel_h;
+    const int c =
+        index / width_col / height_col / batch_size / kernel_w / kernel_h;
+    // compute the start and end of the output
+
+    const int deformable_group_index = c / channel_per_deformable_group;
+
+    int w_out = index % width_col;
+    int h_out = (index / width_col) % height_col;
+    int b = (index / width_col / height_col) % batch_size;
+    int w_in = w_out * stride_w - pad_w;
+    int h_in = h_out * stride_h - pad_h;
+
+    const scalar_t* data_offset_ptr = data_offset +
+        (b * deformable_group + deformable_group_index) * 2 * kernel_h *
+            kernel_w * height_col * width_col;
+    const int data_offset_h_ptr =
+        ((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out;
+    const int data_offset_w_ptr =
+        ((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col + w_out;
+    const scalar_t offset_h = data_offset_ptr[data_offset_h_ptr];
+    const scalar_t offset_w = data_offset_ptr[data_offset_w_ptr];
+    const scalar_t cur_inv_h_data = h_in + i * dilation_h + offset_h;
+    const scalar_t cur_inv_w_data = w_in + j * dilation_w + offset_w;
+
+    const scalar_t cur_top_grad = data_col[index];
+    const int cur_h = (int)cur_inv_h_data;
+    const int cur_w = (int)cur_inv_w_data;
+    for (int dy = -2; dy <= 2; dy++) {
+      for (int dx = -2; dx <= 2; dx++) {
+        if (cur_h + dy >= 0 && cur_h + dy < height && cur_w + dx >= 0 &&
+            cur_w + dx < width && abs(cur_inv_h_data - (cur_h + dy)) < 1 &&
+            abs(cur_inv_w_data - (cur_w + dx)) < 1) {
+          int cur_bottom_grad_pos =
+              ((b * channels + c) * height + cur_h + dy) * width + cur_w + dx;
+          scalar_t weight = get_gradient_weight(
+              cur_inv_h_data,
+              cur_inv_w_data,
+              cur_h + dy,
+              cur_w + dx,
+              height,
+              width);
+          atomicAdd(grad_im + cur_bottom_grad_pos, weight * cur_top_grad);
+        }
+      }
+    }
+  }
+}
+
+
+template <typename scalar_t>
+__global__ void deformable_col2im_coord_gpu_kernel(
+    const int n,
+    const scalar_t* data_col,
+    const scalar_t* data_im,
+    const scalar_t* data_offset,
+    const int channels,
+    const int height,
+    const int width,
+    const int kernel_h,
+    const int kernel_w,
+    const int pad_h,
+    const int pad_w,
+    const int stride_h,
+    const int stride_w,
+    const int dilation_h,
+    const int dilation_w,
+    const int channel_per_deformable_group,
+    const int batch_size,
+    const int offset_channels,
+    const int deformable_group,
+    const int height_col,
+    const int width_col,
+    scalar_t* grad_offset) {
+  CUDA_KERNEL_LOOP(index, n) {
+    scalar_t val = 0;
+    int w = index % width_col;
+    int h = (index / width_col) % height_col;
+    int c = (index / width_col / height_col) % offset_channels;
+    int b = (index / width_col / height_col) / offset_channels;
+    // compute the start and end of the output
+
+    const int deformable_group_index = c / (2 * kernel_h * kernel_w);
+    const int col_step = kernel_h * kernel_w;
+    int cnt = 0;
+    const scalar_t* data_col_ptr = data_col +
+        deformable_group_index * channel_per_deformable_group * batch_size *
+            width_col * height_col;
+    const scalar_t* data_im_ptr = data_im +
+        (b * deformable_group + deformable_group_index) *
+            channel_per_deformable_group / kernel_h / kernel_w * height * width;
+    const scalar_t* data_offset_ptr = data_offset +
+        (b * deformable_group + deformable_group_index) * 2 * kernel_h *
+            kernel_w * height_col * width_col;
+
+    const int offset_c = c - deformable_group_index * 2 * kernel_h * kernel_w;
+
+    for (int col_c = (offset_c / 2); col_c < channel_per_deformable_group;
+         col_c += col_step) {
+      const int col_pos =
+          (((col_c * batch_size + b) * height_col) + h) * width_col + w;
+      const int bp_dir = offset_c % 2;
+
+      int j = (col_pos / width_col / height_col / batch_size) % kernel_w;
+      int i =
+          (col_pos / width_col / height_col / batch_size / kernel_w) % kernel_h;
+      int w_out = col_pos % width_col;
+      int h_out = (col_pos / width_col) % height_col;
+      int w_in = w_out * stride_w - pad_w;
+      int h_in = h_out * stride_h - pad_h;
+      const int data_offset_h_ptr =
+          (((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out);
+      const int data_offset_w_ptr =
+          (((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col +
+           w_out);
+      const scalar_t offset_h = data_offset_ptr[data_offset_h_ptr];
+      const scalar_t offset_w = data_offset_ptr[data_offset_w_ptr];
+      scalar_t inv_h = h_in + i * dilation_h + offset_h;
+      scalar_t inv_w = w_in + j * dilation_w + offset_w;
+      if (inv_h <= -1 || inv_w <= -1 || inv_h >= height || inv_w >= width) {
+        inv_h = inv_w = -2;
+      }
+      const scalar_t weight = get_coordinate_weight(
+          inv_h,
+          inv_w,
+          height,
+          width,
+          data_im_ptr + cnt * height * width,
+          width,
+          bp_dir);
+      val += weight * data_col_ptr[col_pos];
+      cnt += 1;
+    }
+
+    grad_offset[index] = val;
+  }
+}
+
+
+namespace detectron2 {
+
+void deformable_im2col(
+    const at::Tensor data_im,
+    const at::Tensor data_offset,
+    const int channels,
+    const int height,
+    const int width,
+    const int ksize_h,
+    const int ksize_w,
+    const int pad_h,
+    const int pad_w,
+    const int stride_h,
+    const int stride_w,
+    const int dilation_h,
+    const int dilation_w,
+    const int parallel_imgs,
+    const int deformable_group,
+    at::Tensor data_col) {
+  // num_axes should be smaller than block size
+  // todo: check parallel_imgs is correctly passed in
+  int height_col =
+      (height + 2 * pad_h - (dilation_h * (ksize_h - 1) + 1)) / stride_h + 1;
+  int width_col =
+      (width + 2 * pad_w - (dilation_w * (ksize_w - 1) + 1)) / stride_w + 1;
+  int num_kernels = channels * height_col * width_col * parallel_imgs;
+  int channel_per_deformable_group = channels / deformable_group;
+
+  at::cuda::CUDAGuard device_guard(data_im.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      data_im.scalar_type(), "deformable_im2col_gpu", ([&] {
+        const scalar_t* data_im_ = data_im.data_ptr<scalar_t>();
+        const scalar_t* data_offset_ = data_offset.data_ptr<scalar_t>();
+        scalar_t* data_col_ = data_col.data_ptr<scalar_t>();
+
+        deformable_im2col_gpu_kernel<<<
+            GET_BLOCKS(num_kernels),
+            CUDA_NUM_THREADS,
+            0,
+            stream>>>(
+            num_kernels,
+            data_im_,
+            data_offset_,
+            height,
+            width,
+            ksize_h,
+            ksize_w,
+            pad_h,
+            pad_w,
+            stride_h,
+            stride_w,
+            dilation_h,
+            dilation_w,
+            channel_per_deformable_group,
+            parallel_imgs,
+            channels,
+            deformable_group,
+            height_col,
+            width_col,
+            data_col_);
+      }));
+
+  cudaError_t err = cudaGetLastError();
+  if (err != cudaSuccess) {
+    printf("error in deformable_im2col: %s\n", cudaGetErrorString(err));
+  }
+}
+
+
+void deformable_col2im(
+    const at::Tensor data_col,
+    const at::Tensor data_offset,
+    const int channels,
+    const int height,
+    const int width,
+    const int ksize_h,
+    const int ksize_w,
+    const int pad_h,
+    const int pad_w,
+    const int stride_h,
+    const int stride_w,
+    const int dilation_h,
+    const int dilation_w,
+    const int parallel_imgs,
+    const int deformable_group,
+    at::Tensor grad_im) {
+  // todo: make sure parallel_imgs is passed in correctly
+  int height_col =
+      (height + 2 * pad_h - (dilation_h * (ksize_h - 1) + 1)) / stride_h + 1;
+  int width_col =
+      (width + 2 * pad_w - (dilation_w * (ksize_w - 1) + 1)) / stride_w + 1;
+  int num_kernels =
+      channels * ksize_h * ksize_w * height_col * width_col * parallel_imgs;
+  int channel_per_deformable_group = channels / deformable_group;
+
+  at::cuda::CUDAGuard device_guard(data_col.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      data_col.scalar_type(), "deformable_col2im_gpu", ([&] {
+        const scalar_t* data_col_ = data_col.data_ptr<scalar_t>();
+        const scalar_t* data_offset_ = data_offset.data_ptr<scalar_t>();
+        scalar_t* grad_im_ = grad_im.data_ptr<scalar_t>();
+
+        deformable_col2im_gpu_kernel<<<
+            GET_BLOCKS(num_kernels),
+            CUDA_NUM_THREADS,
+            0,
+            stream>>>(
+            num_kernels,
+            data_col_,
+            data_offset_,
+            channels,
+            height,
+            width,
+            ksize_h,
+            ksize_w,
+            pad_h,
+            pad_w,
+            stride_h,
+            stride_w,
+            dilation_h,
+            dilation_w,
+            channel_per_deformable_group,
+            parallel_imgs,
+            deformable_group,
+            height_col,
+            width_col,
+            grad_im_);
+      }));
+
+  cudaError_t err = cudaGetLastError();
+  if (err != cudaSuccess) {
+    printf("error in deformable_col2im: %s\n", cudaGetErrorString(err));
+  }
+}
+
+
+void deformable_col2im_coord(
+    const at::Tensor data_col,
+    const at::Tensor data_im,
+    const at::Tensor data_offset,
+    const int channels,
+    const int height,
+    const int width,
+    const int ksize_h,
+    const int ksize_w,
+    const int pad_h,
+    const int pad_w,
+    const int stride_h,
+    const int stride_w,
+    const int dilation_h,
+    const int dilation_w,
+    const int parallel_imgs,
+    const int deformable_group,
+    at::Tensor grad_offset) {
+  int height_col =
+      (height + 2 * pad_h - (dilation_h * (ksize_h - 1) + 1)) / stride_h + 1;
+  int width_col =
+      (width + 2 * pad_w - (dilation_w * (ksize_w - 1) + 1)) / stride_w + 1;
+  int num_kernels = height_col * width_col * 2 * ksize_h * ksize_w *
+      deformable_group * parallel_imgs;
+  int channel_per_deformable_group =
+      channels * ksize_h * ksize_w / deformable_group;
+
+  at::cuda::CUDAGuard device_guard(data_col.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      data_col.scalar_type(), "deformable_col2im_coord_gpu", ([&] {
+        const scalar_t* data_col_ = data_col.data_ptr<scalar_t>();
+        const scalar_t* data_im_ = data_im.data_ptr<scalar_t>();
+        const scalar_t* data_offset_ = data_offset.data_ptr<scalar_t>();
+        scalar_t* grad_offset_ = grad_offset.data_ptr<scalar_t>();
+
+        deformable_col2im_coord_gpu_kernel<<<
+            GET_BLOCKS(num_kernels),
+            CUDA_NUM_THREADS,
+            0,
+            stream>>>(
+            num_kernels,
+            data_col_,
+            data_im_,
+            data_offset_,
+            channels,
+            height,
+            width,
+            ksize_h,
+            ksize_w,
+            pad_h,
+            pad_w,
+            stride_h,
+            stride_w,
+            dilation_h,
+            dilation_w,
+            channel_per_deformable_group,
+            parallel_imgs,
+            2 * ksize_h * ksize_w * deformable_group,
+            deformable_group,
+            height_col,
+            width_col,
+            grad_offset_);
+      }));
+}
+
+} // namespace detectron2
+
+
+template <typename scalar_t>
+__device__ scalar_t dmcn_im2col_bilinear(
+    const scalar_t* bottom_data,
+    const int data_width,
+    const int height,
+    const int width,
+    scalar_t h,
+    scalar_t w) {
+  int h_low = floor(h);
+  int w_low = floor(w);
+  int h_high = h_low + 1;
+  int w_high = w_low + 1;
+
+  scalar_t lh = h - h_low;
+  scalar_t lw = w - w_low;
+  scalar_t hh = 1 - lh, hw = 1 - lw;
+
+  scalar_t v1 = 0;
+  if (h_low >= 0 && w_low >= 0)
+    v1 = bottom_data[h_low * data_width + w_low];
+  scalar_t v2 = 0;
+  if (h_low >= 0 && w_high <= width - 1)
+    v2 = bottom_data[h_low * data_width + w_high];
+  scalar_t v3 = 0;
+  if (h_high <= height - 1 && w_low >= 0)
+    v3 = bottom_data[h_high * data_width + w_low];
+  scalar_t v4 = 0;
+  if (h_high <= height - 1 && w_high <= width - 1)
+    v4 = bottom_data[h_high * data_width + w_high];
+
+  scalar_t w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
+
+  scalar_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+  return val;
+}
+
+template <typename scalar_t>
+__device__ scalar_t dmcn_get_gradient_weight(
+    scalar_t argmax_h,
+    scalar_t argmax_w,
+    const int h,
+    const int w,
+    const int height,
+    const int width) {
+  if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 ||
+      argmax_w >= width) {
+    // empty
+    return 0;
+  }
+
+  int argmax_h_low = floor(argmax_h);
+  int argmax_w_low = floor(argmax_w);
+  int argmax_h_high = argmax_h_low + 1;
+  int argmax_w_high = argmax_w_low + 1;
+
+  scalar_t weight = 0;
+  if (h == argmax_h_low && w == argmax_w_low)
+    weight = (h + 1 - argmax_h) * (w + 1 - argmax_w);
+  if (h == argmax_h_low && w == argmax_w_high)
+    weight = (h + 1 - argmax_h) * (argmax_w + 1 - w);
+  if (h == argmax_h_high && w == argmax_w_low)
+    weight = (argmax_h + 1 - h) * (w + 1 - argmax_w);
+  if (h == argmax_h_high && w == argmax_w_high)
+    weight = (argmax_h + 1 - h) * (argmax_w + 1 - w);
+  return weight;
+}
+
+template <typename scalar_t>
+__device__ scalar_t dmcn_get_coordinate_weight(
+    scalar_t argmax_h,
+    scalar_t argmax_w,
+    const int height,
+    const int width,
+    const scalar_t* im_data,
+    const int data_width,
+    const int bp_dir) {
+  if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 ||
+      argmax_w >= width) {
+    // empty
+    return 0;
+  }
+
+  int argmax_h_low = floor(argmax_h);
+  int argmax_w_low = floor(argmax_w);
+  int argmax_h_high = argmax_h_low + 1;
+  int argmax_w_high = argmax_w_low + 1;
+
+  scalar_t weight = 0;
+
+  if (bp_dir == 0) {
+    if (argmax_h_low >= 0 && argmax_w_low >= 0)
+      weight += -1 * (argmax_w_low + 1 - argmax_w) *
+          im_data[argmax_h_low * data_width + argmax_w_low];
+    if (argmax_h_low >= 0 && argmax_w_high <= width - 1)
+      weight += -1 * (argmax_w - argmax_w_low) *
+          im_data[argmax_h_low * data_width + argmax_w_high];
+    if (argmax_h_high <= height - 1 && argmax_w_low >= 0)
+      weight += (argmax_w_low + 1 - argmax_w) *
+          im_data[argmax_h_high * data_width + argmax_w_low];
+    if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1)
+      weight += (argmax_w - argmax_w_low) *
+          im_data[argmax_h_high * data_width + argmax_w_high];
+  } else if (bp_dir == 1) {
+    if (argmax_h_low >= 0 && argmax_w_low >= 0)
+      weight += -1 * (argmax_h_low + 1 - argmax_h) *
+          im_data[argmax_h_low * data_width + argmax_w_low];
+    if (argmax_h_low >= 0 && argmax_w_high <= width - 1)
+      weight += (argmax_h_low + 1 - argmax_h) *
+          im_data[argmax_h_low * data_width + argmax_w_high];
+    if (argmax_h_high <= height - 1 && argmax_w_low >= 0)
+      weight += -1 * (argmax_h - argmax_h_low) *
+          im_data[argmax_h_high * data_width + argmax_w_low];
+    if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1)
+      weight += (argmax_h - argmax_h_low) *
+          im_data[argmax_h_high * data_width + argmax_w_high];
+  }
+
+  return weight;
+}
+
+template <typename scalar_t>
+__global__ void modulated_deformable_im2col_gpu_kernel(
+    const int n,
+    const scalar_t* data_im,
+    const scalar_t* data_offset,
+    const scalar_t* data_mask,
+    const int height,
+    const int width,
+    const int kernel_h,
+    const int kernel_w,
+    const int pad_h,
+    const int pad_w,
+    const int stride_h,
+    const int stride_w,
+    const int dilation_h,
+    const int dilation_w,
+    const int channel_per_deformable_group,
+    const int batch_size,
+    const int num_channels,
+    const int deformable_group,
+    const int height_col,
+    const int width_col,
+    scalar_t* data_col) {
+  CUDA_KERNEL_LOOP(index, n) {
+    // index index of output matrix
+    const int w_col = index % width_col;
+    const int h_col = (index / width_col) % height_col;
+    const int b_col = (index / width_col / height_col) % batch_size;
+    const int c_im = (index / width_col / height_col) / batch_size;
+    const int c_col = c_im * kernel_h * kernel_w;
+
+    // compute deformable group index
+    const int deformable_group_index = c_im / channel_per_deformable_group;
+
+    const int h_in = h_col * stride_h - pad_h;
+    const int w_in = w_col * stride_w - pad_w;
+
+    scalar_t* data_col_ptr = data_col +
+        ((c_col * batch_size + b_col) * height_col + h_col) * width_col + w_col;
+    // const float* data_im_ptr = data_im + ((b_col * num_channels + c_im) *
+    // height + h_in) * width + w_in;
+    const scalar_t* data_im_ptr =
+        data_im + (b_col * num_channels + c_im) * height * width;
+    const scalar_t* data_offset_ptr = data_offset +
+        (b_col * deformable_group + deformable_group_index) * 2 * kernel_h *
+            kernel_w * height_col * width_col;
+
+    const scalar_t* data_mask_ptr = data_mask +
+        (b_col * deformable_group + deformable_group_index) * kernel_h *
+            kernel_w * height_col * width_col;
+
+    for (int i = 0; i < kernel_h; ++i) {
+      for (int j = 0; j < kernel_w; ++j) {
+        const int data_offset_h_ptr =
+            ((2 * (i * kernel_w + j)) * height_col + h_col) * width_col + w_col;
+        const int data_offset_w_ptr =
+            ((2 * (i * kernel_w + j) + 1) * height_col + h_col) * width_col +
+            w_col;
+        const int data_mask_hw_ptr =
+            ((i * kernel_w + j) * height_col + h_col) * width_col + w_col;
+        const scalar_t offset_h = data_offset_ptr[data_offset_h_ptr];
+        const scalar_t offset_w = data_offset_ptr[data_offset_w_ptr];
+        const scalar_t mask = data_mask_ptr[data_mask_hw_ptr];
+        scalar_t val = static_cast<scalar_t>(0);
+        const scalar_t h_im = h_in + i * dilation_h + offset_h;
+        const scalar_t w_im = w_in + j * dilation_w + offset_w;
+        // if (h_im >= 0 && w_im >= 0 && h_im < height && w_im < width) {
+        if (h_im > -1 && w_im > -1 && h_im < height && w_im < width) {
+          // const float map_h = i * dilation_h + offset_h;
+          // const float map_w = j * dilation_w + offset_w;
+          // const int cur_height = height - h_in;
+          // const int cur_width = width - w_in;
+          // val = dmcn_im2col_bilinear(data_im_ptr, width, cur_height,
+          // cur_width, map_h, map_w);
+          val = dmcn_im2col_bilinear(
+              data_im_ptr, width, height, width, h_im, w_im);
+        }
+        *data_col_ptr = val * mask;
+        data_col_ptr += batch_size * height_col * width_col;
+        // data_col_ptr += height_col * width_col;
+      }
+    }
+  }
+}
+
+template <typename scalar_t>
+__global__ void modulated_deformable_col2im_gpu_kernel(
+    const int n,
+    const scalar_t* data_col,
+    const scalar_t* data_offset,
+    const scalar_t* data_mask,
+    const int channels,
+    const int height,
+    const int width,
+    const int kernel_h,
+    const int kernel_w,
+    const int pad_h,
+    const int pad_w,
+    const int stride_h,
+    const int stride_w,
+    const int dilation_h,
+    const int dilation_w,
+    const int channel_per_deformable_group,
+    const int batch_size,
+    const int deformable_group,
+    const int height_col,
+    const int width_col,
+    scalar_t* grad_im) {
+  CUDA_KERNEL_LOOP(index, n) {
+    const int j = (index / width_col / height_col / batch_size) % kernel_w;
+    const int i =
+        (index / width_col / height_col / batch_size / kernel_w) % kernel_h;
+    const int c =
+        index / width_col / height_col / batch_size / kernel_w / kernel_h;
+    // compute the start and end of the output
+
+    const int deformable_group_index = c / channel_per_deformable_group;
+
+    int w_out = index % width_col;
+    int h_out = (index / width_col) % height_col;
+    int b = (index / width_col / height_col) % batch_size;
+    int w_in = w_out * stride_w - pad_w;
+    int h_in = h_out * stride_h - pad_h;
+
+    const scalar_t* data_offset_ptr = data_offset +
+        (b * deformable_group + deformable_group_index) * 2 * kernel_h *
+            kernel_w * height_col * width_col;
+    const scalar_t* data_mask_ptr = data_mask +
+        (b * deformable_group + deformable_group_index) * kernel_h * kernel_w *
+            height_col * width_col;
+    const int data_offset_h_ptr =
+        ((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out;
+    const int data_offset_w_ptr =
+        ((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col + w_out;
+    const int data_mask_hw_ptr =
+        ((i * kernel_w + j) * height_col + h_out) * width_col + w_out;
+    const scalar_t offset_h = data_offset_ptr[data_offset_h_ptr];
+    const scalar_t offset_w = data_offset_ptr[data_offset_w_ptr];
+    const scalar_t mask = data_mask_ptr[data_mask_hw_ptr];
+    const scalar_t cur_inv_h_data = h_in + i * dilation_h + offset_h;
+    const scalar_t cur_inv_w_data = w_in + j * dilation_w + offset_w;
+
+    const scalar_t cur_top_grad = data_col[index] * mask;
+    const int cur_h = (int)cur_inv_h_data;
+    const int cur_w = (int)cur_inv_w_data;
+    for (int dy = -2; dy <= 2; dy++) {
+      for (int dx = -2; dx <= 2; dx++) {
+        if (cur_h + dy >= 0 && cur_h + dy < height && cur_w + dx >= 0 &&
+            cur_w + dx < width && abs(cur_inv_h_data - (cur_h + dy)) < 1 &&
+            abs(cur_inv_w_data - (cur_w + dx)) < 1) {
+          int cur_bottom_grad_pos =
+              ((b * channels + c) * height + cur_h + dy) * width + cur_w + dx;
+          scalar_t weight = dmcn_get_gradient_weight(
+              cur_inv_h_data,
+              cur_inv_w_data,
+              cur_h + dy,
+              cur_w + dx,
+              height,
+              width);
+          atomicAdd(grad_im + cur_bottom_grad_pos, weight * cur_top_grad);
+        }
+      }
+    }
+  }
+}
+
+template <typename scalar_t>
+__global__ void modulated_deformable_col2im_coord_gpu_kernel(
+    const int n,
+    const scalar_t* data_col,
+    const scalar_t* data_im,
+    const scalar_t* data_offset,
+    const scalar_t* data_mask,
+    const int channels,
+    const int height,
+    const int width,
+    const int kernel_h,
+    const int kernel_w,
+    const int pad_h,
+    const int pad_w,
+    const int stride_h,
+    const int stride_w,
+    const int dilation_h,
+    const int dilation_w,
+    const int channel_per_deformable_group,
+    const int batch_size,
+    const int offset_channels,
+    const int deformable_group,
+    const int height_col,
+    const int width_col,
+    scalar_t* grad_offset,
+    scalar_t* grad_mask) {
+  CUDA_KERNEL_LOOP(index, n) {
+    scalar_t val = 0, mval = 0;
+    int w = index % width_col;
+    int h = (index / width_col) % height_col;
+    int c = (index / width_col / height_col) % offset_channels;
+    int b = (index / width_col / height_col) / offset_channels;
+    // compute the start and end of the output
+
+    const int deformable_group_index = c / (2 * kernel_h * kernel_w);
+    const int col_step = kernel_h * kernel_w;
+    int cnt = 0;
+    const scalar_t* data_col_ptr = data_col +
+        deformable_group_index * channel_per_deformable_group * batch_size *
+            width_col * height_col;
+    const scalar_t* data_im_ptr = data_im +
+        (b * deformable_group + deformable_group_index) *
+            channel_per_deformable_group / kernel_h / kernel_w * height * width;
+    const scalar_t* data_offset_ptr = data_offset +
+        (b * deformable_group + deformable_group_index) * 2 * kernel_h *
+            kernel_w * height_col * width_col;
+    const scalar_t* data_mask_ptr = data_mask +
+        (b * deformable_group + deformable_group_index) * kernel_h * kernel_w *
+            height_col * width_col;
+
+    const int offset_c = c - deformable_group_index * 2 * kernel_h * kernel_w;
+
+    for (int col_c = (offset_c / 2); col_c < channel_per_deformable_group;
+         col_c += col_step) {
+      const int col_pos =
+          (((col_c * batch_size + b) * height_col) + h) * width_col + w;
+      const int bp_dir = offset_c % 2;
+
+      int j = (col_pos / width_col / height_col / batch_size) % kernel_w;
+      int i =
+          (col_pos / width_col / height_col / batch_size / kernel_w) % kernel_h;
+      int w_out = col_pos % width_col;
+      int h_out = (col_pos / width_col) % height_col;
+      int w_in = w_out * stride_w - pad_w;
+      int h_in = h_out * stride_h - pad_h;
+      const int data_offset_h_ptr =
+          (((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out);
+      const int data_offset_w_ptr =
+          (((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col +
+           w_out);
+      const int data_mask_hw_ptr =
+          (((i * kernel_w + j) * height_col + h_out) * width_col + w_out);
+      const scalar_t offset_h = data_offset_ptr[data_offset_h_ptr];
+      const scalar_t offset_w = data_offset_ptr[data_offset_w_ptr];
+      const scalar_t mask = data_mask_ptr[data_mask_hw_ptr];
+      scalar_t inv_h = h_in + i * dilation_h + offset_h;
+      scalar_t inv_w = w_in + j * dilation_w + offset_w;
+      if (inv_h <= -1 || inv_w <= -1 || inv_h >= height || inv_w >= width) {
+        inv_h = inv_w = -2;
+      } else {
+        mval += data_col_ptr[col_pos] *
+            dmcn_im2col_bilinear(
+                    data_im_ptr + cnt * height * width,
+                    width,
+                    height,
+                    width,
+                    inv_h,
+                    inv_w);
+      }
+      const scalar_t weight = dmcn_get_coordinate_weight(
+          inv_h,
+          inv_w,
+          height,
+          width,
+          data_im_ptr + cnt * height * width,
+          width,
+          bp_dir);
+      val += weight * data_col_ptr[col_pos] * mask;
+      cnt += 1;
+    }
+    // KERNEL_ASSIGN(grad_offset[index], offset_req, val);
+    grad_offset[index] = val;
+    if (offset_c % 2 == 0)
+      // KERNEL_ASSIGN(grad_mask[(((b * deformable_group +
+      // deformable_group_index) * kernel_h * kernel_w + offset_c / 2) *
+      // height_col + h) * width_col + w], mask_req, mval);
+      grad_mask
+          [(((b * deformable_group + deformable_group_index) * kernel_h *
+                 kernel_w +
+             offset_c / 2) *
+                height_col +
+            h) *
+               width_col +
+           w] = mval;
+  }
+}
+
+
+namespace detectron2 {
+
+void modulated_deformable_im2col_cuda(
+    const at::Tensor data_im,
+    const at::Tensor data_offset,
+    const at::Tensor data_mask,
+    const int batch_size,
+    const int channels,
+    const int height_im,
+    const int width_im,
+    const int height_col,
+    const int width_col,
+    const int kernel_h,
+    const int kenerl_w,
+    const int pad_h,
+    const int pad_w,
+    const int stride_h,
+    const int stride_w,
+    const int dilation_h,
+    const int dilation_w,
+    const int deformable_group,
+    at::Tensor data_col) {
+  // num_axes should be smaller than block size
+  const int channel_per_deformable_group = channels / deformable_group;
+  const int num_kernels = channels * batch_size * height_col * width_col;
+
+  at::cuda::CUDAGuard device_guard(data_im.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      data_im.scalar_type(), "modulated_deformable_im2col_gpu", ([&] {
+        const scalar_t* data_im_ = data_im.data_ptr<scalar_t>();
+        const scalar_t* data_offset_ = data_offset.data_ptr<scalar_t>();
+        const scalar_t* data_mask_ = data_mask.data_ptr<scalar_t>();
+        scalar_t* data_col_ = data_col.data_ptr<scalar_t>();
+
+        modulated_deformable_im2col_gpu_kernel<<<
+            GET_BLOCKS(num_kernels),
+            CUDA_NUM_THREADS,
+            0,
+            stream>>>(
+            num_kernels,
+            data_im_,
+            data_offset_,
+            data_mask_,
+            height_im,
+            width_im,
+            kernel_h,
+            kenerl_w,
+            pad_h,
+            pad_w,
+            stride_h,
+            stride_w,
+            dilation_h,
+            dilation_w,
+            channel_per_deformable_group,
+            batch_size,
+            channels,
+            deformable_group,
+            height_col,
+            width_col,
+            data_col_);
+      }));
+
+  cudaError_t err = cudaGetLastError();
+  if (err != cudaSuccess) {
+    printf(
+        "error in modulated_deformable_im2col_cuda: %s\n",
+        cudaGetErrorString(err));
+  }
+}
+
+void modulated_deformable_col2im_cuda(
+    const at::Tensor data_col,
+    const at::Tensor data_offset,
+    const at::Tensor data_mask,
+    const int batch_size,
+    const int channels,
+    const int height_im,
+    const int width_im,
+    const int height_col,
+    const int width_col,
+    const int kernel_h,
+    const int kernel_w,
+    const int pad_h,
+    const int pad_w,
+    const int stride_h,
+    const int stride_w,
+    const int dilation_h,
+    const int dilation_w,
+    const int deformable_group,
+    at::Tensor grad_im) {
+  const int channel_per_deformable_group = channels / deformable_group;
+  const int num_kernels =
+      channels * kernel_h * kernel_w * batch_size * height_col * width_col;
+
+  at::cuda::CUDAGuard device_guard(data_col.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      data_col.scalar_type(), "modulated_deformable_col2im_gpu", ([&] {
+        const scalar_t* data_col_ = data_col.data_ptr<scalar_t>();
+        const scalar_t* data_offset_ = data_offset.data_ptr<scalar_t>();
+        const scalar_t* data_mask_ = data_mask.data_ptr<scalar_t>();
+        scalar_t* grad_im_ = grad_im.data_ptr<scalar_t>();
+
+        modulated_deformable_col2im_gpu_kernel<<<
+            GET_BLOCKS(num_kernels),
+            CUDA_NUM_THREADS,
+            0,
+            stream>>>(
+            num_kernels,
+            data_col_,
+            data_offset_,
+            data_mask_,
+            channels,
+            height_im,
+            width_im,
+            kernel_h,
+            kernel_w,
+            pad_h,
+            pad_w,
+            stride_h,
+            stride_w,
+            dilation_h,
+            dilation_w,
+            channel_per_deformable_group,
+            batch_size,
+            deformable_group,
+            height_col,
+            width_col,
+            grad_im_);
+      }));
+
+  cudaError_t err = cudaGetLastError();
+  if (err != cudaSuccess) {
+    printf(
+        "error in modulated_deformable_col2im_cuda: %s\n",
+        cudaGetErrorString(err));
+  }
+}
+
+void modulated_deformable_col2im_coord_cuda(
+    const at::Tensor data_col,
+    const at::Tensor data_im,
+    const at::Tensor data_offset,
+    const at::Tensor data_mask,
+    const int batch_size,
+    const int channels,
+    const int height_im,
+    const int width_im,
+    const int height_col,
+    const int width_col,
+    const int kernel_h,
+    const int kernel_w,
+    const int pad_h,
+    const int pad_w,
+    const int stride_h,
+    const int stride_w,
+    const int dilation_h,
+    const int dilation_w,
+    const int deformable_group,
+    at::Tensor grad_offset,
+    at::Tensor grad_mask) {
+  const int num_kernels = batch_size * height_col * width_col * 2 * kernel_h *
+      kernel_w * deformable_group;
+  const int channel_per_deformable_group =
+      channels * kernel_h * kernel_w / deformable_group;
+
+  at::cuda::CUDAGuard device_guard(data_col.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      data_col.scalar_type(), "modulated_deformable_col2im_coord_gpu", ([&] {
+        const scalar_t* data_col_ = data_col.data_ptr<scalar_t>();
+        const scalar_t* data_im_ = data_im.data_ptr<scalar_t>();
+        const scalar_t* data_offset_ = data_offset.data_ptr<scalar_t>();
+        const scalar_t* data_mask_ = data_mask.data_ptr<scalar_t>();
+        scalar_t* grad_offset_ = grad_offset.data_ptr<scalar_t>();
+        scalar_t* grad_mask_ = grad_mask.data_ptr<scalar_t>();
+
+        modulated_deformable_col2im_coord_gpu_kernel<<<
+            GET_BLOCKS(num_kernels),
+            CUDA_NUM_THREADS,
+            0,
+            stream>>>(
+            num_kernels,
+            data_col_,
+            data_im_,
+            data_offset_,
+            data_mask_,
+            channels,
+            height_im,
+            width_im,
+            kernel_h,
+            kernel_w,
+            pad_h,
+            pad_w,
+            stride_h,
+            stride_w,
+            dilation_h,
+            dilation_w,
+            channel_per_deformable_group,
+            batch_size,
+            2 * kernel_h * kernel_w * deformable_group,
+            deformable_group,
+            height_col,
+            width_col,
+            grad_offset_,
+            grad_mask_);
+      }));
+  cudaError_t err = cudaGetLastError();
+  if (err != cudaSuccess) {
+    printf(
+        "error in modulated_deformable_col2im_coord_cuda: %s\n",
+        cudaGetErrorString(err));
+  }
+}
+
+} // namespace detectron2
diff --git a/detectron2/layers/csrc/nms_rotated/nms_rotated.h b/detectron2/layers/csrc/nms_rotated/nms_rotated.h
new file mode 100644
index 0000000000000000000000000000000000000000..12aca388e47b12dafd20999f2991a9d42f4b904b
--- /dev/null
+++ b/detectron2/layers/csrc/nms_rotated/nms_rotated.h
@@ -0,0 +1,39 @@
+// Copyright (c) Facebook, Inc. and its affiliates.
+#pragma once
+#include <torch/types.h>
+
+namespace detectron2 {
+
+at::Tensor nms_rotated_cpu(
+    const at::Tensor& dets,
+    const at::Tensor& scores,
+    const double iou_threshold);
+
+#if defined(WITH_CUDA) || defined(WITH_HIP)
+at::Tensor nms_rotated_cuda(
+    const at::Tensor& dets,
+    const at::Tensor& scores,
+    const double iou_threshold);
+#endif
+
+// Interface for Python
+// inline is needed to prevent multiple function definitions when this header is
+// included by different cpps
+inline at::Tensor nms_rotated(
+    const at::Tensor& dets,
+    const at::Tensor& scores,
+    const double iou_threshold) {
+  assert(dets.device().is_cuda() == scores.device().is_cuda());
+  if (dets.device().is_cuda()) {
+#if defined(WITH_CUDA) || defined(WITH_HIP)
+    return nms_rotated_cuda(
+        dets.contiguous(), scores.contiguous(), iou_threshold);
+#else
+    AT_ERROR("Detectron2 is not compiled with GPU support!");
+#endif
+  }
+
+  return nms_rotated_cpu(dets.contiguous(), scores.contiguous(), iou_threshold);
+}
+
+} // namespace detectron2
diff --git a/detectron2/layers/csrc/nms_rotated/nms_rotated_cpu.cpp b/detectron2/layers/csrc/nms_rotated/nms_rotated_cpu.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..d7556e645b604aa83d86cc702b783fd8ecedffcc
--- /dev/null
+++ b/detectron2/layers/csrc/nms_rotated/nms_rotated_cpu.cpp
@@ -0,0 +1,75 @@
+// Copyright (c) Facebook, Inc. and its affiliates.
+#include "../box_iou_rotated/box_iou_rotated_utils.h"
+#include "nms_rotated.h"
+
+namespace detectron2 {
+
+template <typename scalar_t>
+at::Tensor nms_rotated_cpu_kernel(
+    const at::Tensor& dets,
+    const at::Tensor& scores,
+    const double iou_threshold) {
+  // nms_rotated_cpu_kernel is modified from torchvision's nms_cpu_kernel,
+  // however, the code in this function is much shorter because
+  // we delegate the IoU computation for rotated boxes to
+  // the single_box_iou_rotated function in box_iou_rotated_utils.h
+  AT_ASSERTM(dets.device().is_cpu(), "dets must be a CPU tensor");
+  AT_ASSERTM(scores.device().is_cpu(), "scores must be a CPU tensor");
+  AT_ASSERTM(
+      dets.scalar_type() == scores.scalar_type(),
+      "dets should have the same type as scores");
+
+  if (dets.numel() == 0) {
+    return at::empty({0}, dets.options().dtype(at::kLong));
+  }
+
+  auto order_t = std::get<1>(scores.sort(0, /* descending=*/true));
+
+  auto ndets = dets.size(0);
+  at::Tensor suppressed_t = at::zeros({ndets}, dets.options().dtype(at::kByte));
+  at::Tensor keep_t = at::zeros({ndets}, dets.options().dtype(at::kLong));
+
+  auto suppressed = suppressed_t.data_ptr<uint8_t>();
+  auto keep = keep_t.data_ptr<int64_t>();
+  auto order = order_t.data_ptr<int64_t>();
+
+  int64_t num_to_keep = 0;
+
+  for (int64_t _i = 0; _i < ndets; _i++) {
+    auto i = order[_i];
+    if (suppressed[i] == 1) {
+      continue;
+    }
+
+    keep[num_to_keep++] = i;
+
+    for (int64_t _j = _i + 1; _j < ndets; _j++) {
+      auto j = order[_j];
+      if (suppressed[j] == 1) {
+        continue;
+      }
+
+      auto ovr = single_box_iou_rotated<scalar_t>(
+          dets[i].data_ptr<scalar_t>(), dets[j].data_ptr<scalar_t>());
+      if (ovr >= iou_threshold) {
+        suppressed[j] = 1;
+      }
+    }
+  }
+  return keep_t.narrow(/*dim=*/0, /*start=*/0, /*length=*/num_to_keep);
+}
+
+at::Tensor nms_rotated_cpu(
+    // input must be contiguous
+    const at::Tensor& dets,
+    const at::Tensor& scores,
+    const double iou_threshold) {
+  auto result = at::empty({0}, dets.options());
+
+  AT_DISPATCH_FLOATING_TYPES(dets.scalar_type(), "nms_rotated", [&] {
+    result = nms_rotated_cpu_kernel<scalar_t>(dets, scores, iou_threshold);
+  });
+  return result;
+}
+
+} // namespace detectron2
diff --git a/detectron2/layers/csrc/nms_rotated/nms_rotated_cuda.cu b/detectron2/layers/csrc/nms_rotated/nms_rotated_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..2a3db5c62e7a2da52ccf5bac980653c943d630fd
--- /dev/null
+++ b/detectron2/layers/csrc/nms_rotated/nms_rotated_cuda.cu
@@ -0,0 +1,145 @@
+// Copyright (c) Facebook, Inc. and its affiliates.
+#include <ATen/ATen.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAGuard.h>
+#include <ATen/cuda/CUDAApplyUtils.cuh>
+#ifdef WITH_CUDA
+#include "../box_iou_rotated/box_iou_rotated_utils.h"
+#endif
+// TODO avoid this when pytorch supports "same directory" hipification
+#ifdef WITH_HIP
+#include "box_iou_rotated/box_iou_rotated_utils.h"
+#endif
+
+using namespace detectron2;
+
+namespace {
+int const threadsPerBlock = sizeof(unsigned long long) * 8;
+}
+
+template <typename T>
+__global__ void nms_rotated_cuda_kernel(
+    const int n_boxes,
+    const double iou_threshold,
+    const T* dev_boxes,
+    unsigned long long* dev_mask) {
+  // nms_rotated_cuda_kernel is modified from torchvision's nms_cuda_kernel
+
+  const int row_start = blockIdx.y;
+  const int col_start = blockIdx.x;
+
+  // if (row_start > col_start) return;
+
+  const int row_size =
+      min(n_boxes - row_start * threadsPerBlock, threadsPerBlock);
+  const int col_size =
+      min(n_boxes - col_start * threadsPerBlock, threadsPerBlock);
+
+  // Compared to nms_cuda_kernel, where each box is represented with 4 values
+  // (x1, y1, x2, y2), each rotated box is represented with 5 values
+  // (x_center, y_center, width, height, angle_degrees) here.
+  __shared__ T block_boxes[threadsPerBlock * 5];
+  if (threadIdx.x < col_size) {
+    block_boxes[threadIdx.x * 5 + 0] =
+        dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 0];
+    block_boxes[threadIdx.x * 5 + 1] =
+        dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 1];
+    block_boxes[threadIdx.x * 5 + 2] =
+        dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 2];
+    block_boxes[threadIdx.x * 5 + 3] =
+        dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 3];
+    block_boxes[threadIdx.x * 5 + 4] =
+        dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 4];
+  }
+  __syncthreads();
+
+  if (threadIdx.x < row_size) {
+    const int cur_box_idx = threadsPerBlock * row_start + threadIdx.x;
+    const T* cur_box = dev_boxes + cur_box_idx * 5;
+    int i = 0;
+    unsigned long long t = 0;
+    int start = 0;
+    if (row_start == col_start) {
+      start = threadIdx.x + 1;
+    }
+    for (i = start; i < col_size; i++) {
+      // Instead of devIoU used by original horizontal nms, here
+      // we use the single_box_iou_rotated function from box_iou_rotated_utils.h
+      if (single_box_iou_rotated<T>(cur_box, block_boxes + i * 5) >
+          iou_threshold) {
+        t |= 1ULL << i;
+      }
+    }
+    const int col_blocks = at::cuda::ATenCeilDiv(n_boxes, threadsPerBlock);
+    dev_mask[cur_box_idx * col_blocks + col_start] = t;
+  }
+}
+
+namespace detectron2 {
+
+at::Tensor nms_rotated_cuda(
+    // input must be contiguous
+    const at::Tensor& dets,
+    const at::Tensor& scores,
+    double iou_threshold) {
+  // using scalar_t = float;
+  AT_ASSERTM(dets.is_cuda(), "dets must be a CUDA tensor");
+  AT_ASSERTM(scores.is_cuda(), "scores must be a CUDA tensor");
+  at::cuda::CUDAGuard device_guard(dets.device());
+
+  auto order_t = std::get<1>(scores.sort(0, /* descending=*/true));
+  auto dets_sorted = dets.index_select(0, order_t);
+
+  auto dets_num = dets.size(0);
+
+  const int col_blocks =
+      at::cuda::ATenCeilDiv(static_cast<int>(dets_num), threadsPerBlock);
+
+  at::Tensor mask =
+      at::empty({dets_num * col_blocks}, dets.options().dtype(at::kLong));
+
+  dim3 blocks(col_blocks, col_blocks);
+  dim3 threads(threadsPerBlock);
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  AT_DISPATCH_FLOATING_TYPES(
+      dets_sorted.scalar_type(), "nms_rotated_kernel_cuda", [&] {
+        nms_rotated_cuda_kernel<scalar_t><<<blocks, threads, 0, stream>>>(
+            dets_num,
+            iou_threshold,
+            dets_sorted.data_ptr<scalar_t>(),
+            (unsigned long long*)mask.data_ptr<int64_t>());
+      });
+
+  at::Tensor mask_cpu = mask.to(at::kCPU);
+  unsigned long long* mask_host =
+      (unsigned long long*)mask_cpu.data_ptr<int64_t>();
+
+  std::vector<unsigned long long> remv(col_blocks);
+  memset(&remv[0], 0, sizeof(unsigned long long) * col_blocks);
+
+  at::Tensor keep =
+      at::empty({dets_num}, dets.options().dtype(at::kLong).device(at::kCPU));
+  int64_t* keep_out = keep.data_ptr<int64_t>();
+
+  int num_to_keep = 0;
+  for (int i = 0; i < dets_num; i++) {
+    int nblock = i / threadsPerBlock;
+    int inblock = i % threadsPerBlock;
+
+    if (!(remv[nblock] & (1ULL << inblock))) {
+      keep_out[num_to_keep++] = i;
+      unsigned long long* p = mask_host + i * col_blocks;
+      for (int j = nblock; j < col_blocks; j++) {
+        remv[j] |= p[j];
+      }
+    }
+  }
+
+  AT_CUDA_CHECK(cudaGetLastError());
+  return order_t.index(
+      {keep.narrow(/*dim=*/0, /*start=*/0, /*length=*/num_to_keep)
+           .to(order_t.device(), keep.scalar_type())});
+}
+
+} // namespace detectron2
diff --git a/detectron2/layers/csrc/vision.cpp b/detectron2/layers/csrc/vision.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..c9a2cd4f20e6f58be1c5783d67c64232dd59b560
--- /dev/null
+++ b/detectron2/layers/csrc/vision.cpp
@@ -0,0 +1,117 @@
+// Copyright (c) Facebook, Inc. and its affiliates.
+
+#include <torch/extension.h>
+#include "ROIAlignRotated/ROIAlignRotated.h"
+#include "box_iou_rotated/box_iou_rotated.h"
+#include "cocoeval/cocoeval.h"
+#include "deformable/deform_conv.h"
+#include "nms_rotated/nms_rotated.h"
+
+namespace detectron2 {
+
+#if defined(WITH_CUDA) || defined(WITH_HIP)
+extern int get_cudart_version();
+#endif
+
+std::string get_cuda_version() {
+#if defined(WITH_CUDA) || defined(WITH_HIP)
+  std::ostringstream oss;
+
+#if defined(WITH_CUDA)
+  oss << "CUDA ";
+#else
+  oss << "HIP ";
+#endif
+
+  // copied from
+  // https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/cuda/detail/CUDAHooks.cpp#L231
+  auto printCudaStyleVersion = [&](int v) {
+    oss << (v / 1000) << "." << (v / 10 % 100);
+    if (v % 10 != 0) {
+      oss << "." << (v % 10);
+    }
+  };
+  printCudaStyleVersion(get_cudart_version());
+  return oss.str();
+#else // neither CUDA nor HIP
+  return std::string("not available");
+#endif
+}
+
+bool has_cuda() {
+#if defined(WITH_CUDA)
+  return true;
+#else
+  return false;
+#endif
+}
+
+// similar to
+// https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/Version.cpp
+std::string get_compiler_version() {
+  std::ostringstream ss;
+#if defined(__GNUC__)
+#ifndef __clang__
+
+#if ((__GNUC__ <= 4) && (__GNUC_MINOR__ <= 8))
+#error "GCC >= 4.9 is required!"
+#endif
+
+  { ss << "GCC " << __GNUC__ << "." << __GNUC_MINOR__; }
+#endif
+#endif
+
+#if defined(__clang_major__)
+  {
+    ss << "clang " << __clang_major__ << "." << __clang_minor__ << "."
+       << __clang_patchlevel__;
+  }
+#endif
+
+#if defined(_MSC_VER)
+  { ss << "MSVC " << _MSC_FULL_VER; }
+#endif
+  return ss.str();
+}
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+  m.def("get_compiler_version", &get_compiler_version, "get_compiler_version");
+  m.def("get_cuda_version", &get_cuda_version, "get_cuda_version");
+  m.def("has_cuda", &has_cuda, "has_cuda");
+
+  m.def("deform_conv_forward", &deform_conv_forward, "deform_conv_forward");
+  m.def(
+      "deform_conv_backward_input",
+      &deform_conv_backward_input,
+      "deform_conv_backward_input");
+  m.def(
+      "deform_conv_backward_filter",
+      &deform_conv_backward_filter,
+      "deform_conv_backward_filter");
+  m.def(
+      "modulated_deform_conv_forward",
+      &modulated_deform_conv_forward,
+      "modulated_deform_conv_forward");
+  m.def(
+      "modulated_deform_conv_backward",
+      &modulated_deform_conv_backward,
+      "modulated_deform_conv_backward");
+
+  m.def("COCOevalAccumulate", &COCOeval::Accumulate, "COCOeval::Accumulate");
+  m.def(
+      "COCOevalEvaluateImages",
+      &COCOeval::EvaluateImages,
+      "COCOeval::EvaluateImages");
+  pybind11::class_<COCOeval::InstanceAnnotation>(m, "InstanceAnnotation")
+      .def(pybind11::init<uint64_t, double, double, bool, bool>());
+  pybind11::class_<COCOeval::ImageEvaluation>(m, "ImageEvaluation")
+      .def(pybind11::init<>());
+}
+
+TORCH_LIBRARY(detectron2, m) {
+  m.def("nms_rotated", &nms_rotated);
+  m.def("box_iou_rotated", &box_iou_rotated);
+  m.def("roi_align_rotated_forward", &ROIAlignRotated_forward);
+  m.def("roi_align_rotated_backward", &ROIAlignRotated_backward);
+}
+} // namespace detectron2
diff --git a/detectron2/layers/deform_conv.py b/detectron2/layers/deform_conv.py
new file mode 100644
index 0000000000000000000000000000000000000000..dffb720c2a8d10d9273752dbdd291a3714f91338
--- /dev/null
+++ b/detectron2/layers/deform_conv.py
@@ -0,0 +1,514 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+import math
+from functools import lru_cache
+import torch
+from torch import nn
+from torch.autograd import Function
+from torch.autograd.function import once_differentiable
+from torch.nn.modules.utils import _pair
+from torchvision.ops import deform_conv2d
+
+from detectron2.utils.develop import create_dummy_class, create_dummy_func
+
+from .wrappers import _NewEmptyTensorOp
+
+
+class _DeformConv(Function):
+    @staticmethod
+    def forward(
+        ctx,
+        input,
+        offset,
+        weight,
+        stride=1,
+        padding=0,
+        dilation=1,
+        groups=1,
+        deformable_groups=1,
+        im2col_step=64,
+    ):
+        if input is not None and input.dim() != 4:
+            raise ValueError(
+                "Expected 4D tensor as input, got {}D tensor instead.".format(input.dim())
+            )
+        ctx.stride = _pair(stride)
+        ctx.padding = _pair(padding)
+        ctx.dilation = _pair(dilation)
+        ctx.groups = groups
+        ctx.deformable_groups = deformable_groups
+        ctx.im2col_step = im2col_step
+
+        ctx.save_for_backward(input, offset, weight)
+
+        output = input.new_empty(
+            _DeformConv._output_size(input, weight, ctx.padding, ctx.dilation, ctx.stride)
+        )
+
+        ctx.bufs_ = [input.new_empty(0), input.new_empty(0)]  # columns, ones
+
+        if not input.is_cuda:
+            # TODO: let torchvision support full features of our deformconv.
+            if deformable_groups != 1:
+                raise NotImplementedError(
+                    "Deformable Conv with deformable_groups != 1 is not supported on CPUs!"
+                )
+            return deform_conv2d(
+                input, offset, weight, stride=stride, padding=padding, dilation=dilation
+            )
+        else:
+            cur_im2col_step = _DeformConv._cal_im2col_step(input.shape[0], ctx.im2col_step)
+            assert (input.shape[0] % cur_im2col_step) == 0, "im2col step must divide batchsize"
+
+            _C.deform_conv_forward(
+                input,
+                weight,
+                offset,
+                output,
+                ctx.bufs_[0],
+                ctx.bufs_[1],
+                weight.size(3),
+                weight.size(2),
+                ctx.stride[1],
+                ctx.stride[0],
+                ctx.padding[1],
+                ctx.padding[0],
+                ctx.dilation[1],
+                ctx.dilation[0],
+                ctx.groups,
+                ctx.deformable_groups,
+                cur_im2col_step,
+            )
+        return output
+
+    @staticmethod
+    @once_differentiable
+    def backward(ctx, grad_output):
+        input, offset, weight = ctx.saved_tensors
+
+        grad_input = grad_offset = grad_weight = None
+
+        if not grad_output.is_cuda:
+            raise NotImplementedError("Deformable Conv is not supported on CPUs!")
+        else:
+            cur_im2col_step = _DeformConv._cal_im2col_step(input.shape[0], ctx.im2col_step)
+            assert (input.shape[0] % cur_im2col_step) == 0, "im2col step must divide batchsize"
+
+            if ctx.needs_input_grad[0] or ctx.needs_input_grad[1]:
+                grad_input = torch.zeros_like(input)
+                grad_offset = torch.zeros_like(offset)
+                _C.deform_conv_backward_input(
+                    input,
+                    offset,
+                    grad_output,
+                    grad_input,
+                    grad_offset,
+                    weight,
+                    ctx.bufs_[0],
+                    weight.size(3),
+                    weight.size(2),
+                    ctx.stride[1],
+                    ctx.stride[0],
+                    ctx.padding[1],
+                    ctx.padding[0],
+                    ctx.dilation[1],
+                    ctx.dilation[0],
+                    ctx.groups,
+                    ctx.deformable_groups,
+                    cur_im2col_step,
+                )
+
+            if ctx.needs_input_grad[2]:
+                grad_weight = torch.zeros_like(weight)
+                _C.deform_conv_backward_filter(
+                    input,
+                    offset,
+                    grad_output,
+                    grad_weight,
+                    ctx.bufs_[0],
+                    ctx.bufs_[1],
+                    weight.size(3),
+                    weight.size(2),
+                    ctx.stride[1],
+                    ctx.stride[0],
+                    ctx.padding[1],
+                    ctx.padding[0],
+                    ctx.dilation[1],
+                    ctx.dilation[0],
+                    ctx.groups,
+                    ctx.deformable_groups,
+                    1,
+                    cur_im2col_step,
+                )
+
+        return grad_input, grad_offset, grad_weight, None, None, None, None, None, None
+
+    @staticmethod
+    def _output_size(input, weight, padding, dilation, stride):
+        channels = weight.size(0)
+        output_size = (input.size(0), channels)
+        for d in range(input.dim() - 2):
+            in_size = input.size(d + 2)
+            pad = padding[d]
+            kernel = dilation[d] * (weight.size(d + 2) - 1) + 1
+            stride_ = stride[d]
+            output_size += ((in_size + (2 * pad) - kernel) // stride_ + 1,)
+        if not all(map(lambda s: s > 0, output_size)):
+            raise ValueError(
+                "convolution input is too small (output would be {})".format(
+                    "x".join(map(str, output_size))
+                )
+            )
+        return output_size
+
+    @staticmethod
+    @lru_cache(maxsize=128)
+    def _cal_im2col_step(input_size, default_size):
+        """
+        Calculate proper im2col step size, which should be divisible by input_size and not larger
+        than prefer_size. Meanwhile the step size should be as large as possible to be more
+        efficient. So we choose the largest one among all divisors of input_size which are smaller
+        than prefer_size.
+        :param input_size: input batch size .
+        :param default_size: default preferred im2col step size.
+        :return: the largest proper step size.
+        """
+        if input_size <= default_size:
+            return input_size
+        best_step = 1
+        for step in range(2, min(int(math.sqrt(input_size)) + 1, default_size)):
+            if input_size % step == 0:
+                if input_size // step <= default_size:
+                    return input_size // step
+                best_step = step
+
+        return best_step
+
+
+class _ModulatedDeformConv(Function):
+    @staticmethod
+    def forward(
+        ctx,
+        input,
+        offset,
+        mask,
+        weight,
+        bias=None,
+        stride=1,
+        padding=0,
+        dilation=1,
+        groups=1,
+        deformable_groups=1,
+    ):
+        ctx.stride = stride
+        ctx.padding = padding
+        ctx.dilation = dilation
+        ctx.groups = groups
+        ctx.deformable_groups = deformable_groups
+        ctx.with_bias = bias is not None
+        if not ctx.with_bias:
+            bias = input.new_empty(1)  # fake tensor
+        if not input.is_cuda:
+            raise NotImplementedError("Deformable Conv is not supported on CPUs!")
+        if (
+            weight.requires_grad
+            or mask.requires_grad
+            or offset.requires_grad
+            or input.requires_grad
+        ):
+            ctx.save_for_backward(input, offset, mask, weight, bias)
+        output = input.new_empty(_ModulatedDeformConv._infer_shape(ctx, input, weight))
+        ctx._bufs = [input.new_empty(0), input.new_empty(0)]
+        _C.modulated_deform_conv_forward(
+            input,
+            weight,
+            bias,
+            ctx._bufs[0],
+            offset,
+            mask,
+            output,
+            ctx._bufs[1],
+            weight.shape[2],
+            weight.shape[3],
+            ctx.stride,
+            ctx.stride,
+            ctx.padding,
+            ctx.padding,
+            ctx.dilation,
+            ctx.dilation,
+            ctx.groups,
+            ctx.deformable_groups,
+            ctx.with_bias,
+        )
+        return output
+
+    @staticmethod
+    @once_differentiable
+    def backward(ctx, grad_output):
+        if not grad_output.is_cuda:
+            raise NotImplementedError("Deformable Conv is not supported on CPUs!")
+        input, offset, mask, weight, bias = ctx.saved_tensors
+        grad_input = torch.zeros_like(input)
+        grad_offset = torch.zeros_like(offset)
+        grad_mask = torch.zeros_like(mask)
+        grad_weight = torch.zeros_like(weight)
+        grad_bias = torch.zeros_like(bias)
+        _C.modulated_deform_conv_backward(
+            input,
+            weight,
+            bias,
+            ctx._bufs[0],
+            offset,
+            mask,
+            ctx._bufs[1],
+            grad_input,
+            grad_weight,
+            grad_bias,
+            grad_offset,
+            grad_mask,
+            grad_output,
+            weight.shape[2],
+            weight.shape[3],
+            ctx.stride,
+            ctx.stride,
+            ctx.padding,
+            ctx.padding,
+            ctx.dilation,
+            ctx.dilation,
+            ctx.groups,
+            ctx.deformable_groups,
+            ctx.with_bias,
+        )
+        if not ctx.with_bias:
+            grad_bias = None
+
+        return (
+            grad_input,
+            grad_offset,
+            grad_mask,
+            grad_weight,
+            grad_bias,
+            None,
+            None,
+            None,
+            None,
+            None,
+        )
+
+    @staticmethod
+    def _infer_shape(ctx, input, weight):
+        n = input.size(0)
+        channels_out = weight.size(0)
+        height, width = input.shape[2:4]
+        kernel_h, kernel_w = weight.shape[2:4]
+        height_out = (
+            height + 2 * ctx.padding - (ctx.dilation * (kernel_h - 1) + 1)
+        ) // ctx.stride + 1
+        width_out = (
+            width + 2 * ctx.padding - (ctx.dilation * (kernel_w - 1) + 1)
+        ) // ctx.stride + 1
+        return n, channels_out, height_out, width_out
+
+
+deform_conv = _DeformConv.apply
+modulated_deform_conv = _ModulatedDeformConv.apply
+
+
+class DeformConv(nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        kernel_size,
+        stride=1,
+        padding=0,
+        dilation=1,
+        groups=1,
+        deformable_groups=1,
+        bias=False,
+        norm=None,
+        activation=None,
+    ):
+        """
+        Deformable convolution from :paper:`deformconv`.
+
+        Arguments are similar to :class:`Conv2D`. Extra arguments:
+
+        Args:
+            deformable_groups (int): number of groups used in deformable convolution.
+            norm (nn.Module, optional): a normalization layer
+            activation (callable(Tensor) -> Tensor): a callable activation function
+        """
+        super(DeformConv, self).__init__()
+
+        assert not bias
+        assert in_channels % groups == 0, "in_channels {} cannot be divisible by groups {}".format(
+            in_channels, groups
+        )
+        assert (
+            out_channels % groups == 0
+        ), "out_channels {} cannot be divisible by groups {}".format(out_channels, groups)
+
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.kernel_size = _pair(kernel_size)
+        self.stride = _pair(stride)
+        self.padding = _pair(padding)
+        self.dilation = _pair(dilation)
+        self.groups = groups
+        self.deformable_groups = deformable_groups
+        self.norm = norm
+        self.activation = activation
+
+        self.weight = nn.Parameter(
+            torch.Tensor(out_channels, in_channels // self.groups, *self.kernel_size)
+        )
+        self.bias = None
+
+        nn.init.kaiming_uniform_(self.weight, nonlinearity="relu")
+
+    def forward(self, x, offset):
+        if x.numel() == 0:
+            # When input is empty, we want to return a empty tensor with "correct" shape,
+            # So that the following operations will not panic
+            # if they check for the shape of the tensor.
+            # This computes the height and width of the output tensor
+            output_shape = [
+                (i + 2 * p - (di * (k - 1) + 1)) // s + 1
+                for i, p, di, k, s in zip(
+                    x.shape[-2:], self.padding, self.dilation, self.kernel_size, self.stride
+                )
+            ]
+            output_shape = [x.shape[0], self.weight.shape[0]] + output_shape
+            return _NewEmptyTensorOp.apply(x, output_shape)
+
+        x = deform_conv(
+            x,
+            offset,
+            self.weight,
+            self.stride,
+            self.padding,
+            self.dilation,
+            self.groups,
+            self.deformable_groups,
+        )
+        if self.norm is not None:
+            x = self.norm(x)
+        if self.activation is not None:
+            x = self.activation(x)
+        return x
+
+    def extra_repr(self):
+        tmpstr = "in_channels=" + str(self.in_channels)
+        tmpstr += ", out_channels=" + str(self.out_channels)
+        tmpstr += ", kernel_size=" + str(self.kernel_size)
+        tmpstr += ", stride=" + str(self.stride)
+        tmpstr += ", padding=" + str(self.padding)
+        tmpstr += ", dilation=" + str(self.dilation)
+        tmpstr += ", groups=" + str(self.groups)
+        tmpstr += ", deformable_groups=" + str(self.deformable_groups)
+        tmpstr += ", bias=False"
+        return tmpstr
+
+
+class ModulatedDeformConv(nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        kernel_size,
+        stride=1,
+        padding=0,
+        dilation=1,
+        groups=1,
+        deformable_groups=1,
+        bias=True,
+        norm=None,
+        activation=None,
+    ):
+        """
+        Modulated deformable convolution from :paper:`deformconv2`.
+
+        Arguments are similar to :class:`Conv2D`. Extra arguments:
+
+        Args:
+            deformable_groups (int): number of groups used in deformable convolution.
+            norm (nn.Module, optional): a normalization layer
+            activation (callable(Tensor) -> Tensor): a callable activation function
+        """
+        super(ModulatedDeformConv, self).__init__()
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.kernel_size = _pair(kernel_size)
+        self.stride = stride
+        self.padding = padding
+        self.dilation = dilation
+        self.groups = groups
+        self.deformable_groups = deformable_groups
+        self.with_bias = bias
+        self.norm = norm
+        self.activation = activation
+
+        self.weight = nn.Parameter(
+            torch.Tensor(out_channels, in_channels // groups, *self.kernel_size)
+        )
+        if bias:
+            self.bias = nn.Parameter(torch.Tensor(out_channels))
+        else:
+            self.bias = None
+
+        nn.init.kaiming_uniform_(self.weight, nonlinearity="relu")
+        if self.bias is not None:
+            nn.init.constant_(self.bias, 0)
+
+    def forward(self, x, offset, mask):
+        if x.numel() == 0:
+            output_shape = [
+                (i + 2 * p - (di * (k - 1) + 1)) // s + 1
+                for i, p, di, k, s in zip(
+                    x.shape[-2:], self.padding, self.dilation, self.kernel_size, self.stride
+                )
+            ]
+            output_shape = [x.shape[0], self.weight.shape[0]] + output_shape
+            return _NewEmptyTensorOp.apply(x, output_shape)
+
+        x = modulated_deform_conv(
+            x,
+            offset,
+            mask,
+            self.weight,
+            self.bias,
+            self.stride,
+            self.padding,
+            self.dilation,
+            self.groups,
+            self.deformable_groups,
+        )
+        if self.norm is not None:
+            x = self.norm(x)
+        if self.activation is not None:
+            x = self.activation(x)
+        return x
+
+    def extra_repr(self):
+        tmpstr = "in_channels=" + str(self.in_channels)
+        tmpstr += ", out_channels=" + str(self.out_channels)
+        tmpstr += ", kernel_size=" + str(self.kernel_size)
+        tmpstr += ", stride=" + str(self.stride)
+        tmpstr += ", padding=" + str(self.padding)
+        tmpstr += ", dilation=" + str(self.dilation)
+        tmpstr += ", groups=" + str(self.groups)
+        tmpstr += ", deformable_groups=" + str(self.deformable_groups)
+        tmpstr += ", bias=" + str(self.with_bias)
+        return tmpstr
+
+
+try:
+    from detectron2 import _C
+except ImportError:
+    # TODO: register ops natively so there is no need to import _C.
+    _msg = "detectron2 is not compiled successfully, please build following the instructions!"
+    _args = ("detectron2._C", _msg)
+    DeformConv = create_dummy_class("DeformConv", *_args)
+    ModulatedDeformConv = create_dummy_class("ModulatedDeformConv", *_args)
+    deform_conv = create_dummy_func("deform_conv", *_args)
+    modulated_deform_conv = create_dummy_func("modulated_deform_conv", *_args)
diff --git a/detectron2/layers/losses.py b/detectron2/layers/losses.py
new file mode 100644
index 0000000000000000000000000000000000000000..850a852a2f0986d4d1ce89a526d96db42c76e44f
--- /dev/null
+++ b/detectron2/layers/losses.py
@@ -0,0 +1,133 @@
+import math
+import torch
+
+
+def diou_loss(
+    boxes1: torch.Tensor,
+    boxes2: torch.Tensor,
+    reduction: str = "none",
+    eps: float = 1e-7,
+) -> torch.Tensor:
+    """
+    Distance Intersection over Union Loss (Zhaohui Zheng et. al)
+    https://arxiv.org/abs/1911.08287
+    Args:
+        boxes1, boxes2 (Tensor): box locations in XYXY format, shape (N, 4) or (4,).
+        reduction: 'none' | 'mean' | 'sum'
+                 'none': No reduction will be applied to the output.
+                 'mean': The output will be averaged.
+                 'sum': The output will be summed.
+        eps (float): small number to prevent division by zero
+    """
+
+    x1, y1, x2, y2 = boxes1.unbind(dim=-1)
+    x1g, y1g, x2g, y2g = boxes2.unbind(dim=-1)
+
+    # TODO: use torch._assert_async() when pytorch 1.8 support is dropped
+    assert (x2 >= x1).all(), "bad box: x1 larger than x2"
+    assert (y2 >= y1).all(), "bad box: y1 larger than y2"
+
+    # Intersection keypoints
+    xkis1 = torch.max(x1, x1g)
+    ykis1 = torch.max(y1, y1g)
+    xkis2 = torch.min(x2, x2g)
+    ykis2 = torch.min(y2, y2g)
+
+    intsct = torch.zeros_like(x1)
+    mask = (ykis2 > ykis1) & (xkis2 > xkis1)
+    intsct[mask] = (xkis2[mask] - xkis1[mask]) * (ykis2[mask] - ykis1[mask])
+    union = (x2 - x1) * (y2 - y1) + (x2g - x1g) * (y2g - y1g) - intsct + eps
+    iou = intsct / union
+
+    # smallest enclosing box
+    xc1 = torch.min(x1, x1g)
+    yc1 = torch.min(y1, y1g)
+    xc2 = torch.max(x2, x2g)
+    yc2 = torch.max(y2, y2g)
+    diag_len = ((xc2 - xc1) ** 2) + ((yc2 - yc1) ** 2) + eps
+
+    # centers of boxes
+    x_p = (x2 + x1) / 2
+    y_p = (y2 + y1) / 2
+    x_g = (x1g + x2g) / 2
+    y_g = (y1g + y2g) / 2
+    distance = ((x_p - x_g) ** 2) + ((y_p - y_g) ** 2)
+
+    # Eqn. (7)
+    loss = 1 - iou + (distance / diag_len)
+    if reduction == "mean":
+        loss = loss.mean() if loss.numel() > 0 else 0.0 * loss.sum()
+    elif reduction == "sum":
+        loss = loss.sum()
+
+    return loss
+
+
+def ciou_loss(
+    boxes1: torch.Tensor,
+    boxes2: torch.Tensor,
+    reduction: str = "none",
+    eps: float = 1e-7,
+) -> torch.Tensor:
+    """
+    Complete Intersection over Union Loss (Zhaohui Zheng et. al)
+    https://arxiv.org/abs/1911.08287
+    Args:
+        boxes1, boxes2 (Tensor): box locations in XYXY format, shape (N, 4) or (4,).
+        reduction: 'none' | 'mean' | 'sum'
+                 'none': No reduction will be applied to the output.
+                 'mean': The output will be averaged.
+                 'sum': The output will be summed.
+        eps (float): small number to prevent division by zero
+    """
+
+    x1, y1, x2, y2 = boxes1.unbind(dim=-1)
+    x1g, y1g, x2g, y2g = boxes2.unbind(dim=-1)
+
+    # TODO: use torch._assert_async() when pytorch 1.8 support is dropped
+    assert (x2 >= x1).all(), "bad box: x1 larger than x2"
+    assert (y2 >= y1).all(), "bad box: y1 larger than y2"
+
+    # Intersection keypoints
+    xkis1 = torch.max(x1, x1g)
+    ykis1 = torch.max(y1, y1g)
+    xkis2 = torch.min(x2, x2g)
+    ykis2 = torch.min(y2, y2g)
+
+    intsct = torch.zeros_like(x1)
+    mask = (ykis2 > ykis1) & (xkis2 > xkis1)
+    intsct[mask] = (xkis2[mask] - xkis1[mask]) * (ykis2[mask] - ykis1[mask])
+    union = (x2 - x1) * (y2 - y1) + (x2g - x1g) * (y2g - y1g) - intsct + eps
+    iou = intsct / union
+
+    # smallest enclosing box
+    xc1 = torch.min(x1, x1g)
+    yc1 = torch.min(y1, y1g)
+    xc2 = torch.max(x2, x2g)
+    yc2 = torch.max(y2, y2g)
+    diag_len = ((xc2 - xc1) ** 2) + ((yc2 - yc1) ** 2) + eps
+
+    # centers of boxes
+    x_p = (x2 + x1) / 2
+    y_p = (y2 + y1) / 2
+    x_g = (x1g + x2g) / 2
+    y_g = (y1g + y2g) / 2
+    distance = ((x_p - x_g) ** 2) + ((y_p - y_g) ** 2)
+
+    # width and height of boxes
+    w_pred = x2 - x1
+    h_pred = y2 - y1
+    w_gt = x2g - x1g
+    h_gt = y2g - y1g
+    v = (4 / (math.pi**2)) * torch.pow((torch.atan(w_gt / h_gt) - torch.atan(w_pred / h_pred)), 2)
+    with torch.no_grad():
+        alpha = v / (1 - iou + v + eps)
+
+    # Eqn. (10)
+    loss = 1 - iou + (distance / diag_len) + alpha * v
+    if reduction == "mean":
+        loss = loss.mean() if loss.numel() > 0 else 0.0 * loss.sum()
+    elif reduction == "sum":
+        loss = loss.sum()
+
+    return loss
diff --git a/detectron2/layers/mask_ops.py b/detectron2/layers/mask_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..990d04abbb120e40fe07a21d024dfead471bc998
--- /dev/null
+++ b/detectron2/layers/mask_ops.py
@@ -0,0 +1,275 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+import numpy as np
+from typing import Tuple
+import torch
+from PIL import Image
+from torch.nn import functional as F
+
+__all__ = ["paste_masks_in_image"]
+
+
+BYTES_PER_FLOAT = 4
+# TODO: This memory limit may be too much or too little. It would be better to
+# determine it based on available resources.
+GPU_MEM_LIMIT = 1024**3  # 1 GB memory limit
+
+
+def _do_paste_mask(masks, boxes, img_h: int, img_w: int, skip_empty: bool = True):
+    """
+    Args:
+        masks: N, 1, H, W
+        boxes: N, 4
+        img_h, img_w (int):
+        skip_empty (bool): only paste masks within the region that
+            tightly bound all boxes, and returns the results this region only.
+            An important optimization for CPU.
+
+    Returns:
+        if skip_empty == False, a mask of shape (N, img_h, img_w)
+        if skip_empty == True, a mask of shape (N, h', w'), and the slice
+            object for the corresponding region.
+    """
+    # On GPU, paste all masks together (up to chunk size)
+    # by using the entire image to sample the masks
+    # Compared to pasting them one by one,
+    # this has more operations but is faster on COCO-scale dataset.
+    device = masks.device
+
+    if skip_empty and not torch.jit.is_scripting():
+        x0_int, y0_int = torch.clamp(boxes.min(dim=0).values.floor()[:2] - 1, min=0).to(
+            dtype=torch.int32
+        )
+        x1_int = torch.clamp(boxes[:, 2].max().ceil() + 1, max=img_w).to(dtype=torch.int32)
+        y1_int = torch.clamp(boxes[:, 3].max().ceil() + 1, max=img_h).to(dtype=torch.int32)
+    else:
+        x0_int, y0_int = 0, 0
+        x1_int, y1_int = img_w, img_h
+    x0, y0, x1, y1 = torch.split(boxes, 1, dim=1)  # each is Nx1
+
+    N = masks.shape[0]
+
+    img_y = torch.arange(y0_int, y1_int, device=device, dtype=torch.float32) + 0.5
+    img_x = torch.arange(x0_int, x1_int, device=device, dtype=torch.float32) + 0.5
+    img_y = (img_y - y0) / (y1 - y0) * 2 - 1
+    img_x = (img_x - x0) / (x1 - x0) * 2 - 1
+    # img_x, img_y have shapes (N, w), (N, h)
+
+    gx = img_x[:, None, :].expand(N, img_y.size(1), img_x.size(1))
+    gy = img_y[:, :, None].expand(N, img_y.size(1), img_x.size(1))
+    grid = torch.stack([gx, gy], dim=3)
+
+    if not torch.jit.is_scripting():
+        if not masks.dtype.is_floating_point:
+            masks = masks.float()
+    img_masks = F.grid_sample(masks, grid.to(masks.dtype), align_corners=False)
+
+    if skip_empty and not torch.jit.is_scripting():
+        return img_masks[:, 0], (slice(y0_int, y1_int), slice(x0_int, x1_int))
+    else:
+        return img_masks[:, 0], ()
+
+
+# Annotate boxes as Tensor (but not Boxes) in order to use scripting
+@torch.jit.script_if_tracing
+def paste_masks_in_image(
+    masks: torch.Tensor, boxes: torch.Tensor, image_shape: Tuple[int, int], threshold: float = 0.5
+):
+    """
+    Paste a set of masks that are of a fixed resolution (e.g., 28 x 28) into an image.
+    The location, height, and width for pasting each mask is determined by their
+    corresponding bounding boxes in boxes.
+
+    Note:
+        This is a complicated but more accurate implementation. In actual deployment, it is
+        often enough to use a faster but less accurate implementation.
+        See :func:`paste_mask_in_image_old` in this file for an alternative implementation.
+
+    Args:
+        masks (tensor): Tensor of shape (Bimg, Hmask, Wmask), where Bimg is the number of
+            detected object instances in the image and Hmask, Wmask are the mask width and mask
+            height of the predicted mask (e.g., Hmask = Wmask = 28). Values are in [0, 1].
+        boxes (Boxes or Tensor): A Boxes of length Bimg or Tensor of shape (Bimg, 4).
+            boxes[i] and masks[i] correspond to the same object instance.
+        image_shape (tuple): height, width
+        threshold (float): A threshold in [0, 1] for converting the (soft) masks to
+            binary masks.
+
+    Returns:
+        img_masks (Tensor): A tensor of shape (Bimg, Himage, Wimage), where Bimg is the
+        number of detected object instances and Himage, Wimage are the image width
+        and height. img_masks[i] is a binary mask for object instance i.
+    """
+
+    assert masks.shape[-1] == masks.shape[-2], "Only square mask predictions are supported"
+    N = len(masks)
+    if N == 0:
+        return masks.new_empty((0,) + image_shape, dtype=torch.uint8)
+    if not isinstance(boxes, torch.Tensor):
+        boxes = boxes.tensor
+    device = boxes.device
+    assert len(boxes) == N, boxes.shape
+
+    img_h, img_w = image_shape
+
+    # The actual implementation split the input into chunks,
+    # and paste them chunk by chunk.
+    if device.type == "cpu" or torch.jit.is_scripting():
+        # CPU is most efficient when they are pasted one by one with skip_empty=True
+        # so that it performs minimal number of operations.
+        num_chunks = N
+    else:
+        # GPU benefits from parallelism for larger chunks, but may have memory issue
+        # int(img_h) because shape may be tensors in tracing
+        num_chunks = int(np.ceil(N * int(img_h) * int(img_w) * BYTES_PER_FLOAT / GPU_MEM_LIMIT))
+        assert (
+            num_chunks <= N
+        ), "Default GPU_MEM_LIMIT in mask_ops.py is too small; try increasing it"
+    chunks = torch.chunk(torch.arange(N, device=device), num_chunks)
+
+    img_masks = torch.zeros(
+        N, img_h, img_w, device=device, dtype=torch.bool if threshold >= 0 else torch.uint8
+    )
+    for inds in chunks:
+        masks_chunk, spatial_inds = _do_paste_mask(
+            masks[inds, None, :, :], boxes[inds], img_h, img_w, skip_empty=device.type == "cpu"
+        )
+
+        if threshold >= 0:
+            masks_chunk = (masks_chunk >= threshold).to(dtype=torch.bool)
+        else:
+            # for visualization and debugging
+            masks_chunk = (masks_chunk * 255).to(dtype=torch.uint8)
+
+        if torch.jit.is_scripting():  # Scripting does not use the optimized codepath
+            img_masks[inds] = masks_chunk
+        else:
+            img_masks[(inds,) + spatial_inds] = masks_chunk
+    return img_masks
+
+
+# The below are the original paste function (from Detectron1) which has
+# larger quantization error.
+# It is faster on CPU, while the aligned one is faster on GPU thanks to grid_sample.
+
+
+def paste_mask_in_image_old(mask, box, img_h, img_w, threshold):
+    """
+    Paste a single mask in an image.
+    This is a per-box implementation of :func:`paste_masks_in_image`.
+    This function has larger quantization error due to incorrect pixel
+    modeling and is not used any more.
+
+    Args:
+        mask (Tensor): A tensor of shape (Hmask, Wmask) storing the mask of a single
+            object instance. Values are in [0, 1].
+        box (Tensor): A tensor of shape (4, ) storing the x0, y0, x1, y1 box corners
+            of the object instance.
+        img_h, img_w (int): Image height and width.
+        threshold (float): Mask binarization threshold in [0, 1].
+
+    Returns:
+        im_mask (Tensor):
+            The resized and binarized object mask pasted into the original
+            image plane (a tensor of shape (img_h, img_w)).
+    """
+    # Conversion from continuous box coordinates to discrete pixel coordinates
+    # via truncation (cast to int32). This determines which pixels to paste the
+    # mask onto.
+    box = box.to(dtype=torch.int32)  # Continuous to discrete coordinate conversion
+    # An example (1D) box with continuous coordinates (x0=0.7, x1=4.3) will map to
+    # a discrete coordinates (x0=0, x1=4). Note that box is mapped to 5 = x1 - x0 + 1
+    # pixels (not x1 - x0 pixels).
+    samples_w = box[2] - box[0] + 1  # Number of pixel samples, *not* geometric width
+    samples_h = box[3] - box[1] + 1  # Number of pixel samples, *not* geometric height
+
+    # Resample the mask from it's original grid to the new samples_w x samples_h grid
+    mask = Image.fromarray(mask.cpu().numpy())
+    mask = mask.resize((samples_w, samples_h), resample=Image.BILINEAR)
+    mask = np.array(mask, copy=False)
+
+    if threshold >= 0:
+        mask = np.array(mask > threshold, dtype=np.uint8)
+        mask = torch.from_numpy(mask)
+    else:
+        # for visualization and debugging, we also
+        # allow it to return an unmodified mask
+        mask = torch.from_numpy(mask * 255).to(torch.uint8)
+
+    im_mask = torch.zeros((img_h, img_w), dtype=torch.uint8)
+    x_0 = max(box[0], 0)
+    x_1 = min(box[2] + 1, img_w)
+    y_0 = max(box[1], 0)
+    y_1 = min(box[3] + 1, img_h)
+
+    im_mask[y_0:y_1, x_0:x_1] = mask[
+        (y_0 - box[1]) : (y_1 - box[1]), (x_0 - box[0]) : (x_1 - box[0])
+    ]
+    return im_mask
+
+
+# Our pixel modeling requires extrapolation for any continuous
+# coordinate < 0.5 or > length - 0.5. When sampling pixels on the masks,
+# we would like this extrapolation to be an interpolation between boundary values and zero,
+# instead of using absolute zero or boundary values.
+# Therefore `paste_mask_in_image_old` is often used with zero padding around the masks like this:
+# masks, scale = pad_masks(masks[:, 0, :, :], 1)
+# boxes = scale_boxes(boxes.tensor, scale)
+
+
+def pad_masks(masks, padding):
+    """
+    Args:
+        masks (tensor): A tensor of shape (B, M, M) representing B masks.
+        padding (int): Number of cells to pad on all sides.
+
+    Returns:
+        The padded masks and the scale factor of the padding size / original size.
+    """
+    B = masks.shape[0]
+    M = masks.shape[-1]
+    pad2 = 2 * padding
+    scale = float(M + pad2) / M
+    padded_masks = masks.new_zeros((B, M + pad2, M + pad2))
+    padded_masks[:, padding:-padding, padding:-padding] = masks
+    return padded_masks, scale
+
+
+def scale_boxes(boxes, scale):
+    """
+    Args:
+        boxes (tensor): A tensor of shape (B, 4) representing B boxes with 4
+            coords representing the corners x0, y0, x1, y1,
+        scale (float): The box scaling factor.
+
+    Returns:
+        Scaled boxes.
+    """
+    w_half = (boxes[:, 2] - boxes[:, 0]) * 0.5
+    h_half = (boxes[:, 3] - boxes[:, 1]) * 0.5
+    x_c = (boxes[:, 2] + boxes[:, 0]) * 0.5
+    y_c = (boxes[:, 3] + boxes[:, 1]) * 0.5
+
+    w_half *= scale
+    h_half *= scale
+
+    scaled_boxes = torch.zeros_like(boxes)
+    scaled_boxes[:, 0] = x_c - w_half
+    scaled_boxes[:, 2] = x_c + w_half
+    scaled_boxes[:, 1] = y_c - h_half
+    scaled_boxes[:, 3] = y_c + h_half
+    return scaled_boxes
+
+
+@torch.jit.script_if_tracing
+def _paste_masks_tensor_shape(
+    masks: torch.Tensor,
+    boxes: torch.Tensor,
+    image_shape: Tuple[torch.Tensor, torch.Tensor],
+    threshold: float = 0.5,
+):
+    """
+    A wrapper of paste_masks_in_image where image_shape is Tensor.
+    During tracing, shapes might be tensors instead of ints. The Tensor->int
+    conversion should be scripted rather than traced.
+    """
+    return paste_masks_in_image(masks, boxes, (int(image_shape[0]), int(image_shape[1])), threshold)
diff --git a/detectron2/layers/nms.py b/detectron2/layers/nms.py
new file mode 100644
index 0000000000000000000000000000000000000000..1019e7f4c8c58f2def34a019e4c3a0573c5f69bb
--- /dev/null
+++ b/detectron2/layers/nms.py
@@ -0,0 +1,144 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) Facebook, Inc. and its affiliates.
+
+import torch
+from torchvision.ops import boxes as box_ops
+from torchvision.ops import nms  # noqa . for compatibility
+
+
+def batched_nms(
+    boxes: torch.Tensor, scores: torch.Tensor, idxs: torch.Tensor, iou_threshold: float
+):
+    """
+    Same as torchvision.ops.boxes.batched_nms, but with float().
+    """
+    assert boxes.shape[-1] == 4
+    # Note: Torchvision already has a strategy (https://github.com/pytorch/vision/issues/1311)
+    # to decide whether to use coordinate trick or for loop to implement batched_nms. So we
+    # just call it directly.
+    # Fp16 does not have enough range for batched NMS, so adding float().
+    return box_ops.batched_nms(boxes.float(), scores, idxs, iou_threshold)
+
+
+# Note: this function (nms_rotated) might be moved into
+# torchvision/ops/boxes.py in the future
+def nms_rotated(boxes: torch.Tensor, scores: torch.Tensor, iou_threshold: float):
+    """
+    Performs non-maximum suppression (NMS) on the rotated boxes according
+    to their intersection-over-union (IoU).
+
+    Rotated NMS iteratively removes lower scoring rotated boxes which have an
+    IoU greater than iou_threshold with another (higher scoring) rotated box.
+
+    Note that RotatedBox (5, 3, 4, 2, -90) covers exactly the same region as
+    RotatedBox (5, 3, 4, 2, 90) does, and their IoU will be 1. However, they
+    can be representing completely different objects in certain tasks, e.g., OCR.
+
+    As for the question of whether rotated-NMS should treat them as faraway boxes
+    even though their IOU is 1, it depends on the application and/or ground truth annotation.
+
+    As an extreme example, consider a single character v and the square box around it.
+
+    If the angle is 0 degree, the object (text) would be read as 'v';
+
+    If the angle is 90 degrees, the object (text) would become '>';
+
+    If the angle is 180 degrees, the object (text) would become '^';
+
+    If the angle is 270/-90 degrees, the object (text) would become '<'
+
+    All of these cases have IoU of 1 to each other, and rotated NMS that only
+    uses IoU as criterion would only keep one of them with the highest score -
+    which, practically, still makes sense in most cases because typically
+    only one of theses orientations is the correct one. Also, it does not matter
+    as much if the box is only used to classify the object (instead of transcribing
+    them with a sequential OCR recognition model) later.
+
+    On the other hand, when we use IoU to filter proposals that are close to the
+    ground truth during training, we should definitely take the angle into account if
+    we know the ground truth is labeled with the strictly correct orientation (as in,
+    upside-down words are annotated with -180 degrees even though they can be covered
+    with a 0/90/-90 degree box, etc.)
+
+    The way the original dataset is annotated also matters. For example, if the dataset
+    is a 4-point polygon dataset that does not enforce ordering of vertices/orientation,
+    we can estimate a minimum rotated bounding box to this polygon, but there's no way
+    we can tell the correct angle with 100% confidence (as shown above, there could be 4 different
+    rotated boxes, with angles differed by 90 degrees to each other, covering the exactly
+    same region). In that case we have to just use IoU to determine the box
+    proximity (as many detection benchmarks (even for text) do) unless there're other
+    assumptions we can make (like width is always larger than height, or the object is not
+    rotated by more than 90 degrees CCW/CW, etc.)
+
+    In summary, not considering angles in rotated NMS seems to be a good option for now,
+    but we should be aware of its implications.
+
+    Args:
+        boxes (Tensor[N, 5]): Rotated boxes to perform NMS on. They are expected to be in
+           (x_center, y_center, width, height, angle_degrees) format.
+        scores (Tensor[N]): Scores for each one of the rotated boxes
+        iou_threshold (float): Discards all overlapping rotated boxes with IoU < iou_threshold
+
+    Returns:
+        keep (Tensor): int64 tensor with the indices of the elements that have been kept
+        by Rotated NMS, sorted in decreasing order of scores
+    """
+    return torch.ops.detectron2.nms_rotated(boxes, scores, iou_threshold)
+
+
+# Note: this function (batched_nms_rotated) might be moved into
+# torchvision/ops/boxes.py in the future
+
+
+@torch.jit.script_if_tracing
+def batched_nms_rotated(
+    boxes: torch.Tensor, scores: torch.Tensor, idxs: torch.Tensor, iou_threshold: float
+):
+    """
+    Performs non-maximum suppression in a batched fashion.
+
+    Each index value correspond to a category, and NMS
+    will not be applied between elements of different categories.
+
+    Args:
+        boxes (Tensor[N, 5]):
+           boxes where NMS will be performed. They
+           are expected to be in (x_ctr, y_ctr, width, height, angle_degrees) format
+        scores (Tensor[N]):
+           scores for each one of the boxes
+        idxs (Tensor[N]):
+           indices of the categories for each one of the boxes.
+        iou_threshold (float):
+           discards all overlapping boxes
+           with IoU < iou_threshold
+
+    Returns:
+        Tensor:
+            int64 tensor with the indices of the elements that have been kept
+            by NMS, sorted in decreasing order of scores
+    """
+    assert boxes.shape[-1] == 5
+
+    if boxes.numel() == 0:
+        return torch.empty((0,), dtype=torch.int64, device=boxes.device)
+    boxes = boxes.float()  # fp16 does not have enough range for batched NMS
+    # Strategy: in order to perform NMS independently per class,
+    # we add an offset to all the boxes. The offset is dependent
+    # only on the class idx, and is large enough so that boxes
+    # from different classes do not overlap
+
+    # Note that batched_nms in torchvision/ops/boxes.py only uses max_coordinate,
+    # which won't handle negative coordinates correctly.
+    # Here by using min_coordinate we can make sure the negative coordinates are
+    # correctly handled.
+    max_coordinate = (
+        torch.max(boxes[:, 0], boxes[:, 1]) + torch.max(boxes[:, 2], boxes[:, 3]) / 2
+    ).max()
+    min_coordinate = (
+        torch.min(boxes[:, 0], boxes[:, 1]) - torch.max(boxes[:, 2], boxes[:, 3]) / 2
+    ).min()
+    offsets = idxs.to(boxes) * (max_coordinate - min_coordinate + 1)
+    boxes_for_nms = boxes.clone()  # avoid modifying the original values in boxes
+    boxes_for_nms[:, :2] += offsets[:, None]
+    keep = nms_rotated(boxes_for_nms, scores, iou_threshold)
+    return keep
diff --git a/detectron2/layers/roi_align.py b/detectron2/layers/roi_align.py
new file mode 100644
index 0000000000000000000000000000000000000000..163462e1f194e1e4100da92d76d9516f7cc22e35
--- /dev/null
+++ b/detectron2/layers/roi_align.py
@@ -0,0 +1,74 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+from torch import nn
+from torchvision.ops import roi_align
+
+
+# NOTE: torchvision's RoIAlign has a different default aligned=False
+class ROIAlign(nn.Module):
+    def __init__(self, output_size, spatial_scale, sampling_ratio, aligned=True):
+        """
+        Args:
+            output_size (tuple): h, w
+            spatial_scale (float): scale the input boxes by this number
+            sampling_ratio (int): number of inputs samples to take for each output
+                sample. 0 to take samples densely.
+            aligned (bool): if False, use the legacy implementation in
+                Detectron. If True, align the results more perfectly.
+
+        Note:
+            The meaning of aligned=True:
+
+            Given a continuous coordinate c, its two neighboring pixel indices (in our
+            pixel model) are computed by floor(c - 0.5) and ceil(c - 0.5). For example,
+            c=1.3 has pixel neighbors with discrete indices [0] and [1] (which are sampled
+            from the underlying signal at continuous coordinates 0.5 and 1.5). But the original
+            roi_align (aligned=False) does not subtract the 0.5 when computing neighboring
+            pixel indices and therefore it uses pixels with a slightly incorrect alignment
+            (relative to our pixel model) when performing bilinear interpolation.
+
+            With `aligned=True`,
+            we first appropriately scale the ROI and then shift it by -0.5
+            prior to calling roi_align. This produces the correct neighbors; see
+            detectron2/tests/test_roi_align.py for verification.
+
+            The difference does not make a difference to the model's performance if
+            ROIAlign is used together with conv layers.
+        """
+        super().__init__()
+        self.output_size = output_size
+        self.spatial_scale = spatial_scale
+        self.sampling_ratio = sampling_ratio
+        self.aligned = aligned
+
+        from torchvision import __version__
+
+        version = tuple(int(x) for x in __version__.split(".")[:2])
+        # https://github.com/pytorch/vision/pull/2438
+        assert version >= (0, 7), "Require torchvision >= 0.7"
+
+    def forward(self, input, rois):
+        """
+        Args:
+            input: NCHW images
+            rois: Bx5 boxes. First column is the index into N. The other 4 columns are xyxy.
+        """
+        assert rois.dim() == 2 and rois.size(1) == 5
+        if input.is_quantized:
+            input = input.dequantize()
+        return roi_align(
+            input,
+            rois.to(dtype=input.dtype),
+            self.output_size,
+            self.spatial_scale,
+            self.sampling_ratio,
+            self.aligned,
+        )
+
+    def __repr__(self):
+        tmpstr = self.__class__.__name__ + "("
+        tmpstr += "output_size=" + str(self.output_size)
+        tmpstr += ", spatial_scale=" + str(self.spatial_scale)
+        tmpstr += ", sampling_ratio=" + str(self.sampling_ratio)
+        tmpstr += ", aligned=" + str(self.aligned)
+        tmpstr += ")"
+        return tmpstr
diff --git a/detectron2/layers/roi_align_rotated.py b/detectron2/layers/roi_align_rotated.py
new file mode 100644
index 0000000000000000000000000000000000000000..2a523992e7c736262ad5a158f209aae7875f6f0b
--- /dev/null
+++ b/detectron2/layers/roi_align_rotated.py
@@ -0,0 +1,100 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+import torch
+from torch import nn
+from torch.autograd import Function
+from torch.autograd.function import once_differentiable
+from torch.nn.modules.utils import _pair
+
+
+class _ROIAlignRotated(Function):
+    @staticmethod
+    def forward(ctx, input, roi, output_size, spatial_scale, sampling_ratio):
+        ctx.save_for_backward(roi)
+        ctx.output_size = _pair(output_size)
+        ctx.spatial_scale = spatial_scale
+        ctx.sampling_ratio = sampling_ratio
+        ctx.input_shape = input.size()
+        output = torch.ops.detectron2.roi_align_rotated_forward(
+            input, roi, spatial_scale, output_size[0], output_size[1], sampling_ratio
+        )
+        return output
+
+    @staticmethod
+    @once_differentiable
+    def backward(ctx, grad_output):
+        (rois,) = ctx.saved_tensors
+        output_size = ctx.output_size
+        spatial_scale = ctx.spatial_scale
+        sampling_ratio = ctx.sampling_ratio
+        bs, ch, h, w = ctx.input_shape
+        grad_input = torch.ops.detectron2.roi_align_rotated_backward(
+            grad_output,
+            rois,
+            spatial_scale,
+            output_size[0],
+            output_size[1],
+            bs,
+            ch,
+            h,
+            w,
+            sampling_ratio,
+        )
+        return grad_input, None, None, None, None, None
+
+
+roi_align_rotated = _ROIAlignRotated.apply
+
+
+class ROIAlignRotated(nn.Module):
+    def __init__(self, output_size, spatial_scale, sampling_ratio):
+        """
+        Args:
+            output_size (tuple): h, w
+            spatial_scale (float): scale the input boxes by this number
+            sampling_ratio (int): number of inputs samples to take for each output
+                sample. 0 to take samples densely.
+
+        Note:
+            ROIAlignRotated supports continuous coordinate by default:
+            Given a continuous coordinate c, its two neighboring pixel indices (in our
+            pixel model) are computed by floor(c - 0.5) and ceil(c - 0.5). For example,
+            c=1.3 has pixel neighbors with discrete indices [0] and [1] (which are sampled
+            from the underlying signal at continuous coordinates 0.5 and 1.5).
+        """
+        super(ROIAlignRotated, self).__init__()
+        self.output_size = output_size
+        self.spatial_scale = spatial_scale
+        self.sampling_ratio = sampling_ratio
+
+    def forward(self, input, rois):
+        """
+        Args:
+            input: NCHW images
+            rois: Bx6 boxes. First column is the index into N.
+                The other 5 columns are (x_ctr, y_ctr, width, height, angle_degrees).
+        """
+        assert rois.dim() == 2 and rois.size(1) == 6
+        orig_dtype = input.dtype
+        if orig_dtype == torch.float16:
+            input = input.float()
+            rois = rois.float()
+        output_size = _pair(self.output_size)
+
+        # Scripting for Autograd is currently unsupported.
+        # This is a quick fix without having to rewrite code on the C++ side
+        if torch.jit.is_scripting() or torch.jit.is_tracing():
+            return torch.ops.detectron2.roi_align_rotated_forward(
+                input, rois, self.spatial_scale, output_size[0], output_size[1], self.sampling_ratio
+            ).to(dtype=orig_dtype)
+
+        return roi_align_rotated(
+            input, rois, self.output_size, self.spatial_scale, self.sampling_ratio
+        ).to(dtype=orig_dtype)
+
+    def __repr__(self):
+        tmpstr = self.__class__.__name__ + "("
+        tmpstr += "output_size=" + str(self.output_size)
+        tmpstr += ", spatial_scale=" + str(self.spatial_scale)
+        tmpstr += ", sampling_ratio=" + str(self.sampling_ratio)
+        tmpstr += ")"
+        return tmpstr
diff --git a/detectron2/layers/rotated_boxes.py b/detectron2/layers/rotated_boxes.py
new file mode 100644
index 0000000000000000000000000000000000000000..03f73b3bb99275931a887ad9b2d8c0ac9f412bf3
--- /dev/null
+++ b/detectron2/layers/rotated_boxes.py
@@ -0,0 +1,21 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+from __future__ import absolute_import, division, print_function, unicode_literals
+import torch
+
+
+def pairwise_iou_rotated(boxes1, boxes2):
+    """
+    Return intersection-over-union (Jaccard index) of boxes.
+
+    Both sets of boxes are expected to be in
+    (x_center, y_center, width, height, angle) format.
+
+    Arguments:
+        boxes1 (Tensor[N, 5])
+        boxes2 (Tensor[M, 5])
+
+    Returns:
+        iou (Tensor[N, M]): the NxM matrix containing the pairwise
+            IoU values for every element in boxes1 and boxes2
+    """
+    return torch.ops.detectron2.box_iou_rotated(boxes1, boxes2)
diff --git a/detectron2/layers/shape_spec.py b/detectron2/layers/shape_spec.py
new file mode 100644
index 0000000000000000000000000000000000000000..8dac3c59b96576710656abebe9b5eac25868abbb
--- /dev/null
+++ b/detectron2/layers/shape_spec.py
@@ -0,0 +1,18 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) Facebook, Inc. and its affiliates.
+from dataclasses import dataclass
+from typing import Optional
+
+
+@dataclass
+class ShapeSpec:
+    """
+    A simple structure that contains basic shape specification about a tensor.
+    It is often used as the auxiliary inputs/outputs of models,
+    to complement the lack of shape inference ability among pytorch modules.
+    """
+
+    channels: Optional[int] = None
+    height: Optional[int] = None
+    width: Optional[int] = None
+    stride: Optional[int] = None
diff --git a/detectron2/layers/wrappers.py b/detectron2/layers/wrappers.py
new file mode 100644
index 0000000000000000000000000000000000000000..fb3cb38b9de0d936bc3774b85eec7375f739add2
--- /dev/null
+++ b/detectron2/layers/wrappers.py
@@ -0,0 +1,162 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+"""
+Wrappers around on some nn functions, mainly to support empty tensors.
+
+Ideally, add support directly in PyTorch to empty tensors in those functions.
+
+These can be removed once https://github.com/pytorch/pytorch/issues/12013
+is implemented
+"""
+
+import warnings
+from typing import List, Optional
+import torch
+from torch.nn import functional as F
+
+from detectron2.utils.env import TORCH_VERSION
+
+
+def shapes_to_tensor(x: List[int], device: Optional[torch.device] = None) -> torch.Tensor:
+    """
+    Turn a list of integer scalars or integer Tensor scalars into a vector,
+    in a way that's both traceable and scriptable.
+
+    In tracing, `x` should be a list of scalar Tensor, so the output can trace to the inputs.
+    In scripting or eager, `x` should be a list of int.
+    """
+    if torch.jit.is_scripting():
+        return torch.as_tensor(x, device=device)
+    if torch.jit.is_tracing():
+        assert all(
+            [isinstance(t, torch.Tensor) for t in x]
+        ), "Shape should be tensor during tracing!"
+        # as_tensor should not be used in tracing because it records a constant
+        ret = torch.stack(x)
+        if ret.device != device:  # avoid recording a hard-coded device if not necessary
+            ret = ret.to(device=device)
+        return ret
+    return torch.as_tensor(x, device=device)
+
+
+def check_if_dynamo_compiling():
+    if TORCH_VERSION >= (1, 14):
+        from torch._dynamo import is_compiling
+
+        return is_compiling()
+    else:
+        return False
+
+
+def cat(tensors: List[torch.Tensor], dim: int = 0):
+    """
+    Efficient version of torch.cat that avoids a copy if there is only a single element in a list
+    """
+    assert isinstance(tensors, (list, tuple))
+    if len(tensors) == 1:
+        return tensors[0]
+    return torch.cat(tensors, dim)
+
+
+def empty_input_loss_func_wrapper(loss_func):
+    def wrapped_loss_func(input, target, *, reduction="mean", **kwargs):
+        """
+        Same as `loss_func`, but returns 0 (instead of nan) for empty inputs.
+        """
+        if target.numel() == 0 and reduction == "mean":
+            return input.sum() * 0.0  # connect the gradient
+        return loss_func(input, target, reduction=reduction, **kwargs)
+
+    return wrapped_loss_func
+
+
+cross_entropy = empty_input_loss_func_wrapper(F.cross_entropy)
+
+
+class _NewEmptyTensorOp(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, x, new_shape):
+        ctx.shape = x.shape
+        return x.new_empty(new_shape)
+
+    @staticmethod
+    def backward(ctx, grad):
+        shape = ctx.shape
+        return _NewEmptyTensorOp.apply(grad, shape), None
+
+
+class Conv2d(torch.nn.Conv2d):
+    """
+    A wrapper around :class:`torch.nn.Conv2d` to support empty inputs and more features.
+    """
+
+    def __init__(self, *args, **kwargs):
+        """
+        Extra keyword arguments supported in addition to those in `torch.nn.Conv2d`:
+
+        Args:
+            norm (nn.Module, optional): a normalization layer
+            activation (callable(Tensor) -> Tensor): a callable activation function
+
+        It assumes that norm layer is used before activation.
+        """
+        norm = kwargs.pop("norm", None)
+        activation = kwargs.pop("activation", None)
+        super().__init__(*args, **kwargs)
+
+        self.norm = norm
+        self.activation = activation
+
+    def forward(self, x):
+        # torchscript does not support SyncBatchNorm yet
+        # https://github.com/pytorch/pytorch/issues/40507
+        # and we skip these codes in torchscript since:
+        # 1. currently we only support torchscript in evaluation mode
+        # 2. features needed by exporting module to torchscript are added in PyTorch 1.6 or
+        # later version, `Conv2d` in these PyTorch versions has already supported empty inputs.
+        if not torch.jit.is_scripting():
+            # Dynamo doesn't support context managers yet
+            is_dynamo_compiling = check_if_dynamo_compiling()
+            if not is_dynamo_compiling:
+                with warnings.catch_warnings(record=True):
+                    if x.numel() == 0 and self.training:
+                        # https://github.com/pytorch/pytorch/issues/12013
+                        assert not isinstance(
+                            self.norm, torch.nn.SyncBatchNorm
+                        ), "SyncBatchNorm does not support empty inputs!"
+
+        x = F.conv2d(
+            x, self.weight, self.bias, self.stride, self.padding, self.dilation, self.groups
+        )
+        if self.norm is not None:
+            x = self.norm(x)
+        if self.activation is not None:
+            x = self.activation(x)
+        return x
+
+
+ConvTranspose2d = torch.nn.ConvTranspose2d
+BatchNorm2d = torch.nn.BatchNorm2d
+interpolate = F.interpolate
+Linear = torch.nn.Linear
+
+
+def nonzero_tuple(x):
+    """
+    A 'as_tuple=True' version of torch.nonzero to support torchscript.
+    because of https://github.com/pytorch/pytorch/issues/38718
+    """
+    if torch.jit.is_scripting():
+        if x.dim() == 0:
+            return x.unsqueeze(0).nonzero().unbind(1)
+        return x.nonzero().unbind(1)
+    else:
+        return x.nonzero(as_tuple=True)
+
+
+@torch.jit.script_if_tracing
+def move_device_like(src: torch.Tensor, dst: torch.Tensor) -> torch.Tensor:
+    """
+    Tracing friendly way to cast tensor to another tensor's device. Device will be treated
+    as constant during tracing, scripting the casting process as whole can workaround this issue.
+    """
+    return src.to(dst.device)
diff --git a/detectron2/model_zoo/__init__.py b/detectron2/model_zoo/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..6204208198d813728cf6419e8eef4a733f20c18f
--- /dev/null
+++ b/detectron2/model_zoo/__init__.py
@@ -0,0 +1,10 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+"""
+Model Zoo API for Detectron2: a collection of functions to create common model architectures
+listed in `MODEL_ZOO.md <https://github.com/facebookresearch/detectron2/blob/main/MODEL_ZOO.md>`_,
+and optionally load their pre-trained weights.
+"""
+
+from .model_zoo import get, get_config_file, get_checkpoint_url, get_config
+
+__all__ = ["get_checkpoint_url", "get", "get_config_file", "get_config"]
diff --git a/detectron2/model_zoo/model_zoo.py b/detectron2/model_zoo/model_zoo.py
new file mode 100644
index 0000000000000000000000000000000000000000..4e80ffd48f2469287ed091527f72e39766136469
--- /dev/null
+++ b/detectron2/model_zoo/model_zoo.py
@@ -0,0 +1,213 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+import os
+from typing import Optional
+import pkg_resources
+import torch
+
+from detectron2.checkpoint import DetectionCheckpointer
+from detectron2.config import CfgNode, LazyConfig, get_cfg, instantiate
+from detectron2.modeling import build_model
+
+
+class _ModelZooUrls:
+    """
+    Mapping from names to officially released Detectron2 pre-trained models.
+    """
+
+    S3_PREFIX = "https://dl.fbaipublicfiles.com/detectron2/"
+
+    # format: {config_path.yaml} -> model_id/model_final_{commit}.pkl
+    CONFIG_PATH_TO_URL_SUFFIX = {
+        # COCO Detection with Faster R-CNN
+        "COCO-Detection/faster_rcnn_R_50_C4_1x": "137257644/model_final_721ade.pkl",
+        "COCO-Detection/faster_rcnn_R_50_DC5_1x": "137847829/model_final_51d356.pkl",
+        "COCO-Detection/faster_rcnn_R_50_FPN_1x": "137257794/model_final_b275ba.pkl",
+        "COCO-Detection/faster_rcnn_R_50_C4_3x": "137849393/model_final_f97cb7.pkl",
+        "COCO-Detection/faster_rcnn_R_50_DC5_3x": "137849425/model_final_68d202.pkl",
+        "COCO-Detection/faster_rcnn_R_50_FPN_3x": "137849458/model_final_280758.pkl",
+        "COCO-Detection/faster_rcnn_R_101_C4_3x": "138204752/model_final_298dad.pkl",
+        "COCO-Detection/faster_rcnn_R_101_DC5_3x": "138204841/model_final_3e0943.pkl",
+        "COCO-Detection/faster_rcnn_R_101_FPN_3x": "137851257/model_final_f6e8b1.pkl",
+        "COCO-Detection/faster_rcnn_X_101_32x8d_FPN_3x": "139173657/model_final_68b088.pkl",
+        # COCO Detection with RetinaNet
+        "COCO-Detection/retinanet_R_50_FPN_1x": "190397773/model_final_bfca0b.pkl",
+        "COCO-Detection/retinanet_R_50_FPN_3x": "190397829/model_final_5bd44e.pkl",
+        "COCO-Detection/retinanet_R_101_FPN_3x": "190397697/model_final_971ab9.pkl",
+        # COCO Detection with RPN and Fast R-CNN
+        "COCO-Detection/rpn_R_50_C4_1x": "137258005/model_final_450694.pkl",
+        "COCO-Detection/rpn_R_50_FPN_1x": "137258492/model_final_02ce48.pkl",
+        "COCO-Detection/fast_rcnn_R_50_FPN_1x": "137635226/model_final_e5f7ce.pkl",
+        # COCO Instance Segmentation Baselines with Mask R-CNN
+        "COCO-InstanceSegmentation/mask_rcnn_R_50_C4_1x": "137259246/model_final_9243eb.pkl",
+        "COCO-InstanceSegmentation/mask_rcnn_R_50_DC5_1x": "137260150/model_final_4f86c3.pkl",
+        "COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_1x": "137260431/model_final_a54504.pkl",
+        "COCO-InstanceSegmentation/mask_rcnn_R_50_C4_3x": "137849525/model_final_4ce675.pkl",
+        "COCO-InstanceSegmentation/mask_rcnn_R_50_DC5_3x": "137849551/model_final_84107b.pkl",
+        "COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_3x": "137849600/model_final_f10217.pkl",
+        "COCO-InstanceSegmentation/mask_rcnn_R_101_C4_3x": "138363239/model_final_a2914c.pkl",
+        "COCO-InstanceSegmentation/mask_rcnn_R_101_DC5_3x": "138363294/model_final_0464b7.pkl",
+        "COCO-InstanceSegmentation/mask_rcnn_R_101_FPN_3x": "138205316/model_final_a3ec72.pkl",
+        "COCO-InstanceSegmentation/mask_rcnn_X_101_32x8d_FPN_3x": "139653917/model_final_2d9806.pkl",  # noqa
+        # New baselines using Large-Scale Jitter and Longer Training Schedule
+        "new_baselines/mask_rcnn_R_50_FPN_100ep_LSJ": "42047764/model_final_bb69de.pkl",
+        "new_baselines/mask_rcnn_R_50_FPN_200ep_LSJ": "42047638/model_final_89a8d3.pkl",
+        "new_baselines/mask_rcnn_R_50_FPN_400ep_LSJ": "42019571/model_final_14d201.pkl",
+        "new_baselines/mask_rcnn_R_101_FPN_100ep_LSJ": "42025812/model_final_4f7b58.pkl",
+        "new_baselines/mask_rcnn_R_101_FPN_200ep_LSJ": "42131867/model_final_0bb7ae.pkl",
+        "new_baselines/mask_rcnn_R_101_FPN_400ep_LSJ": "42073830/model_final_f96b26.pkl",
+        "new_baselines/mask_rcnn_regnetx_4gf_dds_FPN_100ep_LSJ": "42047771/model_final_b7fbab.pkl",  # noqa
+        "new_baselines/mask_rcnn_regnetx_4gf_dds_FPN_200ep_LSJ": "42132721/model_final_5d87c1.pkl",  # noqa
+        "new_baselines/mask_rcnn_regnetx_4gf_dds_FPN_400ep_LSJ": "42025447/model_final_f1362d.pkl",  # noqa
+        "new_baselines/mask_rcnn_regnety_4gf_dds_FPN_100ep_LSJ": "42047784/model_final_6ba57e.pkl",  # noqa
+        "new_baselines/mask_rcnn_regnety_4gf_dds_FPN_200ep_LSJ": "42047642/model_final_27b9c1.pkl",  # noqa
+        "new_baselines/mask_rcnn_regnety_4gf_dds_FPN_400ep_LSJ": "42045954/model_final_ef3a80.pkl",  # noqa
+        # COCO Person Keypoint Detection Baselines with Keypoint R-CNN
+        "COCO-Keypoints/keypoint_rcnn_R_50_FPN_1x": "137261548/model_final_04e291.pkl",
+        "COCO-Keypoints/keypoint_rcnn_R_50_FPN_3x": "137849621/model_final_a6e10b.pkl",
+        "COCO-Keypoints/keypoint_rcnn_R_101_FPN_3x": "138363331/model_final_997cc7.pkl",
+        "COCO-Keypoints/keypoint_rcnn_X_101_32x8d_FPN_3x": "139686956/model_final_5ad38f.pkl",
+        # COCO Panoptic Segmentation Baselines with Panoptic FPN
+        "COCO-PanopticSegmentation/panoptic_fpn_R_50_1x": "139514544/model_final_dbfeb4.pkl",
+        "COCO-PanopticSegmentation/panoptic_fpn_R_50_3x": "139514569/model_final_c10459.pkl",
+        "COCO-PanopticSegmentation/panoptic_fpn_R_101_3x": "139514519/model_final_cafdb1.pkl",
+        # LVIS Instance Segmentation Baselines with Mask R-CNN
+        "LVISv0.5-InstanceSegmentation/mask_rcnn_R_50_FPN_1x": "144219072/model_final_571f7c.pkl",  # noqa
+        "LVISv0.5-InstanceSegmentation/mask_rcnn_R_101_FPN_1x": "144219035/model_final_824ab5.pkl",  # noqa
+        "LVISv0.5-InstanceSegmentation/mask_rcnn_X_101_32x8d_FPN_1x": "144219108/model_final_5e3439.pkl",  # noqa
+        # Cityscapes & Pascal VOC Baselines
+        "Cityscapes/mask_rcnn_R_50_FPN": "142423278/model_final_af9cf5.pkl",
+        "PascalVOC-Detection/faster_rcnn_R_50_C4": "142202221/model_final_b1acc2.pkl",
+        # Other Settings
+        "Misc/mask_rcnn_R_50_FPN_1x_dconv_c3-c5": "138602867/model_final_65c703.pkl",
+        "Misc/mask_rcnn_R_50_FPN_3x_dconv_c3-c5": "144998336/model_final_821d0b.pkl",
+        "Misc/cascade_mask_rcnn_R_50_FPN_1x": "138602847/model_final_e9d89b.pkl",
+        "Misc/cascade_mask_rcnn_R_50_FPN_3x": "144998488/model_final_480dd8.pkl",
+        "Misc/mask_rcnn_R_50_FPN_3x_syncbn": "169527823/model_final_3b3c51.pkl",
+        "Misc/mask_rcnn_R_50_FPN_3x_gn": "138602888/model_final_dc5d9e.pkl",
+        "Misc/scratch_mask_rcnn_R_50_FPN_3x_gn": "138602908/model_final_01ca85.pkl",
+        "Misc/scratch_mask_rcnn_R_50_FPN_9x_gn": "183808979/model_final_da7b4c.pkl",
+        "Misc/scratch_mask_rcnn_R_50_FPN_9x_syncbn": "184226666/model_final_5ce33e.pkl",
+        "Misc/panoptic_fpn_R_101_dconv_cascade_gn_3x": "139797668/model_final_be35db.pkl",
+        "Misc/cascade_mask_rcnn_X_152_32x8d_FPN_IN5k_gn_dconv": "18131413/model_0039999_e76410.pkl",  # noqa
+        # D1 Comparisons
+        "Detectron1-Comparisons/faster_rcnn_R_50_FPN_noaug_1x": "137781054/model_final_7ab50c.pkl",  # noqa
+        "Detectron1-Comparisons/mask_rcnn_R_50_FPN_noaug_1x": "137781281/model_final_62ca52.pkl",  # noqa
+        "Detectron1-Comparisons/keypoint_rcnn_R_50_FPN_1x": "137781195/model_final_cce136.pkl",
+    }
+
+    @staticmethod
+    def query(config_path: str) -> Optional[str]:
+        """
+        Args:
+            config_path: relative config filename
+        """
+        name = config_path.replace(".yaml", "").replace(".py", "")
+        if name in _ModelZooUrls.CONFIG_PATH_TO_URL_SUFFIX:
+            suffix = _ModelZooUrls.CONFIG_PATH_TO_URL_SUFFIX[name]
+            return _ModelZooUrls.S3_PREFIX + name + "/" + suffix
+        return None
+
+
+def get_checkpoint_url(config_path):
+    """
+    Returns the URL to the model trained using the given config
+
+    Args:
+        config_path (str): config file name relative to detectron2's "configs/"
+            directory, e.g., "COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_1x.yaml"
+
+    Returns:
+        str: a URL to the model
+    """
+    url = _ModelZooUrls.query(config_path)
+    if url is None:
+        raise RuntimeError("Pretrained model for {} is not available!".format(config_path))
+    return url
+
+
+def get_config_file(config_path):
+    """
+    Returns path to a builtin config file.
+
+    Args:
+        config_path (str): config file name relative to detectron2's "configs/"
+            directory, e.g., "COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_1x.yaml"
+
+    Returns:
+        str: the real path to the config file.
+    """
+    cfg_file = pkg_resources.resource_filename(
+        "detectron2.model_zoo", os.path.join("configs", config_path)
+    )
+    if not os.path.exists(cfg_file):
+        raise RuntimeError("{} not available in Model Zoo!".format(config_path))
+    return cfg_file
+
+
+def get_config(config_path, trained: bool = False):
+    """
+    Returns a config object for a model in model zoo.
+
+    Args:
+        config_path (str): config file name relative to detectron2's "configs/"
+            directory, e.g., "COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_1x.yaml"
+        trained (bool): If True, will set ``MODEL.WEIGHTS`` to trained model zoo weights.
+            If False, the checkpoint specified in the config file's ``MODEL.WEIGHTS`` is used
+            instead; this will typically (though not always) initialize a subset of weights using
+            an ImageNet pre-trained model, while randomly initializing the other weights.
+
+    Returns:
+        CfgNode or omegaconf.DictConfig: a config object
+    """
+    cfg_file = get_config_file(config_path)
+    if cfg_file.endswith(".yaml"):
+        cfg = get_cfg()
+        cfg.merge_from_file(cfg_file)
+        if trained:
+            cfg.MODEL.WEIGHTS = get_checkpoint_url(config_path)
+        return cfg
+    elif cfg_file.endswith(".py"):
+        cfg = LazyConfig.load(cfg_file)
+        if trained:
+            url = get_checkpoint_url(config_path)
+            if "train" in cfg and "init_checkpoint" in cfg.train:
+                cfg.train.init_checkpoint = url
+            else:
+                raise NotImplementedError
+        return cfg
+
+
+def get(config_path, trained: bool = False, device: Optional[str] = None):
+    """
+    Get a model specified by relative path under Detectron2's official ``configs/`` directory.
+
+    Args:
+        config_path (str): config file name relative to detectron2's "configs/"
+            directory, e.g., "COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_1x.yaml"
+        trained (bool): see :func:`get_config`.
+        device (str or None): overwrite the device in config, if given.
+
+    Returns:
+        nn.Module: a detectron2 model. Will be in training mode.
+
+    Example:
+    ::
+        from detectron2 import model_zoo
+        model = model_zoo.get("COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_1x.yaml", trained=True)
+    """
+    cfg = get_config(config_path, trained)
+    if device is None and not torch.cuda.is_available():
+        device = "cpu"
+    if device is not None and isinstance(cfg, CfgNode):
+        cfg.MODEL.DEVICE = device
+
+    if isinstance(cfg, CfgNode):
+        model = build_model(cfg)
+        DetectionCheckpointer(model).load(cfg.MODEL.WEIGHTS)
+    else:
+        model = instantiate(cfg.model)
+        if device is not None:
+            model = model.to(device)
+        if "train" in cfg and "init_checkpoint" in cfg.train:
+            DetectionCheckpointer(model).load(cfg.train.init_checkpoint)
+    return model
diff --git a/detectron2/modeling/__init__.py b/detectron2/modeling/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..4d949e222b5e94bef7deac65dadf21dd0e466c5d
--- /dev/null
+++ b/detectron2/modeling/__init__.py
@@ -0,0 +1,64 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+from detectron2.layers import ShapeSpec
+
+from .anchor_generator import build_anchor_generator, ANCHOR_GENERATOR_REGISTRY
+from .backbone import (
+    BACKBONE_REGISTRY,
+    FPN,
+    Backbone,
+    ResNet,
+    ResNetBlockBase,
+    build_backbone,
+    build_resnet_backbone,
+    make_stage,
+    ViT,
+    SimpleFeaturePyramid,
+    get_vit_lr_decay_rate,
+    MViT,
+    SwinTransformer,
+)
+from .meta_arch import (
+    META_ARCH_REGISTRY,
+    SEM_SEG_HEADS_REGISTRY,
+    GeneralizedRCNN,
+    PanopticFPN,
+    ProposalNetwork,
+    RetinaNet,
+    SemanticSegmentor,
+    build_model,
+    build_sem_seg_head,
+    FCOS,
+)
+from .postprocessing import detector_postprocess
+from .proposal_generator import (
+    PROPOSAL_GENERATOR_REGISTRY,
+    build_proposal_generator,
+    RPN_HEAD_REGISTRY,
+    build_rpn_head,
+)
+from .roi_heads import (
+    ROI_BOX_HEAD_REGISTRY,
+    ROI_HEADS_REGISTRY,
+    ROI_KEYPOINT_HEAD_REGISTRY,
+    ROI_MASK_HEAD_REGISTRY,
+    ROIHeads,
+    StandardROIHeads,
+    BaseMaskRCNNHead,
+    BaseKeypointRCNNHead,
+    FastRCNNOutputLayers,
+    build_box_head,
+    build_keypoint_head,
+    build_mask_head,
+    build_roi_heads,
+)
+from .test_time_augmentation import DatasetMapperTTA, GeneralizedRCNNWithTTA
+from .mmdet_wrapper import MMDetBackbone, MMDetDetector
+
+_EXCLUDE = {"ShapeSpec"}
+__all__ = [k for k in globals().keys() if k not in _EXCLUDE and not k.startswith("_")]
+
+
+from detectron2.utils.env import fixup_module_metadata
+
+fixup_module_metadata(__name__, globals(), __all__)
+del fixup_module_metadata
diff --git a/detectron2/modeling/__pycache__/__init__.cpython-310.pyc b/detectron2/modeling/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7ed02f68ceb00e45378d9b8b13a3c42ff359e154
Binary files /dev/null and b/detectron2/modeling/__pycache__/__init__.cpython-310.pyc differ
diff --git a/detectron2/modeling/__pycache__/__init__.cpython-39.pyc b/detectron2/modeling/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..57d9a762366f59c8c2551d2674a1a850cebc9638
Binary files /dev/null and b/detectron2/modeling/__pycache__/__init__.cpython-39.pyc differ
diff --git a/detectron2/modeling/__pycache__/anchor_generator.cpython-310.pyc b/detectron2/modeling/__pycache__/anchor_generator.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7adddf973a5db71649b97f33b2ca2a6c5f411d17
Binary files /dev/null and b/detectron2/modeling/__pycache__/anchor_generator.cpython-310.pyc differ
diff --git a/detectron2/modeling/__pycache__/anchor_generator.cpython-39.pyc b/detectron2/modeling/__pycache__/anchor_generator.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9334faf6781388c4df4d9a13414dcb50b875a110
Binary files /dev/null and b/detectron2/modeling/__pycache__/anchor_generator.cpython-39.pyc differ
diff --git a/detectron2/modeling/__pycache__/box_regression.cpython-310.pyc b/detectron2/modeling/__pycache__/box_regression.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4e650c0c2abeaf42859b9416a1ba2a0d85a54937
Binary files /dev/null and b/detectron2/modeling/__pycache__/box_regression.cpython-310.pyc differ
diff --git a/detectron2/modeling/__pycache__/box_regression.cpython-39.pyc b/detectron2/modeling/__pycache__/box_regression.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c0dca03ffbb265e48eb9d5965500039cf56dac63
Binary files /dev/null and b/detectron2/modeling/__pycache__/box_regression.cpython-39.pyc differ
diff --git a/detectron2/modeling/__pycache__/matcher.cpython-310.pyc b/detectron2/modeling/__pycache__/matcher.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..519765b49d5439927e83a52819246152c432a596
Binary files /dev/null and b/detectron2/modeling/__pycache__/matcher.cpython-310.pyc differ
diff --git a/detectron2/modeling/__pycache__/matcher.cpython-39.pyc b/detectron2/modeling/__pycache__/matcher.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2a11e2aee273a24828762697912514a7add831ae
Binary files /dev/null and b/detectron2/modeling/__pycache__/matcher.cpython-39.pyc differ
diff --git a/detectron2/modeling/__pycache__/mmdet_wrapper.cpython-310.pyc b/detectron2/modeling/__pycache__/mmdet_wrapper.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4fe8a6df787e6e3421ebd2e9de856d05b7d14fb5
Binary files /dev/null and b/detectron2/modeling/__pycache__/mmdet_wrapper.cpython-310.pyc differ
diff --git a/detectron2/modeling/__pycache__/mmdet_wrapper.cpython-39.pyc b/detectron2/modeling/__pycache__/mmdet_wrapper.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9fc62281ad1ecfcde18e92f27975a0a5a6b3f9d7
Binary files /dev/null and b/detectron2/modeling/__pycache__/mmdet_wrapper.cpython-39.pyc differ
diff --git a/detectron2/modeling/__pycache__/poolers.cpython-310.pyc b/detectron2/modeling/__pycache__/poolers.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ec0aaa5fbfb836f14a7da6d64f04d6805cb3509a
Binary files /dev/null and b/detectron2/modeling/__pycache__/poolers.cpython-310.pyc differ
diff --git a/detectron2/modeling/__pycache__/poolers.cpython-39.pyc b/detectron2/modeling/__pycache__/poolers.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..43424109177a71ac562cc00680f241cf19f8cbee
Binary files /dev/null and b/detectron2/modeling/__pycache__/poolers.cpython-39.pyc differ
diff --git a/detectron2/modeling/__pycache__/postprocessing.cpython-310.pyc b/detectron2/modeling/__pycache__/postprocessing.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1656ce80403549b64f65c334bddd94e60e703392
Binary files /dev/null and b/detectron2/modeling/__pycache__/postprocessing.cpython-310.pyc differ
diff --git a/detectron2/modeling/__pycache__/postprocessing.cpython-39.pyc b/detectron2/modeling/__pycache__/postprocessing.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c1f0a14bbadec2eeaf571be606aa2ad3d0dedd15
Binary files /dev/null and b/detectron2/modeling/__pycache__/postprocessing.cpython-39.pyc differ
diff --git a/detectron2/modeling/__pycache__/sampling.cpython-310.pyc b/detectron2/modeling/__pycache__/sampling.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9bfd81fba990ffe8f9af77a0149eeb1f1ae02fde
Binary files /dev/null and b/detectron2/modeling/__pycache__/sampling.cpython-310.pyc differ
diff --git a/detectron2/modeling/__pycache__/sampling.cpython-39.pyc b/detectron2/modeling/__pycache__/sampling.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..083a101749c61ccfbdb438203bdad1350311dc30
Binary files /dev/null and b/detectron2/modeling/__pycache__/sampling.cpython-39.pyc differ
diff --git a/detectron2/modeling/__pycache__/test_time_augmentation.cpython-310.pyc b/detectron2/modeling/__pycache__/test_time_augmentation.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b1eb08c7b6a075fb4f499ece875f7c9de0533b2d
Binary files /dev/null and b/detectron2/modeling/__pycache__/test_time_augmentation.cpython-310.pyc differ
diff --git a/detectron2/modeling/__pycache__/test_time_augmentation.cpython-39.pyc b/detectron2/modeling/__pycache__/test_time_augmentation.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..04b8e2f2118e26d3b62cc549e237b968b4f2e59d
Binary files /dev/null and b/detectron2/modeling/__pycache__/test_time_augmentation.cpython-39.pyc differ
diff --git a/detectron2/modeling/anchor_generator.py b/detectron2/modeling/anchor_generator.py
new file mode 100644
index 0000000000000000000000000000000000000000..ac94e72396ba61778c102133218bb5defe5b4413
--- /dev/null
+++ b/detectron2/modeling/anchor_generator.py
@@ -0,0 +1,386 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+import collections
+import math
+from typing import List
+import torch
+from torch import nn
+
+from detectron2.config import configurable
+from detectron2.layers import ShapeSpec, move_device_like
+from detectron2.structures import Boxes, RotatedBoxes
+from detectron2.utils.registry import Registry
+
+ANCHOR_GENERATOR_REGISTRY = Registry("ANCHOR_GENERATOR")
+ANCHOR_GENERATOR_REGISTRY.__doc__ = """
+Registry for modules that creates object detection anchors for feature maps.
+
+The registered object will be called with `obj(cfg, input_shape)`.
+"""
+
+
+class BufferList(nn.Module):
+    """
+    Similar to nn.ParameterList, but for buffers
+    """
+
+    def __init__(self, buffers):
+        super().__init__()
+        for i, buffer in enumerate(buffers):
+            # Use non-persistent buffer so the values are not saved in checkpoint
+            self.register_buffer(str(i), buffer, persistent=False)
+
+    def __len__(self):
+        return len(self._buffers)
+
+    def __iter__(self):
+        return iter(self._buffers.values())
+
+
+def _create_grid_offsets(
+    size: List[int], stride: int, offset: float, target_device_tensor: torch.Tensor
+):
+    grid_height, grid_width = size
+    shifts_x = move_device_like(
+        torch.arange(offset * stride, grid_width * stride, step=stride, dtype=torch.float32),
+        target_device_tensor,
+    )
+    shifts_y = move_device_like(
+        torch.arange(offset * stride, grid_height * stride, step=stride, dtype=torch.float32),
+        target_device_tensor,
+    )
+
+    shift_y, shift_x = torch.meshgrid(shifts_y, shifts_x)
+    shift_x = shift_x.reshape(-1)
+    shift_y = shift_y.reshape(-1)
+    return shift_x, shift_y
+
+
+def _broadcast_params(params, num_features, name):
+    """
+    If one size (or aspect ratio) is specified and there are multiple feature
+    maps, we "broadcast" anchors of that single size (or aspect ratio)
+    over all feature maps.
+
+    If params is list[float], or list[list[float]] with len(params) == 1, repeat
+    it num_features time.
+
+    Returns:
+        list[list[float]]: param for each feature
+    """
+    assert isinstance(
+        params, collections.abc.Sequence
+    ), f"{name} in anchor generator has to be a list! Got {params}."
+    assert len(params), f"{name} in anchor generator cannot be empty!"
+    if not isinstance(params[0], collections.abc.Sequence):  # params is list[float]
+        return [params] * num_features
+    if len(params) == 1:
+        return list(params) * num_features
+    assert len(params) == num_features, (
+        f"Got {name} of length {len(params)} in anchor generator, "
+        f"but the number of input features is {num_features}!"
+    )
+    return params
+
+
+@ANCHOR_GENERATOR_REGISTRY.register()
+class DefaultAnchorGenerator(nn.Module):
+    """
+    Compute anchors in the standard ways described in
+    "Faster R-CNN: Towards Real-Time Object Detection with Region Proposal Networks".
+    """
+
+    box_dim: torch.jit.Final[int] = 4
+    """
+    the dimension of each anchor box.
+    """
+
+    @configurable
+    def __init__(self, *, sizes, aspect_ratios, strides, offset=0.5):
+        """
+        This interface is experimental.
+
+        Args:
+            sizes (list[list[float]] or list[float]):
+                If ``sizes`` is list[list[float]], ``sizes[i]`` is the list of anchor sizes
+                (i.e. sqrt of anchor area) to use for the i-th feature map.
+                If ``sizes`` is list[float], ``sizes`` is used for all feature maps.
+                Anchor sizes are given in absolute lengths in units of
+                the input image; they do not dynamically scale if the input image size changes.
+            aspect_ratios (list[list[float]] or list[float]): list of aspect ratios
+                (i.e. height / width) to use for anchors. Same "broadcast" rule for `sizes` applies.
+            strides (list[int]): stride of each input feature.
+            offset (float): Relative offset between the center of the first anchor and the top-left
+                corner of the image. Value has to be in [0, 1).
+                Recommend to use 0.5, which means half stride.
+        """
+        super().__init__()
+
+        self.strides = strides
+        self.num_features = len(self.strides)
+        sizes = _broadcast_params(sizes, self.num_features, "sizes")
+        aspect_ratios = _broadcast_params(aspect_ratios, self.num_features, "aspect_ratios")
+        self.cell_anchors = self._calculate_anchors(sizes, aspect_ratios)
+
+        self.offset = offset
+        assert 0.0 <= self.offset < 1.0, self.offset
+
+    @classmethod
+    def from_config(cls, cfg, input_shape: List[ShapeSpec]):
+        return {
+            "sizes": cfg.MODEL.ANCHOR_GENERATOR.SIZES,
+            "aspect_ratios": cfg.MODEL.ANCHOR_GENERATOR.ASPECT_RATIOS,
+            "strides": [x.stride for x in input_shape],
+            "offset": cfg.MODEL.ANCHOR_GENERATOR.OFFSET,
+        }
+
+    def _calculate_anchors(self, sizes, aspect_ratios):
+        cell_anchors = [
+            self.generate_cell_anchors(s, a).float() for s, a in zip(sizes, aspect_ratios)
+        ]
+        return BufferList(cell_anchors)
+
+    @property
+    @torch.jit.unused
+    def num_cell_anchors(self):
+        """
+        Alias of `num_anchors`.
+        """
+        return self.num_anchors
+
+    @property
+    @torch.jit.unused
+    def num_anchors(self):
+        """
+        Returns:
+            list[int]: Each int is the number of anchors at every pixel
+                location, on that feature map.
+                For example, if at every pixel we use anchors of 3 aspect
+                ratios and 5 sizes, the number of anchors is 15.
+                (See also ANCHOR_GENERATOR.SIZES and ANCHOR_GENERATOR.ASPECT_RATIOS in config)
+
+                In standard RPN models, `num_anchors` on every feature map is the same.
+        """
+        return [len(cell_anchors) for cell_anchors in self.cell_anchors]
+
+    def _grid_anchors(self, grid_sizes: List[List[int]]):
+        """
+        Returns:
+            list[Tensor]: #featuremap tensors, each is (#locations x #cell_anchors) x 4
+        """
+        anchors = []
+        # buffers() not supported by torchscript. use named_buffers() instead
+        buffers: List[torch.Tensor] = [x[1] for x in self.cell_anchors.named_buffers()]
+        for size, stride, base_anchors in zip(grid_sizes, self.strides, buffers):
+            shift_x, shift_y = _create_grid_offsets(size, stride, self.offset, base_anchors)
+            shifts = torch.stack((shift_x, shift_y, shift_x, shift_y), dim=1)
+
+            anchors.append((shifts.view(-1, 1, 4) + base_anchors.view(1, -1, 4)).reshape(-1, 4))
+
+        return anchors
+
+    def generate_cell_anchors(self, sizes=(32, 64, 128, 256, 512), aspect_ratios=(0.5, 1, 2)):
+        """
+        Generate a tensor storing canonical anchor boxes, which are all anchor
+        boxes of different sizes and aspect_ratios centered at (0, 0).
+        We can later build the set of anchors for a full feature map by
+        shifting and tiling these tensors (see `meth:_grid_anchors`).
+
+        Args:
+            sizes (tuple[float]):
+            aspect_ratios (tuple[float]]):
+
+        Returns:
+            Tensor of shape (len(sizes) * len(aspect_ratios), 4) storing anchor boxes
+                in XYXY format.
+        """
+
+        # This is different from the anchor generator defined in the original Faster R-CNN
+        # code or Detectron. They yield the same AP, however the old version defines cell
+        # anchors in a less natural way with a shift relative to the feature grid and
+        # quantization that results in slightly different sizes for different aspect ratios.
+        # See also https://github.com/facebookresearch/Detectron/issues/227
+
+        anchors = []
+        for size in sizes:
+            area = size**2.0
+            for aspect_ratio in aspect_ratios:
+                # s * s = w * h
+                # a = h / w
+                # ... some algebra ...
+                # w = sqrt(s * s / a)
+                # h = a * w
+                w = math.sqrt(area / aspect_ratio)
+                h = aspect_ratio * w
+                x0, y0, x1, y1 = -w / 2.0, -h / 2.0, w / 2.0, h / 2.0
+                anchors.append([x0, y0, x1, y1])
+        return torch.tensor(anchors)
+
+    def forward(self, features: List[torch.Tensor]):
+        """
+        Args:
+            features (list[Tensor]): list of backbone feature maps on which to generate anchors.
+
+        Returns:
+            list[Boxes]: a list of Boxes containing all the anchors for each feature map
+                (i.e. the cell anchors repeated over all locations in the feature map).
+                The number of anchors of each feature map is Hi x Wi x num_cell_anchors,
+                where Hi, Wi are resolution of the feature map divided by anchor stride.
+        """
+        grid_sizes = [feature_map.shape[-2:] for feature_map in features]
+        anchors_over_all_feature_maps = self._grid_anchors(grid_sizes)
+        return [Boxes(x) for x in anchors_over_all_feature_maps]
+
+
+@ANCHOR_GENERATOR_REGISTRY.register()
+class RotatedAnchorGenerator(nn.Module):
+    """
+    Compute rotated anchors used by Rotated RPN (RRPN), described in
+    "Arbitrary-Oriented Scene Text Detection via Rotation Proposals".
+    """
+
+    box_dim: int = 5
+    """
+    the dimension of each anchor box.
+    """
+
+    @configurable
+    def __init__(self, *, sizes, aspect_ratios, strides, angles, offset=0.5):
+        """
+        This interface is experimental.
+
+        Args:
+            sizes (list[list[float]] or list[float]):
+                If sizes is list[list[float]], sizes[i] is the list of anchor sizes
+                (i.e. sqrt of anchor area) to use for the i-th feature map.
+                If sizes is list[float], the sizes are used for all feature maps.
+                Anchor sizes are given in absolute lengths in units of
+                the input image; they do not dynamically scale if the input image size changes.
+            aspect_ratios (list[list[float]] or list[float]): list of aspect ratios
+                (i.e. height / width) to use for anchors. Same "broadcast" rule for `sizes` applies.
+            strides (list[int]): stride of each input feature.
+            angles (list[list[float]] or list[float]): list of angles (in degrees CCW)
+                to use for anchors. Same "broadcast" rule for `sizes` applies.
+            offset (float): Relative offset between the center of the first anchor and the top-left
+                corner of the image. Value has to be in [0, 1).
+                Recommend to use 0.5, which means half stride.
+        """
+        super().__init__()
+
+        self.strides = strides
+        self.num_features = len(self.strides)
+        sizes = _broadcast_params(sizes, self.num_features, "sizes")
+        aspect_ratios = _broadcast_params(aspect_ratios, self.num_features, "aspect_ratios")
+        angles = _broadcast_params(angles, self.num_features, "angles")
+        self.cell_anchors = self._calculate_anchors(sizes, aspect_ratios, angles)
+
+        self.offset = offset
+        assert 0.0 <= self.offset < 1.0, self.offset
+
+    @classmethod
+    def from_config(cls, cfg, input_shape: List[ShapeSpec]):
+        return {
+            "sizes": cfg.MODEL.ANCHOR_GENERATOR.SIZES,
+            "aspect_ratios": cfg.MODEL.ANCHOR_GENERATOR.ASPECT_RATIOS,
+            "strides": [x.stride for x in input_shape],
+            "offset": cfg.MODEL.ANCHOR_GENERATOR.OFFSET,
+            "angles": cfg.MODEL.ANCHOR_GENERATOR.ANGLES,
+        }
+
+    def _calculate_anchors(self, sizes, aspect_ratios, angles):
+        cell_anchors = [
+            self.generate_cell_anchors(size, aspect_ratio, angle).float()
+            for size, aspect_ratio, angle in zip(sizes, aspect_ratios, angles)
+        ]
+        return BufferList(cell_anchors)
+
+    @property
+    def num_cell_anchors(self):
+        """
+        Alias of `num_anchors`.
+        """
+        return self.num_anchors
+
+    @property
+    def num_anchors(self):
+        """
+        Returns:
+            list[int]: Each int is the number of anchors at every pixel
+                location, on that feature map.
+                For example, if at every pixel we use anchors of 3 aspect
+                ratios, 2 sizes and 5 angles, the number of anchors is 30.
+                (See also ANCHOR_GENERATOR.SIZES, ANCHOR_GENERATOR.ASPECT_RATIOS
+                and ANCHOR_GENERATOR.ANGLES in config)
+
+                In standard RRPN models, `num_anchors` on every feature map is the same.
+        """
+        return [len(cell_anchors) for cell_anchors in self.cell_anchors]
+
+    def _grid_anchors(self, grid_sizes):
+        anchors = []
+        for size, stride, base_anchors in zip(grid_sizes, self.strides, self.cell_anchors):
+            shift_x, shift_y = _create_grid_offsets(size, stride, self.offset, base_anchors)
+            zeros = torch.zeros_like(shift_x)
+            shifts = torch.stack((shift_x, shift_y, zeros, zeros, zeros), dim=1)
+
+            anchors.append((shifts.view(-1, 1, 5) + base_anchors.view(1, -1, 5)).reshape(-1, 5))
+
+        return anchors
+
+    def generate_cell_anchors(
+        self,
+        sizes=(32, 64, 128, 256, 512),
+        aspect_ratios=(0.5, 1, 2),
+        angles=(-90, -60, -30, 0, 30, 60, 90),
+    ):
+        """
+        Generate a tensor storing canonical anchor boxes, which are all anchor
+        boxes of different sizes, aspect_ratios, angles centered at (0, 0).
+        We can later build the set of anchors for a full feature map by
+        shifting and tiling these tensors (see `meth:_grid_anchors`).
+
+        Args:
+            sizes (tuple[float]):
+            aspect_ratios (tuple[float]]):
+            angles (tuple[float]]):
+
+        Returns:
+            Tensor of shape (len(sizes) * len(aspect_ratios) * len(angles), 5)
+                storing anchor boxes in (x_ctr, y_ctr, w, h, angle) format.
+        """
+        anchors = []
+        for size in sizes:
+            area = size**2.0
+            for aspect_ratio in aspect_ratios:
+                # s * s = w * h
+                # a = h / w
+                # ... some algebra ...
+                # w = sqrt(s * s / a)
+                # h = a * w
+                w = math.sqrt(area / aspect_ratio)
+                h = aspect_ratio * w
+                anchors.extend([0, 0, w, h, a] for a in angles)
+
+        return torch.tensor(anchors)
+
+    def forward(self, features):
+        """
+        Args:
+            features (list[Tensor]): list of backbone feature maps on which to generate anchors.
+
+        Returns:
+            list[RotatedBoxes]: a list of Boxes containing all the anchors for each feature map
+                (i.e. the cell anchors repeated over all locations in the feature map).
+                The number of anchors of each feature map is Hi x Wi x num_cell_anchors,
+                where Hi, Wi are resolution of the feature map divided by anchor stride.
+        """
+        grid_sizes = [feature_map.shape[-2:] for feature_map in features]
+        anchors_over_all_feature_maps = self._grid_anchors(grid_sizes)
+        return [RotatedBoxes(x) for x in anchors_over_all_feature_maps]
+
+
+def build_anchor_generator(cfg, input_shape):
+    """
+    Built an anchor generator from `cfg.MODEL.ANCHOR_GENERATOR.NAME`.
+    """
+    anchor_generator = cfg.MODEL.ANCHOR_GENERATOR.NAME
+    return ANCHOR_GENERATOR_REGISTRY.get(anchor_generator)(cfg, input_shape)
diff --git a/detectron2/modeling/backbone/__init__.py b/detectron2/modeling/backbone/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..5b3358a4061b143c78eba8e7bf81fe9f7ffac1aa
--- /dev/null
+++ b/detectron2/modeling/backbone/__init__.py
@@ -0,0 +1,20 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+from .build import build_backbone, BACKBONE_REGISTRY  # noqa F401 isort:skip
+
+from .backbone import Backbone
+from .fpn import FPN
+from .regnet import RegNet
+from .resnet import (
+    BasicStem,
+    ResNet,
+    ResNetBlockBase,
+    build_resnet_backbone,
+    make_stage,
+    BottleneckBlock,
+)
+from .vit import ViT, SimpleFeaturePyramid, get_vit_lr_decay_rate
+from .mvit import MViT
+from .swin import SwinTransformer
+
+__all__ = [k for k in globals().keys() if not k.startswith("_")]
+# TODO can expose more resnet blocks after careful consideration
diff --git a/detectron2/modeling/backbone/__pycache__/__init__.cpython-310.pyc b/detectron2/modeling/backbone/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3e2b18b86c9470912276ee9367a214691c394e0f
Binary files /dev/null and b/detectron2/modeling/backbone/__pycache__/__init__.cpython-310.pyc differ
diff --git a/detectron2/modeling/backbone/__pycache__/__init__.cpython-39.pyc b/detectron2/modeling/backbone/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..52473a216206804d726130f10b83c889f9775709
Binary files /dev/null and b/detectron2/modeling/backbone/__pycache__/__init__.cpython-39.pyc differ
diff --git a/detectron2/modeling/backbone/__pycache__/backbone.cpython-310.pyc b/detectron2/modeling/backbone/__pycache__/backbone.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..59e12d94f2e3b9af19c63123d9d382839124b16a
Binary files /dev/null and b/detectron2/modeling/backbone/__pycache__/backbone.cpython-310.pyc differ
diff --git a/detectron2/modeling/backbone/__pycache__/backbone.cpython-39.pyc b/detectron2/modeling/backbone/__pycache__/backbone.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..816ded0ac7a6a8c422c97dd32240a399de669be4
Binary files /dev/null and b/detectron2/modeling/backbone/__pycache__/backbone.cpython-39.pyc differ
diff --git a/detectron2/modeling/backbone/__pycache__/build.cpython-310.pyc b/detectron2/modeling/backbone/__pycache__/build.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2917f8c6ba232addb81a6ffda883758007995082
Binary files /dev/null and b/detectron2/modeling/backbone/__pycache__/build.cpython-310.pyc differ
diff --git a/detectron2/modeling/backbone/__pycache__/build.cpython-39.pyc b/detectron2/modeling/backbone/__pycache__/build.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3fe8a2a69003e886401a187c23f1dcd7f345a6b7
Binary files /dev/null and b/detectron2/modeling/backbone/__pycache__/build.cpython-39.pyc differ
diff --git a/detectron2/modeling/backbone/__pycache__/fpn.cpython-310.pyc b/detectron2/modeling/backbone/__pycache__/fpn.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..081f23178af1a2c2fb8be5be3362e118d53e04a9
Binary files /dev/null and b/detectron2/modeling/backbone/__pycache__/fpn.cpython-310.pyc differ
diff --git a/detectron2/modeling/backbone/__pycache__/fpn.cpython-39.pyc b/detectron2/modeling/backbone/__pycache__/fpn.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..cb3f0413cc7b3b4f5505174af6745019be2c12df
Binary files /dev/null and b/detectron2/modeling/backbone/__pycache__/fpn.cpython-39.pyc differ
diff --git a/detectron2/modeling/backbone/__pycache__/mvit.cpython-310.pyc b/detectron2/modeling/backbone/__pycache__/mvit.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..449a7ef26c67201d0f49bc7d4c2e55ee68df2ba4
Binary files /dev/null and b/detectron2/modeling/backbone/__pycache__/mvit.cpython-310.pyc differ
diff --git a/detectron2/modeling/backbone/__pycache__/mvit.cpython-39.pyc b/detectron2/modeling/backbone/__pycache__/mvit.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0b0bac3d6405b768a057d920a72487588251ac61
Binary files /dev/null and b/detectron2/modeling/backbone/__pycache__/mvit.cpython-39.pyc differ
diff --git a/detectron2/modeling/backbone/__pycache__/regnet.cpython-310.pyc b/detectron2/modeling/backbone/__pycache__/regnet.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..10dab6440e35c517afbc2aea7e09f267eca17a7f
Binary files /dev/null and b/detectron2/modeling/backbone/__pycache__/regnet.cpython-310.pyc differ
diff --git a/detectron2/modeling/backbone/__pycache__/regnet.cpython-39.pyc b/detectron2/modeling/backbone/__pycache__/regnet.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d2b1b63db45d6144f84349afa95af6f4ea79bb88
Binary files /dev/null and b/detectron2/modeling/backbone/__pycache__/regnet.cpython-39.pyc differ
diff --git a/detectron2/modeling/backbone/__pycache__/resnet.cpython-310.pyc b/detectron2/modeling/backbone/__pycache__/resnet.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2119acadb779cc2421aa371124bd4d223f6e50f3
Binary files /dev/null and b/detectron2/modeling/backbone/__pycache__/resnet.cpython-310.pyc differ
diff --git a/detectron2/modeling/backbone/__pycache__/resnet.cpython-39.pyc b/detectron2/modeling/backbone/__pycache__/resnet.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ddb6aad8bf1a7a99b3039aaf48a725c3fd915804
Binary files /dev/null and b/detectron2/modeling/backbone/__pycache__/resnet.cpython-39.pyc differ
diff --git a/detectron2/modeling/backbone/__pycache__/swin.cpython-310.pyc b/detectron2/modeling/backbone/__pycache__/swin.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..82ea03f0b79eefaf17b30b7b9bed9c9438aafce8
Binary files /dev/null and b/detectron2/modeling/backbone/__pycache__/swin.cpython-310.pyc differ
diff --git a/detectron2/modeling/backbone/__pycache__/swin.cpython-39.pyc b/detectron2/modeling/backbone/__pycache__/swin.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9e0ac4d4450f9273f0987be6c3de953ecd3042e3
Binary files /dev/null and b/detectron2/modeling/backbone/__pycache__/swin.cpython-39.pyc differ
diff --git a/detectron2/modeling/backbone/__pycache__/utils.cpython-310.pyc b/detectron2/modeling/backbone/__pycache__/utils.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3042d82794f7cdd8587944036c5e64d11ccfade2
Binary files /dev/null and b/detectron2/modeling/backbone/__pycache__/utils.cpython-310.pyc differ
diff --git a/detectron2/modeling/backbone/__pycache__/utils.cpython-39.pyc b/detectron2/modeling/backbone/__pycache__/utils.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..28de077e8f80e9c9513e3d186286d8628349ee10
Binary files /dev/null and b/detectron2/modeling/backbone/__pycache__/utils.cpython-39.pyc differ
diff --git a/detectron2/modeling/backbone/__pycache__/vit.cpython-310.pyc b/detectron2/modeling/backbone/__pycache__/vit.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6f387255d58eb824c182cb894314803a89d6e841
Binary files /dev/null and b/detectron2/modeling/backbone/__pycache__/vit.cpython-310.pyc differ
diff --git a/detectron2/modeling/backbone/__pycache__/vit.cpython-39.pyc b/detectron2/modeling/backbone/__pycache__/vit.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9b0c610450f471c3d2498e12a162b01f20ab85d8
Binary files /dev/null and b/detectron2/modeling/backbone/__pycache__/vit.cpython-39.pyc differ
diff --git a/detectron2/modeling/backbone/backbone.py b/detectron2/modeling/backbone/backbone.py
new file mode 100644
index 0000000000000000000000000000000000000000..e1c765a6b38542f66cae55216bba697a6626d128
--- /dev/null
+++ b/detectron2/modeling/backbone/backbone.py
@@ -0,0 +1,74 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+from abc import ABCMeta, abstractmethod
+from typing import Dict
+import torch.nn as nn
+
+from detectron2.layers import ShapeSpec
+
+__all__ = ["Backbone"]
+
+
+class Backbone(nn.Module, metaclass=ABCMeta):
+    """
+    Abstract base class for network backbones.
+    """
+
+    def __init__(self):
+        """
+        The `__init__` method of any subclass can specify its own set of arguments.
+        """
+        super().__init__()
+
+    @abstractmethod
+    def forward(self):
+        """
+        Subclasses must override this method, but adhere to the same return type.
+
+        Returns:
+            dict[str->Tensor]: mapping from feature name (e.g., "res2") to tensor
+        """
+        pass
+
+    @property
+    def size_divisibility(self) -> int:
+        """
+        Some backbones require the input height and width to be divisible by a
+        specific integer. This is typically true for encoder / decoder type networks
+        with lateral connection (e.g., FPN) for which feature maps need to match
+        dimension in the "bottom up" and "top down" paths. Set to 0 if no specific
+        input size divisibility is required.
+        """
+        return 0
+
+    @property
+    def padding_constraints(self) -> Dict[str, int]:
+        """
+        This property is a generalization of size_divisibility. Some backbones and training
+        recipes require specific padding constraints, such as enforcing divisibility by a specific
+        integer (e.g., FPN) or padding to a square (e.g., ViTDet with large-scale jitter
+        in :paper:vitdet). `padding_constraints` contains these optional items like:
+        {
+            "size_divisibility": int,
+            "square_size": int,
+            # Future options are possible
+        }
+        `size_divisibility` will read from here if presented and `square_size` indicates the
+        square padding size if `square_size` > 0.
+
+        TODO: use type of Dict[str, int] to avoid torchscipt issues. The type of padding_constraints
+        could be generalized as TypedDict (Python 3.8+) to support more types in the future.
+        """
+        return {}
+
+    def output_shape(self):
+        """
+        Returns:
+            dict[str->ShapeSpec]
+        """
+        # this is a backward-compatible default
+        return {
+            name: ShapeSpec(
+                channels=self._out_feature_channels[name], stride=self._out_feature_strides[name]
+            )
+            for name in self._out_features
+        }
diff --git a/detectron2/modeling/backbone/build.py b/detectron2/modeling/backbone/build.py
new file mode 100644
index 0000000000000000000000000000000000000000..af02141172bebe9a2a27a88c81673c2710b4d73f
--- /dev/null
+++ b/detectron2/modeling/backbone/build.py
@@ -0,0 +1,33 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+from detectron2.layers import ShapeSpec
+from detectron2.utils.registry import Registry
+
+from .backbone import Backbone
+
+BACKBONE_REGISTRY = Registry("BACKBONE")
+BACKBONE_REGISTRY.__doc__ = """
+Registry for backbones, which extract feature maps from images
+
+The registered object must be a callable that accepts two arguments:
+
+1. A :class:`detectron2.config.CfgNode`
+2. A :class:`detectron2.layers.ShapeSpec`, which contains the input shape specification.
+
+Registered object must return instance of :class:`Backbone`.
+"""
+
+
+def build_backbone(cfg, input_shape=None):
+    """
+    Build a backbone from `cfg.MODEL.BACKBONE.NAME`.
+
+    Returns:
+        an instance of :class:`Backbone`
+    """
+    if input_shape is None:
+        input_shape = ShapeSpec(channels=len(cfg.MODEL.PIXEL_MEAN))
+
+    backbone_name = cfg.MODEL.BACKBONE.NAME
+    backbone = BACKBONE_REGISTRY.get(backbone_name)(cfg, input_shape)
+    assert isinstance(backbone, Backbone)
+    return backbone
diff --git a/detectron2/modeling/backbone/fpn.py b/detectron2/modeling/backbone/fpn.py
new file mode 100644
index 0000000000000000000000000000000000000000..19d24e13f069ecb389edcdb4d9859506fe9e6f76
--- /dev/null
+++ b/detectron2/modeling/backbone/fpn.py
@@ -0,0 +1,268 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+import math
+import fvcore.nn.weight_init as weight_init
+import torch
+import torch.nn.functional as F
+from torch import nn
+
+from detectron2.layers import Conv2d, ShapeSpec, get_norm
+
+from .backbone import Backbone
+from .build import BACKBONE_REGISTRY
+from .resnet import build_resnet_backbone
+
+__all__ = ["build_resnet_fpn_backbone", "build_retinanet_resnet_fpn_backbone", "FPN"]
+
+
+class FPN(Backbone):
+    """
+    This module implements :paper:`FPN`.
+    It creates pyramid features built on top of some input feature maps.
+    """
+
+    _fuse_type: torch.jit.Final[str]
+
+    def __init__(
+        self,
+        bottom_up,
+        in_features,
+        out_channels,
+        norm="",
+        top_block=None,
+        fuse_type="sum",
+        square_pad=0,
+    ):
+        """
+        Args:
+            bottom_up (Backbone): module representing the bottom up subnetwork.
+                Must be a subclass of :class:`Backbone`. The multi-scale feature
+                maps generated by the bottom up network, and listed in `in_features`,
+                are used to generate FPN levels.
+            in_features (list[str]): names of the input feature maps coming
+                from the backbone to which FPN is attached. For example, if the
+                backbone produces ["res2", "res3", "res4"], any *contiguous* sublist
+                of these may be used; order must be from high to low resolution.
+            out_channels (int): number of channels in the output feature maps.
+            norm (str): the normalization to use.
+            top_block (nn.Module or None): if provided, an extra operation will
+                be performed on the output of the last (smallest resolution)
+                FPN output, and the result will extend the result list. The top_block
+                further downsamples the feature map. It must have an attribute
+                "num_levels", meaning the number of extra FPN levels added by
+                this block, and "in_feature", which is a string representing
+                its input feature (e.g., p5).
+            fuse_type (str): types for fusing the top down features and the lateral
+                ones. It can be "sum" (default), which sums up element-wise; or "avg",
+                which takes the element-wise mean of the two.
+            square_pad (int): If > 0, require input images to be padded to specific square size.
+        """
+        super(FPN, self).__init__()
+        assert isinstance(bottom_up, Backbone)
+        assert in_features, in_features
+
+        # Feature map strides and channels from the bottom up network (e.g. ResNet)
+        input_shapes = bottom_up.output_shape()
+        strides = [input_shapes[f].stride for f in in_features]
+        in_channels_per_feature = [input_shapes[f].channels for f in in_features]
+
+        _assert_strides_are_log2_contiguous(strides)
+        lateral_convs = []
+        output_convs = []
+
+        use_bias = norm == ""
+        for idx, in_channels in enumerate(in_channels_per_feature):
+            lateral_norm = get_norm(norm, out_channels)
+            output_norm = get_norm(norm, out_channels)
+
+            lateral_conv = Conv2d(
+                in_channels, out_channels, kernel_size=1, bias=use_bias, norm=lateral_norm
+            )
+            output_conv = Conv2d(
+                out_channels,
+                out_channels,
+                kernel_size=3,
+                stride=1,
+                padding=1,
+                bias=use_bias,
+                norm=output_norm,
+            )
+            weight_init.c2_xavier_fill(lateral_conv)
+            weight_init.c2_xavier_fill(output_conv)
+            stage = int(math.log2(strides[idx]))
+            self.add_module("fpn_lateral{}".format(stage), lateral_conv)
+            self.add_module("fpn_output{}".format(stage), output_conv)
+
+            lateral_convs.append(lateral_conv)
+            output_convs.append(output_conv)
+        # Place convs into top-down order (from low to high resolution)
+        # to make the top-down computation in forward clearer.
+        self.lateral_convs = lateral_convs[::-1]
+        self.output_convs = output_convs[::-1]
+        self.top_block = top_block
+        self.in_features = tuple(in_features)
+        self.bottom_up = bottom_up
+        # Return feature names are "p<stage>", like ["p2", "p3", ..., "p6"]
+        self._out_feature_strides = {"p{}".format(int(math.log2(s))): s for s in strides}
+        # top block output feature maps.
+        if self.top_block is not None:
+            for s in range(stage, stage + self.top_block.num_levels):
+                self._out_feature_strides["p{}".format(s + 1)] = 2 ** (s + 1)
+
+        self._out_features = list(self._out_feature_strides.keys())
+        self._out_feature_channels = {k: out_channels for k in self._out_features}
+        self._size_divisibility = strides[-1]
+        self._square_pad = square_pad
+        assert fuse_type in {"avg", "sum"}
+        self._fuse_type = fuse_type
+
+    @property
+    def size_divisibility(self):
+        return self._size_divisibility
+
+    @property
+    def padding_constraints(self):
+        return {"square_size": self._square_pad}
+
+    def forward(self, x):
+        """
+        Args:
+            input (dict[str->Tensor]): mapping feature map name (e.g., "res5") to
+                feature map tensor for each feature level in high to low resolution order.
+
+        Returns:
+            dict[str->Tensor]:
+                mapping from feature map name to FPN feature map tensor
+                in high to low resolution order. Returned feature names follow the FPN
+                paper convention: "p<stage>", where stage has stride = 2 ** stage e.g.,
+                ["p2", "p3", ..., "p6"].
+        """
+        bottom_up_features = self.bottom_up(x)
+        results = []
+        prev_features = self.lateral_convs[0](bottom_up_features[self.in_features[-1]])
+        results.append(self.output_convs[0](prev_features))
+
+        # Reverse feature maps into top-down order (from low to high resolution)
+        for idx, (lateral_conv, output_conv) in enumerate(
+            zip(self.lateral_convs, self.output_convs)
+        ):
+            # Slicing of ModuleList is not supported https://github.com/pytorch/pytorch/issues/47336
+            # Therefore we loop over all modules but skip the first one
+            if idx > 0:
+                features = self.in_features[-idx - 1]
+                features = bottom_up_features[features]
+                top_down_features = F.interpolate(prev_features, scale_factor=2.0, mode="nearest")
+                lateral_features = lateral_conv(features)
+                prev_features = lateral_features + top_down_features
+                if self._fuse_type == "avg":
+                    prev_features /= 2
+                results.insert(0, output_conv(prev_features))
+
+        if self.top_block is not None:
+            if self.top_block.in_feature in bottom_up_features:
+                top_block_in_feature = bottom_up_features[self.top_block.in_feature]
+            else:
+                top_block_in_feature = results[self._out_features.index(self.top_block.in_feature)]
+            results.extend(self.top_block(top_block_in_feature))
+        assert len(self._out_features) == len(results)
+        return {f: res for f, res in zip(self._out_features, results)}
+
+    def output_shape(self):
+        return {
+            name: ShapeSpec(
+                channels=self._out_feature_channels[name], stride=self._out_feature_strides[name]
+            )
+            for name in self._out_features
+        }
+
+
+def _assert_strides_are_log2_contiguous(strides):
+    """
+    Assert that each stride is 2x times its preceding stride, i.e. "contiguous in log2".
+    """
+    for i, stride in enumerate(strides[1:], 1):
+        assert stride == 2 * strides[i - 1], "Strides {} {} are not log2 contiguous".format(
+            stride, strides[i - 1]
+        )
+
+
+class LastLevelMaxPool(nn.Module):
+    """
+    This module is used in the original FPN to generate a downsampled
+    P6 feature from P5.
+    """
+
+    def __init__(self):
+        super().__init__()
+        self.num_levels = 1
+        self.in_feature = "p5"
+
+    def forward(self, x):
+        return [F.max_pool2d(x, kernel_size=1, stride=2, padding=0)]
+
+
+class LastLevelP6P7(nn.Module):
+    """
+    This module is used in RetinaNet to generate extra layers, P6 and P7 from
+    C5 feature.
+    """
+
+    def __init__(self, in_channels, out_channels, in_feature="res5"):
+        super().__init__()
+        self.num_levels = 2
+        self.in_feature = in_feature
+        self.p6 = nn.Conv2d(in_channels, out_channels, 3, 2, 1)
+        self.p7 = nn.Conv2d(out_channels, out_channels, 3, 2, 1)
+        for module in [self.p6, self.p7]:
+            weight_init.c2_xavier_fill(module)
+
+    def forward(self, c5):
+        p6 = self.p6(c5)
+        p7 = self.p7(F.relu(p6))
+        return [p6, p7]
+
+
+@BACKBONE_REGISTRY.register()
+def build_resnet_fpn_backbone(cfg, input_shape: ShapeSpec):
+    """
+    Args:
+        cfg: a detectron2 CfgNode
+
+    Returns:
+        backbone (Backbone): backbone module, must be a subclass of :class:`Backbone`.
+    """
+    bottom_up = build_resnet_backbone(cfg, input_shape)
+    in_features = cfg.MODEL.FPN.IN_FEATURES
+    out_channels = cfg.MODEL.FPN.OUT_CHANNELS
+    backbone = FPN(
+        bottom_up=bottom_up,
+        in_features=in_features,
+        out_channels=out_channels,
+        norm=cfg.MODEL.FPN.NORM,
+        top_block=LastLevelMaxPool(),
+        fuse_type=cfg.MODEL.FPN.FUSE_TYPE,
+    )
+    return backbone
+
+
+@BACKBONE_REGISTRY.register()
+def build_retinanet_resnet_fpn_backbone(cfg, input_shape: ShapeSpec):
+    """
+    Args:
+        cfg: a detectron2 CfgNode
+
+    Returns:
+        backbone (Backbone): backbone module, must be a subclass of :class:`Backbone`.
+    """
+    bottom_up = build_resnet_backbone(cfg, input_shape)
+    in_features = cfg.MODEL.FPN.IN_FEATURES
+    out_channels = cfg.MODEL.FPN.OUT_CHANNELS
+    in_channels_p6p7 = bottom_up.output_shape()["res5"].channels
+    backbone = FPN(
+        bottom_up=bottom_up,
+        in_features=in_features,
+        out_channels=out_channels,
+        norm=cfg.MODEL.FPN.NORM,
+        top_block=LastLevelP6P7(in_channels_p6p7, out_channels),
+        fuse_type=cfg.MODEL.FPN.FUSE_TYPE,
+    )
+    return backbone
diff --git a/detectron2/modeling/backbone/mvit.py b/detectron2/modeling/backbone/mvit.py
new file mode 100644
index 0000000000000000000000000000000000000000..50667a8a836b933666761cc09d4175e64098c8aa
--- /dev/null
+++ b/detectron2/modeling/backbone/mvit.py
@@ -0,0 +1,448 @@
+import logging
+import numpy as np
+import torch
+import torch.nn as nn
+
+from .backbone import Backbone
+from .utils import (
+    PatchEmbed,
+    add_decomposed_rel_pos,
+    get_abs_pos,
+    window_partition,
+    window_unpartition,
+)
+
+logger = logging.getLogger(__name__)
+
+
+__all__ = ["MViT"]
+
+
+def attention_pool(x, pool, norm=None):
+    # (B, H, W, C) -> (B, C, H, W)
+    x = x.permute(0, 3, 1, 2)
+    x = pool(x)
+    # (B, C, H1, W1) -> (B, H1, W1, C)
+    x = x.permute(0, 2, 3, 1)
+    if norm:
+        x = norm(x)
+
+    return x
+
+
+class MultiScaleAttention(nn.Module):
+    """Multiscale Multi-head Attention block."""
+
+    def __init__(
+        self,
+        dim,
+        dim_out,
+        num_heads,
+        qkv_bias=True,
+        norm_layer=nn.LayerNorm,
+        pool_kernel=(3, 3),
+        stride_q=1,
+        stride_kv=1,
+        residual_pooling=True,
+        window_size=0,
+        use_rel_pos=False,
+        rel_pos_zero_init=True,
+        input_size=None,
+    ):
+        """
+        Args:
+            dim (int): Number of input channels.
+            dim_out (int): Number of output channels.
+            num_heads (int): Number of attention heads.
+            qkv_bias (bool:  If True, add a learnable bias to query, key, value.
+            norm_layer (nn.Module): Normalization layer.
+            pool_kernel (tuple): kernel size for qkv pooling layers.
+            stride_q (int): stride size for q pooling layer.
+            stride_kv (int): stride size for kv pooling layer.
+            residual_pooling (bool): If true, enable residual pooling.
+            use_rel_pos (bool): If True, add relative postional embeddings to the attention map.
+            rel_pos_zero_init (bool): If True, zero initialize relative positional parameters.
+            input_size (int or None): Input resolution.
+        """
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = dim_out // num_heads
+        self.scale = head_dim**-0.5
+
+        self.qkv = nn.Linear(dim, dim_out * 3, bias=qkv_bias)
+        self.proj = nn.Linear(dim_out, dim_out)
+
+        # qkv pooling
+        pool_padding = [k // 2 for k in pool_kernel]
+        dim_conv = dim_out // num_heads
+        self.pool_q = nn.Conv2d(
+            dim_conv,
+            dim_conv,
+            pool_kernel,
+            stride=stride_q,
+            padding=pool_padding,
+            groups=dim_conv,
+            bias=False,
+        )
+        self.norm_q = norm_layer(dim_conv)
+        self.pool_k = nn.Conv2d(
+            dim_conv,
+            dim_conv,
+            pool_kernel,
+            stride=stride_kv,
+            padding=pool_padding,
+            groups=dim_conv,
+            bias=False,
+        )
+        self.norm_k = norm_layer(dim_conv)
+        self.pool_v = nn.Conv2d(
+            dim_conv,
+            dim_conv,
+            pool_kernel,
+            stride=stride_kv,
+            padding=pool_padding,
+            groups=dim_conv,
+            bias=False,
+        )
+        self.norm_v = norm_layer(dim_conv)
+
+        self.window_size = window_size
+        if window_size:
+            self.q_win_size = window_size // stride_q
+            self.kv_win_size = window_size // stride_kv
+        self.residual_pooling = residual_pooling
+
+        self.use_rel_pos = use_rel_pos
+        if self.use_rel_pos:
+            # initialize relative positional embeddings
+            assert input_size[0] == input_size[1]
+            size = input_size[0]
+            rel_dim = 2 * max(size // stride_q, size // stride_kv) - 1
+            self.rel_pos_h = nn.Parameter(torch.zeros(rel_dim, head_dim))
+            self.rel_pos_w = nn.Parameter(torch.zeros(rel_dim, head_dim))
+
+            if not rel_pos_zero_init:
+                nn.init.trunc_normal_(self.rel_pos_h, std=0.02)
+                nn.init.trunc_normal_(self.rel_pos_w, std=0.02)
+
+    def forward(self, x):
+        B, H, W, _ = x.shape
+        # qkv with shape (3, B, nHead, H, W, C)
+        qkv = self.qkv(x).reshape(B, H, W, 3, self.num_heads, -1).permute(3, 0, 4, 1, 2, 5)
+        # q, k, v with shape (B * nHead, H, W, C)
+        q, k, v = qkv.reshape(3, B * self.num_heads, H, W, -1).unbind(0)
+
+        q = attention_pool(q, self.pool_q, self.norm_q)
+        k = attention_pool(k, self.pool_k, self.norm_k)
+        v = attention_pool(v, self.pool_v, self.norm_v)
+
+        ori_q = q
+        if self.window_size:
+            q, q_hw_pad = window_partition(q, self.q_win_size)
+            k, kv_hw_pad = window_partition(k, self.kv_win_size)
+            v, _ = window_partition(v, self.kv_win_size)
+            q_hw = (self.q_win_size, self.q_win_size)
+            kv_hw = (self.kv_win_size, self.kv_win_size)
+        else:
+            q_hw = q.shape[1:3]
+            kv_hw = k.shape[1:3]
+
+        q = q.view(q.shape[0], np.prod(q_hw), -1)
+        k = k.view(k.shape[0], np.prod(kv_hw), -1)
+        v = v.view(v.shape[0], np.prod(kv_hw), -1)
+
+        attn = (q * self.scale) @ k.transpose(-2, -1)
+
+        if self.use_rel_pos:
+            attn = add_decomposed_rel_pos(attn, q, self.rel_pos_h, self.rel_pos_w, q_hw, kv_hw)
+
+        attn = attn.softmax(dim=-1)
+        x = attn @ v
+
+        x = x.view(x.shape[0], q_hw[0], q_hw[1], -1)
+
+        if self.window_size:
+            x = window_unpartition(x, self.q_win_size, q_hw_pad, ori_q.shape[1:3])
+
+        if self.residual_pooling:
+            x += ori_q
+
+        H, W = x.shape[1], x.shape[2]
+        x = x.view(B, self.num_heads, H, W, -1).permute(0, 2, 3, 1, 4).reshape(B, H, W, -1)
+        x = self.proj(x)
+
+        return x
+
+
+class MultiScaleBlock(nn.Module):
+    """Multiscale Transformer blocks"""
+
+    def __init__(
+        self,
+        dim,
+        dim_out,
+        num_heads,
+        mlp_ratio=4.0,
+        qkv_bias=True,
+        drop_path=0.0,
+        norm_layer=nn.LayerNorm,
+        act_layer=nn.GELU,
+        qkv_pool_kernel=(3, 3),
+        stride_q=1,
+        stride_kv=1,
+        residual_pooling=True,
+        window_size=0,
+        use_rel_pos=False,
+        rel_pos_zero_init=True,
+        input_size=None,
+    ):
+        """
+        Args:
+            dim (int): Number of input channels.
+            dim_out (int): Number of output channels.
+            num_heads (int): Number of attention heads in the MViT block.
+            mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
+            qkv_bias (bool): If True, add a learnable bias to query, key, value.
+            drop_path (float): Stochastic depth rate.
+            norm_layer (nn.Module): Normalization layer.
+            act_layer (nn.Module): Activation layer.
+            qkv_pool_kernel (tuple): kernel size for qkv pooling layers.
+            stride_q (int): stride size for q pooling layer.
+            stride_kv (int): stride size for kv pooling layer.
+            residual_pooling (bool): If true, enable residual pooling.
+            window_size (int): Window size for window attention blocks. If it equals 0, then not
+                use window attention.
+            use_rel_pos (bool): If True, add relative postional embeddings to the attention map.
+            rel_pos_zero_init (bool): If True, zero initialize relative positional parameters.
+            input_size (int or None): Input resolution.
+        """
+        super().__init__()
+        self.norm1 = norm_layer(dim)
+        self.attn = MultiScaleAttention(
+            dim,
+            dim_out,
+            num_heads=num_heads,
+            qkv_bias=qkv_bias,
+            norm_layer=norm_layer,
+            pool_kernel=qkv_pool_kernel,
+            stride_q=stride_q,
+            stride_kv=stride_kv,
+            residual_pooling=residual_pooling,
+            window_size=window_size,
+            use_rel_pos=use_rel_pos,
+            rel_pos_zero_init=rel_pos_zero_init,
+            input_size=input_size,
+        )
+
+        from timm.models.layers import DropPath, Mlp
+
+        self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
+        self.norm2 = norm_layer(dim_out)
+        self.mlp = Mlp(
+            in_features=dim_out,
+            hidden_features=int(dim_out * mlp_ratio),
+            out_features=dim_out,
+            act_layer=act_layer,
+        )
+
+        if dim != dim_out:
+            self.proj = nn.Linear(dim, dim_out)
+
+        if stride_q > 1:
+            kernel_skip = stride_q + 1
+            padding_skip = int(kernel_skip // 2)
+            self.pool_skip = nn.MaxPool2d(kernel_skip, stride_q, padding_skip, ceil_mode=False)
+
+    def forward(self, x):
+        x_norm = self.norm1(x)
+        x_block = self.attn(x_norm)
+
+        if hasattr(self, "proj"):
+            x = self.proj(x_norm)
+        if hasattr(self, "pool_skip"):
+            x = attention_pool(x, self.pool_skip)
+
+        x = x + self.drop_path(x_block)
+        x = x + self.drop_path(self.mlp(self.norm2(x)))
+
+        return x
+
+
+class MViT(Backbone):
+    """
+    This module implements Multiscale Vision Transformer (MViT) backbone in :paper:'mvitv2'.
+    """
+
+    def __init__(
+        self,
+        img_size=224,
+        patch_kernel=(7, 7),
+        patch_stride=(4, 4),
+        patch_padding=(3, 3),
+        in_chans=3,
+        embed_dim=96,
+        depth=16,
+        num_heads=1,
+        last_block_indexes=(0, 2, 11, 15),
+        qkv_pool_kernel=(3, 3),
+        adaptive_kv_stride=4,
+        adaptive_window_size=56,
+        residual_pooling=True,
+        mlp_ratio=4.0,
+        qkv_bias=True,
+        drop_path_rate=0.0,
+        norm_layer=nn.LayerNorm,
+        act_layer=nn.GELU,
+        use_abs_pos=False,
+        use_rel_pos=True,
+        rel_pos_zero_init=True,
+        use_act_checkpoint=False,
+        pretrain_img_size=224,
+        pretrain_use_cls_token=True,
+        out_features=("scale2", "scale3", "scale4", "scale5"),
+    ):
+        """
+        Args:
+            img_size (int): Input image size.
+            patch_kernel (tuple): kernel size for patch embedding.
+            patch_stride (tuple): stride size for patch embedding.
+            patch_padding (tuple): padding size for patch embedding.
+            in_chans (int): Number of input image channels.
+            embed_dim (int): Patch embedding dimension.
+            depth (int): Depth of MViT.
+            num_heads (int): Number of base attention heads in each MViT block.
+            last_block_indexes (tuple): Block indexes for last blocks in each stage.
+            qkv_pool_kernel (tuple): kernel size for qkv pooling layers.
+            adaptive_kv_stride (int): adaptive stride size for kv pooling.
+            adaptive_window_size (int): adaptive window size for window attention blocks.
+            residual_pooling (bool): If true, enable residual pooling.
+            mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
+            qkv_bias (bool): If True, add a learnable bias to query, key, value.
+            drop_path_rate (float): Stochastic depth rate.
+            norm_layer (nn.Module): Normalization layer.
+            act_layer (nn.Module): Activation layer.
+            use_abs_pos (bool): If True, use absolute positional embeddings.
+            use_rel_pos (bool): If True, add relative postional embeddings to the attention map.
+            rel_pos_zero_init (bool): If True, zero initialize relative positional parameters.
+            window_size (int): Window size for window attention blocks.
+            use_act_checkpoint (bool): If True, use activation checkpointing.
+            pretrain_img_size (int): input image size for pretraining models.
+            pretrain_use_cls_token (bool): If True, pretrainig models use class token.
+            out_features (tuple): name of the feature maps from each stage.
+        """
+        super().__init__()
+        self.pretrain_use_cls_token = pretrain_use_cls_token
+
+        self.patch_embed = PatchEmbed(
+            kernel_size=patch_kernel,
+            stride=patch_stride,
+            padding=patch_padding,
+            in_chans=in_chans,
+            embed_dim=embed_dim,
+        )
+
+        if use_abs_pos:
+            # Initialize absoluate positional embedding with pretrain image size.
+            num_patches = (pretrain_img_size // patch_stride[0]) * (
+                pretrain_img_size // patch_stride[1]
+            )
+            num_positions = (num_patches + 1) if pretrain_use_cls_token else num_patches
+            self.pos_embed = nn.Parameter(torch.zeros(1, num_positions, embed_dim))
+        else:
+            self.pos_embed = None
+
+        # stochastic depth decay rule
+        dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)]
+        dim_out = embed_dim
+        stride_kv = adaptive_kv_stride
+        window_size = adaptive_window_size
+        input_size = (img_size // patch_stride[0], img_size // patch_stride[1])
+        stage = 2
+        stride = patch_stride[0]
+        self._out_feature_strides = {}
+        self._out_feature_channels = {}
+        self.blocks = nn.ModuleList()
+        for i in range(depth):
+            # Multiply stride_kv by 2 if it's the last block of stage2 and stage3.
+            if i == last_block_indexes[1] or i == last_block_indexes[2]:
+                stride_kv_ = stride_kv * 2
+            else:
+                stride_kv_ = stride_kv
+            # hybrid window attention: global attention in last three stages.
+            window_size_ = 0 if i in last_block_indexes[1:] else window_size
+            block = MultiScaleBlock(
+                dim=embed_dim,
+                dim_out=dim_out,
+                num_heads=num_heads,
+                mlp_ratio=mlp_ratio,
+                qkv_bias=qkv_bias,
+                drop_path=dpr[i],
+                norm_layer=norm_layer,
+                qkv_pool_kernel=qkv_pool_kernel,
+                stride_q=2 if i - 1 in last_block_indexes else 1,
+                stride_kv=stride_kv_,
+                residual_pooling=residual_pooling,
+                window_size=window_size_,
+                use_rel_pos=use_rel_pos,
+                rel_pos_zero_init=rel_pos_zero_init,
+                input_size=input_size,
+            )
+            if use_act_checkpoint:
+                # TODO: use torch.utils.checkpoint
+                from fairscale.nn.checkpoint import checkpoint_wrapper
+
+                block = checkpoint_wrapper(block)
+            self.blocks.append(block)
+
+            embed_dim = dim_out
+            if i in last_block_indexes:
+                name = f"scale{stage}"
+                if name in out_features:
+                    self._out_feature_channels[name] = dim_out
+                    self._out_feature_strides[name] = stride
+                    self.add_module(f"{name}_norm", norm_layer(dim_out))
+
+                dim_out *= 2
+                num_heads *= 2
+                stride_kv = max(stride_kv // 2, 1)
+                stride *= 2
+                stage += 1
+            if i - 1 in last_block_indexes:
+                window_size = window_size // 2
+                input_size = [s // 2 for s in input_size]
+
+        self._out_features = out_features
+        self._last_block_indexes = last_block_indexes
+
+        if self.pos_embed is not None:
+            nn.init.trunc_normal_(self.pos_embed, std=0.02)
+
+        self.apply(self._init_weights)
+
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            nn.init.trunc_normal_(m.weight, std=0.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+
+    def forward(self, x):
+        x = self.patch_embed(x)
+
+        if self.pos_embed is not None:
+            x = x + get_abs_pos(self.pos_embed, self.pretrain_use_cls_token, x.shape[1:3])
+
+        outputs = {}
+        stage = 2
+        for i, blk in enumerate(self.blocks):
+            x = blk(x)
+            if i in self._last_block_indexes:
+                name = f"scale{stage}"
+                if name in self._out_features:
+                    x_out = getattr(self, f"{name}_norm")(x)
+                    outputs[name] = x_out.permute(0, 3, 1, 2)
+                stage += 1
+
+        return outputs
diff --git a/detectron2/modeling/backbone/regnet.py b/detectron2/modeling/backbone/regnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..3533d63385d1324cfc1559eae9576b3fa52585af
--- /dev/null
+++ b/detectron2/modeling/backbone/regnet.py
@@ -0,0 +1,452 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+"""
+Implementation of RegNet models from :paper:`dds` and :paper:`scaling`.
+
+This code is adapted from https://github.com/facebookresearch/pycls with minimal modifications.
+Some code duplication exists between RegNet and ResNets (e.g., ResStem) in order to simplify
+model loading.
+"""
+
+import numpy as np
+from torch import nn
+
+from detectron2.layers import CNNBlockBase, ShapeSpec, get_norm
+
+from .backbone import Backbone
+
+__all__ = [
+    "AnyNet",
+    "RegNet",
+    "ResStem",
+    "SimpleStem",
+    "VanillaBlock",
+    "ResBasicBlock",
+    "ResBottleneckBlock",
+]
+
+
+def conv2d(w_in, w_out, k, *, stride=1, groups=1, bias=False):
+    """Helper for building a conv2d layer."""
+    assert k % 2 == 1, "Only odd size kernels supported to avoid padding issues."
+    s, p, g, b = stride, (k - 1) // 2, groups, bias
+    return nn.Conv2d(w_in, w_out, k, stride=s, padding=p, groups=g, bias=b)
+
+
+def gap2d():
+    """Helper for building a global average pooling layer."""
+    return nn.AdaptiveAvgPool2d((1, 1))
+
+
+def pool2d(k, *, stride=1):
+    """Helper for building a pool2d layer."""
+    assert k % 2 == 1, "Only odd size kernels supported to avoid padding issues."
+    return nn.MaxPool2d(k, stride=stride, padding=(k - 1) // 2)
+
+
+def init_weights(m):
+    """Performs ResNet-style weight initialization."""
+    if isinstance(m, nn.Conv2d):
+        # Note that there is no bias due to BN
+        fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
+        m.weight.data.normal_(mean=0.0, std=np.sqrt(2.0 / fan_out))
+    elif isinstance(m, nn.BatchNorm2d):
+        m.weight.data.fill_(1.0)
+        m.bias.data.zero_()
+    elif isinstance(m, nn.Linear):
+        m.weight.data.normal_(mean=0.0, std=0.01)
+        m.bias.data.zero_()
+
+
+class ResStem(CNNBlockBase):
+    """ResNet stem for ImageNet: 7x7, BN, AF, MaxPool."""
+
+    def __init__(self, w_in, w_out, norm, activation_class):
+        super().__init__(w_in, w_out, 4)
+        self.conv = conv2d(w_in, w_out, 7, stride=2)
+        self.bn = get_norm(norm, w_out)
+        self.af = activation_class()
+        self.pool = pool2d(3, stride=2)
+
+    def forward(self, x):
+        for layer in self.children():
+            x = layer(x)
+        return x
+
+
+class SimpleStem(CNNBlockBase):
+    """Simple stem for ImageNet: 3x3, BN, AF."""
+
+    def __init__(self, w_in, w_out, norm, activation_class):
+        super().__init__(w_in, w_out, 2)
+        self.conv = conv2d(w_in, w_out, 3, stride=2)
+        self.bn = get_norm(norm, w_out)
+        self.af = activation_class()
+
+    def forward(self, x):
+        for layer in self.children():
+            x = layer(x)
+        return x
+
+
+class SE(nn.Module):
+    """Squeeze-and-Excitation (SE) block: AvgPool, FC, Act, FC, Sigmoid."""
+
+    def __init__(self, w_in, w_se, activation_class):
+        super().__init__()
+        self.avg_pool = gap2d()
+        self.f_ex = nn.Sequential(
+            conv2d(w_in, w_se, 1, bias=True),
+            activation_class(),
+            conv2d(w_se, w_in, 1, bias=True),
+            nn.Sigmoid(),
+        )
+
+    def forward(self, x):
+        return x * self.f_ex(self.avg_pool(x))
+
+
+class VanillaBlock(CNNBlockBase):
+    """Vanilla block: [3x3 conv, BN, Relu] x2."""
+
+    def __init__(self, w_in, w_out, stride, norm, activation_class, _params):
+        super().__init__(w_in, w_out, stride)
+        self.a = conv2d(w_in, w_out, 3, stride=stride)
+        self.a_bn = get_norm(norm, w_out)
+        self.a_af = activation_class()
+        self.b = conv2d(w_out, w_out, 3)
+        self.b_bn = get_norm(norm, w_out)
+        self.b_af = activation_class()
+
+    def forward(self, x):
+        for layer in self.children():
+            x = layer(x)
+        return x
+
+
+class BasicTransform(nn.Module):
+    """Basic transformation: [3x3 conv, BN, Relu] x2."""
+
+    def __init__(self, w_in, w_out, stride, norm, activation_class, _params):
+        super().__init__()
+        self.a = conv2d(w_in, w_out, 3, stride=stride)
+        self.a_bn = get_norm(norm, w_out)
+        self.a_af = activation_class()
+        self.b = conv2d(w_out, w_out, 3)
+        self.b_bn = get_norm(norm, w_out)
+        self.b_bn.final_bn = True
+
+    def forward(self, x):
+        for layer in self.children():
+            x = layer(x)
+        return x
+
+
+class ResBasicBlock(CNNBlockBase):
+    """Residual basic block: x + f(x), f = basic transform."""
+
+    def __init__(self, w_in, w_out, stride, norm, activation_class, params):
+        super().__init__(w_in, w_out, stride)
+        self.proj, self.bn = None, None
+        if (w_in != w_out) or (stride != 1):
+            self.proj = conv2d(w_in, w_out, 1, stride=stride)
+            self.bn = get_norm(norm, w_out)
+        self.f = BasicTransform(w_in, w_out, stride, norm, activation_class, params)
+        self.af = activation_class()
+
+    def forward(self, x):
+        x_p = self.bn(self.proj(x)) if self.proj else x
+        return self.af(x_p + self.f(x))
+
+
+class BottleneckTransform(nn.Module):
+    """Bottleneck transformation: 1x1, 3x3 [+SE], 1x1."""
+
+    def __init__(self, w_in, w_out, stride, norm, activation_class, params):
+        super().__init__()
+        w_b = int(round(w_out * params["bot_mul"]))
+        w_se = int(round(w_in * params["se_r"]))
+        groups = w_b // params["group_w"]
+        self.a = conv2d(w_in, w_b, 1)
+        self.a_bn = get_norm(norm, w_b)
+        self.a_af = activation_class()
+        self.b = conv2d(w_b, w_b, 3, stride=stride, groups=groups)
+        self.b_bn = get_norm(norm, w_b)
+        self.b_af = activation_class()
+        self.se = SE(w_b, w_se, activation_class) if w_se else None
+        self.c = conv2d(w_b, w_out, 1)
+        self.c_bn = get_norm(norm, w_out)
+        self.c_bn.final_bn = True
+
+    def forward(self, x):
+        for layer in self.children():
+            x = layer(x)
+        return x
+
+
+class ResBottleneckBlock(CNNBlockBase):
+    """Residual bottleneck block: x + f(x), f = bottleneck transform."""
+
+    def __init__(self, w_in, w_out, stride, norm, activation_class, params):
+        super().__init__(w_in, w_out, stride)
+        self.proj, self.bn = None, None
+        if (w_in != w_out) or (stride != 1):
+            self.proj = conv2d(w_in, w_out, 1, stride=stride)
+            self.bn = get_norm(norm, w_out)
+        self.f = BottleneckTransform(w_in, w_out, stride, norm, activation_class, params)
+        self.af = activation_class()
+
+    def forward(self, x):
+        x_p = self.bn(self.proj(x)) if self.proj else x
+        return self.af(x_p + self.f(x))
+
+
+class AnyStage(nn.Module):
+    """AnyNet stage (sequence of blocks w/ the same output shape)."""
+
+    def __init__(self, w_in, w_out, stride, d, block_class, norm, activation_class, params):
+        super().__init__()
+        for i in range(d):
+            block = block_class(w_in, w_out, stride, norm, activation_class, params)
+            self.add_module("b{}".format(i + 1), block)
+            stride, w_in = 1, w_out
+
+    def forward(self, x):
+        for block in self.children():
+            x = block(x)
+        return x
+
+
+class AnyNet(Backbone):
+    """AnyNet model. See :paper:`dds`."""
+
+    def __init__(
+        self,
+        *,
+        stem_class,
+        stem_width,
+        block_class,
+        depths,
+        widths,
+        group_widths,
+        strides,
+        bottleneck_ratios,
+        se_ratio,
+        activation_class,
+        freeze_at=0,
+        norm="BN",
+        out_features=None,
+    ):
+        """
+        Args:
+            stem_class (callable): A callable taking 4 arguments (channels in, channels out,
+                normalization, callable returning an activation function) that returns another
+                callable implementing the stem module.
+            stem_width (int): The number of output channels that the stem produces.
+            block_class (callable): A callable taking 6 arguments (channels in, channels out,
+                stride, normalization, callable returning an activation function, a dict of
+                block-specific parameters) that returns another callable implementing the repeated
+                block module.
+            depths (list[int]): Number of blocks in each stage.
+            widths (list[int]): For each stage, the number of output channels of each block.
+            group_widths (list[int]): For each stage, the number of channels per group in group
+                convolution, if the block uses group convolution.
+            strides (list[int]): The stride that each network stage applies to its input.
+            bottleneck_ratios (list[float]): For each stage, the ratio of the number of bottleneck
+                channels to the number of block input channels (or, equivalently, output channels),
+                if the block uses a bottleneck.
+            se_ratio (float): The ratio of the number of channels used inside the squeeze-excitation
+                (SE) module to it number of input channels, if SE the block uses SE.
+            activation_class (callable): A callable taking no arguments that returns another
+                callable implementing an activation function.
+            freeze_at (int): The number of stages at the beginning to freeze.
+                see :meth:`freeze` for detailed explanation.
+            norm (str or callable): normalization for all conv layers.
+                See :func:`layers.get_norm` for supported format.
+            out_features (list[str]): name of the layers whose outputs should
+                be returned in forward. RegNet's use "stem" and "s1", "s2", etc for the stages after
+                the stem. If None, will return the output of the last layer.
+        """
+        super().__init__()
+        self.stem = stem_class(3, stem_width, norm, activation_class)
+
+        current_stride = self.stem.stride
+        self._out_feature_strides = {"stem": current_stride}
+        self._out_feature_channels = {"stem": self.stem.out_channels}
+        self.stages_and_names = []
+        prev_w = stem_width
+
+        for i, (d, w, s, b, g) in enumerate(
+            zip(depths, widths, strides, bottleneck_ratios, group_widths)
+        ):
+            params = {"bot_mul": b, "group_w": g, "se_r": se_ratio}
+            stage = AnyStage(prev_w, w, s, d, block_class, norm, activation_class, params)
+            name = "s{}".format(i + 1)
+            self.add_module(name, stage)
+            self.stages_and_names.append((stage, name))
+            self._out_feature_strides[name] = current_stride = int(
+                current_stride * np.prod([k.stride for k in stage.children()])
+            )
+            self._out_feature_channels[name] = list(stage.children())[-1].out_channels
+            prev_w = w
+
+        self.apply(init_weights)
+
+        if out_features is None:
+            out_features = [name]
+        self._out_features = out_features
+        assert len(self._out_features)
+        children = [x[0] for x in self.named_children()]
+        for out_feature in self._out_features:
+            assert out_feature in children, "Available children: {} does not include {}".format(
+                ", ".join(children), out_feature
+            )
+        self.freeze(freeze_at)
+
+    def forward(self, x):
+        """
+        Args:
+            x: Tensor of shape (N,C,H,W). H, W must be a multiple of ``self.size_divisibility``.
+
+        Returns:
+            dict[str->Tensor]: names and the corresponding features
+        """
+        assert x.dim() == 4, f"Model takes an input of shape (N, C, H, W). Got {x.shape} instead!"
+        outputs = {}
+        x = self.stem(x)
+        if "stem" in self._out_features:
+            outputs["stem"] = x
+        for stage, name in self.stages_and_names:
+            x = stage(x)
+            if name in self._out_features:
+                outputs[name] = x
+        return outputs
+
+    def output_shape(self):
+        return {
+            name: ShapeSpec(
+                channels=self._out_feature_channels[name], stride=self._out_feature_strides[name]
+            )
+            for name in self._out_features
+        }
+
+    def freeze(self, freeze_at=0):
+        """
+        Freeze the first several stages of the model. Commonly used in fine-tuning.
+
+        Layers that produce the same feature map spatial size are defined as one
+        "stage" by :paper:`FPN`.
+
+        Args:
+            freeze_at (int): number of stages to freeze.
+                `1` means freezing the stem. `2` means freezing the stem and
+                one residual stage, etc.
+
+        Returns:
+            nn.Module: this model itself
+        """
+        if freeze_at >= 1:
+            self.stem.freeze()
+        for idx, (stage, _) in enumerate(self.stages_and_names, start=2):
+            if freeze_at >= idx:
+                for block in stage.children():
+                    block.freeze()
+        return self
+
+
+def adjust_block_compatibility(ws, bs, gs):
+    """Adjusts the compatibility of widths, bottlenecks, and groups."""
+    assert len(ws) == len(bs) == len(gs)
+    assert all(w > 0 and b > 0 and g > 0 for w, b, g in zip(ws, bs, gs))
+    vs = [int(max(1, w * b)) for w, b in zip(ws, bs)]
+    gs = [int(min(g, v)) for g, v in zip(gs, vs)]
+    ms = [np.lcm(g, b) if b > 1 else g for g, b in zip(gs, bs)]
+    vs = [max(m, int(round(v / m) * m)) for v, m in zip(vs, ms)]
+    ws = [int(v / b) for v, b in zip(vs, bs)]
+    assert all(w * b % g == 0 for w, b, g in zip(ws, bs, gs))
+    return ws, bs, gs
+
+
+def generate_regnet_parameters(w_a, w_0, w_m, d, q=8):
+    """Generates per stage widths and depths from RegNet parameters."""
+    assert w_a >= 0 and w_0 > 0 and w_m > 1 and w_0 % q == 0
+    # Generate continuous per-block ws
+    ws_cont = np.arange(d) * w_a + w_0
+    # Generate quantized per-block ws
+    ks = np.round(np.log(ws_cont / w_0) / np.log(w_m))
+    ws_all = w_0 * np.power(w_m, ks)
+    ws_all = np.round(np.divide(ws_all, q)).astype(int) * q
+    # Generate per stage ws and ds (assumes ws_all are sorted)
+    ws, ds = np.unique(ws_all, return_counts=True)
+    # Compute number of actual stages and total possible stages
+    num_stages, total_stages = len(ws), ks.max() + 1
+    # Convert numpy arrays to lists and return
+    ws, ds, ws_all, ws_cont = (x.tolist() for x in (ws, ds, ws_all, ws_cont))
+    return ws, ds, num_stages, total_stages, ws_all, ws_cont
+
+
+class RegNet(AnyNet):
+    """RegNet model. See :paper:`dds`."""
+
+    def __init__(
+        self,
+        *,
+        stem_class,
+        stem_width,
+        block_class,
+        depth,
+        w_a,
+        w_0,
+        w_m,
+        group_width,
+        stride=2,
+        bottleneck_ratio=1.0,
+        se_ratio=0.0,
+        activation_class=None,
+        freeze_at=0,
+        norm="BN",
+        out_features=None,
+    ):
+        """
+        Build a RegNet from the parameterization described in :paper:`dds` Section 3.3.
+
+        Args:
+            See :class:`AnyNet` for arguments that are not listed here.
+            depth (int): Total number of blocks in the RegNet.
+            w_a (float): Factor by which block width would increase prior to quantizing block widths
+                by stage. See :paper:`dds` Section 3.3.
+            w_0 (int): Initial block width. See :paper:`dds` Section 3.3.
+            w_m (float): Parameter controlling block width quantization.
+                See :paper:`dds` Section 3.3.
+            group_width (int): Number of channels per group in group convolution, if the block uses
+                group convolution.
+            bottleneck_ratio (float): The ratio of the number of bottleneck channels to the number
+                of block input channels (or, equivalently, output channels), if the block uses a
+                bottleneck.
+            stride (int): The stride that each network stage applies to its input.
+        """
+        ws, ds = generate_regnet_parameters(w_a, w_0, w_m, depth)[0:2]
+        ss = [stride for _ in ws]
+        bs = [bottleneck_ratio for _ in ws]
+        gs = [group_width for _ in ws]
+        ws, bs, gs = adjust_block_compatibility(ws, bs, gs)
+
+        def default_activation_class():
+            return nn.ReLU(inplace=True)
+
+        super().__init__(
+            stem_class=stem_class,
+            stem_width=stem_width,
+            block_class=block_class,
+            depths=ds,
+            widths=ws,
+            strides=ss,
+            group_widths=gs,
+            bottleneck_ratios=bs,
+            se_ratio=se_ratio,
+            activation_class=default_activation_class
+            if activation_class is None
+            else activation_class,
+            freeze_at=freeze_at,
+            norm=norm,
+            out_features=out_features,
+        )
diff --git a/detectron2/modeling/backbone/resnet.py b/detectron2/modeling/backbone/resnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..5b8e842c585a81b5345ade4ca1da62a4904a122a
--- /dev/null
+++ b/detectron2/modeling/backbone/resnet.py
@@ -0,0 +1,694 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+import numpy as np
+import fvcore.nn.weight_init as weight_init
+import torch
+import torch.nn.functional as F
+from torch import nn
+
+from detectron2.layers import (
+    CNNBlockBase,
+    Conv2d,
+    DeformConv,
+    ModulatedDeformConv,
+    ShapeSpec,
+    get_norm,
+)
+
+from .backbone import Backbone
+from .build import BACKBONE_REGISTRY
+
+__all__ = [
+    "ResNetBlockBase",
+    "BasicBlock",
+    "BottleneckBlock",
+    "DeformBottleneckBlock",
+    "BasicStem",
+    "ResNet",
+    "make_stage",
+    "build_resnet_backbone",
+]
+
+
+class BasicBlock(CNNBlockBase):
+    """
+    The basic residual block for ResNet-18 and ResNet-34 defined in :paper:`ResNet`,
+    with two 3x3 conv layers and a projection shortcut if needed.
+    """
+
+    def __init__(self, in_channels, out_channels, *, stride=1, norm="BN"):
+        """
+        Args:
+            in_channels (int): Number of input channels.
+            out_channels (int): Number of output channels.
+            stride (int): Stride for the first conv.
+            norm (str or callable): normalization for all conv layers.
+                See :func:`layers.get_norm` for supported format.
+        """
+        super().__init__(in_channels, out_channels, stride)
+
+        if in_channels != out_channels:
+            self.shortcut = Conv2d(
+                in_channels,
+                out_channels,
+                kernel_size=1,
+                stride=stride,
+                bias=False,
+                norm=get_norm(norm, out_channels),
+            )
+        else:
+            self.shortcut = None
+
+        self.conv1 = Conv2d(
+            in_channels,
+            out_channels,
+            kernel_size=3,
+            stride=stride,
+            padding=1,
+            bias=False,
+            norm=get_norm(norm, out_channels),
+        )
+
+        self.conv2 = Conv2d(
+            out_channels,
+            out_channels,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+            bias=False,
+            norm=get_norm(norm, out_channels),
+        )
+
+        for layer in [self.conv1, self.conv2, self.shortcut]:
+            if layer is not None:  # shortcut can be None
+                weight_init.c2_msra_fill(layer)
+
+    def forward(self, x):
+        out = self.conv1(x)
+        out = F.relu_(out)
+        out = self.conv2(out)
+
+        if self.shortcut is not None:
+            shortcut = self.shortcut(x)
+        else:
+            shortcut = x
+
+        out += shortcut
+        out = F.relu_(out)
+        return out
+
+
+class BottleneckBlock(CNNBlockBase):
+    """
+    The standard bottleneck residual block used by ResNet-50, 101 and 152
+    defined in :paper:`ResNet`.  It contains 3 conv layers with kernels
+    1x1, 3x3, 1x1, and a projection shortcut if needed.
+    """
+
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        *,
+        bottleneck_channels,
+        stride=1,
+        num_groups=1,
+        norm="BN",
+        stride_in_1x1=False,
+        dilation=1,
+    ):
+        """
+        Args:
+            bottleneck_channels (int): number of output channels for the 3x3
+                "bottleneck" conv layers.
+            num_groups (int): number of groups for the 3x3 conv layer.
+            norm (str or callable): normalization for all conv layers.
+                See :func:`layers.get_norm` for supported format.
+            stride_in_1x1 (bool): when stride>1, whether to put stride in the
+                first 1x1 convolution or the bottleneck 3x3 convolution.
+            dilation (int): the dilation rate of the 3x3 conv layer.
+        """
+        super().__init__(in_channels, out_channels, stride)
+
+        if in_channels != out_channels:
+            self.shortcut = Conv2d(
+                in_channels,
+                out_channels,
+                kernel_size=1,
+                stride=stride,
+                bias=False,
+                norm=get_norm(norm, out_channels),
+            )
+        else:
+            self.shortcut = None
+
+        # The original MSRA ResNet models have stride in the first 1x1 conv
+        # The subsequent fb.torch.resnet and Caffe2 ResNe[X]t implementations have
+        # stride in the 3x3 conv
+        stride_1x1, stride_3x3 = (stride, 1) if stride_in_1x1 else (1, stride)
+
+        self.conv1 = Conv2d(
+            in_channels,
+            bottleneck_channels,
+            kernel_size=1,
+            stride=stride_1x1,
+            bias=False,
+            norm=get_norm(norm, bottleneck_channels),
+        )
+
+        self.conv2 = Conv2d(
+            bottleneck_channels,
+            bottleneck_channels,
+            kernel_size=3,
+            stride=stride_3x3,
+            padding=1 * dilation,
+            bias=False,
+            groups=num_groups,
+            dilation=dilation,
+            norm=get_norm(norm, bottleneck_channels),
+        )
+
+        self.conv3 = Conv2d(
+            bottleneck_channels,
+            out_channels,
+            kernel_size=1,
+            bias=False,
+            norm=get_norm(norm, out_channels),
+        )
+
+        for layer in [self.conv1, self.conv2, self.conv3, self.shortcut]:
+            if layer is not None:  # shortcut can be None
+                weight_init.c2_msra_fill(layer)
+
+        # Zero-initialize the last normalization in each residual branch,
+        # so that at the beginning, the residual branch starts with zeros,
+        # and each residual block behaves like an identity.
+        # See Sec 5.1 in "Accurate, Large Minibatch SGD: Training ImageNet in 1 Hour":
+        # "For BN layers, the learnable scaling coefficient γ is initialized
+        # to be 1, except for each residual block's last BN
+        # where γ is initialized to be 0."
+
+        # nn.init.constant_(self.conv3.norm.weight, 0)
+        # TODO this somehow hurts performance when training GN models from scratch.
+        # Add it as an option when we need to use this code to train a backbone.
+
+    def forward(self, x):
+        out = self.conv1(x)
+        out = F.relu_(out)
+
+        out = self.conv2(out)
+        out = F.relu_(out)
+
+        out = self.conv3(out)
+
+        if self.shortcut is not None:
+            shortcut = self.shortcut(x)
+        else:
+            shortcut = x
+
+        out += shortcut
+        out = F.relu_(out)
+        return out
+
+
+class DeformBottleneckBlock(CNNBlockBase):
+    """
+    Similar to :class:`BottleneckBlock`, but with :paper:`deformable conv <deformconv>`
+    in the 3x3 convolution.
+    """
+
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        *,
+        bottleneck_channels,
+        stride=1,
+        num_groups=1,
+        norm="BN",
+        stride_in_1x1=False,
+        dilation=1,
+        deform_modulated=False,
+        deform_num_groups=1,
+    ):
+        super().__init__(in_channels, out_channels, stride)
+        self.deform_modulated = deform_modulated
+
+        if in_channels != out_channels:
+            self.shortcut = Conv2d(
+                in_channels,
+                out_channels,
+                kernel_size=1,
+                stride=stride,
+                bias=False,
+                norm=get_norm(norm, out_channels),
+            )
+        else:
+            self.shortcut = None
+
+        stride_1x1, stride_3x3 = (stride, 1) if stride_in_1x1 else (1, stride)
+
+        self.conv1 = Conv2d(
+            in_channels,
+            bottleneck_channels,
+            kernel_size=1,
+            stride=stride_1x1,
+            bias=False,
+            norm=get_norm(norm, bottleneck_channels),
+        )
+
+        if deform_modulated:
+            deform_conv_op = ModulatedDeformConv
+            # offset channels are 2 or 3 (if with modulated) * kernel_size * kernel_size
+            offset_channels = 27
+        else:
+            deform_conv_op = DeformConv
+            offset_channels = 18
+
+        self.conv2_offset = Conv2d(
+            bottleneck_channels,
+            offset_channels * deform_num_groups,
+            kernel_size=3,
+            stride=stride_3x3,
+            padding=1 * dilation,
+            dilation=dilation,
+        )
+        self.conv2 = deform_conv_op(
+            bottleneck_channels,
+            bottleneck_channels,
+            kernel_size=3,
+            stride=stride_3x3,
+            padding=1 * dilation,
+            bias=False,
+            groups=num_groups,
+            dilation=dilation,
+            deformable_groups=deform_num_groups,
+            norm=get_norm(norm, bottleneck_channels),
+        )
+
+        self.conv3 = Conv2d(
+            bottleneck_channels,
+            out_channels,
+            kernel_size=1,
+            bias=False,
+            norm=get_norm(norm, out_channels),
+        )
+
+        for layer in [self.conv1, self.conv2, self.conv3, self.shortcut]:
+            if layer is not None:  # shortcut can be None
+                weight_init.c2_msra_fill(layer)
+
+        nn.init.constant_(self.conv2_offset.weight, 0)
+        nn.init.constant_(self.conv2_offset.bias, 0)
+
+    def forward(self, x):
+        out = self.conv1(x)
+        out = F.relu_(out)
+
+        if self.deform_modulated:
+            offset_mask = self.conv2_offset(out)
+            offset_x, offset_y, mask = torch.chunk(offset_mask, 3, dim=1)
+            offset = torch.cat((offset_x, offset_y), dim=1)
+            mask = mask.sigmoid()
+            out = self.conv2(out, offset, mask)
+        else:
+            offset = self.conv2_offset(out)
+            out = self.conv2(out, offset)
+        out = F.relu_(out)
+
+        out = self.conv3(out)
+
+        if self.shortcut is not None:
+            shortcut = self.shortcut(x)
+        else:
+            shortcut = x
+
+        out += shortcut
+        out = F.relu_(out)
+        return out
+
+
+class BasicStem(CNNBlockBase):
+    """
+    The standard ResNet stem (layers before the first residual block),
+    with a conv, relu and max_pool.
+    """
+
+    def __init__(self, in_channels=3, out_channels=64, norm="BN"):
+        """
+        Args:
+            norm (str or callable): norm after the first conv layer.
+                See :func:`layers.get_norm` for supported format.
+        """
+        super().__init__(in_channels, out_channels, 4)
+        self.in_channels = in_channels
+        self.conv1 = Conv2d(
+            in_channels,
+            out_channels,
+            kernel_size=7,
+            stride=2,
+            padding=3,
+            bias=False,
+            norm=get_norm(norm, out_channels),
+        )
+        weight_init.c2_msra_fill(self.conv1)
+
+    def forward(self, x):
+        x = self.conv1(x)
+        x = F.relu_(x)
+        x = F.max_pool2d(x, kernel_size=3, stride=2, padding=1)
+        return x
+
+
+class ResNet(Backbone):
+    """
+    Implement :paper:`ResNet`.
+    """
+
+    def __init__(self, stem, stages, num_classes=None, out_features=None, freeze_at=0):
+        """
+        Args:
+            stem (nn.Module): a stem module
+            stages (list[list[CNNBlockBase]]): several (typically 4) stages,
+                each contains multiple :class:`CNNBlockBase`.
+            num_classes (None or int): if None, will not perform classification.
+                Otherwise, will create a linear layer.
+            out_features (list[str]): name of the layers whose outputs should
+                be returned in forward. Can be anything in "stem", "linear", or "res2" ...
+                If None, will return the output of the last layer.
+            freeze_at (int): The number of stages at the beginning to freeze.
+                see :meth:`freeze` for detailed explanation.
+        """
+        super().__init__()
+        self.stem = stem
+        self.num_classes = num_classes
+
+        current_stride = self.stem.stride
+        self._out_feature_strides = {"stem": current_stride}
+        self._out_feature_channels = {"stem": self.stem.out_channels}
+
+        self.stage_names, self.stages = [], []
+
+        if out_features is not None:
+            # Avoid keeping unused layers in this module. They consume extra memory
+            # and may cause allreduce to fail
+            num_stages = max(
+                [{"res2": 1, "res3": 2, "res4": 3, "res5": 4}.get(f, 0) for f in out_features]
+            )
+            stages = stages[:num_stages]
+        for i, blocks in enumerate(stages):
+            assert len(blocks) > 0, len(blocks)
+            for block in blocks:
+                assert isinstance(block, CNNBlockBase), block
+
+            name = "res" + str(i + 2)
+            stage = nn.Sequential(*blocks)
+
+            self.add_module(name, stage)
+            self.stage_names.append(name)
+            self.stages.append(stage)
+
+            self._out_feature_strides[name] = current_stride = int(
+                current_stride * np.prod([k.stride for k in blocks])
+            )
+            self._out_feature_channels[name] = curr_channels = blocks[-1].out_channels
+        self.stage_names = tuple(self.stage_names)  # Make it static for scripting
+
+        if num_classes is not None:
+            self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
+            self.linear = nn.Linear(curr_channels, num_classes)
+
+            # Sec 5.1 in "Accurate, Large Minibatch SGD: Training ImageNet in 1 Hour":
+            # "The 1000-way fully-connected layer is initialized by
+            # drawing weights from a zero-mean Gaussian with standard deviation of 0.01."
+            nn.init.normal_(self.linear.weight, std=0.01)
+            name = "linear"
+
+        if out_features is None:
+            out_features = [name]
+        self._out_features = out_features
+        assert len(self._out_features)
+        children = [x[0] for x in self.named_children()]
+        for out_feature in self._out_features:
+            assert out_feature in children, "Available children: {}".format(", ".join(children))
+        self.freeze(freeze_at)
+
+    def forward(self, x):
+        """
+        Args:
+            x: Tensor of shape (N,C,H,W). H, W must be a multiple of ``self.size_divisibility``.
+
+        Returns:
+            dict[str->Tensor]: names and the corresponding features
+        """
+        assert x.dim() == 4, f"ResNet takes an input of shape (N, C, H, W). Got {x.shape} instead!"
+        outputs = {}
+        x = self.stem(x)
+        if "stem" in self._out_features:
+            outputs["stem"] = x
+        for name, stage in zip(self.stage_names, self.stages):
+            x = stage(x)
+            if name in self._out_features:
+                outputs[name] = x
+        if self.num_classes is not None:
+            x = self.avgpool(x)
+            x = torch.flatten(x, 1)
+            x = self.linear(x)
+            if "linear" in self._out_features:
+                outputs["linear"] = x
+        return outputs
+
+    def output_shape(self):
+        return {
+            name: ShapeSpec(
+                channels=self._out_feature_channels[name], stride=self._out_feature_strides[name]
+            )
+            for name in self._out_features
+        }
+
+    def freeze(self, freeze_at=0):
+        """
+        Freeze the first several stages of the ResNet. Commonly used in
+        fine-tuning.
+
+        Layers that produce the same feature map spatial size are defined as one
+        "stage" by :paper:`FPN`.
+
+        Args:
+            freeze_at (int): number of stages to freeze.
+                `1` means freezing the stem. `2` means freezing the stem and
+                one residual stage, etc.
+
+        Returns:
+            nn.Module: this ResNet itself
+        """
+        if freeze_at >= 1:
+            self.stem.freeze()
+        for idx, stage in enumerate(self.stages, start=2):
+            if freeze_at >= idx:
+                for block in stage.children():
+                    block.freeze()
+        return self
+
+    @staticmethod
+    def make_stage(block_class, num_blocks, *, in_channels, out_channels, **kwargs):
+        """
+        Create a list of blocks of the same type that forms one ResNet stage.
+
+        Args:
+            block_class (type): a subclass of CNNBlockBase that's used to create all blocks in this
+                stage. A module of this type must not change spatial resolution of inputs unless its
+                stride != 1.
+            num_blocks (int): number of blocks in this stage
+            in_channels (int): input channels of the entire stage.
+            out_channels (int): output channels of **every block** in the stage.
+            kwargs: other arguments passed to the constructor of
+                `block_class`. If the argument name is "xx_per_block", the
+                argument is a list of values to be passed to each block in the
+                stage. Otherwise, the same argument is passed to every block
+                in the stage.
+
+        Returns:
+            list[CNNBlockBase]: a list of block module.
+
+        Examples:
+        ::
+            stage = ResNet.make_stage(
+                BottleneckBlock, 3, in_channels=16, out_channels=64,
+                bottleneck_channels=16, num_groups=1,
+                stride_per_block=[2, 1, 1],
+                dilations_per_block=[1, 1, 2]
+            )
+
+        Usually, layers that produce the same feature map spatial size are defined as one
+        "stage" (in :paper:`FPN`). Under such definition, ``stride_per_block[1:]`` should
+        all be 1.
+        """
+        blocks = []
+        for i in range(num_blocks):
+            curr_kwargs = {}
+            for k, v in kwargs.items():
+                if k.endswith("_per_block"):
+                    assert len(v) == num_blocks, (
+                        f"Argument '{k}' of make_stage should have the "
+                        f"same length as num_blocks={num_blocks}."
+                    )
+                    newk = k[: -len("_per_block")]
+                    assert newk not in kwargs, f"Cannot call make_stage with both {k} and {newk}!"
+                    curr_kwargs[newk] = v[i]
+                else:
+                    curr_kwargs[k] = v
+
+            blocks.append(
+                block_class(in_channels=in_channels, out_channels=out_channels, **curr_kwargs)
+            )
+            in_channels = out_channels
+        return blocks
+
+    @staticmethod
+    def make_default_stages(depth, block_class=None, **kwargs):
+        """
+        Created list of ResNet stages from pre-defined depth (one of 18, 34, 50, 101, 152).
+        If it doesn't create the ResNet variant you need, please use :meth:`make_stage`
+        instead for fine-grained customization.
+
+        Args:
+            depth (int): depth of ResNet
+            block_class (type): the CNN block class. Has to accept
+                `bottleneck_channels` argument for depth > 50.
+                By default it is BasicBlock or BottleneckBlock, based on the
+                depth.
+            kwargs:
+                other arguments to pass to `make_stage`. Should not contain
+                stride and channels, as they are predefined for each depth.
+
+        Returns:
+            list[list[CNNBlockBase]]: modules in all stages; see arguments of
+                :class:`ResNet.__init__`.
+        """
+        num_blocks_per_stage = {
+            18: [2, 2, 2, 2],
+            34: [3, 4, 6, 3],
+            50: [3, 4, 6, 3],
+            101: [3, 4, 23, 3],
+            152: [3, 8, 36, 3],
+        }[depth]
+        if block_class is None:
+            block_class = BasicBlock if depth < 50 else BottleneckBlock
+        if depth < 50:
+            in_channels = [64, 64, 128, 256]
+            out_channels = [64, 128, 256, 512]
+        else:
+            in_channels = [64, 256, 512, 1024]
+            out_channels = [256, 512, 1024, 2048]
+        ret = []
+        for (n, s, i, o) in zip(num_blocks_per_stage, [1, 2, 2, 2], in_channels, out_channels):
+            if depth >= 50:
+                kwargs["bottleneck_channels"] = o // 4
+            ret.append(
+                ResNet.make_stage(
+                    block_class=block_class,
+                    num_blocks=n,
+                    stride_per_block=[s] + [1] * (n - 1),
+                    in_channels=i,
+                    out_channels=o,
+                    **kwargs,
+                )
+            )
+        return ret
+
+
+ResNetBlockBase = CNNBlockBase
+"""
+Alias for backward compatibiltiy.
+"""
+
+
+def make_stage(*args, **kwargs):
+    """
+    Deprecated alias for backward compatibiltiy.
+    """
+    return ResNet.make_stage(*args, **kwargs)
+
+
+@BACKBONE_REGISTRY.register()
+def build_resnet_backbone(cfg, input_shape):
+    """
+    Create a ResNet instance from config.
+
+    Returns:
+        ResNet: a :class:`ResNet` instance.
+    """
+    # need registration of new blocks/stems?
+    norm = cfg.MODEL.RESNETS.NORM
+    stem = BasicStem(
+        in_channels=input_shape.channels,
+        out_channels=cfg.MODEL.RESNETS.STEM_OUT_CHANNELS,
+        norm=norm,
+    )
+
+    # fmt: off
+    freeze_at           = cfg.MODEL.BACKBONE.FREEZE_AT
+    out_features        = cfg.MODEL.RESNETS.OUT_FEATURES
+    depth               = cfg.MODEL.RESNETS.DEPTH
+    num_groups          = cfg.MODEL.RESNETS.NUM_GROUPS
+    width_per_group     = cfg.MODEL.RESNETS.WIDTH_PER_GROUP
+    bottleneck_channels = num_groups * width_per_group
+    in_channels         = cfg.MODEL.RESNETS.STEM_OUT_CHANNELS
+    out_channels        = cfg.MODEL.RESNETS.RES2_OUT_CHANNELS
+    stride_in_1x1       = cfg.MODEL.RESNETS.STRIDE_IN_1X1
+    res5_dilation       = cfg.MODEL.RESNETS.RES5_DILATION
+    deform_on_per_stage = cfg.MODEL.RESNETS.DEFORM_ON_PER_STAGE
+    deform_modulated    = cfg.MODEL.RESNETS.DEFORM_MODULATED
+    deform_num_groups   = cfg.MODEL.RESNETS.DEFORM_NUM_GROUPS
+    # fmt: on
+    assert res5_dilation in {1, 2}, "res5_dilation cannot be {}.".format(res5_dilation)
+
+    num_blocks_per_stage = {
+        18: [2, 2, 2, 2],
+        34: [3, 4, 6, 3],
+        50: [3, 4, 6, 3],
+        101: [3, 4, 23, 3],
+        152: [3, 8, 36, 3],
+    }[depth]
+
+    if depth in [18, 34]:
+        assert out_channels == 64, "Must set MODEL.RESNETS.RES2_OUT_CHANNELS = 64 for R18/R34"
+        assert not any(
+            deform_on_per_stage
+        ), "MODEL.RESNETS.DEFORM_ON_PER_STAGE unsupported for R18/R34"
+        assert res5_dilation == 1, "Must set MODEL.RESNETS.RES5_DILATION = 1 for R18/R34"
+        assert num_groups == 1, "Must set MODEL.RESNETS.NUM_GROUPS = 1 for R18/R34"
+
+    stages = []
+
+    for idx, stage_idx in enumerate(range(2, 6)):
+        # res5_dilation is used this way as a convention in R-FCN & Deformable Conv paper
+        dilation = res5_dilation if stage_idx == 5 else 1
+        first_stride = 1 if idx == 0 or (stage_idx == 5 and dilation == 2) else 2
+        stage_kargs = {
+            "num_blocks": num_blocks_per_stage[idx],
+            "stride_per_block": [first_stride] + [1] * (num_blocks_per_stage[idx] - 1),
+            "in_channels": in_channels,
+            "out_channels": out_channels,
+            "norm": norm,
+        }
+        # Use BasicBlock for R18 and R34.
+        if depth in [18, 34]:
+            stage_kargs["block_class"] = BasicBlock
+        else:
+            stage_kargs["bottleneck_channels"] = bottleneck_channels
+            stage_kargs["stride_in_1x1"] = stride_in_1x1
+            stage_kargs["dilation"] = dilation
+            stage_kargs["num_groups"] = num_groups
+            if deform_on_per_stage[idx]:
+                stage_kargs["block_class"] = DeformBottleneckBlock
+                stage_kargs["deform_modulated"] = deform_modulated
+                stage_kargs["deform_num_groups"] = deform_num_groups
+            else:
+                stage_kargs["block_class"] = BottleneckBlock
+        blocks = ResNet.make_stage(**stage_kargs)
+        in_channels = out_channels
+        out_channels *= 2
+        bottleneck_channels *= 2
+        stages.append(blocks)
+    return ResNet(stem, stages, out_features=out_features, freeze_at=freeze_at)
diff --git a/detectron2/modeling/backbone/swin.py b/detectron2/modeling/backbone/swin.py
new file mode 100644
index 0000000000000000000000000000000000000000..780b6fc6eaab1d9a3f513b8a09cb4dc95166e5a3
--- /dev/null
+++ b/detectron2/modeling/backbone/swin.py
@@ -0,0 +1,695 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+"""
+Implementation of Swin models from :paper:`swin`.
+
+This code is adapted from https://github.com/SwinTransformer/Swin-Transformer-Object-Detection/blob/master/mmdet/models/backbones/swin_transformer.py with minimal modifications.  # noqa
+--------------------------------------------------------
+Swin Transformer
+Copyright (c) 2021 Microsoft
+Licensed under The MIT License [see LICENSE for details]
+Written by Ze Liu, Yutong Lin, Yixuan Wei
+--------------------------------------------------------
+LICENSE: https://github.com/SwinTransformer/Swin-Transformer-Object-Detection/blob/461e003166a8083d0b620beacd4662a2df306bd6/LICENSE
+"""
+
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.utils.checkpoint as checkpoint
+
+from detectron2.modeling.backbone.backbone import Backbone
+
+_to_2tuple = nn.modules.utils._ntuple(2)
+
+
+class Mlp(nn.Module):
+    """Multilayer perceptron."""
+
+    def __init__(
+        self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.0
+    ):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features)
+        self.act = act_layer()
+        self.fc2 = nn.Linear(hidden_features, out_features)
+        self.drop = nn.Dropout(drop)
+
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x
+
+
+def window_partition(x, window_size):
+    """
+    Args:
+        x: (B, H, W, C)
+        window_size (int): window size
+    Returns:
+        windows: (num_windows*B, window_size, window_size, C)
+    """
+    B, H, W, C = x.shape
+    x = x.view(B, H // window_size, window_size, W // window_size, window_size, C)
+    windows = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, window_size, window_size, C)
+    return windows
+
+
+def window_reverse(windows, window_size, H, W):
+    """
+    Args:
+        windows: (num_windows*B, window_size, window_size, C)
+        window_size (int): Window size
+        H (int): Height of image
+        W (int): Width of image
+    Returns:
+        x: (B, H, W, C)
+    """
+    B = int(windows.shape[0] / (H * W / window_size / window_size))
+    x = windows.view(B, H // window_size, W // window_size, window_size, window_size, -1)
+    x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, H, W, -1)
+    return x
+
+
+class WindowAttention(nn.Module):
+    """Window based multi-head self attention (W-MSA) module with relative position bias.
+    It supports both of shifted and non-shifted window.
+    Args:
+        dim (int): Number of input channels.
+        window_size (tuple[int]): The height and width of the window.
+        num_heads (int): Number of attention heads.
+        qkv_bias (bool, optional):  If True, add a learnable bias to query, key, value.
+            Default: True
+        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set
+        attn_drop (float, optional): Dropout ratio of attention weight. Default: 0.0
+        proj_drop (float, optional): Dropout ratio of output. Default: 0.0
+    """
+
+    def __init__(
+        self,
+        dim,
+        window_size,
+        num_heads,
+        qkv_bias=True,
+        qk_scale=None,
+        attn_drop=0.0,
+        proj_drop=0.0,
+    ):
+
+        super().__init__()
+        self.dim = dim
+        self.window_size = window_size  # Wh, Ww
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = qk_scale or head_dim**-0.5
+
+        # define a parameter table of relative position bias
+        self.relative_position_bias_table = nn.Parameter(
+            torch.zeros((2 * window_size[0] - 1) * (2 * window_size[1] - 1), num_heads)
+        )  # 2*Wh-1 * 2*Ww-1, nH
+
+        # get pair-wise relative position index for each token inside the window
+        coords_h = torch.arange(self.window_size[0])
+        coords_w = torch.arange(self.window_size[1])
+        coords = torch.stack(torch.meshgrid([coords_h, coords_w]))  # 2, Wh, Ww
+        coords_flatten = torch.flatten(coords, 1)  # 2, Wh*Ww
+        relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :]  # 2, Wh*Ww, Wh*Ww
+        relative_coords = relative_coords.permute(1, 2, 0).contiguous()  # Wh*Ww, Wh*Ww, 2
+        relative_coords[:, :, 0] += self.window_size[0] - 1  # shift to start from 0
+        relative_coords[:, :, 1] += self.window_size[1] - 1
+        relative_coords[:, :, 0] *= 2 * self.window_size[1] - 1
+        relative_position_index = relative_coords.sum(-1)  # Wh*Ww, Wh*Ww
+        self.register_buffer("relative_position_index", relative_position_index)
+
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+
+        nn.init.trunc_normal_(self.relative_position_bias_table, std=0.02)
+        self.softmax = nn.Softmax(dim=-1)
+
+    def forward(self, x, mask=None):
+        """Forward function.
+        Args:
+            x: input features with shape of (num_windows*B, N, C)
+            mask: (0/-inf) mask with shape of (num_windows, Wh*Ww, Wh*Ww) or None
+        """
+        B_, N, C = x.shape
+        qkv = (
+            self.qkv(x)
+            .reshape(B_, N, 3, self.num_heads, C // self.num_heads)
+            .permute(2, 0, 3, 1, 4)
+        )
+        q, k, v = qkv[0], qkv[1], qkv[2]  # make torchscript happy (cannot use tensor as tuple)
+
+        q = q * self.scale
+        attn = q @ k.transpose(-2, -1)
+
+        relative_position_bias = self.relative_position_bias_table[
+            self.relative_position_index.view(-1)
+        ].view(
+            self.window_size[0] * self.window_size[1], self.window_size[0] * self.window_size[1], -1
+        )  # Wh*Ww,Wh*Ww,nH
+        relative_position_bias = relative_position_bias.permute(
+            2, 0, 1
+        ).contiguous()  # nH, Wh*Ww, Wh*Ww
+        attn = attn + relative_position_bias.unsqueeze(0)
+
+        if mask is not None:
+            nW = mask.shape[0]
+            attn = attn.view(B_ // nW, nW, self.num_heads, N, N) + mask.unsqueeze(1).unsqueeze(0)
+            attn = attn.view(-1, self.num_heads, N, N)
+            attn = self.softmax(attn)
+        else:
+            attn = self.softmax(attn)
+
+        attn = self.attn_drop(attn)
+
+        x = (attn @ v).transpose(1, 2).reshape(B_, N, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+
+
+class SwinTransformerBlock(nn.Module):
+    """Swin Transformer Block.
+    Args:
+        dim (int): Number of input channels.
+        num_heads (int): Number of attention heads.
+        window_size (int): Window size.
+        shift_size (int): Shift size for SW-MSA.
+        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
+        qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
+        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.
+        drop (float, optional): Dropout rate. Default: 0.0
+        attn_drop (float, optional): Attention dropout rate. Default: 0.0
+        drop_path (float, optional): Stochastic depth rate. Default: 0.0
+        act_layer (nn.Module, optional): Activation layer. Default: nn.GELU
+        norm_layer (nn.Module, optional): Normalization layer.  Default: nn.LayerNorm
+    """
+
+    def __init__(
+        self,
+        dim,
+        num_heads,
+        window_size=7,
+        shift_size=0,
+        mlp_ratio=4.0,
+        qkv_bias=True,
+        qk_scale=None,
+        drop=0.0,
+        attn_drop=0.0,
+        drop_path=0.0,
+        act_layer=nn.GELU,
+        norm_layer=nn.LayerNorm,
+    ):
+        super().__init__()
+        self.dim = dim
+        self.num_heads = num_heads
+        self.window_size = window_size
+        self.shift_size = shift_size
+        self.mlp_ratio = mlp_ratio
+        assert 0 <= self.shift_size < self.window_size, "shift_size must in 0-window_size"
+
+        self.norm1 = norm_layer(dim)
+        self.attn = WindowAttention(
+            dim,
+            window_size=_to_2tuple(self.window_size),
+            num_heads=num_heads,
+            qkv_bias=qkv_bias,
+            qk_scale=qk_scale,
+            attn_drop=attn_drop,
+            proj_drop=drop,
+        )
+
+        if drop_path > 0.0:
+            from timm.models.layers import DropPath
+
+            self.drop_path = DropPath(drop_path)
+        else:
+            self.drop_path = nn.Identity()
+        self.norm2 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = Mlp(
+            in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop
+        )
+
+        self.H = None
+        self.W = None
+
+    def forward(self, x, mask_matrix):
+        """Forward function.
+        Args:
+            x: Input feature, tensor size (B, H*W, C).
+            H, W: Spatial resolution of the input feature.
+            mask_matrix: Attention mask for cyclic shift.
+        """
+        B, L, C = x.shape
+        H, W = self.H, self.W
+        assert L == H * W, "input feature has wrong size"
+
+        shortcut = x
+        x = self.norm1(x)
+        x = x.view(B, H, W, C)
+
+        # pad feature maps to multiples of window size
+        pad_l = pad_t = 0
+        pad_r = (self.window_size - W % self.window_size) % self.window_size
+        pad_b = (self.window_size - H % self.window_size) % self.window_size
+        x = F.pad(x, (0, 0, pad_l, pad_r, pad_t, pad_b))
+        _, Hp, Wp, _ = x.shape
+
+        # cyclic shift
+        if self.shift_size > 0:
+            shifted_x = torch.roll(x, shifts=(-self.shift_size, -self.shift_size), dims=(1, 2))
+            attn_mask = mask_matrix
+        else:
+            shifted_x = x
+            attn_mask = None
+
+        # partition windows
+        x_windows = window_partition(
+            shifted_x, self.window_size
+        )  # nW*B, window_size, window_size, C
+        x_windows = x_windows.view(
+            -1, self.window_size * self.window_size, C
+        )  # nW*B, window_size*window_size, C
+
+        # W-MSA/SW-MSA
+        attn_windows = self.attn(x_windows, mask=attn_mask)  # nW*B, window_size*window_size, C
+
+        # merge windows
+        attn_windows = attn_windows.view(-1, self.window_size, self.window_size, C)
+        shifted_x = window_reverse(attn_windows, self.window_size, Hp, Wp)  # B H' W' C
+
+        # reverse cyclic shift
+        if self.shift_size > 0:
+            x = torch.roll(shifted_x, shifts=(self.shift_size, self.shift_size), dims=(1, 2))
+        else:
+            x = shifted_x
+
+        if pad_r > 0 or pad_b > 0:
+            x = x[:, :H, :W, :].contiguous()
+
+        x = x.view(B, H * W, C)
+
+        # FFN
+        x = shortcut + self.drop_path(x)
+        x = x + self.drop_path(self.mlp(self.norm2(x)))
+
+        return x
+
+
+class PatchMerging(nn.Module):
+    """Patch Merging Layer
+    Args:
+        dim (int): Number of input channels.
+        norm_layer (nn.Module, optional): Normalization layer.  Default: nn.LayerNorm
+    """
+
+    def __init__(self, dim, norm_layer=nn.LayerNorm):
+        super().__init__()
+        self.dim = dim
+        self.reduction = nn.Linear(4 * dim, 2 * dim, bias=False)
+        self.norm = norm_layer(4 * dim)
+
+    def forward(self, x, H, W):
+        """Forward function.
+        Args:
+            x: Input feature, tensor size (B, H*W, C).
+            H, W: Spatial resolution of the input feature.
+        """
+        B, L, C = x.shape
+        assert L == H * W, "input feature has wrong size"
+
+        x = x.view(B, H, W, C)
+
+        # padding
+        pad_input = (H % 2 == 1) or (W % 2 == 1)
+        if pad_input:
+            x = F.pad(x, (0, 0, 0, W % 2, 0, H % 2))
+
+        x0 = x[:, 0::2, 0::2, :]  # B H/2 W/2 C
+        x1 = x[:, 1::2, 0::2, :]  # B H/2 W/2 C
+        x2 = x[:, 0::2, 1::2, :]  # B H/2 W/2 C
+        x3 = x[:, 1::2, 1::2, :]  # B H/2 W/2 C
+        x = torch.cat([x0, x1, x2, x3], -1)  # B H/2 W/2 4*C
+        x = x.view(B, -1, 4 * C)  # B H/2*W/2 4*C
+
+        x = self.norm(x)
+        x = self.reduction(x)
+
+        return x
+
+
+class BasicLayer(nn.Module):
+    """A basic Swin Transformer layer for one stage.
+    Args:
+        dim (int): Number of feature channels
+        depth (int): Depths of this stage.
+        num_heads (int): Number of attention head.
+        window_size (int): Local window size. Default: 7.
+        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4.
+        qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
+        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.
+        drop (float, optional): Dropout rate. Default: 0.0
+        attn_drop (float, optional): Attention dropout rate. Default: 0.0
+        drop_path (float | tuple[float], optional): Stochastic depth rate. Default: 0.0
+        norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm
+        downsample (nn.Module | None, optional): Downsample layer at the end of the layer.
+            Default: None
+        use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False.
+    """
+
+    def __init__(
+        self,
+        dim,
+        depth,
+        num_heads,
+        window_size=7,
+        mlp_ratio=4.0,
+        qkv_bias=True,
+        qk_scale=None,
+        drop=0.0,
+        attn_drop=0.0,
+        drop_path=0.0,
+        norm_layer=nn.LayerNorm,
+        downsample=None,
+        use_checkpoint=False,
+    ):
+        super().__init__()
+        self.window_size = window_size
+        self.shift_size = window_size // 2
+        self.depth = depth
+        self.use_checkpoint = use_checkpoint
+
+        # build blocks
+        self.blocks = nn.ModuleList(
+            [
+                SwinTransformerBlock(
+                    dim=dim,
+                    num_heads=num_heads,
+                    window_size=window_size,
+                    shift_size=0 if (i % 2 == 0) else window_size // 2,
+                    mlp_ratio=mlp_ratio,
+                    qkv_bias=qkv_bias,
+                    qk_scale=qk_scale,
+                    drop=drop,
+                    attn_drop=attn_drop,
+                    drop_path=drop_path[i] if isinstance(drop_path, list) else drop_path,
+                    norm_layer=norm_layer,
+                )
+                for i in range(depth)
+            ]
+        )
+
+        # patch merging layer
+        if downsample is not None:
+            self.downsample = downsample(dim=dim, norm_layer=norm_layer)
+        else:
+            self.downsample = None
+
+    def forward(self, x, H, W):
+        """Forward function.
+        Args:
+            x: Input feature, tensor size (B, H*W, C).
+            H, W: Spatial resolution of the input feature.
+        """
+
+        # calculate attention mask for SW-MSA
+        Hp = int(np.ceil(H / self.window_size)) * self.window_size
+        Wp = int(np.ceil(W / self.window_size)) * self.window_size
+        img_mask = torch.zeros((1, Hp, Wp, 1), device=x.device)  # 1 Hp Wp 1
+        h_slices = (
+            slice(0, -self.window_size),
+            slice(-self.window_size, -self.shift_size),
+            slice(-self.shift_size, None),
+        )
+        w_slices = (
+            slice(0, -self.window_size),
+            slice(-self.window_size, -self.shift_size),
+            slice(-self.shift_size, None),
+        )
+        cnt = 0
+        for h in h_slices:
+            for w in w_slices:
+                img_mask[:, h, w, :] = cnt
+                cnt += 1
+
+        mask_windows = window_partition(
+            img_mask, self.window_size
+        )  # nW, window_size, window_size, 1
+        mask_windows = mask_windows.view(-1, self.window_size * self.window_size)
+        attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2)
+        attn_mask = attn_mask.masked_fill(attn_mask != 0, float(-100.0)).masked_fill(
+            attn_mask == 0, float(0.0)
+        )
+
+        for blk in self.blocks:
+            blk.H, blk.W = H, W
+            if self.use_checkpoint:
+                x = checkpoint.checkpoint(blk, x, attn_mask)
+            else:
+                x = blk(x, attn_mask)
+        if self.downsample is not None:
+            x_down = self.downsample(x, H, W)
+            Wh, Ww = (H + 1) // 2, (W + 1) // 2
+            return x, H, W, x_down, Wh, Ww
+        else:
+            return x, H, W, x, H, W
+
+
+class PatchEmbed(nn.Module):
+    """Image to Patch Embedding
+    Args:
+        patch_size (int): Patch token size. Default: 4.
+        in_chans (int): Number of input image channels. Default: 3.
+        embed_dim (int): Number of linear projection output channels. Default: 96.
+        norm_layer (nn.Module, optional): Normalization layer. Default: None
+    """
+
+    def __init__(self, patch_size=4, in_chans=3, embed_dim=96, norm_layer=None):
+        super().__init__()
+        patch_size = _to_2tuple(patch_size)
+        self.patch_size = patch_size
+
+        self.in_chans = in_chans
+        self.embed_dim = embed_dim
+
+        self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size)
+        if norm_layer is not None:
+            self.norm = norm_layer(embed_dim)
+        else:
+            self.norm = None
+
+    def forward(self, x):
+        """Forward function."""
+        # padding
+        _, _, H, W = x.size()
+        if W % self.patch_size[1] != 0:
+            x = F.pad(x, (0, self.patch_size[1] - W % self.patch_size[1]))
+        if H % self.patch_size[0] != 0:
+            x = F.pad(x, (0, 0, 0, self.patch_size[0] - H % self.patch_size[0]))
+
+        x = self.proj(x)  # B C Wh Ww
+        if self.norm is not None:
+            Wh, Ww = x.size(2), x.size(3)
+            x = x.flatten(2).transpose(1, 2)
+            x = self.norm(x)
+            x = x.transpose(1, 2).view(-1, self.embed_dim, Wh, Ww)
+
+        return x
+
+
+class SwinTransformer(Backbone):
+    """Swin Transformer backbone.
+        A PyTorch impl of : `Swin Transformer: Hierarchical Vision Transformer using Shifted
+            Windows`  - https://arxiv.org/pdf/2103.14030
+    Args:
+        pretrain_img_size (int): Input image size for training the pretrained model,
+            used in absolute postion embedding. Default 224.
+        patch_size (int | tuple(int)): Patch size. Default: 4.
+        in_chans (int): Number of input image channels. Default: 3.
+        embed_dim (int): Number of linear projection output channels. Default: 96.
+        depths (tuple[int]): Depths of each Swin Transformer stage.
+        num_heads (tuple[int]): Number of attention head of each stage.
+        window_size (int): Window size. Default: 7.
+        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4.
+        qkv_bias (bool): If True, add a learnable bias to query, key, value. Default: True
+        qk_scale (float): Override default qk scale of head_dim ** -0.5 if set.
+        drop_rate (float): Dropout rate.
+        attn_drop_rate (float): Attention dropout rate. Default: 0.
+        drop_path_rate (float): Stochastic depth rate. Default: 0.2.
+        norm_layer (nn.Module): Normalization layer. Default: nn.LayerNorm.
+        ape (bool): If True, add absolute position embedding to the patch embedding. Default: False.
+        patch_norm (bool): If True, add normalization after patch embedding. Default: True.
+        out_indices (Sequence[int]): Output from which stages.
+        frozen_stages (int): Stages to be frozen (stop grad and set eval mode).
+            -1 means not freezing any parameters.
+        use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False.
+    """
+
+    def __init__(
+        self,
+        pretrain_img_size=224,
+        patch_size=4,
+        in_chans=3,
+        embed_dim=96,
+        depths=(2, 2, 6, 2),
+        num_heads=(3, 6, 12, 24),
+        window_size=7,
+        mlp_ratio=4.0,
+        qkv_bias=True,
+        qk_scale=None,
+        drop_rate=0.0,
+        attn_drop_rate=0.0,
+        drop_path_rate=0.2,
+        norm_layer=nn.LayerNorm,
+        ape=False,
+        patch_norm=True,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=-1,
+        use_checkpoint=False,
+    ):
+        super().__init__()
+
+        self.pretrain_img_size = pretrain_img_size
+        self.num_layers = len(depths)
+        self.embed_dim = embed_dim
+        self.ape = ape
+        self.patch_norm = patch_norm
+        self.out_indices = out_indices
+        self.frozen_stages = frozen_stages
+
+        # split image into non-overlapping patches
+        self.patch_embed = PatchEmbed(
+            patch_size=patch_size,
+            in_chans=in_chans,
+            embed_dim=embed_dim,
+            norm_layer=norm_layer if self.patch_norm else None,
+        )
+
+        # absolute position embedding
+        if self.ape:
+            pretrain_img_size = _to_2tuple(pretrain_img_size)
+            patch_size = _to_2tuple(patch_size)
+            patches_resolution = [
+                pretrain_img_size[0] // patch_size[0],
+                pretrain_img_size[1] // patch_size[1],
+            ]
+
+            self.absolute_pos_embed = nn.Parameter(
+                torch.zeros(1, embed_dim, patches_resolution[0], patches_resolution[1])
+            )
+            nn.init.trunc_normal_(self.absolute_pos_embed, std=0.02)
+
+        self.pos_drop = nn.Dropout(p=drop_rate)
+
+        # stochastic depth
+        dpr = [
+            x.item() for x in torch.linspace(0, drop_path_rate, sum(depths))
+        ]  # stochastic depth decay rule
+
+        # build layers
+        self.layers = nn.ModuleList()
+        for i_layer in range(self.num_layers):
+            layer = BasicLayer(
+                dim=int(embed_dim * 2**i_layer),
+                depth=depths[i_layer],
+                num_heads=num_heads[i_layer],
+                window_size=window_size,
+                mlp_ratio=mlp_ratio,
+                qkv_bias=qkv_bias,
+                qk_scale=qk_scale,
+                drop=drop_rate,
+                attn_drop=attn_drop_rate,
+                drop_path=dpr[sum(depths[:i_layer]) : sum(depths[: i_layer + 1])],
+                norm_layer=norm_layer,
+                downsample=PatchMerging if (i_layer < self.num_layers - 1) else None,
+                use_checkpoint=use_checkpoint,
+            )
+            self.layers.append(layer)
+
+        num_features = [int(embed_dim * 2**i) for i in range(self.num_layers)]
+        self.num_features = num_features
+
+        # add a norm layer for each output
+        for i_layer in out_indices:
+            layer = norm_layer(num_features[i_layer])
+            layer_name = f"norm{i_layer}"
+            self.add_module(layer_name, layer)
+
+        self._freeze_stages()
+        self._out_features = ["p{}".format(i) for i in self.out_indices]
+        self._out_feature_channels = {
+            "p{}".format(i): self.embed_dim * 2**i for i in self.out_indices
+        }
+        self._out_feature_strides = {"p{}".format(i): 2 ** (i + 2) for i in self.out_indices}
+        self._size_devisibility = 32
+
+        self.apply(self._init_weights)
+
+    def _freeze_stages(self):
+        if self.frozen_stages >= 0:
+            self.patch_embed.eval()
+            for param in self.patch_embed.parameters():
+                param.requires_grad = False
+
+        if self.frozen_stages >= 1 and self.ape:
+            self.absolute_pos_embed.requires_grad = False
+
+        if self.frozen_stages >= 2:
+            self.pos_drop.eval()
+            for i in range(0, self.frozen_stages - 1):
+                m = self.layers[i]
+                m.eval()
+                for param in m.parameters():
+                    param.requires_grad = False
+
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            nn.init.trunc_normal_(m.weight, std=0.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+
+    @property
+    def size_divisibility(self):
+        return self._size_divisibility
+
+    def forward(self, x):
+        """Forward function."""
+        x = self.patch_embed(x)
+
+        Wh, Ww = x.size(2), x.size(3)
+        if self.ape:
+            # interpolate the position embedding to the corresponding size
+            absolute_pos_embed = F.interpolate(
+                self.absolute_pos_embed, size=(Wh, Ww), mode="bicubic"
+            )
+            x = (x + absolute_pos_embed).flatten(2).transpose(1, 2)  # B Wh*Ww C
+        else:
+            x = x.flatten(2).transpose(1, 2)
+        x = self.pos_drop(x)
+
+        outs = {}
+        for i in range(self.num_layers):
+            layer = self.layers[i]
+            x_out, H, W, x, Wh, Ww = layer(x, Wh, Ww)
+
+            if i in self.out_indices:
+                norm_layer = getattr(self, f"norm{i}")
+                x_out = norm_layer(x_out)
+
+                out = x_out.view(-1, H, W, self.num_features[i]).permute(0, 3, 1, 2).contiguous()
+                outs["p{}".format(i)] = out
+
+        return outs
diff --git a/detectron2/modeling/backbone/utils.py b/detectron2/modeling/backbone/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..2b89a4c3fbe079a77fd0cef947cf9ada787fc55d
--- /dev/null
+++ b/detectron2/modeling/backbone/utils.py
@@ -0,0 +1,186 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+import math
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+__all__ = [
+    "window_partition",
+    "window_unpartition",
+    "add_decomposed_rel_pos",
+    "get_abs_pos",
+    "PatchEmbed",
+]
+
+
+def window_partition(x, window_size):
+    """
+    Partition into non-overlapping windows with padding if needed.
+    Args:
+        x (tensor): input tokens with [B, H, W, C].
+        window_size (int): window size.
+
+    Returns:
+        windows: windows after partition with [B * num_windows, window_size, window_size, C].
+        (Hp, Wp): padded height and width before partition
+    """
+    B, H, W, C = x.shape
+
+    pad_h = (window_size - H % window_size) % window_size
+    pad_w = (window_size - W % window_size) % window_size
+    if pad_h > 0 or pad_w > 0:
+        x = F.pad(x, (0, 0, 0, pad_w, 0, pad_h))
+    Hp, Wp = H + pad_h, W + pad_w
+
+    x = x.view(B, Hp // window_size, window_size, Wp // window_size, window_size, C)
+    windows = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, window_size, window_size, C)
+    return windows, (Hp, Wp)
+
+
+def window_unpartition(windows, window_size, pad_hw, hw):
+    """
+    Window unpartition into original sequences and removing padding.
+    Args:
+        x (tensor): input tokens with [B * num_windows, window_size, window_size, C].
+        window_size (int): window size.
+        pad_hw (Tuple): padded height and width (Hp, Wp).
+        hw (Tuple): original height and width (H, W) before padding.
+
+    Returns:
+        x: unpartitioned sequences with [B, H, W, C].
+    """
+    Hp, Wp = pad_hw
+    H, W = hw
+    B = windows.shape[0] // (Hp * Wp // window_size // window_size)
+    x = windows.view(B, Hp // window_size, Wp // window_size, window_size, window_size, -1)
+    x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, Hp, Wp, -1)
+
+    if Hp > H or Wp > W:
+        x = x[:, :H, :W, :].contiguous()
+    return x
+
+
+def get_rel_pos(q_size, k_size, rel_pos):
+    """
+    Get relative positional embeddings according to the relative positions of
+        query and key sizes.
+    Args:
+        q_size (int): size of query q.
+        k_size (int): size of key k.
+        rel_pos (Tensor): relative position embeddings (L, C).
+
+    Returns:
+        Extracted positional embeddings according to relative positions.
+    """
+    max_rel_dist = int(2 * max(q_size, k_size) - 1)
+    # Interpolate rel pos if needed.
+    if rel_pos.shape[0] != max_rel_dist:
+        # Interpolate rel pos.
+        rel_pos_resized = F.interpolate(
+            rel_pos.reshape(1, rel_pos.shape[0], -1).permute(0, 2, 1),
+            size=max_rel_dist,
+            mode="linear",
+        )
+        rel_pos_resized = rel_pos_resized.reshape(-1, max_rel_dist).permute(1, 0)
+    else:
+        rel_pos_resized = rel_pos
+
+    # Scale the coords with short length if shapes for q and k are different.
+    q_coords = torch.arange(q_size)[:, None] * max(k_size / q_size, 1.0)
+    k_coords = torch.arange(k_size)[None, :] * max(q_size / k_size, 1.0)
+    relative_coords = (q_coords - k_coords) + (k_size - 1) * max(q_size / k_size, 1.0)
+
+    return rel_pos_resized[relative_coords.long()]
+
+
+def add_decomposed_rel_pos(attn, q, rel_pos_h, rel_pos_w, q_size, k_size):
+    """
+    Calculate decomposed Relative Positional Embeddings from :paper:`mvitv2`.
+    https://github.com/facebookresearch/mvit/blob/19786631e330df9f3622e5402b4a419a263a2c80/mvit/models/attention.py   # noqa B950
+    Args:
+        attn (Tensor): attention map.
+        q (Tensor): query q in the attention layer with shape (B, q_h * q_w, C).
+        rel_pos_h (Tensor): relative position embeddings (Lh, C) for height axis.
+        rel_pos_w (Tensor): relative position embeddings (Lw, C) for width axis.
+        q_size (Tuple): spatial sequence size of query q with (q_h, q_w).
+        k_size (Tuple): spatial sequence size of key k with (k_h, k_w).
+
+    Returns:
+        attn (Tensor): attention map with added relative positional embeddings.
+    """
+    q_h, q_w = q_size
+    k_h, k_w = k_size
+    Rh = get_rel_pos(q_h, k_h, rel_pos_h)
+    Rw = get_rel_pos(q_w, k_w, rel_pos_w)
+
+    B, _, dim = q.shape
+    r_q = q.reshape(B, q_h, q_w, dim)
+    rel_h = torch.einsum("bhwc,hkc->bhwk", r_q, Rh)
+    rel_w = torch.einsum("bhwc,wkc->bhwk", r_q, Rw)
+
+    attn = (
+        attn.view(B, q_h, q_w, k_h, k_w) + rel_h[:, :, :, :, None] + rel_w[:, :, :, None, :]
+    ).view(B, q_h * q_w, k_h * k_w)
+
+    return attn
+
+
+def get_abs_pos(abs_pos, has_cls_token, hw):
+    """
+    Calculate absolute positional embeddings. If needed, resize embeddings and remove cls_token
+        dimension for the original embeddings.
+    Args:
+        abs_pos (Tensor): absolute positional embeddings with (1, num_position, C).
+        has_cls_token (bool): If true, has 1 embedding in abs_pos for cls token.
+        hw (Tuple): size of input image tokens.
+
+    Returns:
+        Absolute positional embeddings after processing with shape (1, H, W, C)
+    """
+    h, w = hw
+    if has_cls_token:
+        abs_pos = abs_pos[:, 1:]
+    xy_num = abs_pos.shape[1]
+    size = int(math.sqrt(xy_num))
+    assert size * size == xy_num
+
+    if size != h or size != w:
+        new_abs_pos = F.interpolate(
+            abs_pos.reshape(1, size, size, -1).permute(0, 3, 1, 2),
+            size=(h, w),
+            mode="bicubic",
+            align_corners=False,
+        )
+
+        return new_abs_pos.permute(0, 2, 3, 1)
+    else:
+        return abs_pos.reshape(1, h, w, -1)
+
+
+class PatchEmbed(nn.Module):
+    """
+    Image to Patch Embedding.
+    """
+
+    def __init__(
+        self, kernel_size=(16, 16), stride=(16, 16), padding=(0, 0), in_chans=3, embed_dim=768
+    ):
+        """
+        Args:
+            kernel_size (Tuple): kernel size of the projection layer.
+            stride (Tuple): stride of the projection layer.
+            padding (Tuple): padding size of the projection layer.
+            in_chans (int): Number of input image channels.
+            embed_dim (int):  embed_dim (int): Patch embedding dimension.
+        """
+        super().__init__()
+
+        self.proj = nn.Conv2d(
+            in_chans, embed_dim, kernel_size=kernel_size, stride=stride, padding=padding
+        )
+
+    def forward(self, x):
+        x = self.proj(x)
+        # B C H W -> B H W C
+        x = x.permute(0, 2, 3, 1)
+        return x
diff --git a/detectron2/modeling/backbone/vit.py b/detectron2/modeling/backbone/vit.py
new file mode 100644
index 0000000000000000000000000000000000000000..31cc28ac887773dbc8aea2a663bacd5f7b63bb0c
--- /dev/null
+++ b/detectron2/modeling/backbone/vit.py
@@ -0,0 +1,524 @@
+import logging
+import math
+import fvcore.nn.weight_init as weight_init
+import torch
+import torch.nn as nn
+
+from detectron2.layers import CNNBlockBase, Conv2d, get_norm
+from detectron2.modeling.backbone.fpn import _assert_strides_are_log2_contiguous
+
+from .backbone import Backbone
+from .utils import (
+    PatchEmbed,
+    add_decomposed_rel_pos,
+    get_abs_pos,
+    window_partition,
+    window_unpartition,
+)
+
+logger = logging.getLogger(__name__)
+
+
+__all__ = ["ViT", "SimpleFeaturePyramid", "get_vit_lr_decay_rate"]
+
+
+class Attention(nn.Module):
+    """Multi-head Attention block with relative position embeddings."""
+
+    def __init__(
+        self,
+        dim,
+        num_heads=8,
+        qkv_bias=True,
+        use_rel_pos=False,
+        rel_pos_zero_init=True,
+        input_size=None,
+    ):
+        """
+        Args:
+            dim (int): Number of input channels.
+            num_heads (int): Number of attention heads.
+            qkv_bias (bool:  If True, add a learnable bias to query, key, value.
+            rel_pos (bool): If True, add relative positional embeddings to the attention map.
+            rel_pos_zero_init (bool): If True, zero initialize relative positional parameters.
+            input_size (int or None): Input resolution for calculating the relative positional
+                parameter size.
+        """
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = head_dim**-0.5
+
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.proj = nn.Linear(dim, dim)
+
+        self.use_rel_pos = use_rel_pos
+        if self.use_rel_pos:
+            # initialize relative positional embeddings
+            self.rel_pos_h = nn.Parameter(torch.zeros(2 * input_size[0] - 1, head_dim))
+            self.rel_pos_w = nn.Parameter(torch.zeros(2 * input_size[1] - 1, head_dim))
+
+            if not rel_pos_zero_init:
+                nn.init.trunc_normal_(self.rel_pos_h, std=0.02)
+                nn.init.trunc_normal_(self.rel_pos_w, std=0.02)
+
+    def forward(self, x):
+        B, H, W, _ = x.shape
+        # qkv with shape (3, B, nHead, H * W, C)
+        qkv = self.qkv(x).reshape(B, H * W, 3, self.num_heads, -1).permute(2, 0, 3, 1, 4)
+        # q, k, v with shape (B * nHead, H * W, C)
+        q, k, v = qkv.reshape(3, B * self.num_heads, H * W, -1).unbind(0)
+
+        attn = (q * self.scale) @ k.transpose(-2, -1)
+
+        if self.use_rel_pos:
+            attn = add_decomposed_rel_pos(attn, q, self.rel_pos_h, self.rel_pos_w, (H, W), (H, W))
+
+        attn = attn.softmax(dim=-1)
+        x = (attn @ v).view(B, self.num_heads, H, W, -1).permute(0, 2, 3, 1, 4).reshape(B, H, W, -1)
+        x = self.proj(x)
+
+        return x
+
+
+class ResBottleneckBlock(CNNBlockBase):
+    """
+    The standard bottleneck residual block without the last activation layer.
+    It contains 3 conv layers with kernels 1x1, 3x3, 1x1.
+    """
+
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        bottleneck_channels,
+        norm="LN",
+        act_layer=nn.GELU,
+    ):
+        """
+        Args:
+            in_channels (int): Number of input channels.
+            out_channels (int): Number of output channels.
+            bottleneck_channels (int): number of output channels for the 3x3
+                "bottleneck" conv layers.
+            norm (str or callable): normalization for all conv layers.
+                See :func:`layers.get_norm` for supported format.
+            act_layer (callable): activation for all conv layers.
+        """
+        super().__init__(in_channels, out_channels, 1)
+
+        self.conv1 = Conv2d(in_channels, bottleneck_channels, 1, bias=False)
+        self.norm1 = get_norm(norm, bottleneck_channels)
+        self.act1 = act_layer()
+
+        self.conv2 = Conv2d(
+            bottleneck_channels,
+            bottleneck_channels,
+            3,
+            padding=1,
+            bias=False,
+        )
+        self.norm2 = get_norm(norm, bottleneck_channels)
+        self.act2 = act_layer()
+
+        self.conv3 = Conv2d(bottleneck_channels, out_channels, 1, bias=False)
+        self.norm3 = get_norm(norm, out_channels)
+
+        for layer in [self.conv1, self.conv2, self.conv3]:
+            weight_init.c2_msra_fill(layer)
+        for layer in [self.norm1, self.norm2]:
+            layer.weight.data.fill_(1.0)
+            layer.bias.data.zero_()
+        # zero init last norm layer.
+        self.norm3.weight.data.zero_()
+        self.norm3.bias.data.zero_()
+
+    def forward(self, x):
+        out = x
+        for layer in self.children():
+            out = layer(out)
+
+        out = x + out
+        return out
+
+
+class Block(nn.Module):
+    """Transformer blocks with support of window attention and residual propagation blocks"""
+
+    def __init__(
+        self,
+        dim,
+        num_heads,
+        mlp_ratio=4.0,
+        qkv_bias=True,
+        drop_path=0.0,
+        norm_layer=nn.LayerNorm,
+        act_layer=nn.GELU,
+        use_rel_pos=False,
+        rel_pos_zero_init=True,
+        window_size=0,
+        use_residual_block=False,
+        input_size=None,
+    ):
+        """
+        Args:
+            dim (int): Number of input channels.
+            num_heads (int): Number of attention heads in each ViT block.
+            mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
+            qkv_bias (bool): If True, add a learnable bias to query, key, value.
+            drop_path (float): Stochastic depth rate.
+            norm_layer (nn.Module): Normalization layer.
+            act_layer (nn.Module): Activation layer.
+            use_rel_pos (bool): If True, add relative positional embeddings to the attention map.
+            rel_pos_zero_init (bool): If True, zero initialize relative positional parameters.
+            window_size (int): Window size for window attention blocks. If it equals 0, then not
+                use window attention.
+            use_residual_block (bool): If True, use a residual block after the MLP block.
+            input_size (int or None): Input resolution for calculating the relative positional
+                parameter size.
+        """
+        super().__init__()
+        self.norm1 = norm_layer(dim)
+        self.attn = Attention(
+            dim,
+            num_heads=num_heads,
+            qkv_bias=qkv_bias,
+            use_rel_pos=use_rel_pos,
+            rel_pos_zero_init=rel_pos_zero_init,
+            input_size=input_size if window_size == 0 else (window_size, window_size),
+        )
+
+        from timm.models.layers import DropPath, Mlp
+
+        self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
+        self.norm2 = norm_layer(dim)
+        self.mlp = Mlp(in_features=dim, hidden_features=int(dim * mlp_ratio), act_layer=act_layer)
+
+        self.window_size = window_size
+
+        self.use_residual_block = use_residual_block
+        if use_residual_block:
+            # Use a residual block with bottleneck channel as dim // 2
+            self.residual = ResBottleneckBlock(
+                in_channels=dim,
+                out_channels=dim,
+                bottleneck_channels=dim // 2,
+                norm="LN",
+                act_layer=act_layer,
+            )
+
+    def forward(self, x):
+        shortcut = x
+        x = self.norm1(x)
+        # Window partition
+        if self.window_size > 0:
+            H, W = x.shape[1], x.shape[2]
+            x, pad_hw = window_partition(x, self.window_size)
+
+        x = self.attn(x)
+        # Reverse window partition
+        if self.window_size > 0:
+            x = window_unpartition(x, self.window_size, pad_hw, (H, W))
+
+        x = shortcut + self.drop_path(x)
+        x = x + self.drop_path(self.mlp(self.norm2(x)))
+
+        if self.use_residual_block:
+            x = self.residual(x.permute(0, 3, 1, 2)).permute(0, 2, 3, 1)
+
+        return x
+
+
+class ViT(Backbone):
+    """
+    This module implements Vision Transformer (ViT) backbone in :paper:`vitdet`.
+    "Exploring Plain Vision Transformer Backbones for Object Detection",
+    https://arxiv.org/abs/2203.16527
+    """
+
+    def __init__(
+        self,
+        img_size=1024,
+        patch_size=16,
+        in_chans=3,
+        embed_dim=768,
+        depth=12,
+        num_heads=12,
+        mlp_ratio=4.0,
+        qkv_bias=True,
+        drop_path_rate=0.0,
+        norm_layer=nn.LayerNorm,
+        act_layer=nn.GELU,
+        use_abs_pos=True,
+        use_rel_pos=False,
+        rel_pos_zero_init=True,
+        window_size=0,
+        window_block_indexes=(),
+        residual_block_indexes=(),
+        use_act_checkpoint=False,
+        pretrain_img_size=224,
+        pretrain_use_cls_token=True,
+        out_feature="last_feat",
+    ):
+        """
+        Args:
+            img_size (int): Input image size.
+            patch_size (int): Patch size.
+            in_chans (int): Number of input image channels.
+            embed_dim (int): Patch embedding dimension.
+            depth (int): Depth of ViT.
+            num_heads (int): Number of attention heads in each ViT block.
+            mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
+            qkv_bias (bool): If True, add a learnable bias to query, key, value.
+            drop_path_rate (float): Stochastic depth rate.
+            norm_layer (nn.Module): Normalization layer.
+            act_layer (nn.Module): Activation layer.
+            use_abs_pos (bool): If True, use absolute positional embeddings.
+            use_rel_pos (bool): If True, add relative positional embeddings to the attention map.
+            rel_pos_zero_init (bool): If True, zero initialize relative positional parameters.
+            window_size (int): Window size for window attention blocks.
+            window_block_indexes (list): Indexes for blocks using window attention.
+            residual_block_indexes (list): Indexes for blocks using conv propagation.
+            use_act_checkpoint (bool): If True, use activation checkpointing.
+            pretrain_img_size (int): input image size for pretraining models.
+            pretrain_use_cls_token (bool): If True, pretrainig models use class token.
+            out_feature (str): name of the feature from the last block.
+        """
+        super().__init__()
+        self.pretrain_use_cls_token = pretrain_use_cls_token
+
+        self.patch_embed = PatchEmbed(
+            kernel_size=(patch_size, patch_size),
+            stride=(patch_size, patch_size),
+            in_chans=in_chans,
+            embed_dim=embed_dim,
+        )
+
+        if use_abs_pos:
+            # Initialize absolute positional embedding with pretrain image size.
+            num_patches = (pretrain_img_size // patch_size) * (pretrain_img_size // patch_size)
+            num_positions = (num_patches + 1) if pretrain_use_cls_token else num_patches
+            self.pos_embed = nn.Parameter(torch.zeros(1, num_positions, embed_dim))
+        else:
+            self.pos_embed = None
+
+        # stochastic depth decay rule
+        dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)]
+
+        self.blocks = nn.ModuleList()
+        for i in range(depth):
+            block = Block(
+                dim=embed_dim,
+                num_heads=num_heads,
+                mlp_ratio=mlp_ratio,
+                qkv_bias=qkv_bias,
+                drop_path=dpr[i],
+                norm_layer=norm_layer,
+                act_layer=act_layer,
+                use_rel_pos=use_rel_pos,
+                rel_pos_zero_init=rel_pos_zero_init,
+                window_size=window_size if i in window_block_indexes else 0,
+                use_residual_block=i in residual_block_indexes,
+                input_size=(img_size // patch_size, img_size // patch_size),
+            )
+            if use_act_checkpoint:
+                # TODO: use torch.utils.checkpoint
+                from fairscale.nn.checkpoint import checkpoint_wrapper
+
+                block = checkpoint_wrapper(block)
+            self.blocks.append(block)
+
+        self._out_feature_channels = {out_feature: embed_dim}
+        self._out_feature_strides = {out_feature: patch_size}
+        self._out_features = [out_feature]
+
+        if self.pos_embed is not None:
+            nn.init.trunc_normal_(self.pos_embed, std=0.02)
+
+        self.apply(self._init_weights)
+
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            nn.init.trunc_normal_(m.weight, std=0.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+
+    def forward(self, x):
+        x = self.patch_embed(x)
+        if self.pos_embed is not None:
+            x = x + get_abs_pos(
+                self.pos_embed, self.pretrain_use_cls_token, (x.shape[1], x.shape[2])
+            )
+
+        for blk in self.blocks:
+            x = blk(x)
+
+        outputs = {self._out_features[0]: x.permute(0, 3, 1, 2)}
+        return outputs
+
+
+class SimpleFeaturePyramid(Backbone):
+    """
+    This module implements SimpleFeaturePyramid in :paper:`vitdet`.
+    It creates pyramid features built on top of the input feature map.
+    """
+
+    def __init__(
+        self,
+        net,
+        in_feature,
+        out_channels,
+        scale_factors,
+        top_block=None,
+        norm="LN",
+        square_pad=0,
+    ):
+        """
+        Args:
+            net (Backbone): module representing the subnetwork backbone.
+                Must be a subclass of :class:`Backbone`.
+            in_feature (str): names of the input feature maps coming
+                from the net.
+            out_channels (int): number of channels in the output feature maps.
+            scale_factors (list[float]): list of scaling factors to upsample or downsample
+                the input features for creating pyramid features.
+            top_block (nn.Module or None): if provided, an extra operation will
+                be performed on the output of the last (smallest resolution)
+                pyramid output, and the result will extend the result list. The top_block
+                further downsamples the feature map. It must have an attribute
+                "num_levels", meaning the number of extra pyramid levels added by
+                this block, and "in_feature", which is a string representing
+                its input feature (e.g., p5).
+            norm (str): the normalization to use.
+            square_pad (int): If > 0, require input images to be padded to specific square size.
+        """
+        super(SimpleFeaturePyramid, self).__init__()
+        assert isinstance(net, Backbone)
+
+        self.scale_factors = scale_factors
+
+        input_shapes = net.output_shape()
+        strides = [int(input_shapes[in_feature].stride / scale) for scale in scale_factors]
+        _assert_strides_are_log2_contiguous(strides)
+
+        dim = input_shapes[in_feature].channels
+        self.stages = []
+        use_bias = norm == ""
+        for idx, scale in enumerate(scale_factors):
+            out_dim = dim
+            if scale == 4.0:
+                layers = [
+                    nn.ConvTranspose2d(dim, dim // 2, kernel_size=2, stride=2),
+                    get_norm(norm, dim // 2),
+                    nn.GELU(),
+                    nn.ConvTranspose2d(dim // 2, dim // 4, kernel_size=2, stride=2),
+                ]
+                out_dim = dim // 4
+            elif scale == 2.0:
+                layers = [nn.ConvTranspose2d(dim, dim // 2, kernel_size=2, stride=2)]
+                out_dim = dim // 2
+            elif scale == 1.0:
+                layers = []
+            elif scale == 0.5:
+                layers = [nn.MaxPool2d(kernel_size=2, stride=2)]
+            else:
+                raise NotImplementedError(f"scale_factor={scale} is not supported yet.")
+
+            layers.extend(
+                [
+                    Conv2d(
+                        out_dim,
+                        out_channels,
+                        kernel_size=1,
+                        bias=use_bias,
+                        norm=get_norm(norm, out_channels),
+                    ),
+                    Conv2d(
+                        out_channels,
+                        out_channels,
+                        kernel_size=3,
+                        padding=1,
+                        bias=use_bias,
+                        norm=get_norm(norm, out_channels),
+                    ),
+                ]
+            )
+            layers = nn.Sequential(*layers)
+
+            stage = int(math.log2(strides[idx]))
+            self.add_module(f"simfp_{stage}", layers)
+            self.stages.append(layers)
+
+        self.net = net
+        self.in_feature = in_feature
+        self.top_block = top_block
+        # Return feature names are "p<stage>", like ["p2", "p3", ..., "p6"]
+        self._out_feature_strides = {"p{}".format(int(math.log2(s))): s for s in strides}
+        # top block output feature maps.
+        if self.top_block is not None:
+            for s in range(stage, stage + self.top_block.num_levels):
+                self._out_feature_strides["p{}".format(s + 1)] = 2 ** (s + 1)
+
+        self._out_features = list(self._out_feature_strides.keys())
+        self._out_feature_channels = {k: out_channels for k in self._out_features}
+        self._size_divisibility = strides[-1]
+        self._square_pad = square_pad
+
+    @property
+    def padding_constraints(self):
+        return {
+            "size_divisiblity": self._size_divisibility,
+            "square_size": self._square_pad,
+        }
+
+    def forward(self, x):
+        """
+        Args:
+            x: Tensor of shape (N,C,H,W). H, W must be a multiple of ``self.size_divisibility``.
+
+        Returns:
+            dict[str->Tensor]:
+                mapping from feature map name to pyramid feature map tensor
+                in high to low resolution order. Returned feature names follow the FPN
+                convention: "p<stage>", where stage has stride = 2 ** stage e.g.,
+                ["p2", "p3", ..., "p6"].
+        """
+        bottom_up_features = self.net(x)
+        features = bottom_up_features[self.in_feature]
+        results = []
+
+        for stage in self.stages:
+            results.append(stage(features))
+
+        if self.top_block is not None:
+            if self.top_block.in_feature in bottom_up_features:
+                top_block_in_feature = bottom_up_features[self.top_block.in_feature]
+            else:
+                top_block_in_feature = results[self._out_features.index(self.top_block.in_feature)]
+            results.extend(self.top_block(top_block_in_feature))
+        assert len(self._out_features) == len(results)
+        return {f: res for f, res in zip(self._out_features, results)}
+
+
+def get_vit_lr_decay_rate(name, lr_decay_rate=1.0, num_layers=12):
+    """
+    Calculate lr decay rate for different ViT blocks.
+    Args:
+        name (string): parameter name.
+        lr_decay_rate (float): base lr decay rate.
+        num_layers (int): number of ViT blocks.
+
+    Returns:
+        lr decay rate for the given parameter.
+    """
+    layer_id = num_layers + 1
+    if name.startswith("backbone"):
+        if ".pos_embed" in name or ".patch_embed" in name:
+            layer_id = 0
+        elif ".blocks." in name and ".residual." not in name:
+            layer_id = int(name[name.find(".blocks.") :].split(".")[2]) + 1
+
+    return lr_decay_rate ** (num_layers + 1 - layer_id)
diff --git a/detectron2/modeling/box_regression.py b/detectron2/modeling/box_regression.py
new file mode 100644
index 0000000000000000000000000000000000000000..09fa097424d99518874c972efd3cdfb0bb96fd16
--- /dev/null
+++ b/detectron2/modeling/box_regression.py
@@ -0,0 +1,369 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+import math
+from typing import List, Tuple, Union
+import torch
+from fvcore.nn import giou_loss, smooth_l1_loss
+from torch.nn import functional as F
+
+from detectron2.layers import cat, ciou_loss, diou_loss
+from detectron2.structures import Boxes
+
+# Value for clamping large dw and dh predictions. The heuristic is that we clamp
+# such that dw and dh are no larger than what would transform a 16px box into a
+# 1000px box (based on a small anchor, 16px, and a typical image size, 1000px).
+_DEFAULT_SCALE_CLAMP = math.log(1000.0 / 16)
+
+
+__all__ = ["Box2BoxTransform", "Box2BoxTransformRotated", "Box2BoxTransformLinear"]
+
+
+@torch.jit.script
+class Box2BoxTransform:
+    """
+    The box-to-box transform defined in R-CNN. The transformation is parameterized
+    by 4 deltas: (dx, dy, dw, dh). The transformation scales the box's width and height
+    by exp(dw), exp(dh) and shifts a box's center by the offset (dx * width, dy * height).
+    """
+
+    def __init__(
+        self, weights: Tuple[float, float, float, float], scale_clamp: float = _DEFAULT_SCALE_CLAMP
+    ):
+        """
+        Args:
+            weights (4-element tuple): Scaling factors that are applied to the
+                (dx, dy, dw, dh) deltas. In Fast R-CNN, these were originally set
+                such that the deltas have unit variance; now they are treated as
+                hyperparameters of the system.
+            scale_clamp (float): When predicting deltas, the predicted box scaling
+                factors (dw and dh) are clamped such that they are <= scale_clamp.
+        """
+        self.weights = weights
+        self.scale_clamp = scale_clamp
+
+    def get_deltas(self, src_boxes, target_boxes):
+        """
+        Get box regression transformation deltas (dx, dy, dw, dh) that can be used
+        to transform the `src_boxes` into the `target_boxes`. That is, the relation
+        ``target_boxes == self.apply_deltas(deltas, src_boxes)`` is true (unless
+        any delta is too large and is clamped).
+
+        Args:
+            src_boxes (Tensor): source boxes, e.g., object proposals
+            target_boxes (Tensor): target of the transformation, e.g., ground-truth
+                boxes.
+        """
+        assert isinstance(src_boxes, torch.Tensor), type(src_boxes)
+        assert isinstance(target_boxes, torch.Tensor), type(target_boxes)
+
+        src_widths = src_boxes[:, 2] - src_boxes[:, 0]
+        src_heights = src_boxes[:, 3] - src_boxes[:, 1]
+        src_ctr_x = src_boxes[:, 0] + 0.5 * src_widths
+        src_ctr_y = src_boxes[:, 1] + 0.5 * src_heights
+
+        target_widths = target_boxes[:, 2] - target_boxes[:, 0]
+        target_heights = target_boxes[:, 3] - target_boxes[:, 1]
+        target_ctr_x = target_boxes[:, 0] + 0.5 * target_widths
+        target_ctr_y = target_boxes[:, 1] + 0.5 * target_heights
+
+        wx, wy, ww, wh = self.weights
+        dx = wx * (target_ctr_x - src_ctr_x) / src_widths
+        dy = wy * (target_ctr_y - src_ctr_y) / src_heights
+        dw = ww * torch.log(target_widths / src_widths)
+        dh = wh * torch.log(target_heights / src_heights)
+
+        deltas = torch.stack((dx, dy, dw, dh), dim=1)
+        assert (src_widths > 0).all().item(), "Input boxes to Box2BoxTransform are not valid!"
+        return deltas
+
+    def apply_deltas(self, deltas, boxes):
+        """
+        Apply transformation `deltas` (dx, dy, dw, dh) to `boxes`.
+
+        Args:
+            deltas (Tensor): transformation deltas of shape (N, k*4), where k >= 1.
+                deltas[i] represents k potentially different class-specific
+                box transformations for the single box boxes[i].
+            boxes (Tensor): boxes to transform, of shape (N, 4)
+        """
+        deltas = deltas.float()  # ensure fp32 for decoding precision
+        boxes = boxes.to(deltas.dtype)
+
+        widths = boxes[:, 2] - boxes[:, 0]
+        heights = boxes[:, 3] - boxes[:, 1]
+        ctr_x = boxes[:, 0] + 0.5 * widths
+        ctr_y = boxes[:, 1] + 0.5 * heights
+
+        wx, wy, ww, wh = self.weights
+        dx = deltas[:, 0::4] / wx
+        dy = deltas[:, 1::4] / wy
+        dw = deltas[:, 2::4] / ww
+        dh = deltas[:, 3::4] / wh
+
+        # Prevent sending too large values into torch.exp()
+        dw = torch.clamp(dw, max=self.scale_clamp)
+        dh = torch.clamp(dh, max=self.scale_clamp)
+
+        pred_ctr_x = dx * widths[:, None] + ctr_x[:, None]
+        pred_ctr_y = dy * heights[:, None] + ctr_y[:, None]
+        pred_w = torch.exp(dw) * widths[:, None]
+        pred_h = torch.exp(dh) * heights[:, None]
+
+        x1 = pred_ctr_x - 0.5 * pred_w
+        y1 = pred_ctr_y - 0.5 * pred_h
+        x2 = pred_ctr_x + 0.5 * pred_w
+        y2 = pred_ctr_y + 0.5 * pred_h
+        pred_boxes = torch.stack((x1, y1, x2, y2), dim=-1)
+        return pred_boxes.reshape(deltas.shape)
+
+
+# @torch.jit.script
+class Box2BoxTransformRotated:
+    """
+    The box-to-box transform defined in Rotated R-CNN. The transformation is parameterized
+    by 5 deltas: (dx, dy, dw, dh, da). The transformation scales the box's width and height
+    by exp(dw), exp(dh), shifts a box's center by the offset (dx * width, dy * height),
+    and rotate a box's angle by da (radians).
+    Note: angles of deltas are in radians while angles of boxes are in degrees.
+    """
+
+    def __init__(
+        self,
+        weights: Tuple[float, float, float, float, float],
+        scale_clamp: float = _DEFAULT_SCALE_CLAMP,
+    ):
+        """
+        Args:
+            weights (5-element tuple): Scaling factors that are applied to the
+                (dx, dy, dw, dh, da) deltas. These are treated as
+                hyperparameters of the system.
+            scale_clamp (float): When predicting deltas, the predicted box scaling
+                factors (dw and dh) are clamped such that they are <= scale_clamp.
+        """
+        self.weights = weights
+        self.scale_clamp = scale_clamp
+
+    def get_deltas(self, src_boxes, target_boxes):
+        """
+        Get box regression transformation deltas (dx, dy, dw, dh, da) that can be used
+        to transform the `src_boxes` into the `target_boxes`. That is, the relation
+        ``target_boxes == self.apply_deltas(deltas, src_boxes)`` is true (unless
+        any delta is too large and is clamped).
+
+        Args:
+            src_boxes (Tensor): Nx5 source boxes, e.g., object proposals
+            target_boxes (Tensor): Nx5 target of the transformation, e.g., ground-truth
+                boxes.
+        """
+        assert isinstance(src_boxes, torch.Tensor), type(src_boxes)
+        assert isinstance(target_boxes, torch.Tensor), type(target_boxes)
+
+        src_ctr_x, src_ctr_y, src_widths, src_heights, src_angles = torch.unbind(src_boxes, dim=1)
+
+        target_ctr_x, target_ctr_y, target_widths, target_heights, target_angles = torch.unbind(
+            target_boxes, dim=1
+        )
+
+        wx, wy, ww, wh, wa = self.weights
+        dx = wx * (target_ctr_x - src_ctr_x) / src_widths
+        dy = wy * (target_ctr_y - src_ctr_y) / src_heights
+        dw = ww * torch.log(target_widths / src_widths)
+        dh = wh * torch.log(target_heights / src_heights)
+        # Angles of deltas are in radians while angles of boxes are in degrees.
+        # the conversion to radians serve as a way to normalize the values
+        da = target_angles - src_angles
+        da = (da + 180.0) % 360.0 - 180.0  # make it in [-180, 180)
+        da *= wa * math.pi / 180.0
+
+        deltas = torch.stack((dx, dy, dw, dh, da), dim=1)
+        assert (
+            (src_widths > 0).all().item()
+        ), "Input boxes to Box2BoxTransformRotated are not valid!"
+        return deltas
+
+    def apply_deltas(self, deltas, boxes):
+        """
+        Apply transformation `deltas` (dx, dy, dw, dh, da) to `boxes`.
+
+        Args:
+            deltas (Tensor): transformation deltas of shape (N, k*5).
+                deltas[i] represents box transformation for the single box boxes[i].
+            boxes (Tensor): boxes to transform, of shape (N, 5)
+        """
+        assert deltas.shape[1] % 5 == 0 and boxes.shape[1] == 5
+
+        boxes = boxes.to(deltas.dtype).unsqueeze(2)
+
+        ctr_x = boxes[:, 0]
+        ctr_y = boxes[:, 1]
+        widths = boxes[:, 2]
+        heights = boxes[:, 3]
+        angles = boxes[:, 4]
+
+        wx, wy, ww, wh, wa = self.weights
+
+        dx = deltas[:, 0::5] / wx
+        dy = deltas[:, 1::5] / wy
+        dw = deltas[:, 2::5] / ww
+        dh = deltas[:, 3::5] / wh
+        da = deltas[:, 4::5] / wa
+
+        # Prevent sending too large values into torch.exp()
+        dw = torch.clamp(dw, max=self.scale_clamp)
+        dh = torch.clamp(dh, max=self.scale_clamp)
+
+        pred_boxes = torch.zeros_like(deltas)
+        pred_boxes[:, 0::5] = dx * widths + ctr_x  # x_ctr
+        pred_boxes[:, 1::5] = dy * heights + ctr_y  # y_ctr
+        pred_boxes[:, 2::5] = torch.exp(dw) * widths  # width
+        pred_boxes[:, 3::5] = torch.exp(dh) * heights  # height
+
+        # Following original RRPN implementation,
+        # angles of deltas are in radians while angles of boxes are in degrees.
+        pred_angle = da * 180.0 / math.pi + angles
+        pred_angle = (pred_angle + 180.0) % 360.0 - 180.0  # make it in [-180, 180)
+
+        pred_boxes[:, 4::5] = pred_angle
+
+        return pred_boxes
+
+
+class Box2BoxTransformLinear:
+    """
+    The linear box-to-box transform defined in FCOS. The transformation is parameterized
+    by the distance from the center of (square) src box to 4 edges of the target box.
+    """
+
+    def __init__(self, normalize_by_size=True):
+        """
+        Args:
+            normalize_by_size: normalize deltas by the size of src (anchor) boxes.
+        """
+        self.normalize_by_size = normalize_by_size
+
+    def get_deltas(self, src_boxes, target_boxes):
+        """
+        Get box regression transformation deltas (dx1, dy1, dx2, dy2) that can be used
+        to transform the `src_boxes` into the `target_boxes`. That is, the relation
+        ``target_boxes == self.apply_deltas(deltas, src_boxes)`` is true.
+        The center of src must be inside target boxes.
+
+        Args:
+            src_boxes (Tensor): square source boxes, e.g., anchors
+            target_boxes (Tensor): target of the transformation, e.g., ground-truth
+                boxes.
+        """
+        assert isinstance(src_boxes, torch.Tensor), type(src_boxes)
+        assert isinstance(target_boxes, torch.Tensor), type(target_boxes)
+
+        src_ctr_x = 0.5 * (src_boxes[:, 0] + src_boxes[:, 2])
+        src_ctr_y = 0.5 * (src_boxes[:, 1] + src_boxes[:, 3])
+
+        target_l = src_ctr_x - target_boxes[:, 0]
+        target_t = src_ctr_y - target_boxes[:, 1]
+        target_r = target_boxes[:, 2] - src_ctr_x
+        target_b = target_boxes[:, 3] - src_ctr_y
+
+        deltas = torch.stack((target_l, target_t, target_r, target_b), dim=1)
+        if self.normalize_by_size:
+            stride_w = src_boxes[:, 2] - src_boxes[:, 0]
+            stride_h = src_boxes[:, 3] - src_boxes[:, 1]
+            strides = torch.stack([stride_w, stride_h, stride_w, stride_h], axis=1)
+            deltas = deltas / strides
+
+        return deltas
+
+    def apply_deltas(self, deltas, boxes):
+        """
+        Apply transformation `deltas` (dx1, dy1, dx2, dy2) to `boxes`.
+
+        Args:
+            deltas (Tensor): transformation deltas of shape (N, k*4), where k >= 1.
+                deltas[i] represents k potentially different class-specific
+                box transformations for the single box boxes[i].
+            boxes (Tensor): boxes to transform, of shape (N, 4)
+        """
+        # Ensure the output is a valid box. See Sec 2.1 of https://arxiv.org/abs/2006.09214
+        deltas = F.relu(deltas)
+        boxes = boxes.to(deltas.dtype)
+
+        ctr_x = 0.5 * (boxes[:, 0] + boxes[:, 2])
+        ctr_y = 0.5 * (boxes[:, 1] + boxes[:, 3])
+        if self.normalize_by_size:
+            stride_w = boxes[:, 2] - boxes[:, 0]
+            stride_h = boxes[:, 3] - boxes[:, 1]
+            strides = torch.stack([stride_w, stride_h, stride_w, stride_h], axis=1)
+            deltas = deltas * strides
+
+        l = deltas[:, 0::4]
+        t = deltas[:, 1::4]
+        r = deltas[:, 2::4]
+        b = deltas[:, 3::4]
+
+        pred_boxes = torch.zeros_like(deltas)
+        pred_boxes[:, 0::4] = ctr_x[:, None] - l  # x1
+        pred_boxes[:, 1::4] = ctr_y[:, None] - t  # y1
+        pred_boxes[:, 2::4] = ctr_x[:, None] + r  # x2
+        pred_boxes[:, 3::4] = ctr_y[:, None] + b  # y2
+        return pred_boxes
+
+
+def _dense_box_regression_loss(
+    anchors: List[Union[Boxes, torch.Tensor]],
+    box2box_transform: Box2BoxTransform,
+    pred_anchor_deltas: List[torch.Tensor],
+    gt_boxes: List[torch.Tensor],
+    fg_mask: torch.Tensor,
+    box_reg_loss_type="smooth_l1",
+    smooth_l1_beta=0.0,
+):
+    """
+    Compute loss for dense multi-level box regression.
+    Loss is accumulated over ``fg_mask``.
+
+    Args:
+        anchors: #lvl anchor boxes, each is (HixWixA, 4)
+        pred_anchor_deltas: #lvl predictions, each is (N, HixWixA, 4)
+        gt_boxes: N ground truth boxes, each has shape (R, 4) (R = sum(Hi * Wi * A))
+        fg_mask: the foreground boolean mask of shape (N, R) to compute loss on
+        box_reg_loss_type (str): Loss type to use. Supported losses: "smooth_l1", "giou",
+            "diou", "ciou".
+        smooth_l1_beta (float): beta parameter for the smooth L1 regression loss. Default to
+            use L1 loss. Only used when `box_reg_loss_type` is "smooth_l1"
+    """
+    if isinstance(anchors[0], Boxes):
+        anchors = type(anchors[0]).cat(anchors).tensor  # (R, 4)
+    else:
+        anchors = cat(anchors)
+    if box_reg_loss_type == "smooth_l1":
+        gt_anchor_deltas = [box2box_transform.get_deltas(anchors, k) for k in gt_boxes]
+        gt_anchor_deltas = torch.stack(gt_anchor_deltas)  # (N, R, 4)
+        loss_box_reg = smooth_l1_loss(
+            cat(pred_anchor_deltas, dim=1)[fg_mask],
+            gt_anchor_deltas[fg_mask],
+            beta=smooth_l1_beta,
+            reduction="sum",
+        )
+    elif box_reg_loss_type == "giou":
+        pred_boxes = [
+            box2box_transform.apply_deltas(k, anchors) for k in cat(pred_anchor_deltas, dim=1)
+        ]
+        loss_box_reg = giou_loss(
+            torch.stack(pred_boxes)[fg_mask], torch.stack(gt_boxes)[fg_mask], reduction="sum"
+        )
+    elif box_reg_loss_type == "diou":
+        pred_boxes = [
+            box2box_transform.apply_deltas(k, anchors) for k in cat(pred_anchor_deltas, dim=1)
+        ]
+        loss_box_reg = diou_loss(
+            torch.stack(pred_boxes)[fg_mask], torch.stack(gt_boxes)[fg_mask], reduction="sum"
+        )
+    elif box_reg_loss_type == "ciou":
+        pred_boxes = [
+            box2box_transform.apply_deltas(k, anchors) for k in cat(pred_anchor_deltas, dim=1)
+        ]
+        loss_box_reg = ciou_loss(
+            torch.stack(pred_boxes)[fg_mask], torch.stack(gt_boxes)[fg_mask], reduction="sum"
+        )
+    else:
+        raise ValueError(f"Invalid dense box regression loss type '{box_reg_loss_type}'")
+    return loss_box_reg
diff --git a/detectron2/modeling/matcher.py b/detectron2/modeling/matcher.py
new file mode 100644
index 0000000000000000000000000000000000000000..be8abc770e2ec1f5912688d1541509c3a799bddc
--- /dev/null
+++ b/detectron2/modeling/matcher.py
@@ -0,0 +1,127 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+from typing import List
+import torch
+
+from detectron2.layers import nonzero_tuple
+
+
+# TODO: the name is too general
+class Matcher:
+    """
+    This class assigns to each predicted "element" (e.g., a box) a ground-truth
+    element. Each predicted element will have exactly zero or one matches; each
+    ground-truth element may be matched to zero or more predicted elements.
+
+    The matching is determined by the MxN match_quality_matrix, that characterizes
+    how well each (ground-truth, prediction)-pair match each other. For example,
+    if the elements are boxes, this matrix may contain box intersection-over-union
+    overlap values.
+
+    The matcher returns (a) a vector of length N containing the index of the
+    ground-truth element m in [0, M) that matches to prediction n in [0, N).
+    (b) a vector of length N containing the labels for each prediction.
+    """
+
+    def __init__(
+        self, thresholds: List[float], labels: List[int], allow_low_quality_matches: bool = False
+    ):
+        """
+        Args:
+            thresholds (list): a list of thresholds used to stratify predictions
+                into levels.
+            labels (list): a list of values to label predictions belonging at
+                each level. A label can be one of {-1, 0, 1} signifying
+                {ignore, negative class, positive class}, respectively.
+            allow_low_quality_matches (bool): if True, produce additional matches
+                for predictions with maximum match quality lower than high_threshold.
+                See set_low_quality_matches_ for more details.
+
+            For example,
+                thresholds = [0.3, 0.5]
+                labels = [0, -1, 1]
+                All predictions with iou < 0.3 will be marked with 0 and
+                thus will be considered as false positives while training.
+                All predictions with 0.3 <= iou < 0.5 will be marked with -1 and
+                thus will be ignored.
+                All predictions with 0.5 <= iou will be marked with 1 and
+                thus will be considered as true positives.
+        """
+        # Add -inf and +inf to first and last position in thresholds
+        thresholds = thresholds[:]
+        assert thresholds[0] > 0
+        thresholds.insert(0, -float("inf"))
+        thresholds.append(float("inf"))
+        # Currently torchscript does not support all + generator
+        assert all([low <= high for (low, high) in zip(thresholds[:-1], thresholds[1:])])
+        assert all([l in [-1, 0, 1] for l in labels])
+        assert len(labels) == len(thresholds) - 1
+        self.thresholds = thresholds
+        self.labels = labels
+        self.allow_low_quality_matches = allow_low_quality_matches
+
+    def __call__(self, match_quality_matrix):
+        """
+        Args:
+            match_quality_matrix (Tensor[float]): an MxN tensor, containing the
+                pairwise quality between M ground-truth elements and N predicted
+                elements. All elements must be >= 0 (due to the us of `torch.nonzero`
+                for selecting indices in :meth:`set_low_quality_matches_`).
+
+        Returns:
+            matches (Tensor[int64]): a vector of length N, where matches[i] is a matched
+                ground-truth index in [0, M)
+            match_labels (Tensor[int8]): a vector of length N, where pred_labels[i] indicates
+                whether a prediction is a true or false positive or ignored
+        """
+        assert match_quality_matrix.dim() == 2
+        if match_quality_matrix.numel() == 0:
+            default_matches = match_quality_matrix.new_full(
+                (match_quality_matrix.size(1),), 0, dtype=torch.int64
+            )
+            # When no gt boxes exist, we define IOU = 0 and therefore set labels
+            # to `self.labels[0]`, which usually defaults to background class 0
+            # To choose to ignore instead, can make labels=[-1,0,-1,1] + set appropriate thresholds
+            default_match_labels = match_quality_matrix.new_full(
+                (match_quality_matrix.size(1),), self.labels[0], dtype=torch.int8
+            )
+            return default_matches, default_match_labels
+
+        assert torch.all(match_quality_matrix >= 0)
+
+        # match_quality_matrix is M (gt) x N (predicted)
+        # Max over gt elements (dim 0) to find best gt candidate for each prediction
+        matched_vals, matches = match_quality_matrix.max(dim=0)
+
+        match_labels = matches.new_full(matches.size(), 1, dtype=torch.int8)
+
+        for (l, low, high) in zip(self.labels, self.thresholds[:-1], self.thresholds[1:]):
+            low_high = (matched_vals >= low) & (matched_vals < high)
+            match_labels[low_high] = l
+
+        if self.allow_low_quality_matches:
+            self.set_low_quality_matches_(match_labels, match_quality_matrix)
+
+        return matches, match_labels
+
+    def set_low_quality_matches_(self, match_labels, match_quality_matrix):
+        """
+        Produce additional matches for predictions that have only low-quality matches.
+        Specifically, for each ground-truth G find the set of predictions that have
+        maximum overlap with it (including ties); for each prediction in that set, if
+        it is unmatched, then match it to the ground-truth G.
+
+        This function implements the RPN assignment case (i) in Sec. 3.1.2 of
+        :paper:`Faster R-CNN`.
+        """
+        # For each gt, find the prediction with which it has highest quality
+        highest_quality_foreach_gt, _ = match_quality_matrix.max(dim=1)
+        # Find the highest quality match available, even if it is low, including ties.
+        # Note that the matches qualities must be positive due to the use of
+        # `torch.nonzero`.
+        _, pred_inds_with_highest_quality = nonzero_tuple(
+            match_quality_matrix == highest_quality_foreach_gt[:, None]
+        )
+        # If an anchor was labeled positive only due to a low-quality match
+        # with gt_A, but it has larger overlap with gt_B, it's matched index will still be gt_B.
+        # This follows the implementation in Detectron, and is found to have no significant impact.
+        match_labels[pred_inds_with_highest_quality] = 1
diff --git a/detectron2/modeling/meta_arch/__init__.py b/detectron2/modeling/meta_arch/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..6b0668157052ce7b796ef50bc7ee85361e7605b9
--- /dev/null
+++ b/detectron2/modeling/meta_arch/__init__.py
@@ -0,0 +1,16 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) Facebook, Inc. and its affiliates.
+
+from .build import META_ARCH_REGISTRY, build_model  # isort:skip
+
+from .panoptic_fpn import PanopticFPN
+
+# import all the meta_arch, so they will be registered
+from .rcnn import GeneralizedRCNN, ProposalNetwork
+from .dense_detector import DenseDetector
+from .retinanet import RetinaNet
+from .fcos import FCOS
+from .semantic_seg import SEM_SEG_HEADS_REGISTRY, SemanticSegmentor, build_sem_seg_head
+
+
+__all__ = list(globals().keys())
diff --git a/detectron2/modeling/meta_arch/__pycache__/__init__.cpython-310.pyc b/detectron2/modeling/meta_arch/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9e9d1c0799301392ec78ea16299dbf60b62a29b3
Binary files /dev/null and b/detectron2/modeling/meta_arch/__pycache__/__init__.cpython-310.pyc differ
diff --git a/detectron2/modeling/meta_arch/__pycache__/__init__.cpython-39.pyc b/detectron2/modeling/meta_arch/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b24bb4965fe460ed6920cc09705353cf8cc2c54c
Binary files /dev/null and b/detectron2/modeling/meta_arch/__pycache__/__init__.cpython-39.pyc differ
diff --git a/detectron2/modeling/meta_arch/__pycache__/build.cpython-310.pyc b/detectron2/modeling/meta_arch/__pycache__/build.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2f9b572a6a062d03025b1ed6dc4a9e9d8474df68
Binary files /dev/null and b/detectron2/modeling/meta_arch/__pycache__/build.cpython-310.pyc differ
diff --git a/detectron2/modeling/meta_arch/__pycache__/build.cpython-39.pyc b/detectron2/modeling/meta_arch/__pycache__/build.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6b5aa4f82e5fcef370f9c10b9e5539c29b9a1fd6
Binary files /dev/null and b/detectron2/modeling/meta_arch/__pycache__/build.cpython-39.pyc differ
diff --git a/detectron2/modeling/meta_arch/__pycache__/dense_detector.cpython-310.pyc b/detectron2/modeling/meta_arch/__pycache__/dense_detector.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..60fca7f3a909171db36bcdc609ce16b030a94223
Binary files /dev/null and b/detectron2/modeling/meta_arch/__pycache__/dense_detector.cpython-310.pyc differ
diff --git a/detectron2/modeling/meta_arch/__pycache__/dense_detector.cpython-39.pyc b/detectron2/modeling/meta_arch/__pycache__/dense_detector.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8297bce599328e3bbf4419fc0d4c36b4fa993e56
Binary files /dev/null and b/detectron2/modeling/meta_arch/__pycache__/dense_detector.cpython-39.pyc differ
diff --git a/detectron2/modeling/meta_arch/__pycache__/fcos.cpython-310.pyc b/detectron2/modeling/meta_arch/__pycache__/fcos.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..471d1a81ce033247a1cdd1dd2954f064f000835b
Binary files /dev/null and b/detectron2/modeling/meta_arch/__pycache__/fcos.cpython-310.pyc differ
diff --git a/detectron2/modeling/meta_arch/__pycache__/fcos.cpython-39.pyc b/detectron2/modeling/meta_arch/__pycache__/fcos.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..00fdf1ae3bccc5946bc1274f566c753fbe107ebd
Binary files /dev/null and b/detectron2/modeling/meta_arch/__pycache__/fcos.cpython-39.pyc differ
diff --git a/detectron2/modeling/meta_arch/__pycache__/panoptic_fpn.cpython-310.pyc b/detectron2/modeling/meta_arch/__pycache__/panoptic_fpn.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4c5931cc97f62806bff1e9ecfa542ebe405c4190
Binary files /dev/null and b/detectron2/modeling/meta_arch/__pycache__/panoptic_fpn.cpython-310.pyc differ
diff --git a/detectron2/modeling/meta_arch/__pycache__/panoptic_fpn.cpython-39.pyc b/detectron2/modeling/meta_arch/__pycache__/panoptic_fpn.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..921689ab215455fec233035e0e9e5646839739b9
Binary files /dev/null and b/detectron2/modeling/meta_arch/__pycache__/panoptic_fpn.cpython-39.pyc differ
diff --git a/detectron2/modeling/meta_arch/__pycache__/rcnn.cpython-310.pyc b/detectron2/modeling/meta_arch/__pycache__/rcnn.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..42810c5a57512f3ea25f689e434023b4463b4e26
Binary files /dev/null and b/detectron2/modeling/meta_arch/__pycache__/rcnn.cpython-310.pyc differ
diff --git a/detectron2/modeling/meta_arch/__pycache__/rcnn.cpython-39.pyc b/detectron2/modeling/meta_arch/__pycache__/rcnn.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d0641c0424a135f9fdb6a25d65ac7dd725414236
Binary files /dev/null and b/detectron2/modeling/meta_arch/__pycache__/rcnn.cpython-39.pyc differ
diff --git a/detectron2/modeling/meta_arch/__pycache__/retinanet.cpython-310.pyc b/detectron2/modeling/meta_arch/__pycache__/retinanet.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..074ee74cc7761aeb444e48ec328814151274f233
Binary files /dev/null and b/detectron2/modeling/meta_arch/__pycache__/retinanet.cpython-310.pyc differ
diff --git a/detectron2/modeling/meta_arch/__pycache__/retinanet.cpython-39.pyc b/detectron2/modeling/meta_arch/__pycache__/retinanet.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..19a8aa5d045fcf0c4db1d9da72b571d884b40720
Binary files /dev/null and b/detectron2/modeling/meta_arch/__pycache__/retinanet.cpython-39.pyc differ
diff --git a/detectron2/modeling/meta_arch/__pycache__/semantic_seg.cpython-310.pyc b/detectron2/modeling/meta_arch/__pycache__/semantic_seg.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6676254f1899bab772513508868b8afb50d277f8
Binary files /dev/null and b/detectron2/modeling/meta_arch/__pycache__/semantic_seg.cpython-310.pyc differ
diff --git a/detectron2/modeling/meta_arch/__pycache__/semantic_seg.cpython-39.pyc b/detectron2/modeling/meta_arch/__pycache__/semantic_seg.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1707b0776d34a85eefda52098a3488f7713530cb
Binary files /dev/null and b/detectron2/modeling/meta_arch/__pycache__/semantic_seg.cpython-39.pyc differ
diff --git a/detectron2/modeling/meta_arch/build.py b/detectron2/modeling/meta_arch/build.py
new file mode 100644
index 0000000000000000000000000000000000000000..3427215746c9a146bd902f22ea9b26d121c36b27
--- /dev/null
+++ b/detectron2/modeling/meta_arch/build.py
@@ -0,0 +1,25 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+import torch
+
+from detectron2.utils.logger import _log_api_usage
+from detectron2.utils.registry import Registry
+
+META_ARCH_REGISTRY = Registry("META_ARCH")  # noqa F401 isort:skip
+META_ARCH_REGISTRY.__doc__ = """
+Registry for meta-architectures, i.e. the whole model.
+
+The registered object will be called with `obj(cfg)`
+and expected to return a `nn.Module` object.
+"""
+
+
+def build_model(cfg):
+    """
+    Build the whole model architecture, defined by ``cfg.MODEL.META_ARCHITECTURE``.
+    Note that it does not load any weights from ``cfg``.
+    """
+    meta_arch = cfg.MODEL.META_ARCHITECTURE
+    model = META_ARCH_REGISTRY.get(meta_arch)(cfg)
+    model.to(torch.device(cfg.MODEL.DEVICE))
+    _log_api_usage("modeling.meta_arch." + meta_arch)
+    return model
diff --git a/detectron2/modeling/meta_arch/dense_detector.py b/detectron2/modeling/meta_arch/dense_detector.py
new file mode 100644
index 0000000000000000000000000000000000000000..74456f95666b89e5507627427bf1b3b8aea9c69f
--- /dev/null
+++ b/detectron2/modeling/meta_arch/dense_detector.py
@@ -0,0 +1,294 @@
+import numpy as np
+from typing import Dict, List, Optional, Tuple
+import torch
+from torch import Tensor, nn
+
+from detectron2.data.detection_utils import convert_image_to_rgb
+from detectron2.layers import move_device_like
+from detectron2.modeling import Backbone
+from detectron2.structures import Boxes, ImageList, Instances
+from detectron2.utils.events import get_event_storage
+
+from ..postprocessing import detector_postprocess
+
+
+def permute_to_N_HWA_K(tensor, K: int):
+    """
+    Transpose/reshape a tensor from (N, (Ai x K), H, W) to (N, (HxWxAi), K)
+    """
+    assert tensor.dim() == 4, tensor.shape
+    N, _, H, W = tensor.shape
+    tensor = tensor.view(N, -1, K, H, W)
+    tensor = tensor.permute(0, 3, 4, 1, 2)
+    tensor = tensor.reshape(N, -1, K)  # Size=(N,HWA,K)
+    return tensor
+
+
+class DenseDetector(nn.Module):
+    """
+    Base class for dense detector. We define a dense detector as a fully-convolutional model that
+    makes per-pixel (i.e. dense) predictions.
+    """
+
+    def __init__(
+        self,
+        backbone: Backbone,
+        head: nn.Module,
+        head_in_features: Optional[List[str]] = None,
+        *,
+        pixel_mean,
+        pixel_std,
+    ):
+        """
+        Args:
+            backbone: backbone module
+            head: head module
+            head_in_features: backbone features to use in head. Default to all backbone features.
+            pixel_mean (Tuple[float]):
+                Values to be used for image normalization (BGR order).
+                To train on images of different number of channels, set different mean & std.
+                Default values are the mean pixel value from ImageNet: [103.53, 116.28, 123.675]
+            pixel_std (Tuple[float]):
+                When using pre-trained models in Detectron1 or any MSRA models,
+                std has been absorbed into its conv1 weights, so the std needs to be set 1.
+                Otherwise, you can use [57.375, 57.120, 58.395] (ImageNet std)
+        """
+        super().__init__()
+
+        self.backbone = backbone
+        self.head = head
+        if head_in_features is None:
+            shapes = self.backbone.output_shape()
+            self.head_in_features = sorted(shapes.keys(), key=lambda x: shapes[x].stride)
+        else:
+            self.head_in_features = head_in_features
+        self.register_buffer("pixel_mean", torch.tensor(pixel_mean).view(-1, 1, 1), False)
+        self.register_buffer("pixel_std", torch.tensor(pixel_std).view(-1, 1, 1), False)
+
+    @property
+    def device(self):
+        return self.pixel_mean.device
+
+    def _move_to_current_device(self, x):
+        return move_device_like(x, self.pixel_mean)
+
+    def forward(self, batched_inputs: List[Dict[str, Tensor]]):
+        """
+        Args:
+            batched_inputs: a list, batched outputs of :class:`DatasetMapper` .
+                Each item in the list contains the inputs for one image.
+                For now, each item in the list is a dict that contains:
+
+                * image: Tensor, image in (C, H, W) format.
+                * instances: Instances
+
+                Other information that's included in the original dicts, such as:
+
+                * "height", "width" (int): the output resolution of the model, used in inference.
+                  See :meth:`postprocess` for details.
+
+        Returns:
+            In training, dict[str, Tensor]: mapping from a named loss to a tensor storing the
+            loss. Used during training only. In inference, the standard output format, described
+            in :doc:`/tutorials/models`.
+        """
+        images = self.preprocess_image(batched_inputs)
+        features = self.backbone(images.tensor)
+        features = [features[f] for f in self.head_in_features]
+        predictions = self.head(features)
+
+        if self.training:
+            assert not torch.jit.is_scripting(), "Not supported"
+            assert "instances" in batched_inputs[0], "Instance annotations are missing in training!"
+            gt_instances = [x["instances"].to(self.device) for x in batched_inputs]
+            return self.forward_training(images, features, predictions, gt_instances)
+        else:
+            results = self.forward_inference(images, features, predictions)
+            if torch.jit.is_scripting():
+                return results
+
+            processed_results = []
+            for results_per_image, input_per_image, image_size in zip(
+                results, batched_inputs, images.image_sizes
+            ):
+                height = input_per_image.get("height", image_size[0])
+                width = input_per_image.get("width", image_size[1])
+                r = detector_postprocess(results_per_image, height, width)
+                processed_results.append({"instances": r})
+            return processed_results
+
+    def forward_training(self, images, features, predictions, gt_instances):
+        raise NotImplementedError()
+
+    def preprocess_image(self, batched_inputs: List[Dict[str, Tensor]]):
+        """
+        Normalize, pad and batch the input images.
+        """
+        images = [self._move_to_current_device(x["image"]) for x in batched_inputs]
+        images = [(x - self.pixel_mean) / self.pixel_std for x in images]
+        images = ImageList.from_tensors(
+            images,
+            self.backbone.size_divisibility,
+            padding_constraints=self.backbone.padding_constraints,
+        )
+        return images
+
+    def _transpose_dense_predictions(
+        self, predictions: List[List[Tensor]], dims_per_anchor: List[int]
+    ) -> List[List[Tensor]]:
+        """
+        Transpose the dense per-level predictions.
+
+        Args:
+            predictions: a list of outputs, each is a list of per-level
+                predictions with shape (N, Ai x K, Hi, Wi), where N is the
+                number of images, Ai is the number of anchors per location on
+                level i, K is the dimension of predictions per anchor.
+            dims_per_anchor: the value of K for each predictions. e.g. 4 for
+                box prediction, #classes for classification prediction.
+
+        Returns:
+            List[List[Tensor]]: each prediction is transposed to (N, Hi x Wi x Ai, K).
+        """
+        assert len(predictions) == len(dims_per_anchor)
+        res: List[List[Tensor]] = []
+        for pred, dim_per_anchor in zip(predictions, dims_per_anchor):
+            pred = [permute_to_N_HWA_K(x, dim_per_anchor) for x in pred]
+            res.append(pred)
+        return res
+
+    def _ema_update(self, name: str, value: float, initial_value: float, momentum: float = 0.9):
+        """
+        Apply EMA update to `self.name` using `value`.
+
+        This is mainly used for loss normalizer. In Detectron1, loss is normalized by number
+        of foreground samples in the batch. When batch size is 1 per GPU, #foreground has a
+        large variance and using it lead to lower performance. Therefore we maintain an EMA of
+        #foreground to stabilize the normalizer.
+
+        Args:
+            name: name of the normalizer
+            value: the new value to update
+            initial_value: the initial value to start with
+            momentum: momentum of EMA
+
+        Returns:
+            float: the updated EMA value
+        """
+        if hasattr(self, name):
+            old = getattr(self, name)
+        else:
+            old = initial_value
+        new = old * momentum + value * (1 - momentum)
+        setattr(self, name, new)
+        return new
+
+    def _decode_per_level_predictions(
+        self,
+        anchors: Boxes,
+        pred_scores: Tensor,
+        pred_deltas: Tensor,
+        score_thresh: float,
+        topk_candidates: int,
+        image_size: Tuple[int, int],
+    ) -> Instances:
+        """
+        Decode boxes and classification predictions of one featuer level, by
+        the following steps:
+        1. filter the predictions based on score threshold and top K scores.
+        2. transform the box regression outputs
+        3. return the predicted scores, classes and boxes
+
+        Args:
+            anchors: Boxes, anchor for this feature level
+            pred_scores: HxWxA,K
+            pred_deltas: HxWxA,4
+
+        Returns:
+            Instances: with field "scores", "pred_boxes", "pred_classes".
+        """
+        # Apply two filtering to make NMS faster.
+        # 1. Keep boxes with confidence score higher than threshold
+        keep_idxs = pred_scores > score_thresh
+        pred_scores = pred_scores[keep_idxs]
+        topk_idxs = torch.nonzero(keep_idxs)  # Kx2
+
+        # 2. Keep top k top scoring boxes only
+        topk_idxs_size = topk_idxs.shape[0]
+        if isinstance(topk_idxs_size, Tensor):
+            # It's a tensor in tracing
+            num_topk = torch.clamp(topk_idxs_size, max=topk_candidates)
+        else:
+            num_topk = min(topk_idxs_size, topk_candidates)
+        pred_scores, idxs = pred_scores.topk(num_topk)
+        topk_idxs = topk_idxs[idxs]
+
+        anchor_idxs, classes_idxs = topk_idxs.unbind(dim=1)
+
+        pred_boxes = self.box2box_transform.apply_deltas(
+            pred_deltas[anchor_idxs], anchors.tensor[anchor_idxs]
+        )
+        return Instances(
+            image_size, pred_boxes=Boxes(pred_boxes), scores=pred_scores, pred_classes=classes_idxs
+        )
+
+    def _decode_multi_level_predictions(
+        self,
+        anchors: List[Boxes],
+        pred_scores: List[Tensor],
+        pred_deltas: List[Tensor],
+        score_thresh: float,
+        topk_candidates: int,
+        image_size: Tuple[int, int],
+    ) -> Instances:
+        """
+        Run `_decode_per_level_predictions` for all feature levels and concat the results.
+        """
+        predictions = [
+            self._decode_per_level_predictions(
+                anchors_i,
+                box_cls_i,
+                box_reg_i,
+                score_thresh,
+                topk_candidates,
+                image_size,
+            )
+            # Iterate over every feature level
+            for box_cls_i, box_reg_i, anchors_i in zip(pred_scores, pred_deltas, anchors)
+        ]
+        return predictions[0].cat(predictions)  # 'Instances.cat' is not scriptale but this is
+
+    def visualize_training(self, batched_inputs, results):
+        """
+        A function used to visualize ground truth images and final network predictions.
+        It shows ground truth bounding boxes on the original image and up to 20
+        predicted object bounding boxes on the original image.
+
+        Args:
+            batched_inputs (list): a list that contains input to the model.
+            results (List[Instances]): a list of #images elements returned by forward_inference().
+        """
+        from detectron2.utils.visualizer import Visualizer
+
+        assert len(batched_inputs) == len(
+            results
+        ), "Cannot visualize inputs and results of different sizes"
+        storage = get_event_storage()
+        max_boxes = 20
+
+        image_index = 0  # only visualize a single image
+        img = batched_inputs[image_index]["image"]
+        img = convert_image_to_rgb(img.permute(1, 2, 0), self.input_format)
+        v_gt = Visualizer(img, None)
+        v_gt = v_gt.overlay_instances(boxes=batched_inputs[image_index]["instances"].gt_boxes)
+        anno_img = v_gt.get_image()
+        processed_results = detector_postprocess(results[image_index], img.shape[0], img.shape[1])
+        predicted_boxes = processed_results.pred_boxes.tensor.detach().cpu().numpy()
+
+        v_pred = Visualizer(img, None)
+        v_pred = v_pred.overlay_instances(boxes=predicted_boxes[0:max_boxes])
+        prop_img = v_pred.get_image()
+        vis_img = np.vstack((anno_img, prop_img))
+        vis_img = vis_img.transpose(2, 0, 1)
+        vis_name = f"Top: GT bounding boxes; Bottom: {max_boxes} Highest Scoring Results"
+        storage.put_image(vis_name, vis_img)
diff --git a/detectron2/modeling/meta_arch/fcos.py b/detectron2/modeling/meta_arch/fcos.py
new file mode 100644
index 0000000000000000000000000000000000000000..7e7140bfa04a8e8bb199a800805cbaf22fdd8f32
--- /dev/null
+++ b/detectron2/modeling/meta_arch/fcos.py
@@ -0,0 +1,328 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+
+import logging
+from typing import List, Optional, Tuple
+import torch
+from fvcore.nn import sigmoid_focal_loss_jit
+from torch import nn
+from torch.nn import functional as F
+
+from detectron2.layers import ShapeSpec, batched_nms
+from detectron2.structures import Boxes, ImageList, Instances, pairwise_point_box_distance
+from detectron2.utils.events import get_event_storage
+
+from ..anchor_generator import DefaultAnchorGenerator
+from ..backbone import Backbone
+from ..box_regression import Box2BoxTransformLinear, _dense_box_regression_loss
+from .dense_detector import DenseDetector
+from .retinanet import RetinaNetHead
+
+__all__ = ["FCOS"]
+
+logger = logging.getLogger(__name__)
+
+
+class FCOS(DenseDetector):
+    """
+    Implement FCOS in :paper:`fcos`.
+    """
+
+    def __init__(
+        self,
+        *,
+        backbone: Backbone,
+        head: nn.Module,
+        head_in_features: Optional[List[str]] = None,
+        box2box_transform=None,
+        num_classes,
+        center_sampling_radius: float = 1.5,
+        focal_loss_alpha=0.25,
+        focal_loss_gamma=2.0,
+        test_score_thresh=0.2,
+        test_topk_candidates=1000,
+        test_nms_thresh=0.6,
+        max_detections_per_image=100,
+        pixel_mean,
+        pixel_std,
+    ):
+        """
+        Args:
+            center_sampling_radius: radius of the "center" of a groundtruth box,
+                within which all anchor points are labeled positive.
+            Other arguments mean the same as in :class:`RetinaNet`.
+        """
+        super().__init__(
+            backbone, head, head_in_features, pixel_mean=pixel_mean, pixel_std=pixel_std
+        )
+
+        self.num_classes = num_classes
+
+        # FCOS uses one anchor point per location.
+        # We represent the anchor point by a box whose size equals the anchor stride.
+        feature_shapes = backbone.output_shape()
+        fpn_strides = [feature_shapes[k].stride for k in self.head_in_features]
+        self.anchor_generator = DefaultAnchorGenerator(
+            sizes=[[k] for k in fpn_strides], aspect_ratios=[1.0], strides=fpn_strides
+        )
+
+        # FCOS parameterizes box regression by a linear transform,
+        # where predictions are normalized by anchor stride (equal to anchor size).
+        if box2box_transform is None:
+            box2box_transform = Box2BoxTransformLinear(normalize_by_size=True)
+        self.box2box_transform = box2box_transform
+
+        self.center_sampling_radius = float(center_sampling_radius)
+
+        # Loss parameters:
+        self.focal_loss_alpha = focal_loss_alpha
+        self.focal_loss_gamma = focal_loss_gamma
+
+        # Inference parameters:
+        self.test_score_thresh = test_score_thresh
+        self.test_topk_candidates = test_topk_candidates
+        self.test_nms_thresh = test_nms_thresh
+        self.max_detections_per_image = max_detections_per_image
+
+    def forward_training(self, images, features, predictions, gt_instances):
+        # Transpose the Hi*Wi*A dimension to the middle:
+        pred_logits, pred_anchor_deltas, pred_centerness = self._transpose_dense_predictions(
+            predictions, [self.num_classes, 4, 1]
+        )
+        anchors = self.anchor_generator(features)
+        gt_labels, gt_boxes = self.label_anchors(anchors, gt_instances)
+        return self.losses(
+            anchors, pred_logits, gt_labels, pred_anchor_deltas, gt_boxes, pred_centerness
+        )
+
+    @torch.no_grad()
+    def _match_anchors(self, gt_boxes: Boxes, anchors: List[Boxes]):
+        """
+        Match ground-truth boxes to a set of multi-level anchors.
+
+        Args:
+            gt_boxes: Ground-truth boxes from instances of an image.
+            anchors: List of anchors for each feature map (of different scales).
+
+        Returns:
+            torch.Tensor
+                A tensor of shape `(M, R)`, given `M` ground-truth boxes and total
+                `R` anchor points from all feature levels, indicating the quality
+                of match between m-th box and r-th anchor. Higher value indicates
+                better match.
+        """
+        # Naming convention: (M = ground-truth boxes, R = anchor points)
+        # Anchor points are represented as square boxes of size = stride.
+        num_anchors_per_level = [len(x) for x in anchors]
+        anchors = Boxes.cat(anchors)  # (R, 4)
+        anchor_centers = anchors.get_centers()  # (R, 2)
+        anchor_sizes = anchors.tensor[:, 2] - anchors.tensor[:, 0]  # (R, )
+
+        lower_bound = anchor_sizes * 4
+        lower_bound[: num_anchors_per_level[0]] = 0
+        upper_bound = anchor_sizes * 8
+        upper_bound[-num_anchors_per_level[-1] :] = float("inf")
+
+        gt_centers = gt_boxes.get_centers()
+
+        # FCOS with center sampling: anchor point must be close enough to
+        # ground-truth box center.
+        center_dists = (anchor_centers[None, :, :] - gt_centers[:, None, :]).abs_()
+        sampling_regions = self.center_sampling_radius * anchor_sizes[None, :]
+
+        match_quality_matrix = center_dists.max(dim=2).values < sampling_regions
+
+        pairwise_dist = pairwise_point_box_distance(anchor_centers, gt_boxes)
+        pairwise_dist = pairwise_dist.permute(1, 0, 2)  # (M, R, 4)
+
+        # The original FCOS anchor matching rule: anchor point must be inside GT.
+        match_quality_matrix &= pairwise_dist.min(dim=2).values > 0
+
+        # Multilevel anchor matching in FCOS: each anchor is only responsible
+        # for certain scale range.
+        pairwise_dist = pairwise_dist.max(dim=2).values
+        match_quality_matrix &= (pairwise_dist > lower_bound[None, :]) & (
+            pairwise_dist < upper_bound[None, :]
+        )
+        # Match the GT box with minimum area, if there are multiple GT matches.
+        gt_areas = gt_boxes.area()  # (M, )
+
+        match_quality_matrix = match_quality_matrix.to(torch.float32)
+        match_quality_matrix *= 1e8 - gt_areas[:, None]
+        return match_quality_matrix  # (M, R)
+
+    @torch.no_grad()
+    def label_anchors(self, anchors: List[Boxes], gt_instances: List[Instances]):
+        """
+        Same interface as :meth:`RetinaNet.label_anchors`, but implemented with FCOS
+        anchor matching rule.
+
+        Unlike RetinaNet, there are no ignored anchors.
+        """
+
+        gt_labels, matched_gt_boxes = [], []
+
+        for inst in gt_instances:
+            if len(inst) > 0:
+                match_quality_matrix = self._match_anchors(inst.gt_boxes, anchors)
+
+                # Find matched ground-truth box per anchor. Un-matched anchors are
+                # assigned -1. This is equivalent to using an anchor matcher as used
+                # in R-CNN/RetinaNet: `Matcher(thresholds=[1e-5], labels=[0, 1])`
+                match_quality, matched_idxs = match_quality_matrix.max(dim=0)
+                matched_idxs[match_quality < 1e-5] = -1
+
+                matched_gt_boxes_i = inst.gt_boxes.tensor[matched_idxs.clip(min=0)]
+                gt_labels_i = inst.gt_classes[matched_idxs.clip(min=0)]
+
+                # Anchors with matched_idxs = -1 are labeled background.
+                gt_labels_i[matched_idxs < 0] = self.num_classes
+            else:
+                matched_gt_boxes_i = torch.zeros_like(Boxes.cat(anchors).tensor)
+                gt_labels_i = torch.full(
+                    (len(matched_gt_boxes_i),),
+                    fill_value=self.num_classes,
+                    dtype=torch.long,
+                    device=matched_gt_boxes_i.device,
+                )
+
+            gt_labels.append(gt_labels_i)
+            matched_gt_boxes.append(matched_gt_boxes_i)
+
+        return gt_labels, matched_gt_boxes
+
+    def losses(
+        self, anchors, pred_logits, gt_labels, pred_anchor_deltas, gt_boxes, pred_centerness
+    ):
+        """
+        This method is almost identical to :meth:`RetinaNet.losses`, with an extra
+        "loss_centerness" in the returned dict.
+        """
+        num_images = len(gt_labels)
+        gt_labels = torch.stack(gt_labels)  # (M, R)
+
+        pos_mask = (gt_labels >= 0) & (gt_labels != self.num_classes)
+        num_pos_anchors = pos_mask.sum().item()
+        get_event_storage().put_scalar("num_pos_anchors", num_pos_anchors / num_images)
+        normalizer = self._ema_update("loss_normalizer", max(num_pos_anchors, 1), 300)
+
+        # classification and regression loss
+        gt_labels_target = F.one_hot(gt_labels, num_classes=self.num_classes + 1)[
+            :, :, :-1
+        ]  # no loss for the last (background) class
+        loss_cls = sigmoid_focal_loss_jit(
+            torch.cat(pred_logits, dim=1),
+            gt_labels_target.to(pred_logits[0].dtype),
+            alpha=self.focal_loss_alpha,
+            gamma=self.focal_loss_gamma,
+            reduction="sum",
+        )
+
+        loss_box_reg = _dense_box_regression_loss(
+            anchors,
+            self.box2box_transform,
+            pred_anchor_deltas,
+            gt_boxes,
+            pos_mask,
+            box_reg_loss_type="giou",
+        )
+
+        ctrness_targets = self.compute_ctrness_targets(anchors, gt_boxes)  # (M, R)
+        pred_centerness = torch.cat(pred_centerness, dim=1).squeeze(dim=2)  # (M, R)
+        ctrness_loss = F.binary_cross_entropy_with_logits(
+            pred_centerness[pos_mask], ctrness_targets[pos_mask], reduction="sum"
+        )
+        return {
+            "loss_fcos_cls": loss_cls / normalizer,
+            "loss_fcos_loc": loss_box_reg / normalizer,
+            "loss_fcos_ctr": ctrness_loss / normalizer,
+        }
+
+    def compute_ctrness_targets(self, anchors: List[Boxes], gt_boxes: List[torch.Tensor]):
+        anchors = Boxes.cat(anchors).tensor  # Rx4
+        reg_targets = [self.box2box_transform.get_deltas(anchors, m) for m in gt_boxes]
+        reg_targets = torch.stack(reg_targets, dim=0)  # NxRx4
+        if len(reg_targets) == 0:
+            return reg_targets.new_zeros(len(reg_targets))
+        left_right = reg_targets[:, :, [0, 2]]
+        top_bottom = reg_targets[:, :, [1, 3]]
+        ctrness = (left_right.min(dim=-1)[0] / left_right.max(dim=-1)[0]) * (
+            top_bottom.min(dim=-1)[0] / top_bottom.max(dim=-1)[0]
+        )
+        return torch.sqrt(ctrness)
+
+    def forward_inference(
+        self,
+        images: ImageList,
+        features: List[torch.Tensor],
+        predictions: List[List[torch.Tensor]],
+    ):
+        pred_logits, pred_anchor_deltas, pred_centerness = self._transpose_dense_predictions(
+            predictions, [self.num_classes, 4, 1]
+        )
+        anchors = self.anchor_generator(features)
+
+        results: List[Instances] = []
+        for img_idx, image_size in enumerate(images.image_sizes):
+            scores_per_image = [
+                # Multiply and sqrt centerness & classification scores
+                # (See eqn. 4 in https://arxiv.org/abs/2006.09214)
+                torch.sqrt(x[img_idx].sigmoid_() * y[img_idx].sigmoid_())
+                for x, y in zip(pred_logits, pred_centerness)
+            ]
+            deltas_per_image = [x[img_idx] for x in pred_anchor_deltas]
+            results_per_image = self.inference_single_image(
+                anchors, scores_per_image, deltas_per_image, image_size
+            )
+            results.append(results_per_image)
+        return results
+
+    def inference_single_image(
+        self,
+        anchors: List[Boxes],
+        box_cls: List[torch.Tensor],
+        box_delta: List[torch.Tensor],
+        image_size: Tuple[int, int],
+    ):
+        """
+        Identical to :meth:`RetinaNet.inference_single_image.
+        """
+        pred = self._decode_multi_level_predictions(
+            anchors,
+            box_cls,
+            box_delta,
+            self.test_score_thresh,
+            self.test_topk_candidates,
+            image_size,
+        )
+        keep = batched_nms(
+            pred.pred_boxes.tensor, pred.scores, pred.pred_classes, self.test_nms_thresh
+        )
+        return pred[keep[: self.max_detections_per_image]]
+
+
+class FCOSHead(RetinaNetHead):
+    """
+    The head used in :paper:`fcos`. It adds an additional centerness
+    prediction branch on top of :class:`RetinaNetHead`.
+    """
+
+    def __init__(self, *, input_shape: List[ShapeSpec], conv_dims: List[int], **kwargs):
+        super().__init__(input_shape=input_shape, conv_dims=conv_dims, num_anchors=1, **kwargs)
+        # Unlike original FCOS, we do not add an additional learnable scale layer
+        # because it's found to have no benefits after normalizing regression targets by stride.
+        self._num_features = len(input_shape)
+        self.ctrness = nn.Conv2d(conv_dims[-1], 1, kernel_size=3, stride=1, padding=1)
+        torch.nn.init.normal_(self.ctrness.weight, std=0.01)
+        torch.nn.init.constant_(self.ctrness.bias, 0)
+
+    def forward(self, features):
+        assert len(features) == self._num_features
+        logits = []
+        bbox_reg = []
+        ctrness = []
+        for feature in features:
+            logits.append(self.cls_score(self.cls_subnet(feature)))
+            bbox_feature = self.bbox_subnet(feature)
+            bbox_reg.append(self.bbox_pred(bbox_feature))
+            ctrness.append(self.ctrness(bbox_feature))
+        return logits, bbox_reg, ctrness
diff --git a/detectron2/modeling/meta_arch/panoptic_fpn.py b/detectron2/modeling/meta_arch/panoptic_fpn.py
new file mode 100644
index 0000000000000000000000000000000000000000..b31e1c8dc06913d413ae829426e0625fdd5c2f38
--- /dev/null
+++ b/detectron2/modeling/meta_arch/panoptic_fpn.py
@@ -0,0 +1,269 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) Facebook, Inc. and its affiliates.
+
+import logging
+from typing import Dict, List
+import torch
+from torch import nn
+
+from detectron2.config import configurable
+from detectron2.structures import ImageList
+
+from ..postprocessing import detector_postprocess, sem_seg_postprocess
+from .build import META_ARCH_REGISTRY
+from .rcnn import GeneralizedRCNN
+from .semantic_seg import build_sem_seg_head
+
+__all__ = ["PanopticFPN"]
+
+
+@META_ARCH_REGISTRY.register()
+class PanopticFPN(GeneralizedRCNN):
+    """
+    Implement the paper :paper:`PanopticFPN`.
+    """
+
+    @configurable
+    def __init__(
+        self,
+        *,
+        sem_seg_head: nn.Module,
+        combine_overlap_thresh: float = 0.5,
+        combine_stuff_area_thresh: float = 4096,
+        combine_instances_score_thresh: float = 0.5,
+        **kwargs,
+    ):
+        """
+        NOTE: this interface is experimental.
+
+        Args:
+            sem_seg_head: a module for the semantic segmentation head.
+            combine_overlap_thresh: combine masks into one instances if
+                they have enough overlap
+            combine_stuff_area_thresh: ignore stuff areas smaller than this threshold
+            combine_instances_score_thresh: ignore instances whose score is
+                smaller than this threshold
+
+        Other arguments are the same as :class:`GeneralizedRCNN`.
+        """
+        super().__init__(**kwargs)
+        self.sem_seg_head = sem_seg_head
+        # options when combining instance & semantic outputs
+        self.combine_overlap_thresh = combine_overlap_thresh
+        self.combine_stuff_area_thresh = combine_stuff_area_thresh
+        self.combine_instances_score_thresh = combine_instances_score_thresh
+
+    @classmethod
+    def from_config(cls, cfg):
+        ret = super().from_config(cfg)
+        ret.update(
+            {
+                "combine_overlap_thresh": cfg.MODEL.PANOPTIC_FPN.COMBINE.OVERLAP_THRESH,
+                "combine_stuff_area_thresh": cfg.MODEL.PANOPTIC_FPN.COMBINE.STUFF_AREA_LIMIT,
+                "combine_instances_score_thresh": cfg.MODEL.PANOPTIC_FPN.COMBINE.INSTANCES_CONFIDENCE_THRESH,  # noqa
+            }
+        )
+        ret["sem_seg_head"] = build_sem_seg_head(cfg, ret["backbone"].output_shape())
+        logger = logging.getLogger(__name__)
+        if not cfg.MODEL.PANOPTIC_FPN.COMBINE.ENABLED:
+            logger.warning(
+                "PANOPTIC_FPN.COMBINED.ENABLED is no longer used. "
+                " model.inference(do_postprocess=) should be used to toggle postprocessing."
+            )
+        if cfg.MODEL.PANOPTIC_FPN.INSTANCE_LOSS_WEIGHT != 1.0:
+            w = cfg.MODEL.PANOPTIC_FPN.INSTANCE_LOSS_WEIGHT
+            logger.warning(
+                "PANOPTIC_FPN.INSTANCE_LOSS_WEIGHT should be replaced by weights on each ROI head."
+            )
+
+            def update_weight(x):
+                if isinstance(x, dict):
+                    return {k: v * w for k, v in x.items()}
+                else:
+                    return x * w
+
+            roi_heads = ret["roi_heads"]
+            roi_heads.box_predictor.loss_weight = update_weight(roi_heads.box_predictor.loss_weight)
+            roi_heads.mask_head.loss_weight = update_weight(roi_heads.mask_head.loss_weight)
+        return ret
+
+    def forward(self, batched_inputs):
+        """
+        Args:
+            batched_inputs: a list, batched outputs of :class:`DatasetMapper`.
+                Each item in the list contains the inputs for one image.
+
+                For now, each item in the list is a dict that contains:
+
+                * "image": Tensor, image in (C, H, W) format.
+                * "instances": Instances
+                * "sem_seg": semantic segmentation ground truth.
+                * Other information that's included in the original dicts, such as:
+                  "height", "width" (int): the output resolution of the model, used in inference.
+                  See :meth:`postprocess` for details.
+
+        Returns:
+            list[dict]:
+                each dict has the results for one image. The dict contains the following keys:
+
+                * "instances": see :meth:`GeneralizedRCNN.forward` for its format.
+                * "sem_seg": see :meth:`SemanticSegmentor.forward` for its format.
+                * "panoptic_seg": See the return value of
+                  :func:`combine_semantic_and_instance_outputs` for its format.
+        """
+        if not self.training:
+            return self.inference(batched_inputs)
+        images = self.preprocess_image(batched_inputs)
+        features = self.backbone(images.tensor)
+
+        assert "sem_seg" in batched_inputs[0]
+        gt_sem_seg = [x["sem_seg"].to(self.device) for x in batched_inputs]
+        gt_sem_seg = ImageList.from_tensors(
+            gt_sem_seg,
+            self.backbone.size_divisibility,
+            self.sem_seg_head.ignore_value,
+            self.backbone.padding_constraints,
+        ).tensor
+        sem_seg_results, sem_seg_losses = self.sem_seg_head(features, gt_sem_seg)
+
+        gt_instances = [x["instances"].to(self.device) for x in batched_inputs]
+        proposals, proposal_losses = self.proposal_generator(images, features, gt_instances)
+        detector_results, detector_losses = self.roi_heads(
+            images, features, proposals, gt_instances
+        )
+
+        losses = sem_seg_losses
+        losses.update(proposal_losses)
+        losses.update(detector_losses)
+        return losses
+
+    def inference(self, batched_inputs: List[Dict[str, torch.Tensor]], do_postprocess: bool = True):
+        """
+        Run inference on the given inputs.
+
+        Args:
+            batched_inputs (list[dict]): same as in :meth:`forward`
+            do_postprocess (bool): whether to apply post-processing on the outputs.
+
+        Returns:
+            When do_postprocess=True, see docs in :meth:`forward`.
+            Otherwise, returns a (list[Instances], list[Tensor]) that contains
+            the raw detector outputs, and raw semantic segmentation outputs.
+        """
+        images = self.preprocess_image(batched_inputs)
+        features = self.backbone(images.tensor)
+        sem_seg_results, sem_seg_losses = self.sem_seg_head(features, None)
+        proposals, _ = self.proposal_generator(images, features, None)
+        detector_results, _ = self.roi_heads(images, features, proposals, None)
+
+        if do_postprocess:
+            processed_results = []
+            for sem_seg_result, detector_result, input_per_image, image_size in zip(
+                sem_seg_results, detector_results, batched_inputs, images.image_sizes
+            ):
+                height = input_per_image.get("height", image_size[0])
+                width = input_per_image.get("width", image_size[1])
+                sem_seg_r = sem_seg_postprocess(sem_seg_result, image_size, height, width)
+                detector_r = detector_postprocess(detector_result, height, width)
+
+                processed_results.append({"sem_seg": sem_seg_r, "instances": detector_r})
+
+                panoptic_r = combine_semantic_and_instance_outputs(
+                    detector_r,
+                    sem_seg_r.argmax(dim=0),
+                    self.combine_overlap_thresh,
+                    self.combine_stuff_area_thresh,
+                    self.combine_instances_score_thresh,
+                )
+                processed_results[-1]["panoptic_seg"] = panoptic_r
+            return processed_results
+        else:
+            return detector_results, sem_seg_results
+
+
+def combine_semantic_and_instance_outputs(
+    instance_results,
+    semantic_results,
+    overlap_threshold,
+    stuff_area_thresh,
+    instances_score_thresh,
+):
+    """
+    Implement a simple combining logic following
+    "combine_semantic_and_instance_predictions.py" in panopticapi
+    to produce panoptic segmentation outputs.
+
+    Args:
+        instance_results: output of :func:`detector_postprocess`.
+        semantic_results: an (H, W) tensor, each element is the contiguous semantic
+            category id
+
+    Returns:
+        panoptic_seg (Tensor): of shape (height, width) where the values are ids for each segment.
+        segments_info (list[dict]): Describe each segment in `panoptic_seg`.
+            Each dict contains keys "id", "category_id", "isthing".
+    """
+    panoptic_seg = torch.zeros_like(semantic_results, dtype=torch.int32)
+
+    # sort instance outputs by scores
+    sorted_inds = torch.argsort(-instance_results.scores)
+
+    current_segment_id = 0
+    segments_info = []
+
+    instance_masks = instance_results.pred_masks.to(dtype=torch.bool, device=panoptic_seg.device)
+
+    # Add instances one-by-one, check for overlaps with existing ones
+    for inst_id in sorted_inds:
+        score = instance_results.scores[inst_id].item()
+        if score < instances_score_thresh:
+            break
+        mask = instance_masks[inst_id]  # H,W
+        mask_area = mask.sum().item()
+
+        if mask_area == 0:
+            continue
+
+        intersect = (mask > 0) & (panoptic_seg > 0)
+        intersect_area = intersect.sum().item()
+
+        if intersect_area * 1.0 / mask_area > overlap_threshold:
+            continue
+
+        if intersect_area > 0:
+            mask = mask & (panoptic_seg == 0)
+
+        current_segment_id += 1
+        panoptic_seg[mask] = current_segment_id
+        segments_info.append(
+            {
+                "id": current_segment_id,
+                "isthing": True,
+                "score": score,
+                "category_id": instance_results.pred_classes[inst_id].item(),
+                "instance_id": inst_id.item(),
+            }
+        )
+
+    # Add semantic results to remaining empty areas
+    semantic_labels = torch.unique(semantic_results).cpu().tolist()
+    for semantic_label in semantic_labels:
+        if semantic_label == 0:  # 0 is a special "thing" class
+            continue
+        mask = (semantic_results == semantic_label) & (panoptic_seg == 0)
+        mask_area = mask.sum().item()
+        if mask_area < stuff_area_thresh:
+            continue
+
+        current_segment_id += 1
+        panoptic_seg[mask] = current_segment_id
+        segments_info.append(
+            {
+                "id": current_segment_id,
+                "isthing": False,
+                "category_id": semantic_label,
+                "area": mask_area,
+            }
+        )
+
+    return panoptic_seg, segments_info
diff --git a/detectron2/modeling/meta_arch/rcnn.py b/detectron2/modeling/meta_arch/rcnn.py
new file mode 100644
index 0000000000000000000000000000000000000000..edcbda553a619c314d6175638b485ee5c791a176
--- /dev/null
+++ b/detectron2/modeling/meta_arch/rcnn.py
@@ -0,0 +1,341 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+import logging
+import numpy as np
+from typing import Dict, List, Optional, Tuple
+import torch
+from torch import nn
+
+from detectron2.config import configurable
+from detectron2.data.detection_utils import convert_image_to_rgb
+from detectron2.layers import move_device_like
+from detectron2.structures import ImageList, Instances
+from detectron2.utils.events import get_event_storage
+from detectron2.utils.logger import log_first_n
+
+from ..backbone import Backbone, build_backbone
+from ..postprocessing import detector_postprocess
+from ..proposal_generator import build_proposal_generator
+from ..roi_heads import build_roi_heads
+from .build import META_ARCH_REGISTRY
+
+__all__ = ["GeneralizedRCNN", "ProposalNetwork"]
+
+
+@META_ARCH_REGISTRY.register()
+class GeneralizedRCNN(nn.Module):
+    """
+    Generalized R-CNN. Any models that contains the following three components:
+    1. Per-image feature extraction (aka backbone)
+    2. Region proposal generation
+    3. Per-region feature extraction and prediction
+    """
+
+    @configurable
+    def __init__(
+        self,
+        *,
+        backbone: Backbone,
+        proposal_generator: nn.Module,
+        roi_heads: nn.Module,
+        pixel_mean: Tuple[float],
+        pixel_std: Tuple[float],
+        input_format: Optional[str] = None,
+        vis_period: int = 0,
+    ):
+        """
+        Args:
+            backbone: a backbone module, must follow detectron2's backbone interface
+            proposal_generator: a module that generates proposals using backbone features
+            roi_heads: a ROI head that performs per-region computation
+            pixel_mean, pixel_std: list or tuple with #channels element, representing
+                the per-channel mean and std to be used to normalize the input image
+            input_format: describe the meaning of channels of input. Needed by visualization
+            vis_period: the period to run visualization. Set to 0 to disable.
+        """
+        super().__init__()
+        self.backbone = backbone
+        self.proposal_generator = proposal_generator
+        self.roi_heads = roi_heads
+
+        self.input_format = input_format
+        self.vis_period = vis_period
+        if vis_period > 0:
+            assert input_format is not None, "input_format is required for visualization!"
+
+        self.register_buffer("pixel_mean", torch.tensor(pixel_mean).view(-1, 1, 1), False)
+        self.register_buffer("pixel_std", torch.tensor(pixel_std).view(-1, 1, 1), False)
+        assert (
+            self.pixel_mean.shape == self.pixel_std.shape
+        ), f"{self.pixel_mean} and {self.pixel_std} have different shapes!"
+
+    @classmethod
+    def from_config(cls, cfg):
+        backbone = build_backbone(cfg)
+        return {
+            "backbone": backbone,
+            "proposal_generator": build_proposal_generator(cfg, backbone.output_shape()),
+            "roi_heads": build_roi_heads(cfg, backbone.output_shape()),
+            "input_format": cfg.INPUT.FORMAT,
+            "vis_period": cfg.VIS_PERIOD,
+            "pixel_mean": cfg.MODEL.PIXEL_MEAN,
+            "pixel_std": cfg.MODEL.PIXEL_STD,
+        }
+
+    @property
+    def device(self):
+        return self.pixel_mean.device
+
+    def _move_to_current_device(self, x):
+        return move_device_like(x, self.pixel_mean)
+
+    def visualize_training(self, batched_inputs, proposals):
+        """
+        A function used to visualize images and proposals. It shows ground truth
+        bounding boxes on the original image and up to 20 top-scoring predicted
+        object proposals on the original image. Users can implement different
+        visualization functions for different models.
+
+        Args:
+            batched_inputs (list): a list that contains input to the model.
+            proposals (list): a list that contains predicted proposals. Both
+                batched_inputs and proposals should have the same length.
+        """
+        from detectron2.utils.visualizer import Visualizer
+
+        storage = get_event_storage()
+        max_vis_prop = 20
+
+        for input, prop in zip(batched_inputs, proposals):
+            img = input["image"]
+            img = convert_image_to_rgb(img.permute(1, 2, 0), self.input_format)
+            v_gt = Visualizer(img, None)
+            v_gt = v_gt.overlay_instances(boxes=input["instances"].gt_boxes)
+            anno_img = v_gt.get_image()
+            box_size = min(len(prop.proposal_boxes), max_vis_prop)
+            v_pred = Visualizer(img, None)
+            v_pred = v_pred.overlay_instances(
+                boxes=prop.proposal_boxes[0:box_size].tensor.cpu().numpy()
+            )
+            prop_img = v_pred.get_image()
+            vis_img = np.concatenate((anno_img, prop_img), axis=1)
+            vis_img = vis_img.transpose(2, 0, 1)
+            vis_name = "Left: GT bounding boxes;  Right: Predicted proposals"
+            storage.put_image(vis_name, vis_img)
+            break  # only visualize one image in a batch
+
+    def forward(self, batched_inputs: List[Dict[str, torch.Tensor]]):
+        """
+        Args:
+            batched_inputs: a list, batched outputs of :class:`DatasetMapper` .
+                Each item in the list contains the inputs for one image.
+                For now, each item in the list is a dict that contains:
+
+                * image: Tensor, image in (C, H, W) format.
+                * instances (optional): groundtruth :class:`Instances`
+                * proposals (optional): :class:`Instances`, precomputed proposals.
+
+                Other information that's included in the original dicts, such as:
+
+                * "height", "width" (int): the output resolution of the model, used in inference.
+                  See :meth:`postprocess` for details.
+
+        Returns:
+            list[dict]:
+                Each dict is the output for one input image.
+                The dict contains one key "instances" whose value is a :class:`Instances`.
+                The :class:`Instances` object has the following keys:
+                "pred_boxes", "pred_classes", "scores", "pred_masks", "pred_keypoints"
+        """
+        if not self.training:
+            return self.inference(batched_inputs)
+
+        images = self.preprocess_image(batched_inputs)
+        if "instances" in batched_inputs[0]:
+            gt_instances = [x["instances"].to(self.device) for x in batched_inputs]
+        else:
+            gt_instances = None
+
+        features = self.backbone(images.tensor)
+
+        if self.proposal_generator is not None:
+            proposals, proposal_losses = self.proposal_generator(images, features, gt_instances)
+        else:
+            assert "proposals" in batched_inputs[0]
+            proposals = [x["proposals"].to(self.device) for x in batched_inputs]
+            proposal_losses = {}
+
+        _, detector_losses = self.roi_heads(images, features, proposals, gt_instances)
+        if self.vis_period > 0:
+            storage = get_event_storage()
+            if storage.iter % self.vis_period == 0:
+                self.visualize_training(batched_inputs, proposals)
+
+        losses = {}
+        losses.update(detector_losses)
+        losses.update(proposal_losses)
+        return losses
+
+    def inference(
+        self,
+        batched_inputs: List[Dict[str, torch.Tensor]],
+        detected_instances: Optional[List[Instances]] = None,
+        do_postprocess: bool = True,
+    ):
+        """
+        Run inference on the given inputs.
+
+        Args:
+            batched_inputs (list[dict]): same as in :meth:`forward`
+            detected_instances (None or list[Instances]): if not None, it
+                contains an `Instances` object per image. The `Instances`
+                object contains "pred_boxes" and "pred_classes" which are
+                known boxes in the image.
+                The inference will then skip the detection of bounding boxes,
+                and only predict other per-ROI outputs.
+            do_postprocess (bool): whether to apply post-processing on the outputs.
+
+        Returns:
+            When do_postprocess=True, same as in :meth:`forward`.
+            Otherwise, a list[Instances] containing raw network outputs.
+        """
+        assert not self.training
+
+        images = self.preprocess_image(batched_inputs)
+        features = self.backbone(images.tensor)
+
+        if detected_instances is None:
+            if self.proposal_generator is not None:
+                proposals, _ = self.proposal_generator(images, features, None)
+            else:
+                assert "proposals" in batched_inputs[0]
+                proposals = [x["proposals"].to(self.device) for x in batched_inputs]
+
+            results, _ = self.roi_heads(images, features, proposals, None)
+        else:
+            detected_instances = [x.to(self.device) for x in detected_instances]
+            results = self.roi_heads.forward_with_given_boxes(features, detected_instances)
+
+        if do_postprocess:
+            assert not torch.jit.is_scripting(), "Scripting is not supported for postprocess."
+            return GeneralizedRCNN._postprocess(results, batched_inputs, images.image_sizes)
+        return results
+
+    def preprocess_image(self, batched_inputs: List[Dict[str, torch.Tensor]]):
+        """
+        Normalize, pad and batch the input images.
+        """
+        images = [self._move_to_current_device(x["image"]) for x in batched_inputs]
+        images = [(x - self.pixel_mean) / self.pixel_std for x in images]
+        images = ImageList.from_tensors(
+            images,
+            self.backbone.size_divisibility,
+            padding_constraints=self.backbone.padding_constraints,
+        )
+        return images
+
+    @staticmethod
+    def _postprocess(instances, batched_inputs: List[Dict[str, torch.Tensor]], image_sizes):
+        """
+        Rescale the output instances to the target size.
+        """
+        # note: private function; subject to changes
+        processed_results = []
+        for results_per_image, input_per_image, image_size in zip(
+            instances, batched_inputs, image_sizes
+        ):
+            height = input_per_image.get("height", image_size[0])
+            width = input_per_image.get("width", image_size[1])
+            r = detector_postprocess(results_per_image, height, width)
+            processed_results.append({"instances": r})
+        return processed_results
+
+
+@META_ARCH_REGISTRY.register()
+class ProposalNetwork(nn.Module):
+    """
+    A meta architecture that only predicts object proposals.
+    """
+
+    @configurable
+    def __init__(
+        self,
+        *,
+        backbone: Backbone,
+        proposal_generator: nn.Module,
+        pixel_mean: Tuple[float],
+        pixel_std: Tuple[float],
+    ):
+        """
+        Args:
+            backbone: a backbone module, must follow detectron2's backbone interface
+            proposal_generator: a module that generates proposals using backbone features
+            pixel_mean, pixel_std: list or tuple with #channels element, representing
+                the per-channel mean and std to be used to normalize the input image
+        """
+        super().__init__()
+        self.backbone = backbone
+        self.proposal_generator = proposal_generator
+        self.register_buffer("pixel_mean", torch.tensor(pixel_mean).view(-1, 1, 1), False)
+        self.register_buffer("pixel_std", torch.tensor(pixel_std).view(-1, 1, 1), False)
+
+    @classmethod
+    def from_config(cls, cfg):
+        backbone = build_backbone(cfg)
+        return {
+            "backbone": backbone,
+            "proposal_generator": build_proposal_generator(cfg, backbone.output_shape()),
+            "pixel_mean": cfg.MODEL.PIXEL_MEAN,
+            "pixel_std": cfg.MODEL.PIXEL_STD,
+        }
+
+    @property
+    def device(self):
+        return self.pixel_mean.device
+
+    def _move_to_current_device(self, x):
+        return move_device_like(x, self.pixel_mean)
+
+    def forward(self, batched_inputs):
+        """
+        Args:
+            Same as in :class:`GeneralizedRCNN.forward`
+
+        Returns:
+            list[dict]:
+                Each dict is the output for one input image.
+                The dict contains one key "proposals" whose value is a
+                :class:`Instances` with keys "proposal_boxes" and "objectness_logits".
+        """
+        images = [self._move_to_current_device(x["image"]) for x in batched_inputs]
+        images = [(x - self.pixel_mean) / self.pixel_std for x in images]
+        images = ImageList.from_tensors(
+            images,
+            self.backbone.size_divisibility,
+            padding_constraints=self.backbone.padding_constraints,
+        )
+        features = self.backbone(images.tensor)
+
+        if "instances" in batched_inputs[0]:
+            gt_instances = [x["instances"].to(self.device) for x in batched_inputs]
+        elif "targets" in batched_inputs[0]:
+            log_first_n(
+                logging.WARN, "'targets' in the model inputs is now renamed to 'instances'!", n=10
+            )
+            gt_instances = [x["targets"].to(self.device) for x in batched_inputs]
+        else:
+            gt_instances = None
+        proposals, proposal_losses = self.proposal_generator(images, features, gt_instances)
+        # In training, the proposals are not useful at all but we generate them anyway.
+        # This makes RPN-only models about 5% slower.
+        if self.training:
+            return proposal_losses
+
+        processed_results = []
+        for results_per_image, input_per_image, image_size in zip(
+            proposals, batched_inputs, images.image_sizes
+        ):
+            height = input_per_image.get("height", image_size[0])
+            width = input_per_image.get("width", image_size[1])
+            r = detector_postprocess(results_per_image, height, width)
+            processed_results.append({"proposals": r})
+        return processed_results
diff --git a/detectron2/modeling/meta_arch/retinanet.py b/detectron2/modeling/meta_arch/retinanet.py
new file mode 100644
index 0000000000000000000000000000000000000000..bd72a8e7fb57bebcdca64c7bc43b8f0f03118bed
--- /dev/null
+++ b/detectron2/modeling/meta_arch/retinanet.py
@@ -0,0 +1,439 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+import logging
+import math
+from typing import List, Tuple
+import torch
+from fvcore.nn import sigmoid_focal_loss_jit
+from torch import Tensor, nn
+from torch.nn import functional as F
+
+from detectron2.config import configurable
+from detectron2.layers import CycleBatchNormList, ShapeSpec, batched_nms, cat, get_norm
+from detectron2.structures import Boxes, ImageList, Instances, pairwise_iou
+from detectron2.utils.events import get_event_storage
+
+from ..anchor_generator import build_anchor_generator
+from ..backbone import Backbone, build_backbone
+from ..box_regression import Box2BoxTransform, _dense_box_regression_loss
+from ..matcher import Matcher
+from .build import META_ARCH_REGISTRY
+from .dense_detector import DenseDetector, permute_to_N_HWA_K  # noqa
+
+__all__ = ["RetinaNet"]
+
+
+logger = logging.getLogger(__name__)
+
+
+@META_ARCH_REGISTRY.register()
+class RetinaNet(DenseDetector):
+    """
+    Implement RetinaNet in :paper:`RetinaNet`.
+    """
+
+    @configurable
+    def __init__(
+        self,
+        *,
+        backbone: Backbone,
+        head: nn.Module,
+        head_in_features,
+        anchor_generator,
+        box2box_transform,
+        anchor_matcher,
+        num_classes,
+        focal_loss_alpha=0.25,
+        focal_loss_gamma=2.0,
+        smooth_l1_beta=0.0,
+        box_reg_loss_type="smooth_l1",
+        test_score_thresh=0.05,
+        test_topk_candidates=1000,
+        test_nms_thresh=0.5,
+        max_detections_per_image=100,
+        pixel_mean,
+        pixel_std,
+        vis_period=0,
+        input_format="BGR",
+    ):
+        """
+        NOTE: this interface is experimental.
+
+        Args:
+            backbone: a backbone module, must follow detectron2's backbone interface
+            head (nn.Module): a module that predicts logits and regression deltas
+                for each level from a list of per-level features
+            head_in_features (Tuple[str]): Names of the input feature maps to be used in head
+            anchor_generator (nn.Module): a module that creates anchors from a
+                list of features. Usually an instance of :class:`AnchorGenerator`
+            box2box_transform (Box2BoxTransform): defines the transform from anchors boxes to
+                instance boxes
+            anchor_matcher (Matcher): label the anchors by matching them with ground truth.
+            num_classes (int): number of classes. Used to label background proposals.
+
+            # Loss parameters:
+            focal_loss_alpha (float): focal_loss_alpha
+            focal_loss_gamma (float): focal_loss_gamma
+            smooth_l1_beta (float): smooth_l1_beta
+            box_reg_loss_type (str): Options are "smooth_l1", "giou", "diou", "ciou"
+
+            # Inference parameters:
+            test_score_thresh (float): Inference cls score threshold, only anchors with
+                score > INFERENCE_TH are considered for inference (to improve speed)
+            test_topk_candidates (int): Select topk candidates before NMS
+            test_nms_thresh (float): Overlap threshold used for non-maximum suppression
+                (suppress boxes with IoU >= this threshold)
+            max_detections_per_image (int):
+                Maximum number of detections to return per image during inference
+                (100 is based on the limit established for the COCO dataset).
+
+            pixel_mean, pixel_std: see :class:`DenseDetector`.
+        """
+        super().__init__(
+            backbone, head, head_in_features, pixel_mean=pixel_mean, pixel_std=pixel_std
+        )
+        self.num_classes = num_classes
+
+        # Anchors
+        self.anchor_generator = anchor_generator
+        self.box2box_transform = box2box_transform
+        self.anchor_matcher = anchor_matcher
+
+        # Loss parameters:
+        self.focal_loss_alpha = focal_loss_alpha
+        self.focal_loss_gamma = focal_loss_gamma
+        self.smooth_l1_beta = smooth_l1_beta
+        self.box_reg_loss_type = box_reg_loss_type
+        # Inference parameters:
+        self.test_score_thresh = test_score_thresh
+        self.test_topk_candidates = test_topk_candidates
+        self.test_nms_thresh = test_nms_thresh
+        self.max_detections_per_image = max_detections_per_image
+        # Vis parameters
+        self.vis_period = vis_period
+        self.input_format = input_format
+
+    @classmethod
+    def from_config(cls, cfg):
+        backbone = build_backbone(cfg)
+        backbone_shape = backbone.output_shape()
+        feature_shapes = [backbone_shape[f] for f in cfg.MODEL.RETINANET.IN_FEATURES]
+        head = RetinaNetHead(cfg, feature_shapes)
+        anchor_generator = build_anchor_generator(cfg, feature_shapes)
+        return {
+            "backbone": backbone,
+            "head": head,
+            "anchor_generator": anchor_generator,
+            "box2box_transform": Box2BoxTransform(weights=cfg.MODEL.RETINANET.BBOX_REG_WEIGHTS),
+            "anchor_matcher": Matcher(
+                cfg.MODEL.RETINANET.IOU_THRESHOLDS,
+                cfg.MODEL.RETINANET.IOU_LABELS,
+                allow_low_quality_matches=True,
+            ),
+            "pixel_mean": cfg.MODEL.PIXEL_MEAN,
+            "pixel_std": cfg.MODEL.PIXEL_STD,
+            "num_classes": cfg.MODEL.RETINANET.NUM_CLASSES,
+            "head_in_features": cfg.MODEL.RETINANET.IN_FEATURES,
+            # Loss parameters:
+            "focal_loss_alpha": cfg.MODEL.RETINANET.FOCAL_LOSS_ALPHA,
+            "focal_loss_gamma": cfg.MODEL.RETINANET.FOCAL_LOSS_GAMMA,
+            "smooth_l1_beta": cfg.MODEL.RETINANET.SMOOTH_L1_LOSS_BETA,
+            "box_reg_loss_type": cfg.MODEL.RETINANET.BBOX_REG_LOSS_TYPE,
+            # Inference parameters:
+            "test_score_thresh": cfg.MODEL.RETINANET.SCORE_THRESH_TEST,
+            "test_topk_candidates": cfg.MODEL.RETINANET.TOPK_CANDIDATES_TEST,
+            "test_nms_thresh": cfg.MODEL.RETINANET.NMS_THRESH_TEST,
+            "max_detections_per_image": cfg.TEST.DETECTIONS_PER_IMAGE,
+            # Vis parameters
+            "vis_period": cfg.VIS_PERIOD,
+            "input_format": cfg.INPUT.FORMAT,
+        }
+
+    def forward_training(self, images, features, predictions, gt_instances):
+        # Transpose the Hi*Wi*A dimension to the middle:
+        pred_logits, pred_anchor_deltas = self._transpose_dense_predictions(
+            predictions, [self.num_classes, 4]
+        )
+        anchors = self.anchor_generator(features)
+        gt_labels, gt_boxes = self.label_anchors(anchors, gt_instances)
+        return self.losses(anchors, pred_logits, gt_labels, pred_anchor_deltas, gt_boxes)
+
+    def losses(self, anchors, pred_logits, gt_labels, pred_anchor_deltas, gt_boxes):
+        """
+        Args:
+            anchors (list[Boxes]): a list of #feature level Boxes
+            gt_labels, gt_boxes: see output of :meth:`RetinaNet.label_anchors`.
+                Their shapes are (N, R) and (N, R, 4), respectively, where R is
+                the total number of anchors across levels, i.e. sum(Hi x Wi x Ai)
+            pred_logits, pred_anchor_deltas: both are list[Tensor]. Each element in the
+                list corresponds to one level and has shape (N, Hi * Wi * Ai, K or 4).
+                Where K is the number of classes used in `pred_logits`.
+
+        Returns:
+            dict[str, Tensor]:
+                mapping from a named loss to a scalar tensor storing the loss.
+                Used during training only. The dict keys are: "loss_cls" and "loss_box_reg"
+        """
+        num_images = len(gt_labels)
+        gt_labels = torch.stack(gt_labels)  # (N, R)
+
+        valid_mask = gt_labels >= 0
+        pos_mask = (gt_labels >= 0) & (gt_labels != self.num_classes)
+        num_pos_anchors = pos_mask.sum().item()
+        get_event_storage().put_scalar("num_pos_anchors", num_pos_anchors / num_images)
+        normalizer = self._ema_update("loss_normalizer", max(num_pos_anchors, 1), 100)
+
+        # classification and regression loss
+        gt_labels_target = F.one_hot(gt_labels[valid_mask], num_classes=self.num_classes + 1)[
+            :, :-1
+        ]  # no loss for the last (background) class
+        loss_cls = sigmoid_focal_loss_jit(
+            cat(pred_logits, dim=1)[valid_mask],
+            gt_labels_target.to(pred_logits[0].dtype),
+            alpha=self.focal_loss_alpha,
+            gamma=self.focal_loss_gamma,
+            reduction="sum",
+        )
+
+        loss_box_reg = _dense_box_regression_loss(
+            anchors,
+            self.box2box_transform,
+            pred_anchor_deltas,
+            gt_boxes,
+            pos_mask,
+            box_reg_loss_type=self.box_reg_loss_type,
+            smooth_l1_beta=self.smooth_l1_beta,
+        )
+
+        return {
+            "loss_cls": loss_cls / normalizer,
+            "loss_box_reg": loss_box_reg / normalizer,
+        }
+
+    @torch.no_grad()
+    def label_anchors(self, anchors, gt_instances):
+        """
+        Args:
+            anchors (list[Boxes]): A list of #feature level Boxes.
+                The Boxes contains anchors of this image on the specific feature level.
+            gt_instances (list[Instances]): a list of N `Instances`s. The i-th
+                `Instances` contains the ground-truth per-instance annotations
+                for the i-th input image.
+
+        Returns:
+            list[Tensor]: List of #img tensors. i-th element is a vector of labels whose length is
+            the total number of anchors across all feature maps (sum(Hi * Wi * A)).
+            Label values are in {-1, 0, ..., K}, with -1 means ignore, and K means background.
+
+            list[Tensor]: i-th element is a Rx4 tensor, where R is the total number of anchors
+            across feature maps. The values are the matched gt boxes for each anchor.
+            Values are undefined for those anchors not labeled as foreground.
+        """
+        anchors = Boxes.cat(anchors)  # Rx4
+
+        gt_labels = []
+        matched_gt_boxes = []
+        for gt_per_image in gt_instances:
+            match_quality_matrix = pairwise_iou(gt_per_image.gt_boxes, anchors)
+            matched_idxs, anchor_labels = self.anchor_matcher(match_quality_matrix)
+            del match_quality_matrix
+
+            if len(gt_per_image) > 0:
+                matched_gt_boxes_i = gt_per_image.gt_boxes.tensor[matched_idxs]
+
+                gt_labels_i = gt_per_image.gt_classes[matched_idxs]
+                # Anchors with label 0 are treated as background.
+                gt_labels_i[anchor_labels == 0] = self.num_classes
+                # Anchors with label -1 are ignored.
+                gt_labels_i[anchor_labels == -1] = -1
+            else:
+                matched_gt_boxes_i = torch.zeros_like(anchors.tensor)
+                gt_labels_i = torch.zeros_like(matched_idxs) + self.num_classes
+
+            gt_labels.append(gt_labels_i)
+            matched_gt_boxes.append(matched_gt_boxes_i)
+
+        return gt_labels, matched_gt_boxes
+
+    def forward_inference(
+        self, images: ImageList, features: List[Tensor], predictions: List[List[Tensor]]
+    ):
+        pred_logits, pred_anchor_deltas = self._transpose_dense_predictions(
+            predictions, [self.num_classes, 4]
+        )
+        anchors = self.anchor_generator(features)
+
+        results: List[Instances] = []
+        for img_idx, image_size in enumerate(images.image_sizes):
+            scores_per_image = [x[img_idx].sigmoid_() for x in pred_logits]
+            deltas_per_image = [x[img_idx] for x in pred_anchor_deltas]
+            results_per_image = self.inference_single_image(
+                anchors, scores_per_image, deltas_per_image, image_size
+            )
+            results.append(results_per_image)
+        return results
+
+    def inference_single_image(
+        self,
+        anchors: List[Boxes],
+        box_cls: List[Tensor],
+        box_delta: List[Tensor],
+        image_size: Tuple[int, int],
+    ):
+        """
+        Single-image inference. Return bounding-box detection results by thresholding
+        on scores and applying non-maximum suppression (NMS).
+
+        Arguments:
+            anchors (list[Boxes]): list of #feature levels. Each entry contains
+                a Boxes object, which contains all the anchors in that feature level.
+            box_cls (list[Tensor]): list of #feature levels. Each entry contains
+                tensor of size (H x W x A, K)
+            box_delta (list[Tensor]): Same shape as 'box_cls' except that K becomes 4.
+            image_size (tuple(H, W)): a tuple of the image height and width.
+
+        Returns:
+            Same as `inference`, but for only one image.
+        """
+        pred = self._decode_multi_level_predictions(
+            anchors,
+            box_cls,
+            box_delta,
+            self.test_score_thresh,
+            self.test_topk_candidates,
+            image_size,
+        )
+        keep = batched_nms(  # per-class NMS
+            pred.pred_boxes.tensor, pred.scores, pred.pred_classes, self.test_nms_thresh
+        )
+        return pred[keep[: self.max_detections_per_image]]
+
+
+class RetinaNetHead(nn.Module):
+    """
+    The head used in RetinaNet for object classification and box regression.
+    It has two subnets for the two tasks, with a common structure but separate parameters.
+    """
+
+    @configurable
+    def __init__(
+        self,
+        *,
+        input_shape: List[ShapeSpec],
+        num_classes,
+        num_anchors,
+        conv_dims: List[int],
+        norm="",
+        prior_prob=0.01,
+    ):
+        """
+        NOTE: this interface is experimental.
+
+        Args:
+            input_shape (List[ShapeSpec]): input shape
+            num_classes (int): number of classes. Used to label background proposals.
+            num_anchors (int): number of generated anchors
+            conv_dims (List[int]): dimensions for each convolution layer
+            norm (str or callable):
+                Normalization for conv layers except for the two output layers.
+                See :func:`detectron2.layers.get_norm` for supported types.
+            prior_prob (float): Prior weight for computing bias
+        """
+        super().__init__()
+
+        self._num_features = len(input_shape)
+        if norm == "BN" or norm == "SyncBN":
+            logger.info(
+                f"Using domain-specific {norm} in RetinaNetHead with len={self._num_features}."
+            )
+            bn_class = nn.BatchNorm2d if norm == "BN" else nn.SyncBatchNorm
+
+            def norm(c):
+                return CycleBatchNormList(
+                    length=self._num_features, bn_class=bn_class, num_features=c
+                )
+
+        else:
+            norm_name = str(type(get_norm(norm, 32)))
+            if "BN" in norm_name:
+                logger.warning(
+                    f"Shared BatchNorm (type={norm_name}) may not work well in RetinaNetHead."
+                )
+
+        cls_subnet = []
+        bbox_subnet = []
+        for in_channels, out_channels in zip(
+            [input_shape[0].channels] + list(conv_dims), conv_dims
+        ):
+            cls_subnet.append(
+                nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=1, padding=1)
+            )
+            if norm:
+                cls_subnet.append(get_norm(norm, out_channels))
+            cls_subnet.append(nn.ReLU())
+            bbox_subnet.append(
+                nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=1, padding=1)
+            )
+            if norm:
+                bbox_subnet.append(get_norm(norm, out_channels))
+            bbox_subnet.append(nn.ReLU())
+
+        self.cls_subnet = nn.Sequential(*cls_subnet)
+        self.bbox_subnet = nn.Sequential(*bbox_subnet)
+        self.cls_score = nn.Conv2d(
+            conv_dims[-1], num_anchors * num_classes, kernel_size=3, stride=1, padding=1
+        )
+        self.bbox_pred = nn.Conv2d(
+            conv_dims[-1], num_anchors * 4, kernel_size=3, stride=1, padding=1
+        )
+
+        # Initialization
+        for modules in [self.cls_subnet, self.bbox_subnet, self.cls_score, self.bbox_pred]:
+            for layer in modules.modules():
+                if isinstance(layer, nn.Conv2d):
+                    torch.nn.init.normal_(layer.weight, mean=0, std=0.01)
+                    torch.nn.init.constant_(layer.bias, 0)
+
+        # Use prior in model initialization to improve stability
+        bias_value = -(math.log((1 - prior_prob) / prior_prob))
+        torch.nn.init.constant_(self.cls_score.bias, bias_value)
+
+    @classmethod
+    def from_config(cls, cfg, input_shape: List[ShapeSpec]):
+        num_anchors = build_anchor_generator(cfg, input_shape).num_cell_anchors
+        assert (
+            len(set(num_anchors)) == 1
+        ), "Using different number of anchors between levels is not currently supported!"
+        num_anchors = num_anchors[0]
+
+        return {
+            "input_shape": input_shape,
+            "num_classes": cfg.MODEL.RETINANET.NUM_CLASSES,
+            "conv_dims": [input_shape[0].channels] * cfg.MODEL.RETINANET.NUM_CONVS,
+            "prior_prob": cfg.MODEL.RETINANET.PRIOR_PROB,
+            "norm": cfg.MODEL.RETINANET.NORM,
+            "num_anchors": num_anchors,
+        }
+
+    def forward(self, features: List[Tensor]):
+        """
+        Arguments:
+            features (list[Tensor]): FPN feature map tensors in high to low resolution.
+                Each tensor in the list correspond to different feature levels.
+
+        Returns:
+            logits (list[Tensor]): #lvl tensors, each has shape (N, AxK, Hi, Wi).
+                The tensor predicts the classification probability
+                at each spatial position for each of the A anchors and K object
+                classes.
+            bbox_reg (list[Tensor]): #lvl tensors, each has shape (N, Ax4, Hi, Wi).
+                The tensor predicts 4-vector (dx,dy,dw,dh) box
+                regression values for every anchor. These values are the
+                relative offset between the anchor and the ground truth box.
+        """
+        assert len(features) == self._num_features
+        logits = []
+        bbox_reg = []
+        for feature in features:
+            logits.append(self.cls_score(self.cls_subnet(feature)))
+            bbox_reg.append(self.bbox_pred(self.bbox_subnet(feature)))
+        return logits, bbox_reg
diff --git a/detectron2/modeling/meta_arch/semantic_seg.py b/detectron2/modeling/meta_arch/semantic_seg.py
new file mode 100644
index 0000000000000000000000000000000000000000..fefbecfb4f9ca84c4cf62c246cdcbf946016f0e6
--- /dev/null
+++ b/detectron2/modeling/meta_arch/semantic_seg.py
@@ -0,0 +1,267 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+import numpy as np
+from typing import Callable, Dict, Optional, Tuple, Union
+import fvcore.nn.weight_init as weight_init
+import torch
+from torch import nn
+from torch.nn import functional as F
+
+from detectron2.config import configurable
+from detectron2.layers import Conv2d, ShapeSpec, get_norm
+from detectron2.structures import ImageList
+from detectron2.utils.registry import Registry
+
+from ..backbone import Backbone, build_backbone
+from ..postprocessing import sem_seg_postprocess
+from .build import META_ARCH_REGISTRY
+
+__all__ = [
+    "SemanticSegmentor",
+    "SEM_SEG_HEADS_REGISTRY",
+    "SemSegFPNHead",
+    "build_sem_seg_head",
+]
+
+
+SEM_SEG_HEADS_REGISTRY = Registry("SEM_SEG_HEADS")
+SEM_SEG_HEADS_REGISTRY.__doc__ = """
+Registry for semantic segmentation heads, which make semantic segmentation predictions
+from feature maps.
+"""
+
+
+@META_ARCH_REGISTRY.register()
+class SemanticSegmentor(nn.Module):
+    """
+    Main class for semantic segmentation architectures.
+    """
+
+    @configurable
+    def __init__(
+        self,
+        *,
+        backbone: Backbone,
+        sem_seg_head: nn.Module,
+        pixel_mean: Tuple[float],
+        pixel_std: Tuple[float],
+    ):
+        """
+        Args:
+            backbone: a backbone module, must follow detectron2's backbone interface
+            sem_seg_head: a module that predicts semantic segmentation from backbone features
+            pixel_mean, pixel_std: list or tuple with #channels element, representing
+                the per-channel mean and std to be used to normalize the input image
+        """
+        super().__init__()
+        self.backbone = backbone
+        self.sem_seg_head = sem_seg_head
+        self.register_buffer("pixel_mean", torch.tensor(pixel_mean).view(-1, 1, 1), False)
+        self.register_buffer("pixel_std", torch.tensor(pixel_std).view(-1, 1, 1), False)
+
+    @classmethod
+    def from_config(cls, cfg):
+        backbone = build_backbone(cfg)
+        sem_seg_head = build_sem_seg_head(cfg, backbone.output_shape())
+        return {
+            "backbone": backbone,
+            "sem_seg_head": sem_seg_head,
+            "pixel_mean": cfg.MODEL.PIXEL_MEAN,
+            "pixel_std": cfg.MODEL.PIXEL_STD,
+        }
+
+    @property
+    def device(self):
+        return self.pixel_mean.device
+
+    def forward(self, batched_inputs):
+        """
+        Args:
+            batched_inputs: a list, batched outputs of :class:`DatasetMapper`.
+                Each item in the list contains the inputs for one image.
+
+                For now, each item in the list is a dict that contains:
+
+                   * "image": Tensor, image in (C, H, W) format.
+                   * "sem_seg": semantic segmentation ground truth
+                   * Other information that's included in the original dicts, such as:
+                     "height", "width" (int): the output resolution of the model (may be different
+                     from input resolution), used in inference.
+
+
+        Returns:
+            list[dict]:
+              Each dict is the output for one input image.
+              The dict contains one key "sem_seg" whose value is a
+              Tensor that represents the
+              per-pixel segmentation prediced by the head.
+              The prediction has shape KxHxW that represents the logits of
+              each class for each pixel.
+        """
+        images = [x["image"].to(self.device) for x in batched_inputs]
+        images = [(x - self.pixel_mean) / self.pixel_std for x in images]
+        images = ImageList.from_tensors(
+            images,
+            self.backbone.size_divisibility,
+            padding_constraints=self.backbone.padding_constraints,
+        )
+
+        features = self.backbone(images.tensor)
+
+        if "sem_seg" in batched_inputs[0]:
+            targets = [x["sem_seg"].to(self.device) for x in batched_inputs]
+            targets = ImageList.from_tensors(
+                targets,
+                self.backbone.size_divisibility,
+                self.sem_seg_head.ignore_value,
+                self.backbone.padding_constraints,
+            ).tensor
+        else:
+            targets = None
+        results, losses = self.sem_seg_head(features, targets)
+
+        if self.training:
+            return losses
+
+        processed_results = []
+        for result, input_per_image, image_size in zip(results, batched_inputs, images.image_sizes):
+            height = input_per_image.get("height", image_size[0])
+            width = input_per_image.get("width", image_size[1])
+            r = sem_seg_postprocess(result, image_size, height, width)
+            processed_results.append({"sem_seg": r})
+        return processed_results
+
+
+def build_sem_seg_head(cfg, input_shape):
+    """
+    Build a semantic segmentation head from `cfg.MODEL.SEM_SEG_HEAD.NAME`.
+    """
+    name = cfg.MODEL.SEM_SEG_HEAD.NAME
+    return SEM_SEG_HEADS_REGISTRY.get(name)(cfg, input_shape)
+
+
+@SEM_SEG_HEADS_REGISTRY.register()
+class SemSegFPNHead(nn.Module):
+    """
+    A semantic segmentation head described in :paper:`PanopticFPN`.
+    It takes a list of FPN features as input, and applies a sequence of
+    3x3 convs and upsampling to scale all of them to the stride defined by
+    ``common_stride``. Then these features are added and used to make final
+    predictions by another 1x1 conv layer.
+    """
+
+    @configurable
+    def __init__(
+        self,
+        input_shape: Dict[str, ShapeSpec],
+        *,
+        num_classes: int,
+        conv_dims: int,
+        common_stride: int,
+        loss_weight: float = 1.0,
+        norm: Optional[Union[str, Callable]] = None,
+        ignore_value: int = -1,
+    ):
+        """
+        NOTE: this interface is experimental.
+
+        Args:
+            input_shape: shapes (channels and stride) of the input features
+            num_classes: number of classes to predict
+            conv_dims: number of output channels for the intermediate conv layers.
+            common_stride: the common stride that all features will be upscaled to
+            loss_weight: loss weight
+            norm (str or callable): normalization for all conv layers
+            ignore_value: category id to be ignored during training.
+        """
+        super().__init__()
+        input_shape = sorted(input_shape.items(), key=lambda x: x[1].stride)
+        if not len(input_shape):
+            raise ValueError("SemSegFPNHead(input_shape=) cannot be empty!")
+        self.in_features = [k for k, v in input_shape]
+        feature_strides = [v.stride for k, v in input_shape]
+        feature_channels = [v.channels for k, v in input_shape]
+
+        self.ignore_value = ignore_value
+        self.common_stride = common_stride
+        self.loss_weight = loss_weight
+
+        self.scale_heads = []
+        for in_feature, stride, channels in zip(
+            self.in_features, feature_strides, feature_channels
+        ):
+            head_ops = []
+            head_length = max(1, int(np.log2(stride) - np.log2(self.common_stride)))
+            for k in range(head_length):
+                norm_module = get_norm(norm, conv_dims)
+                conv = Conv2d(
+                    channels if k == 0 else conv_dims,
+                    conv_dims,
+                    kernel_size=3,
+                    stride=1,
+                    padding=1,
+                    bias=not norm,
+                    norm=norm_module,
+                    activation=F.relu,
+                )
+                weight_init.c2_msra_fill(conv)
+                head_ops.append(conv)
+                if stride != self.common_stride:
+                    head_ops.append(
+                        nn.Upsample(scale_factor=2, mode="bilinear", align_corners=False)
+                    )
+            self.scale_heads.append(nn.Sequential(*head_ops))
+            self.add_module(in_feature, self.scale_heads[-1])
+        self.predictor = Conv2d(conv_dims, num_classes, kernel_size=1, stride=1, padding=0)
+        weight_init.c2_msra_fill(self.predictor)
+
+    @classmethod
+    def from_config(cls, cfg, input_shape: Dict[str, ShapeSpec]):
+        return {
+            "input_shape": {
+                k: v for k, v in input_shape.items() if k in cfg.MODEL.SEM_SEG_HEAD.IN_FEATURES
+            },
+            "ignore_value": cfg.MODEL.SEM_SEG_HEAD.IGNORE_VALUE,
+            "num_classes": cfg.MODEL.SEM_SEG_HEAD.NUM_CLASSES,
+            "conv_dims": cfg.MODEL.SEM_SEG_HEAD.CONVS_DIM,
+            "common_stride": cfg.MODEL.SEM_SEG_HEAD.COMMON_STRIDE,
+            "norm": cfg.MODEL.SEM_SEG_HEAD.NORM,
+            "loss_weight": cfg.MODEL.SEM_SEG_HEAD.LOSS_WEIGHT,
+        }
+
+    def forward(self, features, targets=None):
+        """
+        Returns:
+            In training, returns (None, dict of losses)
+            In inference, returns (CxHxW logits, {})
+        """
+        x = self.layers(features)
+        if self.training:
+            return None, self.losses(x, targets)
+        else:
+            x = F.interpolate(
+                x, scale_factor=self.common_stride, mode="bilinear", align_corners=False
+            )
+            return x, {}
+
+    def layers(self, features):
+        for i, f in enumerate(self.in_features):
+            if i == 0:
+                x = self.scale_heads[i](features[f])
+            else:
+                x = x + self.scale_heads[i](features[f])
+        x = self.predictor(x)
+        return x
+
+    def losses(self, predictions, targets):
+        predictions = predictions.float()  # https://github.com/pytorch/pytorch/issues/48163
+        predictions = F.interpolate(
+            predictions,
+            scale_factor=self.common_stride,
+            mode="bilinear",
+            align_corners=False,
+        )
+        loss = F.cross_entropy(
+            predictions, targets, reduction="mean", ignore_index=self.ignore_value
+        )
+        losses = {"loss_sem_seg": loss * self.loss_weight}
+        return losses
diff --git a/detectron2/modeling/mmdet_wrapper.py b/detectron2/modeling/mmdet_wrapper.py
new file mode 100644
index 0000000000000000000000000000000000000000..293b3e9faf34c48456cd3fff37b966af9042fe4e
--- /dev/null
+++ b/detectron2/modeling/mmdet_wrapper.py
@@ -0,0 +1,273 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+import itertools
+import logging
+import numpy as np
+from collections import OrderedDict
+from collections.abc import Mapping
+from typing import Dict, List, Optional, Tuple, Union
+import torch
+from omegaconf import DictConfig, OmegaConf
+from torch import Tensor, nn
+
+from detectron2.layers import ShapeSpec
+from detectron2.structures import BitMasks, Boxes, ImageList, Instances
+from detectron2.utils.events import get_event_storage
+
+from .backbone import Backbone
+
+logger = logging.getLogger(__name__)
+
+
+def _to_container(cfg):
+    """
+    mmdet will assert the type of dict/list.
+    So convert omegaconf objects to dict/list.
+    """
+    if isinstance(cfg, DictConfig):
+        cfg = OmegaConf.to_container(cfg, resolve=True)
+    from mmcv.utils import ConfigDict
+
+    return ConfigDict(cfg)
+
+
+class MMDetBackbone(Backbone):
+    """
+    Wrapper of mmdetection backbones to use in detectron2.
+
+    mmdet backbones produce list/tuple of tensors, while detectron2 backbones
+    produce a dict of tensors. This class wraps the given backbone to produce
+    output in detectron2's convention, so it can be used in place of detectron2
+    backbones.
+    """
+
+    def __init__(
+        self,
+        backbone: Union[nn.Module, Mapping],
+        neck: Union[nn.Module, Mapping, None] = None,
+        *,
+        output_shapes: List[ShapeSpec],
+        output_names: Optional[List[str]] = None,
+    ):
+        """
+        Args:
+            backbone: either a backbone module or a mmdet config dict that defines a
+                backbone. The backbone takes a 4D image tensor and returns a
+                sequence of tensors.
+            neck: either a backbone module or a mmdet config dict that defines a
+                neck. The neck takes outputs of backbone and returns a
+                sequence of tensors. If None, no neck is used.
+            output_shapes: shape for every output of the backbone (or neck, if given).
+                stride and channels are often needed.
+            output_names: names for every output of the backbone (or neck, if given).
+                By default, will use "out0", "out1", ...
+        """
+        super().__init__()
+        if isinstance(backbone, Mapping):
+            from mmdet.models import build_backbone
+
+            backbone = build_backbone(_to_container(backbone))
+        self.backbone = backbone
+
+        if isinstance(neck, Mapping):
+            from mmdet.models import build_neck
+
+            neck = build_neck(_to_container(neck))
+        self.neck = neck
+
+        # "Neck" weights, if any, are part of neck itself. This is the interface
+        # of mmdet so we follow it. Reference:
+        # https://github.com/open-mmlab/mmdetection/blob/master/mmdet/models/detectors/two_stage.py
+        logger.info("Initializing mmdet backbone weights...")
+        self.backbone.init_weights()
+        # train() in mmdet modules is non-trivial, and has to be explicitly
+        # called. Reference:
+        # https://github.com/open-mmlab/mmdetection/blob/master/mmdet/models/backbones/resnet.py
+        self.backbone.train()
+        if self.neck is not None:
+            logger.info("Initializing mmdet neck weights ...")
+            if isinstance(self.neck, nn.Sequential):
+                for m in self.neck:
+                    m.init_weights()
+            else:
+                self.neck.init_weights()
+            self.neck.train()
+
+        self._output_shapes = output_shapes
+        if not output_names:
+            output_names = [f"out{i}" for i in range(len(output_shapes))]
+        self._output_names = output_names
+
+    def forward(self, x) -> Dict[str, Tensor]:
+        outs = self.backbone(x)
+        if self.neck is not None:
+            outs = self.neck(outs)
+        assert isinstance(
+            outs, (list, tuple)
+        ), "mmdet backbone should return a list/tuple of tensors!"
+        if len(outs) != len(self._output_shapes):
+            raise ValueError(
+                "Length of output_shapes does not match outputs from the mmdet backbone: "
+                f"{len(outs)} != {len(self._output_shapes)}"
+            )
+        return {k: v for k, v in zip(self._output_names, outs)}
+
+    def output_shape(self) -> Dict[str, ShapeSpec]:
+        return {k: v for k, v in zip(self._output_names, self._output_shapes)}
+
+
+class MMDetDetector(nn.Module):
+    """
+    Wrapper of a mmdetection detector model, for detection and instance segmentation.
+    Input/output formats of this class follow detectron2's convention, so a
+    mmdetection model can be trained and evaluated in detectron2.
+    """
+
+    def __init__(
+        self,
+        detector: Union[nn.Module, Mapping],
+        *,
+        # Default is 32 regardless of model:
+        # https://github.com/open-mmlab/mmdetection/tree/master/configs/_base_/datasets
+        size_divisibility=32,
+        pixel_mean: Tuple[float],
+        pixel_std: Tuple[float],
+    ):
+        """
+        Args:
+            detector: a mmdet detector, or a mmdet config dict that defines a detector.
+            size_divisibility: pad input images to multiple of this number
+            pixel_mean: per-channel mean to normalize input image
+            pixel_std: per-channel stddev to normalize input image
+        """
+        super().__init__()
+        if isinstance(detector, Mapping):
+            from mmdet.models import build_detector
+
+            detector = build_detector(_to_container(detector))
+        self.detector = detector
+        self.detector.init_weights()
+        self.size_divisibility = size_divisibility
+
+        self.register_buffer("pixel_mean", torch.tensor(pixel_mean).view(-1, 1, 1), False)
+        self.register_buffer("pixel_std", torch.tensor(pixel_std).view(-1, 1, 1), False)
+        assert (
+            self.pixel_mean.shape == self.pixel_std.shape
+        ), f"{self.pixel_mean} and {self.pixel_std} have different shapes!"
+
+    def forward(self, batched_inputs: List[Dict[str, torch.Tensor]]):
+        images = [x["image"].to(self.device) for x in batched_inputs]
+        images = [(x - self.pixel_mean) / self.pixel_std for x in images]
+        images = ImageList.from_tensors(images, size_divisibility=self.size_divisibility).tensor
+        metas = []
+        rescale = {"height" in x for x in batched_inputs}
+        if len(rescale) != 1:
+            raise ValueError("Some inputs have original height/width, but some don't!")
+        rescale = list(rescale)[0]
+        output_shapes = []
+        for input in batched_inputs:
+            meta = {}
+            c, h, w = input["image"].shape
+            meta["img_shape"] = meta["ori_shape"] = (h, w, c)
+            if rescale:
+                scale_factor = np.array(
+                    [w / input["width"], h / input["height"]] * 2, dtype="float32"
+                )
+                ori_shape = (input["height"], input["width"])
+                output_shapes.append(ori_shape)
+                meta["ori_shape"] = ori_shape + (c,)
+            else:
+                scale_factor = 1.0
+                output_shapes.append((h, w))
+            meta["scale_factor"] = scale_factor
+            meta["flip"] = False
+            padh, padw = images.shape[-2:]
+            meta["pad_shape"] = (padh, padw, c)
+            metas.append(meta)
+
+        if self.training:
+            gt_instances = [x["instances"].to(self.device) for x in batched_inputs]
+            if gt_instances[0].has("gt_masks"):
+                from mmdet.core import PolygonMasks as mm_PolygonMasks, BitmapMasks as mm_BitMasks
+
+                def convert_mask(m, shape):
+                    # mmdet mask format
+                    if isinstance(m, BitMasks):
+                        return mm_BitMasks(m.tensor.cpu().numpy(), shape[0], shape[1])
+                    else:
+                        return mm_PolygonMasks(m.polygons, shape[0], shape[1])
+
+                gt_masks = [convert_mask(x.gt_masks, x.image_size) for x in gt_instances]
+                losses_and_metrics = self.detector.forward_train(
+                    images,
+                    metas,
+                    [x.gt_boxes.tensor for x in gt_instances],
+                    [x.gt_classes for x in gt_instances],
+                    gt_masks=gt_masks,
+                )
+            else:
+                losses_and_metrics = self.detector.forward_train(
+                    images,
+                    metas,
+                    [x.gt_boxes.tensor for x in gt_instances],
+                    [x.gt_classes for x in gt_instances],
+                )
+            return _parse_losses(losses_and_metrics)
+        else:
+            results = self.detector.simple_test(images, metas, rescale=rescale)
+            results = [
+                {"instances": _convert_mmdet_result(r, shape)}
+                for r, shape in zip(results, output_shapes)
+            ]
+            return results
+
+    @property
+    def device(self):
+        return self.pixel_mean.device
+
+
+# Reference: show_result() in
+# https://github.com/open-mmlab/mmdetection/blob/master/mmdet/models/detectors/base.py
+def _convert_mmdet_result(result, shape: Tuple[int, int]) -> Instances:
+    if isinstance(result, tuple):
+        bbox_result, segm_result = result
+        if isinstance(segm_result, tuple):
+            segm_result = segm_result[0]
+    else:
+        bbox_result, segm_result = result, None
+
+    bboxes = torch.from_numpy(np.vstack(bbox_result))  # Nx5
+    bboxes, scores = bboxes[:, :4], bboxes[:, -1]
+    labels = [
+        torch.full((bbox.shape[0],), i, dtype=torch.int32) for i, bbox in enumerate(bbox_result)
+    ]
+    labels = torch.cat(labels)
+    inst = Instances(shape)
+    inst.pred_boxes = Boxes(bboxes)
+    inst.scores = scores
+    inst.pred_classes = labels
+
+    if segm_result is not None and len(labels) > 0:
+        segm_result = list(itertools.chain(*segm_result))
+        segm_result = [torch.from_numpy(x) if isinstance(x, np.ndarray) else x for x in segm_result]
+        segm_result = torch.stack(segm_result, dim=0)
+        inst.pred_masks = segm_result
+    return inst
+
+
+# reference: https://github.com/open-mmlab/mmdetection/blob/master/mmdet/models/detectors/base.py
+def _parse_losses(losses: Dict[str, Tensor]) -> Dict[str, Tensor]:
+    log_vars = OrderedDict()
+    for loss_name, loss_value in losses.items():
+        if isinstance(loss_value, torch.Tensor):
+            log_vars[loss_name] = loss_value.mean()
+        elif isinstance(loss_value, list):
+            log_vars[loss_name] = sum(_loss.mean() for _loss in loss_value)
+        else:
+            raise TypeError(f"{loss_name} is not a tensor or list of tensors")
+
+        if "loss" not in loss_name:
+            # put metrics to storage; don't return them
+            storage = get_event_storage()
+            value = log_vars.pop(loss_name).cpu().item()
+            storage.put_scalar(loss_name, value)
+    return log_vars
diff --git a/detectron2/modeling/poolers.py b/detectron2/modeling/poolers.py
new file mode 100644
index 0000000000000000000000000000000000000000..3393794507c6504bf6ac1bfae7a1c80a0d81725e
--- /dev/null
+++ b/detectron2/modeling/poolers.py
@@ -0,0 +1,263 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+import math
+from typing import List, Optional
+import torch
+from torch import nn
+from torchvision.ops import RoIPool
+
+from detectron2.layers import ROIAlign, ROIAlignRotated, cat, nonzero_tuple, shapes_to_tensor
+from detectron2.structures import Boxes
+from detectron2.utils.tracing import assert_fx_safe, is_fx_tracing
+
+"""
+To export ROIPooler to torchscript, in this file, variables that should be annotated with
+`Union[List[Boxes], List[RotatedBoxes]]` are only annotated with `List[Boxes]`.
+
+TODO: Correct these annotations when torchscript support `Union`.
+https://github.com/pytorch/pytorch/issues/41412
+"""
+
+__all__ = ["ROIPooler"]
+
+
+def assign_boxes_to_levels(
+    box_lists: List[Boxes],
+    min_level: int,
+    max_level: int,
+    canonical_box_size: int,
+    canonical_level: int,
+):
+    """
+    Map each box in `box_lists` to a feature map level index and return the assignment
+    vector.
+
+    Args:
+        box_lists (list[Boxes] | list[RotatedBoxes]): A list of N Boxes or N RotatedBoxes,
+            where N is the number of images in the batch.
+        min_level (int): Smallest feature map level index. The input is considered index 0,
+            the output of stage 1 is index 1, and so.
+        max_level (int): Largest feature map level index.
+        canonical_box_size (int): A canonical box size in pixels (sqrt(box area)).
+        canonical_level (int): The feature map level index on which a canonically-sized box
+            should be placed.
+
+    Returns:
+        A tensor of length M, where M is the total number of boxes aggregated over all
+            N batch images. The memory layout corresponds to the concatenation of boxes
+            from all images. Each element is the feature map index, as an offset from
+            `self.min_level`, for the corresponding box (so value i means the box is at
+            `self.min_level + i`).
+    """
+    box_sizes = torch.sqrt(cat([boxes.area() for boxes in box_lists]))
+    # Eqn.(1) in FPN paper
+    level_assignments = torch.floor(
+        canonical_level + torch.log2(box_sizes / canonical_box_size + 1e-8)
+    )
+    # clamp level to (min, max), in case the box size is too large or too small
+    # for the available feature maps
+    level_assignments = torch.clamp(level_assignments, min=min_level, max=max_level)
+    return level_assignments.to(torch.int64) - min_level
+
+
+# script the module to avoid hardcoded device type
+@torch.jit.script_if_tracing
+def _convert_boxes_to_pooler_format(boxes: torch.Tensor, sizes: torch.Tensor) -> torch.Tensor:
+    sizes = sizes.to(device=boxes.device)
+    indices = torch.repeat_interleave(
+        torch.arange(len(sizes), dtype=boxes.dtype, device=boxes.device), sizes
+    )
+    return cat([indices[:, None], boxes], dim=1)
+
+
+def convert_boxes_to_pooler_format(box_lists: List[Boxes]):
+    """
+    Convert all boxes in `box_lists` to the low-level format used by ROI pooling ops
+    (see description under Returns).
+
+    Args:
+        box_lists (list[Boxes] | list[RotatedBoxes]):
+            A list of N Boxes or N RotatedBoxes, where N is the number of images in the batch.
+
+    Returns:
+        When input is list[Boxes]:
+            A tensor of shape (M, 5), where M is the total number of boxes aggregated over all
+            N batch images.
+            The 5 columns are (batch index, x0, y0, x1, y1), where batch index
+            is the index in [0, N) identifying which batch image the box with corners at
+            (x0, y0, x1, y1) comes from.
+        When input is list[RotatedBoxes]:
+            A tensor of shape (M, 6), where M is the total number of boxes aggregated over all
+            N batch images.
+            The 6 columns are (batch index, x_ctr, y_ctr, width, height, angle_degrees),
+            where batch index is the index in [0, N) identifying which batch image the
+            rotated box (x_ctr, y_ctr, width, height, angle_degrees) comes from.
+    """
+    boxes = torch.cat([x.tensor for x in box_lists], dim=0)
+    # __len__ returns Tensor in tracing.
+    sizes = shapes_to_tensor([x.__len__() for x in box_lists])
+    return _convert_boxes_to_pooler_format(boxes, sizes)
+
+
+@torch.jit.script_if_tracing
+def _create_zeros(
+    batch_target: Optional[torch.Tensor],
+    channels: int,
+    height: int,
+    width: int,
+    like_tensor: torch.Tensor,
+) -> torch.Tensor:
+    batches = batch_target.shape[0] if batch_target is not None else 0
+    sizes = (batches, channels, height, width)
+    return torch.zeros(sizes, dtype=like_tensor.dtype, device=like_tensor.device)
+
+
+class ROIPooler(nn.Module):
+    """
+    Region of interest feature map pooler that supports pooling from one or more
+    feature maps.
+    """
+
+    def __init__(
+        self,
+        output_size,
+        scales,
+        sampling_ratio,
+        pooler_type,
+        canonical_box_size=224,
+        canonical_level=4,
+    ):
+        """
+        Args:
+            output_size (int, tuple[int] or list[int]): output size of the pooled region,
+                e.g., 14 x 14. If tuple or list is given, the length must be 2.
+            scales (list[float]): The scale for each low-level pooling op relative to
+                the input image. For a feature map with stride s relative to the input
+                image, scale is defined as 1/s. The stride must be power of 2.
+                When there are multiple scales, they must form a pyramid, i.e. they must be
+                a monotically decreasing geometric sequence with a factor of 1/2.
+            sampling_ratio (int): The `sampling_ratio` parameter for the ROIAlign op.
+            pooler_type (string): Name of the type of pooling operation that should be applied.
+                For instance, "ROIPool" or "ROIAlignV2".
+            canonical_box_size (int): A canonical box size in pixels (sqrt(box area)). The default
+                is heuristically defined as 224 pixels in the FPN paper (based on ImageNet
+                pre-training).
+            canonical_level (int): The feature map level index from which a canonically-sized box
+                should be placed. The default is defined as level 4 (stride=16) in the FPN paper,
+                i.e., a box of size 224x224 will be placed on the feature with stride=16.
+                The box placement for all boxes will be determined from their sizes w.r.t
+                canonical_box_size. For example, a box whose area is 4x that of a canonical box
+                should be used to pool features from feature level ``canonical_level+1``.
+
+                Note that the actual input feature maps given to this module may not have
+                sufficiently many levels for the input boxes. If the boxes are too large or too
+                small for the input feature maps, the closest level will be used.
+        """
+        super().__init__()
+
+        if isinstance(output_size, int):
+            output_size = (output_size, output_size)
+        assert len(output_size) == 2
+        assert isinstance(output_size[0], int) and isinstance(output_size[1], int)
+        self.output_size = output_size
+
+        if pooler_type == "ROIAlign":
+            self.level_poolers = nn.ModuleList(
+                ROIAlign(
+                    output_size, spatial_scale=scale, sampling_ratio=sampling_ratio, aligned=False
+                )
+                for scale in scales
+            )
+        elif pooler_type == "ROIAlignV2":
+            self.level_poolers = nn.ModuleList(
+                ROIAlign(
+                    output_size, spatial_scale=scale, sampling_ratio=sampling_ratio, aligned=True
+                )
+                for scale in scales
+            )
+        elif pooler_type == "ROIPool":
+            self.level_poolers = nn.ModuleList(
+                RoIPool(output_size, spatial_scale=scale) for scale in scales
+            )
+        elif pooler_type == "ROIAlignRotated":
+            self.level_poolers = nn.ModuleList(
+                ROIAlignRotated(output_size, spatial_scale=scale, sampling_ratio=sampling_ratio)
+                for scale in scales
+            )
+        else:
+            raise ValueError("Unknown pooler type: {}".format(pooler_type))
+
+        # Map scale (defined as 1 / stride) to its feature map level under the
+        # assumption that stride is a power of 2.
+        min_level = -(math.log2(scales[0]))
+        max_level = -(math.log2(scales[-1]))
+        assert math.isclose(min_level, int(min_level)) and math.isclose(
+            max_level, int(max_level)
+        ), "Featuremap stride is not power of 2!"
+        self.min_level = int(min_level)
+        self.max_level = int(max_level)
+        assert (
+            len(scales) == self.max_level - self.min_level + 1
+        ), "[ROIPooler] Sizes of input featuremaps do not form a pyramid!"
+        assert 0 <= self.min_level and self.min_level <= self.max_level
+        self.canonical_level = canonical_level
+        assert canonical_box_size > 0
+        self.canonical_box_size = canonical_box_size
+
+    def forward(self, x: List[torch.Tensor], box_lists: List[Boxes]):
+        """
+        Args:
+            x (list[Tensor]): A list of feature maps of NCHW shape, with scales matching those
+                used to construct this module.
+            box_lists (list[Boxes] | list[RotatedBoxes]):
+                A list of N Boxes or N RotatedBoxes, where N is the number of images in the batch.
+                The box coordinates are defined on the original image and
+                will be scaled by the `scales` argument of :class:`ROIPooler`.
+
+        Returns:
+            Tensor:
+                A tensor of shape (M, C, output_size, output_size) where M is the total number of
+                boxes aggregated over all N batch images and C is the number of channels in `x`.
+        """
+        num_level_assignments = len(self.level_poolers)
+
+        if not is_fx_tracing():
+            torch._assert(
+                isinstance(x, list) and isinstance(box_lists, list),
+                "Arguments to pooler must be lists",
+            )
+        assert_fx_safe(
+            len(x) == num_level_assignments,
+            "unequal value, num_level_assignments={}, but x is list of {} Tensors".format(
+                num_level_assignments, len(x)
+            ),
+        )
+        assert_fx_safe(
+            len(box_lists) == x[0].size(0),
+            "unequal value, x[0] batch dim 0 is {}, but box_list has length {}".format(
+                x[0].size(0), len(box_lists)
+            ),
+        )
+        if len(box_lists) == 0:
+            return _create_zeros(None, x[0].shape[1], *self.output_size, x[0])
+
+        pooler_fmt_boxes = convert_boxes_to_pooler_format(box_lists)
+
+        if num_level_assignments == 1:
+            return self.level_poolers[0](x[0], pooler_fmt_boxes)
+
+        level_assignments = assign_boxes_to_levels(
+            box_lists, self.min_level, self.max_level, self.canonical_box_size, self.canonical_level
+        )
+
+        num_channels = x[0].shape[1]
+        output_size = self.output_size[0]
+
+        output = _create_zeros(pooler_fmt_boxes, num_channels, output_size, output_size, x[0])
+
+        for level, pooler in enumerate(self.level_poolers):
+            inds = nonzero_tuple(level_assignments == level)[0]
+            pooler_fmt_boxes_level = pooler_fmt_boxes[inds]
+            # Use index_put_ instead of advance indexing, to avoid pytorch/issues/49852
+            output.index_put_((inds,), pooler(x[level], pooler_fmt_boxes_level))
+
+        return output
diff --git a/detectron2/modeling/postprocessing.py b/detectron2/modeling/postprocessing.py
new file mode 100644
index 0000000000000000000000000000000000000000..84512606a43d6991df0ae1f046164eb3c70d751a
--- /dev/null
+++ b/detectron2/modeling/postprocessing.py
@@ -0,0 +1,100 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+import torch
+from torch.nn import functional as F
+
+from detectron2.structures import Instances, ROIMasks
+
+
+# perhaps should rename to "resize_instance"
+def detector_postprocess(
+    results: Instances, output_height: int, output_width: int, mask_threshold: float = 0.5
+):
+    """
+    Resize the output instances.
+    The input images are often resized when entering an object detector.
+    As a result, we often need the outputs of the detector in a different
+    resolution from its inputs.
+
+    This function will resize the raw outputs of an R-CNN detector
+    to produce outputs according to the desired output resolution.
+
+    Args:
+        results (Instances): the raw outputs from the detector.
+            `results.image_size` contains the input image resolution the detector sees.
+            This object might be modified in-place.
+        output_height, output_width: the desired output resolution.
+    Returns:
+        Instances: the resized output from the model, based on the output resolution
+    """
+    if isinstance(output_width, torch.Tensor):
+        # This shape might (but not necessarily) be tensors during tracing.
+        # Converts integer tensors to float temporaries to ensure true
+        # division is performed when computing scale_x and scale_y.
+        output_width_tmp = output_width.float()
+        output_height_tmp = output_height.float()
+        new_size = torch.stack([output_height, output_width])
+    else:
+        new_size = (output_height, output_width)
+        output_width_tmp = output_width
+        output_height_tmp = output_height
+
+    scale_x, scale_y = (
+        output_width_tmp / results.image_size[1],
+        output_height_tmp / results.image_size[0],
+    )
+    results = Instances(new_size, **results.get_fields())
+
+    if results.has("pred_boxes"):
+        output_boxes = results.pred_boxes
+    elif results.has("proposal_boxes"):
+        output_boxes = results.proposal_boxes
+    else:
+        output_boxes = None
+    assert output_boxes is not None, "Predictions must contain boxes!"
+
+    output_boxes.scale(scale_x, scale_y)
+    output_boxes.clip(results.image_size)
+
+    results = results[output_boxes.nonempty()]
+
+    if results.has("pred_masks"):
+        if isinstance(results.pred_masks, ROIMasks):
+            roi_masks = results.pred_masks
+        else:
+            # pred_masks is a tensor of shape (N, 1, M, M)
+            roi_masks = ROIMasks(results.pred_masks[:, 0, :, :])
+        results.pred_masks = roi_masks.to_bitmasks(
+            results.pred_boxes, output_height, output_width, mask_threshold
+        ).tensor  # TODO return ROIMasks/BitMask object in the future
+
+    if results.has("pred_keypoints"):
+        results.pred_keypoints[:, :, 0] *= scale_x
+        results.pred_keypoints[:, :, 1] *= scale_y
+
+    return results
+
+
+def sem_seg_postprocess(result, img_size, output_height, output_width):
+    """
+    Return semantic segmentation predictions in the original resolution.
+
+    The input images are often resized when entering semantic segmentor. Moreover, in same
+    cases, they also padded inside segmentor to be divisible by maximum network stride.
+    As a result, we often need the predictions of the segmentor in a different
+    resolution from its inputs.
+
+    Args:
+        result (Tensor): semantic segmentation prediction logits. A tensor of shape (C, H, W),
+            where C is the number of classes, and H, W are the height and width of the prediction.
+        img_size (tuple): image size that segmentor is taking as input.
+        output_height, output_width: the desired output resolution.
+
+    Returns:
+        semantic segmentation prediction (Tensor): A tensor of the shape
+            (C, output_height, output_width) that contains per-pixel soft predictions.
+    """
+    result = result[:, : img_size[0], : img_size[1]].expand(1, -1, -1, -1)
+    result = F.interpolate(
+        result, size=(output_height, output_width), mode="bilinear", align_corners=False
+    )[0]
+    return result
diff --git a/detectron2/modeling/proposal_generator/__init__.py b/detectron2/modeling/proposal_generator/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..3f4e4df7645c67b7a013295207b98fe70b2e574c
--- /dev/null
+++ b/detectron2/modeling/proposal_generator/__init__.py
@@ -0,0 +1,5 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+from .build import PROPOSAL_GENERATOR_REGISTRY, build_proposal_generator
+from .rpn import RPN_HEAD_REGISTRY, build_rpn_head, RPN, StandardRPNHead
+
+__all__ = list(globals().keys())
diff --git a/detectron2/modeling/proposal_generator/__pycache__/__init__.cpython-310.pyc b/detectron2/modeling/proposal_generator/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b2b421e6f2255eb5b6f9cfbaa6875612a30d735d
Binary files /dev/null and b/detectron2/modeling/proposal_generator/__pycache__/__init__.cpython-310.pyc differ
diff --git a/detectron2/modeling/proposal_generator/__pycache__/__init__.cpython-39.pyc b/detectron2/modeling/proposal_generator/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ae5d21c0bc5c8cce264a2f82723ab4f4169d388d
Binary files /dev/null and b/detectron2/modeling/proposal_generator/__pycache__/__init__.cpython-39.pyc differ
diff --git a/detectron2/modeling/proposal_generator/__pycache__/build.cpython-310.pyc b/detectron2/modeling/proposal_generator/__pycache__/build.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a9070f49b2d38b9c332abcdb3aefc924819b065b
Binary files /dev/null and b/detectron2/modeling/proposal_generator/__pycache__/build.cpython-310.pyc differ
diff --git a/detectron2/modeling/proposal_generator/__pycache__/build.cpython-39.pyc b/detectron2/modeling/proposal_generator/__pycache__/build.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7a1ff701128e088371b000cc014a17e586b63614
Binary files /dev/null and b/detectron2/modeling/proposal_generator/__pycache__/build.cpython-39.pyc differ
diff --git a/detectron2/modeling/proposal_generator/__pycache__/proposal_utils.cpython-310.pyc b/detectron2/modeling/proposal_generator/__pycache__/proposal_utils.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..eab4d29fecc510781eb96b26386796dc686b7d23
Binary files /dev/null and b/detectron2/modeling/proposal_generator/__pycache__/proposal_utils.cpython-310.pyc differ
diff --git a/detectron2/modeling/proposal_generator/__pycache__/proposal_utils.cpython-39.pyc b/detectron2/modeling/proposal_generator/__pycache__/proposal_utils.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5858978bb8f9e9dc0437f6fb40f9b99267f3ff61
Binary files /dev/null and b/detectron2/modeling/proposal_generator/__pycache__/proposal_utils.cpython-39.pyc differ
diff --git a/detectron2/modeling/proposal_generator/__pycache__/rpn.cpython-310.pyc b/detectron2/modeling/proposal_generator/__pycache__/rpn.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f5e7162f2ab540952693be4d886c3b17eec41412
Binary files /dev/null and b/detectron2/modeling/proposal_generator/__pycache__/rpn.cpython-310.pyc differ
diff --git a/detectron2/modeling/proposal_generator/__pycache__/rpn.cpython-39.pyc b/detectron2/modeling/proposal_generator/__pycache__/rpn.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1a0c38c2725db99688ba7d52d1fdb4c7df44104f
Binary files /dev/null and b/detectron2/modeling/proposal_generator/__pycache__/rpn.cpython-39.pyc differ
diff --git a/detectron2/modeling/proposal_generator/__pycache__/rrpn.cpython-310.pyc b/detectron2/modeling/proposal_generator/__pycache__/rrpn.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3a439f5a76bc4b5ef067c8e3f1e871650a90b8dd
Binary files /dev/null and b/detectron2/modeling/proposal_generator/__pycache__/rrpn.cpython-310.pyc differ
diff --git a/detectron2/modeling/proposal_generator/__pycache__/rrpn.cpython-39.pyc b/detectron2/modeling/proposal_generator/__pycache__/rrpn.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..50abda199d12e82740ee6ce6edea78041076da0a
Binary files /dev/null and b/detectron2/modeling/proposal_generator/__pycache__/rrpn.cpython-39.pyc differ
diff --git a/detectron2/modeling/proposal_generator/build.py b/detectron2/modeling/proposal_generator/build.py
new file mode 100644
index 0000000000000000000000000000000000000000..34eb12d00d94ff905b796e75e2c4c5845257c8e9
--- /dev/null
+++ b/detectron2/modeling/proposal_generator/build.py
@@ -0,0 +1,24 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+from detectron2.utils.registry import Registry
+
+PROPOSAL_GENERATOR_REGISTRY = Registry("PROPOSAL_GENERATOR")
+PROPOSAL_GENERATOR_REGISTRY.__doc__ = """
+Registry for proposal generator, which produces object proposals from feature maps.
+
+The registered object will be called with `obj(cfg, input_shape)`.
+The call should return a `nn.Module` object.
+"""
+
+from . import rpn, rrpn  # noqa F401 isort:skip
+
+
+def build_proposal_generator(cfg, input_shape):
+    """
+    Build a proposal generator from `cfg.MODEL.PROPOSAL_GENERATOR.NAME`.
+    The name can be "PrecomputedProposals" to use no proposal generator.
+    """
+    name = cfg.MODEL.PROPOSAL_GENERATOR.NAME
+    if name == "PrecomputedProposals":
+        return None
+
+    return PROPOSAL_GENERATOR_REGISTRY.get(name)(cfg, input_shape)
diff --git a/detectron2/modeling/proposal_generator/proposal_utils.py b/detectron2/modeling/proposal_generator/proposal_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..0fdf5dc15c125163c124ab3d04c13bd5b4261588
--- /dev/null
+++ b/detectron2/modeling/proposal_generator/proposal_utils.py
@@ -0,0 +1,205 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+import logging
+import math
+from typing import List, Tuple, Union
+import torch
+
+from detectron2.layers import batched_nms, cat, move_device_like
+from detectron2.structures import Boxes, Instances
+
+logger = logging.getLogger(__name__)
+
+
+def _is_tracing():
+    # (fixed in TORCH_VERSION >= 1.9)
+    if torch.jit.is_scripting():
+        # https://github.com/pytorch/pytorch/issues/47379
+        return False
+    else:
+        return torch.jit.is_tracing()
+
+
+def find_top_rpn_proposals(
+    proposals: List[torch.Tensor],
+    pred_objectness_logits: List[torch.Tensor],
+    image_sizes: List[Tuple[int, int]],
+    nms_thresh: float,
+    pre_nms_topk: int,
+    post_nms_topk: int,
+    min_box_size: float,
+    training: bool,
+):
+    """
+    For each feature map, select the `pre_nms_topk` highest scoring proposals,
+    apply NMS, clip proposals, and remove small boxes. Return the `post_nms_topk`
+    highest scoring proposals among all the feature maps for each image.
+
+    Args:
+        proposals (list[Tensor]): A list of L tensors. Tensor i has shape (N, Hi*Wi*A, 4).
+            All proposal predictions on the feature maps.
+        pred_objectness_logits (list[Tensor]): A list of L tensors. Tensor i has shape (N, Hi*Wi*A).
+        image_sizes (list[tuple]): sizes (h, w) for each image
+        nms_thresh (float): IoU threshold to use for NMS
+        pre_nms_topk (int): number of top k scoring proposals to keep before applying NMS.
+            When RPN is run on multiple feature maps (as in FPN) this number is per
+            feature map.
+        post_nms_topk (int): number of top k scoring proposals to keep after applying NMS.
+            When RPN is run on multiple feature maps (as in FPN) this number is total,
+            over all feature maps.
+        min_box_size (float): minimum proposal box side length in pixels (absolute units
+            wrt input images).
+        training (bool): True if proposals are to be used in training, otherwise False.
+            This arg exists only to support a legacy bug; look for the "NB: Legacy bug ..."
+            comment.
+
+    Returns:
+        list[Instances]: list of N Instances. The i-th Instances
+            stores post_nms_topk object proposals for image i, sorted by their
+            objectness score in descending order.
+    """
+    num_images = len(image_sizes)
+    device = (
+        proposals[0].device
+        if torch.jit.is_scripting()
+        else ("cpu" if torch.jit.is_tracing() else proposals[0].device)
+    )
+
+    # 1. Select top-k anchor for every level and every image
+    topk_scores = []  # #lvl Tensor, each of shape N x topk
+    topk_proposals = []
+    level_ids = []  # #lvl Tensor, each of shape (topk,)
+    batch_idx = move_device_like(torch.arange(num_images, device=device), proposals[0])
+    for level_id, (proposals_i, logits_i) in enumerate(zip(proposals, pred_objectness_logits)):
+        Hi_Wi_A = logits_i.shape[1]
+        if isinstance(Hi_Wi_A, torch.Tensor):  # it's a tensor in tracing
+            num_proposals_i = torch.clamp(Hi_Wi_A, max=pre_nms_topk)
+        else:
+            num_proposals_i = min(Hi_Wi_A, pre_nms_topk)
+
+        topk_scores_i, topk_idx = logits_i.topk(num_proposals_i, dim=1)
+
+        # each is N x topk
+        topk_proposals_i = proposals_i[batch_idx[:, None], topk_idx]  # N x topk x 4
+
+        topk_proposals.append(topk_proposals_i)
+        topk_scores.append(topk_scores_i)
+        level_ids.append(
+            move_device_like(
+                torch.full((num_proposals_i,), level_id, dtype=torch.int64, device=device),
+                proposals[0],
+            )
+        )
+
+    # 2. Concat all levels together
+    topk_scores = cat(topk_scores, dim=1)
+    topk_proposals = cat(topk_proposals, dim=1)
+    level_ids = cat(level_ids, dim=0)
+
+    # 3. For each image, run a per-level NMS, and choose topk results.
+    results: List[Instances] = []
+    for n, image_size in enumerate(image_sizes):
+        boxes = Boxes(topk_proposals[n])
+        scores_per_img = topk_scores[n]
+        lvl = level_ids
+
+        valid_mask = torch.isfinite(boxes.tensor).all(dim=1) & torch.isfinite(scores_per_img)
+        if not valid_mask.all():
+            if training:
+                raise FloatingPointError(
+                    "Predicted boxes or scores contain Inf/NaN. Training has diverged."
+                )
+            boxes = boxes[valid_mask]
+            scores_per_img = scores_per_img[valid_mask]
+            lvl = lvl[valid_mask]
+        boxes.clip(image_size)
+
+        # filter empty boxes
+        keep = boxes.nonempty(threshold=min_box_size)
+        if _is_tracing() or keep.sum().item() != len(boxes):
+            boxes, scores_per_img, lvl = boxes[keep], scores_per_img[keep], lvl[keep]
+
+        keep = batched_nms(boxes.tensor, scores_per_img, lvl, nms_thresh)
+        # In Detectron1, there was different behavior during training vs. testing.
+        # (https://github.com/facebookresearch/Detectron/issues/459)
+        # During training, topk is over the proposals from *all* images in the training batch.
+        # During testing, it is over the proposals for each image separately.
+        # As a result, the training behavior becomes batch-dependent,
+        # and the configuration "POST_NMS_TOPK_TRAIN" end up relying on the batch size.
+        # This bug is addressed in Detectron2 to make the behavior independent of batch size.
+        keep = keep[:post_nms_topk]  # keep is already sorted
+
+        res = Instances(image_size)
+        res.proposal_boxes = boxes[keep]
+        res.objectness_logits = scores_per_img[keep]
+        results.append(res)
+    return results
+
+
+def add_ground_truth_to_proposals(
+    gt: Union[List[Instances], List[Boxes]], proposals: List[Instances]
+) -> List[Instances]:
+    """
+    Call `add_ground_truth_to_proposals_single_image` for all images.
+
+    Args:
+        gt(Union[List[Instances], List[Boxes]): list of N elements. Element i is a Instances
+            representing the ground-truth for image i.
+        proposals (list[Instances]): list of N elements. Element i is a Instances
+            representing the proposals for image i.
+
+    Returns:
+        list[Instances]: list of N Instances. Each is the proposals for the image,
+            with field "proposal_boxes" and "objectness_logits".
+    """
+    assert gt is not None
+
+    if len(proposals) != len(gt):
+        raise ValueError("proposals and gt should have the same length as the number of images!")
+    if len(proposals) == 0:
+        return proposals
+
+    return [
+        add_ground_truth_to_proposals_single_image(gt_i, proposals_i)
+        for gt_i, proposals_i in zip(gt, proposals)
+    ]
+
+
+def add_ground_truth_to_proposals_single_image(
+    gt: Union[Instances, Boxes], proposals: Instances
+) -> Instances:
+    """
+    Augment `proposals` with `gt`.
+
+    Args:
+        Same as `add_ground_truth_to_proposals`, but with gt and proposals
+        per image.
+
+    Returns:
+        Same as `add_ground_truth_to_proposals`, but for only one image.
+    """
+    if isinstance(gt, Boxes):
+        # convert Boxes to Instances
+        gt = Instances(proposals.image_size, gt_boxes=gt)
+
+    gt_boxes = gt.gt_boxes
+    device = proposals.objectness_logits.device
+    # Assign all ground-truth boxes an objectness logit corresponding to
+    # P(object) = sigmoid(logit) =~ 1.
+    gt_logit_value = math.log((1.0 - 1e-10) / (1 - (1.0 - 1e-10)))
+    gt_logits = gt_logit_value * torch.ones(len(gt_boxes), device=device)
+
+    # Concatenating gt_boxes with proposals requires them to have the same fields
+    gt_proposal = Instances(proposals.image_size, **gt.get_fields())
+    gt_proposal.proposal_boxes = gt_boxes
+    gt_proposal.objectness_logits = gt_logits
+
+    for key in proposals.get_fields().keys():
+        assert gt_proposal.has(
+            key
+        ), "The attribute '{}' in `proposals` does not exist in `gt`".format(key)
+
+    # NOTE: Instances.cat only use fields from the first item. Extra fields in latter items
+    # will be thrown away.
+    new_proposals = Instances.cat([proposals, gt_proposal])
+
+    return new_proposals
diff --git a/detectron2/modeling/proposal_generator/rpn.py b/detectron2/modeling/proposal_generator/rpn.py
new file mode 100644
index 0000000000000000000000000000000000000000..99cd536d2f9880d2049390c45f73eb22335e1b82
--- /dev/null
+++ b/detectron2/modeling/proposal_generator/rpn.py
@@ -0,0 +1,533 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+from typing import Dict, List, Optional, Tuple, Union
+import torch
+import torch.nn.functional as F
+from torch import nn
+
+from detectron2.config import configurable
+from detectron2.layers import Conv2d, ShapeSpec, cat
+from detectron2.structures import Boxes, ImageList, Instances, pairwise_iou
+from detectron2.utils.events import get_event_storage
+from detectron2.utils.memory import retry_if_cuda_oom
+from detectron2.utils.registry import Registry
+
+from ..anchor_generator import build_anchor_generator
+from ..box_regression import Box2BoxTransform, _dense_box_regression_loss
+from ..matcher import Matcher
+from ..sampling import subsample_labels
+from .build import PROPOSAL_GENERATOR_REGISTRY
+from .proposal_utils import find_top_rpn_proposals
+
+RPN_HEAD_REGISTRY = Registry("RPN_HEAD")
+RPN_HEAD_REGISTRY.__doc__ = """
+Registry for RPN heads, which take feature maps and perform
+objectness classification and bounding box regression for anchors.
+
+The registered object will be called with `obj(cfg, input_shape)`.
+The call should return a `nn.Module` object.
+"""
+
+
+"""
+Shape shorthand in this module:
+
+    N: number of images in the minibatch
+    L: number of feature maps per image on which RPN is run
+    A: number of cell anchors (must be the same for all feature maps)
+    Hi, Wi: height and width of the i-th feature map
+    B: size of the box parameterization
+
+Naming convention:
+
+    objectness: refers to the binary classification of an anchor as object vs. not object.
+
+    deltas: refers to the 4-d (dx, dy, dw, dh) deltas that parameterize the box2box
+    transform (see :class:`box_regression.Box2BoxTransform`), or 5d for rotated boxes.
+
+    pred_objectness_logits: predicted objectness scores in [-inf, +inf]; use
+        sigmoid(pred_objectness_logits) to estimate P(object).
+
+    gt_labels: ground-truth binary classification labels for objectness
+
+    pred_anchor_deltas: predicted box2box transform deltas
+
+    gt_anchor_deltas: ground-truth box2box transform deltas
+"""
+
+
+def build_rpn_head(cfg, input_shape):
+    """
+    Build an RPN head defined by `cfg.MODEL.RPN.HEAD_NAME`.
+    """
+    name = cfg.MODEL.RPN.HEAD_NAME
+    return RPN_HEAD_REGISTRY.get(name)(cfg, input_shape)
+
+
+@RPN_HEAD_REGISTRY.register()
+class StandardRPNHead(nn.Module):
+    """
+    Standard RPN classification and regression heads described in :paper:`Faster R-CNN`.
+    Uses a 3x3 conv to produce a shared hidden state from which one 1x1 conv predicts
+    objectness logits for each anchor and a second 1x1 conv predicts bounding-box deltas
+    specifying how to deform each anchor into an object proposal.
+    """
+
+    @configurable
+    def __init__(
+        self, *, in_channels: int, num_anchors: int, box_dim: int = 4, conv_dims: List[int] = (-1,)
+    ):
+        """
+        NOTE: this interface is experimental.
+
+        Args:
+            in_channels (int): number of input feature channels. When using multiple
+                input features, they must have the same number of channels.
+            num_anchors (int): number of anchors to predict for *each spatial position*
+                on the feature map. The total number of anchors for each
+                feature map will be `num_anchors * H * W`.
+            box_dim (int): dimension of a box, which is also the number of box regression
+                predictions to make for each anchor. An axis aligned box has
+                box_dim=4, while a rotated box has box_dim=5.
+            conv_dims (list[int]): a list of integers representing the output channels
+                of N conv layers. Set it to -1 to use the same number of output channels
+                as input channels.
+        """
+        super().__init__()
+        cur_channels = in_channels
+        # Keeping the old variable names and structure for backwards compatiblity.
+        # Otherwise the old checkpoints will fail to load.
+        if len(conv_dims) == 1:
+            out_channels = cur_channels if conv_dims[0] == -1 else conv_dims[0]
+            # 3x3 conv for the hidden representation
+            self.conv = self._get_rpn_conv(cur_channels, out_channels)
+            cur_channels = out_channels
+        else:
+            self.conv = nn.Sequential()
+            for k, conv_dim in enumerate(conv_dims):
+                out_channels = cur_channels if conv_dim == -1 else conv_dim
+                if out_channels <= 0:
+                    raise ValueError(
+                        f"Conv output channels should be greater than 0. Got {out_channels}"
+                    )
+                conv = self._get_rpn_conv(cur_channels, out_channels)
+                self.conv.add_module(f"conv{k}", conv)
+                cur_channels = out_channels
+        # 1x1 conv for predicting objectness logits
+        self.objectness_logits = nn.Conv2d(cur_channels, num_anchors, kernel_size=1, stride=1)
+        # 1x1 conv for predicting box2box transform deltas
+        self.anchor_deltas = nn.Conv2d(cur_channels, num_anchors * box_dim, kernel_size=1, stride=1)
+
+        # Keeping the order of weights initialization same for backwards compatiblility.
+        for layer in self.modules():
+            if isinstance(layer, nn.Conv2d):
+                nn.init.normal_(layer.weight, std=0.01)
+                nn.init.constant_(layer.bias, 0)
+
+    def _get_rpn_conv(self, in_channels, out_channels):
+        return Conv2d(
+            in_channels,
+            out_channels,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+            activation=nn.ReLU(),
+        )
+
+    @classmethod
+    def from_config(cls, cfg, input_shape):
+        # Standard RPN is shared across levels:
+        in_channels = [s.channels for s in input_shape]
+        assert len(set(in_channels)) == 1, "Each level must have the same channel!"
+        in_channels = in_channels[0]
+
+        # RPNHead should take the same input as anchor generator
+        # NOTE: it assumes that creating an anchor generator does not have unwanted side effect.
+        anchor_generator = build_anchor_generator(cfg, input_shape)
+        num_anchors = anchor_generator.num_anchors
+        box_dim = anchor_generator.box_dim
+        assert (
+            len(set(num_anchors)) == 1
+        ), "Each level must have the same number of anchors per spatial position"
+        return {
+            "in_channels": in_channels,
+            "num_anchors": num_anchors[0],
+            "box_dim": box_dim,
+            "conv_dims": cfg.MODEL.RPN.CONV_DIMS,
+        }
+
+    def forward(self, features: List[torch.Tensor]):
+        """
+        Args:
+            features (list[Tensor]): list of feature maps
+
+        Returns:
+            list[Tensor]: A list of L elements.
+                Element i is a tensor of shape (N, A, Hi, Wi) representing
+                the predicted objectness logits for all anchors. A is the number of cell anchors.
+            list[Tensor]: A list of L elements. Element i is a tensor of shape
+                (N, A*box_dim, Hi, Wi) representing the predicted "deltas" used to transform anchors
+                to proposals.
+        """
+        pred_objectness_logits = []
+        pred_anchor_deltas = []
+        for x in features:
+            t = self.conv(x)
+            pred_objectness_logits.append(self.objectness_logits(t))
+            pred_anchor_deltas.append(self.anchor_deltas(t))
+        return pred_objectness_logits, pred_anchor_deltas
+
+
+@PROPOSAL_GENERATOR_REGISTRY.register()
+class RPN(nn.Module):
+    """
+    Region Proposal Network, introduced by :paper:`Faster R-CNN`.
+    """
+
+    @configurable
+    def __init__(
+        self,
+        *,
+        in_features: List[str],
+        head: nn.Module,
+        anchor_generator: nn.Module,
+        anchor_matcher: Matcher,
+        box2box_transform: Box2BoxTransform,
+        batch_size_per_image: int,
+        positive_fraction: float,
+        pre_nms_topk: Tuple[float, float],
+        post_nms_topk: Tuple[float, float],
+        nms_thresh: float = 0.7,
+        min_box_size: float = 0.0,
+        anchor_boundary_thresh: float = -1.0,
+        loss_weight: Union[float, Dict[str, float]] = 1.0,
+        box_reg_loss_type: str = "smooth_l1",
+        smooth_l1_beta: float = 0.0,
+    ):
+        """
+        NOTE: this interface is experimental.
+
+        Args:
+            in_features (list[str]): list of names of input features to use
+            head (nn.Module): a module that predicts logits and regression deltas
+                for each level from a list of per-level features
+            anchor_generator (nn.Module): a module that creates anchors from a
+                list of features. Usually an instance of :class:`AnchorGenerator`
+            anchor_matcher (Matcher): label the anchors by matching them with ground truth.
+            box2box_transform (Box2BoxTransform): defines the transform from anchors boxes to
+                instance boxes
+            batch_size_per_image (int): number of anchors per image to sample for training
+            positive_fraction (float): fraction of foreground anchors to sample for training
+            pre_nms_topk (tuple[float]): (train, test) that represents the
+                number of top k proposals to select before NMS, in
+                training and testing.
+            post_nms_topk (tuple[float]): (train, test) that represents the
+                number of top k proposals to select after NMS, in
+                training and testing.
+            nms_thresh (float): NMS threshold used to de-duplicate the predicted proposals
+            min_box_size (float): remove proposal boxes with any side smaller than this threshold,
+                in the unit of input image pixels
+            anchor_boundary_thresh (float): legacy option
+            loss_weight (float|dict): weights to use for losses. Can be single float for weighting
+                all rpn losses together, or a dict of individual weightings. Valid dict keys are:
+                    "loss_rpn_cls" - applied to classification loss
+                    "loss_rpn_loc" - applied to box regression loss
+            box_reg_loss_type (str): Loss type to use. Supported losses: "smooth_l1", "giou".
+            smooth_l1_beta (float): beta parameter for the smooth L1 regression loss. Default to
+                use L1 loss. Only used when `box_reg_loss_type` is "smooth_l1"
+        """
+        super().__init__()
+        self.in_features = in_features
+        self.rpn_head = head
+        self.anchor_generator = anchor_generator
+        self.anchor_matcher = anchor_matcher
+        self.box2box_transform = box2box_transform
+        self.batch_size_per_image = batch_size_per_image
+        self.positive_fraction = positive_fraction
+        # Map from self.training state to train/test settings
+        self.pre_nms_topk = {True: pre_nms_topk[0], False: pre_nms_topk[1]}
+        self.post_nms_topk = {True: post_nms_topk[0], False: post_nms_topk[1]}
+        self.nms_thresh = nms_thresh
+        self.min_box_size = float(min_box_size)
+        self.anchor_boundary_thresh = anchor_boundary_thresh
+        if isinstance(loss_weight, float):
+            loss_weight = {"loss_rpn_cls": loss_weight, "loss_rpn_loc": loss_weight}
+        self.loss_weight = loss_weight
+        self.box_reg_loss_type = box_reg_loss_type
+        self.smooth_l1_beta = smooth_l1_beta
+
+    @classmethod
+    def from_config(cls, cfg, input_shape: Dict[str, ShapeSpec]):
+        in_features = cfg.MODEL.RPN.IN_FEATURES
+        ret = {
+            "in_features": in_features,
+            "min_box_size": cfg.MODEL.PROPOSAL_GENERATOR.MIN_SIZE,
+            "nms_thresh": cfg.MODEL.RPN.NMS_THRESH,
+            "batch_size_per_image": cfg.MODEL.RPN.BATCH_SIZE_PER_IMAGE,
+            "positive_fraction": cfg.MODEL.RPN.POSITIVE_FRACTION,
+            "loss_weight": {
+                "loss_rpn_cls": cfg.MODEL.RPN.LOSS_WEIGHT,
+                "loss_rpn_loc": cfg.MODEL.RPN.BBOX_REG_LOSS_WEIGHT * cfg.MODEL.RPN.LOSS_WEIGHT,
+            },
+            "anchor_boundary_thresh": cfg.MODEL.RPN.BOUNDARY_THRESH,
+            "box2box_transform": Box2BoxTransform(weights=cfg.MODEL.RPN.BBOX_REG_WEIGHTS),
+            "box_reg_loss_type": cfg.MODEL.RPN.BBOX_REG_LOSS_TYPE,
+            "smooth_l1_beta": cfg.MODEL.RPN.SMOOTH_L1_BETA,
+        }
+
+        ret["pre_nms_topk"] = (cfg.MODEL.RPN.PRE_NMS_TOPK_TRAIN, cfg.MODEL.RPN.PRE_NMS_TOPK_TEST)
+        ret["post_nms_topk"] = (cfg.MODEL.RPN.POST_NMS_TOPK_TRAIN, cfg.MODEL.RPN.POST_NMS_TOPK_TEST)
+
+        ret["anchor_generator"] = build_anchor_generator(cfg, [input_shape[f] for f in in_features])
+        ret["anchor_matcher"] = Matcher(
+            cfg.MODEL.RPN.IOU_THRESHOLDS, cfg.MODEL.RPN.IOU_LABELS, allow_low_quality_matches=True
+        )
+        ret["head"] = build_rpn_head(cfg, [input_shape[f] for f in in_features])
+        return ret
+
+    def _subsample_labels(self, label):
+        """
+        Randomly sample a subset of positive and negative examples, and overwrite
+        the label vector to the ignore value (-1) for all elements that are not
+        included in the sample.
+
+        Args:
+            labels (Tensor): a vector of -1, 0, 1. Will be modified in-place and returned.
+        """
+        pos_idx, neg_idx = subsample_labels(
+            label, self.batch_size_per_image, self.positive_fraction, 0
+        )
+        # Fill with the ignore label (-1), then set positive and negative labels
+        label.fill_(-1)
+        label.scatter_(0, pos_idx, 1)
+        label.scatter_(0, neg_idx, 0)
+        return label
+
+    @torch.jit.unused
+    @torch.no_grad()
+    def label_and_sample_anchors(
+        self, anchors: List[Boxes], gt_instances: List[Instances]
+    ) -> Tuple[List[torch.Tensor], List[torch.Tensor]]:
+        """
+        Args:
+            anchors (list[Boxes]): anchors for each feature map.
+            gt_instances: the ground-truth instances for each image.
+
+        Returns:
+            list[Tensor]:
+                List of #img tensors. i-th element is a vector of labels whose length is
+                the total number of anchors across all feature maps R = sum(Hi * Wi * A).
+                Label values are in {-1, 0, 1}, with meanings: -1 = ignore; 0 = negative
+                class; 1 = positive class.
+            list[Tensor]:
+                i-th element is a Rx4 tensor. The values are the matched gt boxes for each
+                anchor. Values are undefined for those anchors not labeled as 1.
+        """
+        anchors = Boxes.cat(anchors)
+
+        gt_boxes = [x.gt_boxes for x in gt_instances]
+        image_sizes = [x.image_size for x in gt_instances]
+        del gt_instances
+
+        gt_labels = []
+        matched_gt_boxes = []
+        for image_size_i, gt_boxes_i in zip(image_sizes, gt_boxes):
+            """
+            image_size_i: (h, w) for the i-th image
+            gt_boxes_i: ground-truth boxes for i-th image
+            """
+
+            match_quality_matrix = retry_if_cuda_oom(pairwise_iou)(gt_boxes_i, anchors)
+            matched_idxs, gt_labels_i = retry_if_cuda_oom(self.anchor_matcher)(match_quality_matrix)
+            # Matching is memory-expensive and may result in CPU tensors. But the result is small
+            gt_labels_i = gt_labels_i.to(device=gt_boxes_i.device)
+            del match_quality_matrix
+
+            if self.anchor_boundary_thresh >= 0:
+                # Discard anchors that go out of the boundaries of the image
+                # NOTE: This is legacy functionality that is turned off by default in Detectron2
+                anchors_inside_image = anchors.inside_box(image_size_i, self.anchor_boundary_thresh)
+                gt_labels_i[~anchors_inside_image] = -1
+
+            # A vector of labels (-1, 0, 1) for each anchor
+            gt_labels_i = self._subsample_labels(gt_labels_i)
+
+            if len(gt_boxes_i) == 0:
+                # These values won't be used anyway since the anchor is labeled as background
+                matched_gt_boxes_i = torch.zeros_like(anchors.tensor)
+            else:
+                # TODO wasted indexing computation for ignored boxes
+                matched_gt_boxes_i = gt_boxes_i[matched_idxs].tensor
+
+            gt_labels.append(gt_labels_i)  # N,AHW
+            matched_gt_boxes.append(matched_gt_boxes_i)
+        return gt_labels, matched_gt_boxes
+
+    @torch.jit.unused
+    def losses(
+        self,
+        anchors: List[Boxes],
+        pred_objectness_logits: List[torch.Tensor],
+        gt_labels: List[torch.Tensor],
+        pred_anchor_deltas: List[torch.Tensor],
+        gt_boxes: List[torch.Tensor],
+    ) -> Dict[str, torch.Tensor]:
+        """
+        Return the losses from a set of RPN predictions and their associated ground-truth.
+
+        Args:
+            anchors (list[Boxes or RotatedBoxes]): anchors for each feature map, each
+                has shape (Hi*Wi*A, B), where B is box dimension (4 or 5).
+            pred_objectness_logits (list[Tensor]): A list of L elements.
+                Element i is a tensor of shape (N, Hi*Wi*A) representing
+                the predicted objectness logits for all anchors.
+            gt_labels (list[Tensor]): Output of :meth:`label_and_sample_anchors`.
+            pred_anchor_deltas (list[Tensor]): A list of L elements. Element i is a tensor of shape
+                (N, Hi*Wi*A, 4 or 5) representing the predicted "deltas" used to transform anchors
+                to proposals.
+            gt_boxes (list[Tensor]): Output of :meth:`label_and_sample_anchors`.
+
+        Returns:
+            dict[loss name -> loss value]: A dict mapping from loss name to loss value.
+                Loss names are: `loss_rpn_cls` for objectness classification and
+                `loss_rpn_loc` for proposal localization.
+        """
+        num_images = len(gt_labels)
+        gt_labels = torch.stack(gt_labels)  # (N, sum(Hi*Wi*Ai))
+
+        # Log the number of positive/negative anchors per-image that's used in training
+        pos_mask = gt_labels == 1
+        num_pos_anchors = pos_mask.sum().item()
+        num_neg_anchors = (gt_labels == 0).sum().item()
+        storage = get_event_storage()
+        storage.put_scalar("rpn/num_pos_anchors", num_pos_anchors / num_images)
+        storage.put_scalar("rpn/num_neg_anchors", num_neg_anchors / num_images)
+
+        localization_loss = _dense_box_regression_loss(
+            anchors,
+            self.box2box_transform,
+            pred_anchor_deltas,
+            gt_boxes,
+            pos_mask,
+            box_reg_loss_type=self.box_reg_loss_type,
+            smooth_l1_beta=self.smooth_l1_beta,
+        )
+
+        valid_mask = gt_labels >= 0
+        objectness_loss = F.binary_cross_entropy_with_logits(
+            cat(pred_objectness_logits, dim=1)[valid_mask],
+            gt_labels[valid_mask].to(torch.float32),
+            reduction="sum",
+        )
+        normalizer = self.batch_size_per_image * num_images
+        losses = {
+            "loss_rpn_cls": objectness_loss / normalizer,
+            # The original Faster R-CNN paper uses a slightly different normalizer
+            # for loc loss. But it doesn't matter in practice
+            "loss_rpn_loc": localization_loss / normalizer,
+        }
+        losses = {k: v * self.loss_weight.get(k, 1.0) for k, v in losses.items()}
+        return losses
+
+    def forward(
+        self,
+        images: ImageList,
+        features: Dict[str, torch.Tensor],
+        gt_instances: Optional[List[Instances]] = None,
+    ):
+        """
+        Args:
+            images (ImageList): input images of length `N`
+            features (dict[str, Tensor]): input data as a mapping from feature
+                map name to tensor. Axis 0 represents the number of images `N` in
+                the input data; axes 1-3 are channels, height, and width, which may
+                vary between feature maps (e.g., if a feature pyramid is used).
+            gt_instances (list[Instances], optional): a length `N` list of `Instances`s.
+                Each `Instances` stores ground-truth instances for the corresponding image.
+
+        Returns:
+            proposals: list[Instances]: contains fields "proposal_boxes", "objectness_logits"
+            loss: dict[Tensor] or None
+        """
+        features = [features[f] for f in self.in_features]
+        anchors = self.anchor_generator(features)
+
+        pred_objectness_logits, pred_anchor_deltas = self.rpn_head(features)
+        # Transpose the Hi*Wi*A dimension to the middle:
+        pred_objectness_logits = [
+            # (N, A, Hi, Wi) -> (N, Hi, Wi, A) -> (N, Hi*Wi*A)
+            score.permute(0, 2, 3, 1).flatten(1)
+            for score in pred_objectness_logits
+        ]
+        pred_anchor_deltas = [
+            # (N, A*B, Hi, Wi) -> (N, A, B, Hi, Wi) -> (N, Hi, Wi, A, B) -> (N, Hi*Wi*A, B)
+            x.view(x.shape[0], -1, self.anchor_generator.box_dim, x.shape[-2], x.shape[-1])
+            .permute(0, 3, 4, 1, 2)
+            .flatten(1, -2)
+            for x in pred_anchor_deltas
+        ]
+
+        if self.training:
+            assert gt_instances is not None, "RPN requires gt_instances in training!"
+            gt_labels, gt_boxes = self.label_and_sample_anchors(anchors, gt_instances)
+            losses = self.losses(
+                anchors, pred_objectness_logits, gt_labels, pred_anchor_deltas, gt_boxes
+            )
+        else:
+            losses = {}
+        proposals = self.predict_proposals(
+            anchors, pred_objectness_logits, pred_anchor_deltas, images.image_sizes
+        )
+        return proposals, losses
+
+    def predict_proposals(
+        self,
+        anchors: List[Boxes],
+        pred_objectness_logits: List[torch.Tensor],
+        pred_anchor_deltas: List[torch.Tensor],
+        image_sizes: List[Tuple[int, int]],
+    ):
+        """
+        Decode all the predicted box regression deltas to proposals. Find the top proposals
+        by applying NMS and removing boxes that are too small.
+
+        Returns:
+            proposals (list[Instances]): list of N Instances. The i-th Instances
+                stores post_nms_topk object proposals for image i, sorted by their
+                objectness score in descending order.
+        """
+        # The proposals are treated as fixed for joint training with roi heads.
+        # This approach ignores the derivative w.r.t. the proposal boxes’ coordinates that
+        # are also network responses.
+        with torch.no_grad():
+            pred_proposals = self._decode_proposals(anchors, pred_anchor_deltas)
+            return find_top_rpn_proposals(
+                pred_proposals,
+                pred_objectness_logits,
+                image_sizes,
+                self.nms_thresh,
+                self.pre_nms_topk[self.training],
+                self.post_nms_topk[self.training],
+                self.min_box_size,
+                self.training,
+            )
+
+    def _decode_proposals(self, anchors: List[Boxes], pred_anchor_deltas: List[torch.Tensor]):
+        """
+        Transform anchors into proposals by applying the predicted anchor deltas.
+
+        Returns:
+            proposals (list[Tensor]): A list of L tensors. Tensor i has shape
+                (N, Hi*Wi*A, B)
+        """
+        N = pred_anchor_deltas[0].shape[0]
+        proposals = []
+        # For each feature map
+        for anchors_i, pred_anchor_deltas_i in zip(anchors, pred_anchor_deltas):
+            B = anchors_i.tensor.size(1)
+            pred_anchor_deltas_i = pred_anchor_deltas_i.reshape(-1, B)
+            # Expand anchors to shape (N*Hi*Wi*A, B)
+            anchors_i = anchors_i.tensor.unsqueeze(0).expand(N, -1, -1).reshape(-1, B)
+            proposals_i = self.box2box_transform.apply_deltas(pred_anchor_deltas_i, anchors_i)
+            # Append feature map proposals with shape (N, Hi*Wi*A, B)
+            proposals.append(proposals_i.view(N, -1, B))
+        return proposals
diff --git a/detectron2/modeling/proposal_generator/rrpn.py b/detectron2/modeling/proposal_generator/rrpn.py
new file mode 100644
index 0000000000000000000000000000000000000000..1a3cd282c2d1ede5c60a7c2c84846cbeed7808f0
--- /dev/null
+++ b/detectron2/modeling/proposal_generator/rrpn.py
@@ -0,0 +1,209 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+import itertools
+import logging
+from typing import Dict, List
+import torch
+
+from detectron2.config import configurable
+from detectron2.layers import ShapeSpec, batched_nms_rotated, cat
+from detectron2.structures import Instances, RotatedBoxes, pairwise_iou_rotated
+from detectron2.utils.memory import retry_if_cuda_oom
+
+from ..box_regression import Box2BoxTransformRotated
+from .build import PROPOSAL_GENERATOR_REGISTRY
+from .proposal_utils import _is_tracing
+from .rpn import RPN
+
+logger = logging.getLogger(__name__)
+
+
+def find_top_rrpn_proposals(
+    proposals,
+    pred_objectness_logits,
+    image_sizes,
+    nms_thresh,
+    pre_nms_topk,
+    post_nms_topk,
+    min_box_size,
+    training,
+):
+    """
+    For each feature map, select the `pre_nms_topk` highest scoring proposals,
+    apply NMS, clip proposals, and remove small boxes. Return the `post_nms_topk`
+    highest scoring proposals among all the feature maps if `training` is True,
+    otherwise, returns the highest `post_nms_topk` scoring proposals for each
+    feature map.
+
+    Args:
+        proposals (list[Tensor]): A list of L tensors. Tensor i has shape (N, Hi*Wi*A, 5).
+            All proposal predictions on the feature maps.
+        pred_objectness_logits (list[Tensor]): A list of L tensors. Tensor i has shape (N, Hi*Wi*A).
+        image_sizes (list[tuple]): sizes (h, w) for each image
+        nms_thresh (float): IoU threshold to use for NMS
+        pre_nms_topk (int): number of top k scoring proposals to keep before applying NMS.
+            When RRPN is run on multiple feature maps (as in FPN) this number is per
+            feature map.
+        post_nms_topk (int): number of top k scoring proposals to keep after applying NMS.
+            When RRPN is run on multiple feature maps (as in FPN) this number is total,
+            over all feature maps.
+        min_box_size(float): minimum proposal box side length in pixels (absolute units wrt
+            input images).
+        training (bool): True if proposals are to be used in training, otherwise False.
+            This arg exists only to support a legacy bug; look for the "NB: Legacy bug ..."
+            comment.
+
+    Returns:
+        proposals (list[Instances]): list of N Instances. The i-th Instances
+            stores post_nms_topk object proposals for image i.
+    """
+    num_images = len(image_sizes)
+    device = proposals[0].device
+
+    # 1. Select top-k anchor for every level and every image
+    topk_scores = []  # #lvl Tensor, each of shape N x topk
+    topk_proposals = []
+    level_ids = []  # #lvl Tensor, each of shape (topk,)
+    batch_idx = torch.arange(num_images, device=device)
+    for level_id, proposals_i, logits_i in zip(
+        itertools.count(), proposals, pred_objectness_logits
+    ):
+        Hi_Wi_A = logits_i.shape[1]
+        if isinstance(Hi_Wi_A, torch.Tensor):  # it's a tensor in tracing
+            num_proposals_i = torch.clamp(Hi_Wi_A, max=pre_nms_topk)
+        else:
+            num_proposals_i = min(Hi_Wi_A, pre_nms_topk)
+
+        topk_scores_i, topk_idx = logits_i.topk(num_proposals_i, dim=1)
+
+        # each is N x topk
+        topk_proposals_i = proposals_i[batch_idx[:, None], topk_idx]  # N x topk x 5
+
+        topk_proposals.append(topk_proposals_i)
+        topk_scores.append(topk_scores_i)
+        level_ids.append(torch.full((num_proposals_i,), level_id, dtype=torch.int64, device=device))
+
+    # 2. Concat all levels together
+    topk_scores = cat(topk_scores, dim=1)
+    topk_proposals = cat(topk_proposals, dim=1)
+    level_ids = cat(level_ids, dim=0)
+
+    # 3. For each image, run a per-level NMS, and choose topk results.
+    results = []
+    for n, image_size in enumerate(image_sizes):
+        boxes = RotatedBoxes(topk_proposals[n])
+        scores_per_img = topk_scores[n]
+        lvl = level_ids
+
+        valid_mask = torch.isfinite(boxes.tensor).all(dim=1) & torch.isfinite(scores_per_img)
+        if not valid_mask.all():
+            if training:
+                raise FloatingPointError(
+                    "Predicted boxes or scores contain Inf/NaN. Training has diverged."
+                )
+            boxes = boxes[valid_mask]
+            scores_per_img = scores_per_img[valid_mask]
+            lvl = lvl[valid_mask]
+        boxes.clip(image_size)
+
+        # filter empty boxes
+        keep = boxes.nonempty(threshold=min_box_size)
+        if _is_tracing() or keep.sum().item() != len(boxes):
+            boxes, scores_per_img, lvl = (boxes[keep], scores_per_img[keep], lvl[keep])
+
+        keep = batched_nms_rotated(boxes.tensor, scores_per_img, lvl, nms_thresh)
+        # In Detectron1, there was different behavior during training vs. testing.
+        # (https://github.com/facebookresearch/Detectron/issues/459)
+        # During training, topk is over the proposals from *all* images in the training batch.
+        # During testing, it is over the proposals for each image separately.
+        # As a result, the training behavior becomes batch-dependent,
+        # and the configuration "POST_NMS_TOPK_TRAIN" end up relying on the batch size.
+        # This bug is addressed in Detectron2 to make the behavior independent of batch size.
+        keep = keep[:post_nms_topk]
+
+        res = Instances(image_size)
+        res.proposal_boxes = boxes[keep]
+        res.objectness_logits = scores_per_img[keep]
+        results.append(res)
+    return results
+
+
+@PROPOSAL_GENERATOR_REGISTRY.register()
+class RRPN(RPN):
+    """
+    Rotated Region Proposal Network described in :paper:`RRPN`.
+    """
+
+    @configurable
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        if self.anchor_boundary_thresh >= 0:
+            raise NotImplementedError(
+                "anchor_boundary_thresh is a legacy option not implemented for RRPN."
+            )
+
+    @classmethod
+    def from_config(cls, cfg, input_shape: Dict[str, ShapeSpec]):
+        ret = super().from_config(cfg, input_shape)
+        ret["box2box_transform"] = Box2BoxTransformRotated(weights=cfg.MODEL.RPN.BBOX_REG_WEIGHTS)
+        return ret
+
+    @torch.no_grad()
+    def label_and_sample_anchors(self, anchors: List[RotatedBoxes], gt_instances: List[Instances]):
+        """
+        Args:
+            anchors (list[RotatedBoxes]): anchors for each feature map.
+            gt_instances: the ground-truth instances for each image.
+
+        Returns:
+            list[Tensor]:
+                List of #img tensors. i-th element is a vector of labels whose length is
+                the total number of anchors across feature maps. Label values are in {-1, 0, 1},
+                with meanings: -1 = ignore; 0 = negative class; 1 = positive class.
+            list[Tensor]:
+                i-th element is a Nx5 tensor, where N is the total number of anchors across
+                feature maps.  The values are the matched gt boxes for each anchor.
+                Values are undefined for those anchors not labeled as 1.
+        """
+        anchors = RotatedBoxes.cat(anchors)
+
+        gt_boxes = [x.gt_boxes for x in gt_instances]
+        del gt_instances
+
+        gt_labels = []
+        matched_gt_boxes = []
+        for gt_boxes_i in gt_boxes:
+            """
+            gt_boxes_i: ground-truth boxes for i-th image
+            """
+            match_quality_matrix = retry_if_cuda_oom(pairwise_iou_rotated)(gt_boxes_i, anchors)
+            matched_idxs, gt_labels_i = retry_if_cuda_oom(self.anchor_matcher)(match_quality_matrix)
+            # Matching is memory-expensive and may result in CPU tensors. But the result is small
+            gt_labels_i = gt_labels_i.to(device=gt_boxes_i.device)
+
+            # A vector of labels (-1, 0, 1) for each anchor
+            gt_labels_i = self._subsample_labels(gt_labels_i)
+
+            if len(gt_boxes_i) == 0:
+                # These values won't be used anyway since the anchor is labeled as background
+                matched_gt_boxes_i = torch.zeros_like(anchors.tensor)
+            else:
+                # TODO wasted indexing computation for ignored boxes
+                matched_gt_boxes_i = gt_boxes_i[matched_idxs].tensor
+
+            gt_labels.append(gt_labels_i)  # N,AHW
+            matched_gt_boxes.append(matched_gt_boxes_i)
+        return gt_labels, matched_gt_boxes
+
+    @torch.no_grad()
+    def predict_proposals(self, anchors, pred_objectness_logits, pred_anchor_deltas, image_sizes):
+        pred_proposals = self._decode_proposals(anchors, pred_anchor_deltas)
+        return find_top_rrpn_proposals(
+            pred_proposals,
+            pred_objectness_logits,
+            image_sizes,
+            self.nms_thresh,
+            self.pre_nms_topk[self.training],
+            self.post_nms_topk[self.training],
+            self.min_box_size,
+            self.training,
+        )
diff --git a/detectron2/modeling/roi_heads/__init__.py b/detectron2/modeling/roi_heads/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..d13e9c57235b982f3e0645bc316de2b75755dfda
--- /dev/null
+++ b/detectron2/modeling/roi_heads/__init__.py
@@ -0,0 +1,29 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+from .box_head import ROI_BOX_HEAD_REGISTRY, build_box_head, FastRCNNConvFCHead
+from .keypoint_head import (
+    ROI_KEYPOINT_HEAD_REGISTRY,
+    build_keypoint_head,
+    BaseKeypointRCNNHead,
+    KRCNNConvDeconvUpsampleHead,
+)
+from .mask_head import (
+    ROI_MASK_HEAD_REGISTRY,
+    build_mask_head,
+    BaseMaskRCNNHead,
+    MaskRCNNConvUpsampleHead,
+)
+from .roi_heads import (
+    ROI_HEADS_REGISTRY,
+    ROIHeads,
+    Res5ROIHeads,
+    StandardROIHeads,
+    build_roi_heads,
+    select_foreground_proposals,
+)
+from .cascade_rcnn import CascadeROIHeads
+from .rotated_fast_rcnn import RROIHeads
+from .fast_rcnn import FastRCNNOutputLayers
+
+from . import cascade_rcnn  # isort:skip
+
+__all__ = list(globals().keys())
diff --git a/detectron2/modeling/roi_heads/__pycache__/__init__.cpython-310.pyc b/detectron2/modeling/roi_heads/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b32196760592bc7da58cf78fb6226b2565d4363a
Binary files /dev/null and b/detectron2/modeling/roi_heads/__pycache__/__init__.cpython-310.pyc differ
diff --git a/detectron2/modeling/roi_heads/__pycache__/__init__.cpython-39.pyc b/detectron2/modeling/roi_heads/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7a5fca8c801fa6c74eec2b38841699554f20ef98
Binary files /dev/null and b/detectron2/modeling/roi_heads/__pycache__/__init__.cpython-39.pyc differ
diff --git a/detectron2/modeling/roi_heads/__pycache__/box_head.cpython-310.pyc b/detectron2/modeling/roi_heads/__pycache__/box_head.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4733a8ed78c3269082fa541aa806e4bd58e3725b
Binary files /dev/null and b/detectron2/modeling/roi_heads/__pycache__/box_head.cpython-310.pyc differ
diff --git a/detectron2/modeling/roi_heads/__pycache__/box_head.cpython-39.pyc b/detectron2/modeling/roi_heads/__pycache__/box_head.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0198c151e1cbe96b8b0f147e0b4eb835496384a0
Binary files /dev/null and b/detectron2/modeling/roi_heads/__pycache__/box_head.cpython-39.pyc differ
diff --git a/detectron2/modeling/roi_heads/__pycache__/cascade_rcnn.cpython-310.pyc b/detectron2/modeling/roi_heads/__pycache__/cascade_rcnn.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6d833aecb4278e11b7f2249facb10e0ed4704598
Binary files /dev/null and b/detectron2/modeling/roi_heads/__pycache__/cascade_rcnn.cpython-310.pyc differ
diff --git a/detectron2/modeling/roi_heads/__pycache__/cascade_rcnn.cpython-39.pyc b/detectron2/modeling/roi_heads/__pycache__/cascade_rcnn.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..86e34ab99faec03e62e6f5f7b9ad93ba94e249db
Binary files /dev/null and b/detectron2/modeling/roi_heads/__pycache__/cascade_rcnn.cpython-39.pyc differ
diff --git a/detectron2/modeling/roi_heads/__pycache__/fast_rcnn.cpython-310.pyc b/detectron2/modeling/roi_heads/__pycache__/fast_rcnn.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..44cd688463b433592899fae705957322d0e51997
Binary files /dev/null and b/detectron2/modeling/roi_heads/__pycache__/fast_rcnn.cpython-310.pyc differ
diff --git a/detectron2/modeling/roi_heads/__pycache__/fast_rcnn.cpython-39.pyc b/detectron2/modeling/roi_heads/__pycache__/fast_rcnn.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f2b3243ed2ad54c00ae95b704dc1df95b33ef8c5
Binary files /dev/null and b/detectron2/modeling/roi_heads/__pycache__/fast_rcnn.cpython-39.pyc differ
diff --git a/detectron2/modeling/roi_heads/__pycache__/keypoint_head.cpython-310.pyc b/detectron2/modeling/roi_heads/__pycache__/keypoint_head.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..19c1a047a267e8cb564f2c15401201708dd3eb80
Binary files /dev/null and b/detectron2/modeling/roi_heads/__pycache__/keypoint_head.cpython-310.pyc differ
diff --git a/detectron2/modeling/roi_heads/__pycache__/keypoint_head.cpython-39.pyc b/detectron2/modeling/roi_heads/__pycache__/keypoint_head.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..874b705991d3660b5a20d90f54f35815164013cc
Binary files /dev/null and b/detectron2/modeling/roi_heads/__pycache__/keypoint_head.cpython-39.pyc differ
diff --git a/detectron2/modeling/roi_heads/__pycache__/mask_head.cpython-310.pyc b/detectron2/modeling/roi_heads/__pycache__/mask_head.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..57f6eadf57ff1bdf02f83590d48e028c788ca852
Binary files /dev/null and b/detectron2/modeling/roi_heads/__pycache__/mask_head.cpython-310.pyc differ
diff --git a/detectron2/modeling/roi_heads/__pycache__/mask_head.cpython-39.pyc b/detectron2/modeling/roi_heads/__pycache__/mask_head.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..18f8e04a0735cb538ba25000499bcbbeb0e5d45e
Binary files /dev/null and b/detectron2/modeling/roi_heads/__pycache__/mask_head.cpython-39.pyc differ
diff --git a/detectron2/modeling/roi_heads/__pycache__/roi_heads.cpython-310.pyc b/detectron2/modeling/roi_heads/__pycache__/roi_heads.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..829e271f7c25e9043a8e591e5959d9358ce3ef5a
Binary files /dev/null and b/detectron2/modeling/roi_heads/__pycache__/roi_heads.cpython-310.pyc differ
diff --git a/detectron2/modeling/roi_heads/__pycache__/roi_heads.cpython-39.pyc b/detectron2/modeling/roi_heads/__pycache__/roi_heads.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..aea3d0e51cb7afc49dba3f0b1eaada8869cc8b96
Binary files /dev/null and b/detectron2/modeling/roi_heads/__pycache__/roi_heads.cpython-39.pyc differ
diff --git a/detectron2/modeling/roi_heads/__pycache__/rotated_fast_rcnn.cpython-310.pyc b/detectron2/modeling/roi_heads/__pycache__/rotated_fast_rcnn.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2dd6d5be4c3ca8437d3d68cfaf2a693c3eab9756
Binary files /dev/null and b/detectron2/modeling/roi_heads/__pycache__/rotated_fast_rcnn.cpython-310.pyc differ
diff --git a/detectron2/modeling/roi_heads/__pycache__/rotated_fast_rcnn.cpython-39.pyc b/detectron2/modeling/roi_heads/__pycache__/rotated_fast_rcnn.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4d2fdaf4f5fd8c44023814d39e0e294e0ad6cc97
Binary files /dev/null and b/detectron2/modeling/roi_heads/__pycache__/rotated_fast_rcnn.cpython-39.pyc differ
diff --git a/detectron2/modeling/roi_heads/box_head.py b/detectron2/modeling/roi_heads/box_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..5d0370b0400d9268f13c905e4096a84ce42e9bfd
--- /dev/null
+++ b/detectron2/modeling/roi_heads/box_head.py
@@ -0,0 +1,118 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+import numpy as np
+from typing import List
+import fvcore.nn.weight_init as weight_init
+import torch
+from torch import nn
+
+from detectron2.config import configurable
+from detectron2.layers import Conv2d, ShapeSpec, get_norm
+from detectron2.utils.registry import Registry
+
+__all__ = ["FastRCNNConvFCHead", "build_box_head", "ROI_BOX_HEAD_REGISTRY"]
+
+ROI_BOX_HEAD_REGISTRY = Registry("ROI_BOX_HEAD")
+ROI_BOX_HEAD_REGISTRY.__doc__ = """
+Registry for box heads, which make box predictions from per-region features.
+
+The registered object will be called with `obj(cfg, input_shape)`.
+"""
+
+
+# To get torchscript support, we make the head a subclass of `nn.Sequential`.
+# Therefore, to add new layers in this head class, please make sure they are
+# added in the order they will be used in forward().
+@ROI_BOX_HEAD_REGISTRY.register()
+class FastRCNNConvFCHead(nn.Sequential):
+    """
+    A head with several 3x3 conv layers (each followed by norm & relu) and then
+    several fc layers (each followed by relu).
+    """
+
+    @configurable
+    def __init__(
+        self, input_shape: ShapeSpec, *, conv_dims: List[int], fc_dims: List[int], conv_norm=""
+    ):
+        """
+        NOTE: this interface is experimental.
+
+        Args:
+            input_shape (ShapeSpec): shape of the input feature.
+            conv_dims (list[int]): the output dimensions of the conv layers
+            fc_dims (list[int]): the output dimensions of the fc layers
+            conv_norm (str or callable): normalization for the conv layers.
+                See :func:`detectron2.layers.get_norm` for supported types.
+        """
+        super().__init__()
+        assert len(conv_dims) + len(fc_dims) > 0
+
+        self._output_size = (input_shape.channels, input_shape.height, input_shape.width)
+
+        self.conv_norm_relus = []
+        for k, conv_dim in enumerate(conv_dims):
+            conv = Conv2d(
+                self._output_size[0],
+                conv_dim,
+                kernel_size=3,
+                padding=1,
+                bias=not conv_norm,
+                norm=get_norm(conv_norm, conv_dim),
+                activation=nn.ReLU(),
+            )
+            self.add_module("conv{}".format(k + 1), conv)
+            self.conv_norm_relus.append(conv)
+            self._output_size = (conv_dim, self._output_size[1], self._output_size[2])
+
+        self.fcs = []
+        for k, fc_dim in enumerate(fc_dims):
+            if k == 0:
+                self.add_module("flatten", nn.Flatten())
+            fc = nn.Linear(int(np.prod(self._output_size)), fc_dim)
+            self.add_module("fc{}".format(k + 1), fc)
+            self.add_module("fc_relu{}".format(k + 1), nn.ReLU())
+            self.fcs.append(fc)
+            self._output_size = fc_dim
+
+        for layer in self.conv_norm_relus:
+            weight_init.c2_msra_fill(layer)
+        for layer in self.fcs:
+            weight_init.c2_xavier_fill(layer)
+
+    @classmethod
+    def from_config(cls, cfg, input_shape):
+        num_conv = cfg.MODEL.ROI_BOX_HEAD.NUM_CONV
+        conv_dim = cfg.MODEL.ROI_BOX_HEAD.CONV_DIM
+        num_fc = cfg.MODEL.ROI_BOX_HEAD.NUM_FC
+        fc_dim = cfg.MODEL.ROI_BOX_HEAD.FC_DIM
+        return {
+            "input_shape": input_shape,
+            "conv_dims": [conv_dim] * num_conv,
+            "fc_dims": [fc_dim] * num_fc,
+            "conv_norm": cfg.MODEL.ROI_BOX_HEAD.NORM,
+        }
+
+    def forward(self, x):
+        for layer in self:
+            x = layer(x)
+        return x
+
+    @property
+    @torch.jit.unused
+    def output_shape(self):
+        """
+        Returns:
+            ShapeSpec: the output feature shape
+        """
+        o = self._output_size
+        if isinstance(o, int):
+            return ShapeSpec(channels=o)
+        else:
+            return ShapeSpec(channels=o[0], height=o[1], width=o[2])
+
+
+def build_box_head(cfg, input_shape):
+    """
+    Build a box head defined by `cfg.MODEL.ROI_BOX_HEAD.NAME`.
+    """
+    name = cfg.MODEL.ROI_BOX_HEAD.NAME
+    return ROI_BOX_HEAD_REGISTRY.get(name)(cfg, input_shape)
diff --git a/detectron2/modeling/roi_heads/cascade_rcnn.py b/detectron2/modeling/roi_heads/cascade_rcnn.py
new file mode 100644
index 0000000000000000000000000000000000000000..a0ca70fe23a1d406ee9bed6204a987d7e0708b91
--- /dev/null
+++ b/detectron2/modeling/roi_heads/cascade_rcnn.py
@@ -0,0 +1,299 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+from typing import List
+import torch
+from torch import nn
+from torch.autograd.function import Function
+
+from detectron2.config import configurable
+from detectron2.layers import ShapeSpec
+from detectron2.structures import Boxes, Instances, pairwise_iou
+from detectron2.utils.events import get_event_storage
+
+from ..box_regression import Box2BoxTransform
+from ..matcher import Matcher
+from ..poolers import ROIPooler
+from .box_head import build_box_head
+from .fast_rcnn import FastRCNNOutputLayers, fast_rcnn_inference
+from .roi_heads import ROI_HEADS_REGISTRY, StandardROIHeads
+
+
+class _ScaleGradient(Function):
+    @staticmethod
+    def forward(ctx, input, scale):
+        ctx.scale = scale
+        return input
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        return grad_output * ctx.scale, None
+
+
+@ROI_HEADS_REGISTRY.register()
+class CascadeROIHeads(StandardROIHeads):
+    """
+    The ROI heads that implement :paper:`Cascade R-CNN`.
+    """
+
+    @configurable
+    def __init__(
+        self,
+        *,
+        box_in_features: List[str],
+        box_pooler: ROIPooler,
+        box_heads: List[nn.Module],
+        box_predictors: List[nn.Module],
+        proposal_matchers: List[Matcher],
+        **kwargs,
+    ):
+        """
+        NOTE: this interface is experimental.
+
+        Args:
+            box_pooler (ROIPooler): pooler that extracts region features from given boxes
+            box_heads (list[nn.Module]): box head for each cascade stage
+            box_predictors (list[nn.Module]): box predictor for each cascade stage
+            proposal_matchers (list[Matcher]): matcher with different IoU thresholds to
+                match boxes with ground truth for each stage. The first matcher matches
+                RPN proposals with ground truth, the other matchers use boxes predicted
+                by the previous stage as proposals and match them with ground truth.
+        """
+        assert "proposal_matcher" not in kwargs, (
+            "CascadeROIHeads takes 'proposal_matchers=' for each stage instead "
+            "of one 'proposal_matcher='."
+        )
+        # The first matcher matches RPN proposals with ground truth, done in the base class
+        kwargs["proposal_matcher"] = proposal_matchers[0]
+        num_stages = self.num_cascade_stages = len(box_heads)
+        box_heads = nn.ModuleList(box_heads)
+        box_predictors = nn.ModuleList(box_predictors)
+        assert len(box_predictors) == num_stages, f"{len(box_predictors)} != {num_stages}!"
+        assert len(proposal_matchers) == num_stages, f"{len(proposal_matchers)} != {num_stages}!"
+        super().__init__(
+            box_in_features=box_in_features,
+            box_pooler=box_pooler,
+            box_head=box_heads,
+            box_predictor=box_predictors,
+            **kwargs,
+        )
+        self.proposal_matchers = proposal_matchers
+
+    @classmethod
+    def from_config(cls, cfg, input_shape):
+        ret = super().from_config(cfg, input_shape)
+        ret.pop("proposal_matcher")
+        return ret
+
+    @classmethod
+    def _init_box_head(cls, cfg, input_shape):
+        # fmt: off
+        in_features              = cfg.MODEL.ROI_HEADS.IN_FEATURES
+        pooler_resolution        = cfg.MODEL.ROI_BOX_HEAD.POOLER_RESOLUTION
+        pooler_scales            = tuple(1.0 / input_shape[k].stride for k in in_features)
+        sampling_ratio           = cfg.MODEL.ROI_BOX_HEAD.POOLER_SAMPLING_RATIO
+        pooler_type              = cfg.MODEL.ROI_BOX_HEAD.POOLER_TYPE
+        cascade_bbox_reg_weights = cfg.MODEL.ROI_BOX_CASCADE_HEAD.BBOX_REG_WEIGHTS
+        cascade_ious             = cfg.MODEL.ROI_BOX_CASCADE_HEAD.IOUS
+        assert len(cascade_bbox_reg_weights) == len(cascade_ious)
+        assert cfg.MODEL.ROI_BOX_HEAD.CLS_AGNOSTIC_BBOX_REG,  \
+            "CascadeROIHeads only support class-agnostic regression now!"
+        assert cascade_ious[0] == cfg.MODEL.ROI_HEADS.IOU_THRESHOLDS[0]
+        # fmt: on
+
+        in_channels = [input_shape[f].channels for f in in_features]
+        # Check all channel counts are equal
+        assert len(set(in_channels)) == 1, in_channels
+        in_channels = in_channels[0]
+
+        box_pooler = ROIPooler(
+            output_size=pooler_resolution,
+            scales=pooler_scales,
+            sampling_ratio=sampling_ratio,
+            pooler_type=pooler_type,
+        )
+        pooled_shape = ShapeSpec(
+            channels=in_channels, width=pooler_resolution, height=pooler_resolution
+        )
+
+        box_heads, box_predictors, proposal_matchers = [], [], []
+        for match_iou, bbox_reg_weights in zip(cascade_ious, cascade_bbox_reg_weights):
+            box_head = build_box_head(cfg, pooled_shape)
+            box_heads.append(box_head)
+            box_predictors.append(
+                FastRCNNOutputLayers(
+                    cfg,
+                    box_head.output_shape,
+                    box2box_transform=Box2BoxTransform(weights=bbox_reg_weights),
+                )
+            )
+            proposal_matchers.append(Matcher([match_iou], [0, 1], allow_low_quality_matches=False))
+        return {
+            "box_in_features": in_features,
+            "box_pooler": box_pooler,
+            "box_heads": box_heads,
+            "box_predictors": box_predictors,
+            "proposal_matchers": proposal_matchers,
+        }
+
+    def forward(self, images, features, proposals, targets=None):
+        del images
+        if self.training:
+            proposals = self.label_and_sample_proposals(proposals, targets)
+
+        if self.training:
+            # Need targets to box head
+            losses = self._forward_box(features, proposals, targets)
+            losses.update(self._forward_mask(features, proposals))
+            losses.update(self._forward_keypoint(features, proposals))
+            return proposals, losses
+        else:
+            pred_instances = self._forward_box(features, proposals)
+            pred_instances = self.forward_with_given_boxes(features, pred_instances)
+            return pred_instances, {}
+
+    def _forward_box(self, features, proposals, targets=None):
+        """
+        Args:
+            features, targets: the same as in
+                Same as in :meth:`ROIHeads.forward`.
+            proposals (list[Instances]): the per-image object proposals with
+                their matching ground truth.
+                Each has fields "proposal_boxes", and "objectness_logits",
+                "gt_classes", "gt_boxes".
+        """
+        features = [features[f] for f in self.box_in_features]
+        head_outputs = []  # (predictor, predictions, proposals)
+        prev_pred_boxes = None
+        image_sizes = [x.image_size for x in proposals]
+        for k in range(self.num_cascade_stages):
+            if k > 0:
+                # The output boxes of the previous stage are used to create the input
+                # proposals of the next stage.
+                proposals = self._create_proposals_from_boxes(prev_pred_boxes, image_sizes)
+                if self.training:
+                    proposals = self._match_and_label_boxes(proposals, k, targets)
+            predictions = self._run_stage(features, proposals, k)
+            prev_pred_boxes = self.box_predictor[k].predict_boxes(predictions, proposals)
+            head_outputs.append((self.box_predictor[k], predictions, proposals))
+
+        if self.training:
+            losses = {}
+            storage = get_event_storage()
+            for stage, (predictor, predictions, proposals) in enumerate(head_outputs):
+                with storage.name_scope("stage{}".format(stage)):
+                    stage_losses = predictor.losses(predictions, proposals)
+                losses.update({k + "_stage{}".format(stage): v for k, v in stage_losses.items()})
+            return losses
+        else:
+            # Each is a list[Tensor] of length #image. Each tensor is Ri x (K+1)
+            scores_per_stage = [h[0].predict_probs(h[1], h[2]) for h in head_outputs]
+
+            # Average the scores across heads
+            scores = [
+                sum(list(scores_per_image)) * (1.0 / self.num_cascade_stages)
+                for scores_per_image in zip(*scores_per_stage)
+            ]
+            # Use the boxes of the last head
+            predictor, predictions, proposals = head_outputs[-1]
+            boxes = predictor.predict_boxes(predictions, proposals)
+            pred_instances, _ = fast_rcnn_inference(
+                boxes,
+                scores,
+                image_sizes,
+                predictor.test_score_thresh,
+                predictor.test_nms_thresh,
+                predictor.test_topk_per_image,
+            )
+            return pred_instances
+
+    @torch.no_grad()
+    def _match_and_label_boxes(self, proposals, stage, targets):
+        """
+        Match proposals with groundtruth using the matcher at the given stage.
+        Label the proposals as foreground or background based on the match.
+
+        Args:
+            proposals (list[Instances]): One Instances for each image, with
+                the field "proposal_boxes".
+            stage (int): the current stage
+            targets (list[Instances]): the ground truth instances
+
+        Returns:
+            list[Instances]: the same proposals, but with fields "gt_classes" and "gt_boxes"
+        """
+        num_fg_samples, num_bg_samples = [], []
+        for proposals_per_image, targets_per_image in zip(proposals, targets):
+            match_quality_matrix = pairwise_iou(
+                targets_per_image.gt_boxes, proposals_per_image.proposal_boxes
+            )
+            # proposal_labels are 0 or 1
+            matched_idxs, proposal_labels = self.proposal_matchers[stage](match_quality_matrix)
+            if len(targets_per_image) > 0:
+                gt_classes = targets_per_image.gt_classes[matched_idxs]
+                # Label unmatched proposals (0 label from matcher) as background (label=num_classes)
+                gt_classes[proposal_labels == 0] = self.num_classes
+                gt_boxes = targets_per_image.gt_boxes[matched_idxs]
+            else:
+                gt_classes = torch.zeros_like(matched_idxs) + self.num_classes
+                gt_boxes = Boxes(
+                    targets_per_image.gt_boxes.tensor.new_zeros((len(proposals_per_image), 4))
+                )
+            proposals_per_image.gt_classes = gt_classes
+            proposals_per_image.gt_boxes = gt_boxes
+
+            num_fg_samples.append((proposal_labels == 1).sum().item())
+            num_bg_samples.append(proposal_labels.numel() - num_fg_samples[-1])
+
+        # Log the number of fg/bg samples in each stage
+        storage = get_event_storage()
+        storage.put_scalar(
+            "stage{}/roi_head/num_fg_samples".format(stage),
+            sum(num_fg_samples) / len(num_fg_samples),
+        )
+        storage.put_scalar(
+            "stage{}/roi_head/num_bg_samples".format(stage),
+            sum(num_bg_samples) / len(num_bg_samples),
+        )
+        return proposals
+
+    def _run_stage(self, features, proposals, stage):
+        """
+        Args:
+            features (list[Tensor]): #lvl input features to ROIHeads
+            proposals (list[Instances]): #image Instances, with the field "proposal_boxes"
+            stage (int): the current stage
+
+        Returns:
+            Same output as `FastRCNNOutputLayers.forward()`.
+        """
+        box_features = self.box_pooler(features, [x.proposal_boxes for x in proposals])
+        # The original implementation averages the losses among heads,
+        # but scale up the parameter gradients of the heads.
+        # This is equivalent to adding the losses among heads,
+        # but scale down the gradients on features.
+        if self.training:
+            box_features = _ScaleGradient.apply(box_features, 1.0 / self.num_cascade_stages)
+        box_features = self.box_head[stage](box_features)
+        return self.box_predictor[stage](box_features)
+
+    def _create_proposals_from_boxes(self, boxes, image_sizes):
+        """
+        Args:
+            boxes (list[Tensor]): per-image predicted boxes, each of shape Ri x 4
+            image_sizes (list[tuple]): list of image shapes in (h, w)
+
+        Returns:
+            list[Instances]: per-image proposals with the given boxes.
+        """
+        # Just like RPN, the proposals should not have gradients
+        boxes = [Boxes(b.detach()) for b in boxes]
+        proposals = []
+        for boxes_per_image, image_size in zip(boxes, image_sizes):
+            boxes_per_image.clip(image_size)
+            if self.training:
+                # do not filter empty boxes at inference time,
+                # because the scores from each stage need to be aligned and added later
+                boxes_per_image = boxes_per_image[boxes_per_image.nonempty()]
+            prop = Instances(image_size)
+            prop.proposal_boxes = boxes_per_image
+            proposals.append(prop)
+        return proposals
diff --git a/detectron2/modeling/roi_heads/fast_rcnn.py b/detectron2/modeling/roi_heads/fast_rcnn.py
new file mode 100644
index 0000000000000000000000000000000000000000..039e2490fae27d6e837b57492a230bc556da845f
--- /dev/null
+++ b/detectron2/modeling/roi_heads/fast_rcnn.py
@@ -0,0 +1,569 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+import logging
+from typing import Callable, Dict, List, Optional, Tuple, Union
+import torch
+from torch import nn
+from torch.nn import functional as F
+
+from detectron2.config import configurable
+from detectron2.data.detection_utils import get_fed_loss_cls_weights
+from detectron2.layers import ShapeSpec, batched_nms, cat, cross_entropy, nonzero_tuple
+from detectron2.modeling.box_regression import Box2BoxTransform, _dense_box_regression_loss
+from detectron2.structures import Boxes, Instances
+from detectron2.utils.events import get_event_storage
+
+__all__ = ["fast_rcnn_inference", "FastRCNNOutputLayers"]
+
+
+logger = logging.getLogger(__name__)
+
+"""
+Shape shorthand in this module:
+
+    N: number of images in the minibatch
+    R: number of ROIs, combined over all images, in the minibatch
+    Ri: number of ROIs in image i
+    K: number of foreground classes. E.g.,there are 80 foreground classes in COCO.
+
+Naming convention:
+
+    deltas: refers to the 4-d (dx, dy, dw, dh) deltas that parameterize the box2box
+    transform (see :class:`box_regression.Box2BoxTransform`).
+
+    pred_class_logits: predicted class scores in [-inf, +inf]; use
+        softmax(pred_class_logits) to estimate P(class).
+
+    gt_classes: ground-truth classification labels in [0, K], where [0, K) represent
+        foreground object classes and K represents the background class.
+
+    pred_proposal_deltas: predicted box2box transform deltas for transforming proposals
+        to detection box predictions.
+
+    gt_proposal_deltas: ground-truth box2box transform deltas
+"""
+
+
+def fast_rcnn_inference(
+    boxes: List[torch.Tensor],
+    scores: List[torch.Tensor],
+    image_shapes: List[Tuple[int, int]],
+    score_thresh: float,
+    nms_thresh: float,
+    topk_per_image: int,
+):
+    """
+    Call `fast_rcnn_inference_single_image` for all images.
+
+    Args:
+        boxes (list[Tensor]): A list of Tensors of predicted class-specific or class-agnostic
+            boxes for each image. Element i has shape (Ri, K * 4) if doing
+            class-specific regression, or (Ri, 4) if doing class-agnostic
+            regression, where Ri is the number of predicted objects for image i.
+            This is compatible with the output of :meth:`FastRCNNOutputLayers.predict_boxes`.
+        scores (list[Tensor]): A list of Tensors of predicted class scores for each image.
+            Element i has shape (Ri, K + 1), where Ri is the number of predicted objects
+            for image i. Compatible with the output of :meth:`FastRCNNOutputLayers.predict_probs`.
+        image_shapes (list[tuple]): A list of (width, height) tuples for each image in the batch.
+        score_thresh (float): Only return detections with a confidence score exceeding this
+            threshold.
+        nms_thresh (float):  The threshold to use for box non-maximum suppression. Value in [0, 1].
+        topk_per_image (int): The number of top scoring detections to return. Set < 0 to return
+            all detections.
+
+    Returns:
+        instances: (list[Instances]): A list of N instances, one for each image in the batch,
+            that stores the topk most confidence detections.
+        kept_indices: (list[Tensor]): A list of 1D tensor of length of N, each element indicates
+            the corresponding boxes/scores index in [0, Ri) from the input, for image i.
+    """
+    result_per_image = [
+        fast_rcnn_inference_single_image(
+            boxes_per_image, scores_per_image, image_shape, score_thresh, nms_thresh, topk_per_image
+        )
+        for scores_per_image, boxes_per_image, image_shape in zip(scores, boxes, image_shapes)
+    ]
+    return [x[0] for x in result_per_image], [x[1] for x in result_per_image]
+
+
+def _log_classification_stats(pred_logits, gt_classes, prefix="fast_rcnn"):
+    """
+    Log the classification metrics to EventStorage.
+
+    Args:
+        pred_logits: Rx(K+1) logits. The last column is for background class.
+        gt_classes: R labels
+    """
+    num_instances = gt_classes.numel()
+    if num_instances == 0:
+        return
+    pred_classes = pred_logits.argmax(dim=1)
+    bg_class_ind = pred_logits.shape[1] - 1
+
+    fg_inds = (gt_classes >= 0) & (gt_classes < bg_class_ind)
+    num_fg = fg_inds.nonzero().numel()
+    fg_gt_classes = gt_classes[fg_inds]
+    fg_pred_classes = pred_classes[fg_inds]
+
+    num_false_negative = (fg_pred_classes == bg_class_ind).nonzero().numel()
+    num_accurate = (pred_classes == gt_classes).nonzero().numel()
+    fg_num_accurate = (fg_pred_classes == fg_gt_classes).nonzero().numel()
+
+    storage = get_event_storage()
+    storage.put_scalar(f"{prefix}/cls_accuracy", num_accurate / num_instances)
+    if num_fg > 0:
+        storage.put_scalar(f"{prefix}/fg_cls_accuracy", fg_num_accurate / num_fg)
+        storage.put_scalar(f"{prefix}/false_negative", num_false_negative / num_fg)
+
+
+def fast_rcnn_inference_single_image(
+    boxes,
+    scores,
+    image_shape: Tuple[int, int],
+    score_thresh: float,
+    nms_thresh: float,
+    topk_per_image: int,
+):
+    """
+    Single-image inference. Return bounding-box detection results by thresholding
+    on scores and applying non-maximum suppression (NMS).
+
+    Args:
+        Same as `fast_rcnn_inference`, but with boxes, scores, and image shapes
+        per image.
+
+    Returns:
+        Same as `fast_rcnn_inference`, but for only one image.
+    """
+    valid_mask = torch.isfinite(boxes).all(dim=1) & torch.isfinite(scores).all(dim=1)
+    if not valid_mask.all():
+        boxes = boxes[valid_mask]
+        scores = scores[valid_mask]
+
+    scores = scores[:, :-1]
+    num_bbox_reg_classes = boxes.shape[1] // 4
+    # Convert to Boxes to use the `clip` function ...
+    boxes = Boxes(boxes.reshape(-1, 4))
+    boxes.clip(image_shape)
+    boxes = boxes.tensor.view(-1, num_bbox_reg_classes, 4)  # R x C x 4
+
+    # 1. Filter results based on detection scores. It can make NMS more efficient
+    #    by filtering out low-confidence detections.
+    filter_mask = scores > score_thresh  # R x K
+    # R' x 2. First column contains indices of the R predictions;
+    # Second column contains indices of classes.
+    filter_inds = filter_mask.nonzero()
+    if num_bbox_reg_classes == 1:
+        boxes = boxes[filter_inds[:, 0], 0]
+    else:
+        boxes = boxes[filter_mask]
+    scores = scores[filter_mask]
+
+    # 2. Apply NMS for each class independently.
+    keep = batched_nms(boxes, scores, filter_inds[:, 1], nms_thresh)
+    if topk_per_image >= 0:
+        keep = keep[:topk_per_image]
+    boxes, scores, filter_inds = boxes[keep], scores[keep], filter_inds[keep]
+
+    result = Instances(image_shape)
+    result.pred_boxes = Boxes(boxes)
+    result.scores = scores
+    result.pred_classes = filter_inds[:, 1]
+    return result, filter_inds[:, 0]
+
+
+class FastRCNNOutputLayers(nn.Module):
+    """
+    Two linear layers for predicting Fast R-CNN outputs:
+
+    1. proposal-to-detection box regression deltas
+    2. classification scores
+    """
+
+    @configurable
+    def __init__(
+        self,
+        input_shape: ShapeSpec,
+        *,
+        box2box_transform,
+        num_classes: int,
+        test_score_thresh: float = 0.0,
+        test_nms_thresh: float = 0.5,
+        test_topk_per_image: int = 100,
+        cls_agnostic_bbox_reg: bool = False,
+        smooth_l1_beta: float = 0.0,
+        box_reg_loss_type: str = "smooth_l1",
+        loss_weight: Union[float, Dict[str, float]] = 1.0,
+        use_fed_loss: bool = False,
+        use_sigmoid_ce: bool = False,
+        get_fed_loss_cls_weights: Optional[Callable] = None,
+        fed_loss_num_classes: int = 50,
+    ):
+        """
+        NOTE: this interface is experimental.
+
+        Args:
+            input_shape (ShapeSpec): shape of the input feature to this module
+            box2box_transform (Box2BoxTransform or Box2BoxTransformRotated):
+            num_classes (int): number of foreground classes
+            test_score_thresh (float): threshold to filter predictions results.
+            test_nms_thresh (float): NMS threshold for prediction results.
+            test_topk_per_image (int): number of top predictions to produce per image.
+            cls_agnostic_bbox_reg (bool): whether to use class agnostic for bbox regression
+            smooth_l1_beta (float): transition point from L1 to L2 loss. Only used if
+                `box_reg_loss_type` is "smooth_l1"
+            box_reg_loss_type (str): Box regression loss type. One of: "smooth_l1", "giou",
+                "diou", "ciou"
+            loss_weight (float|dict): weights to use for losses. Can be single float for weighting
+                all losses, or a dict of individual weightings. Valid dict keys are:
+                    * "loss_cls": applied to classification loss
+                    * "loss_box_reg": applied to box regression loss
+            use_fed_loss (bool): whether to use federated loss which samples additional negative
+                classes to calculate the loss
+            use_sigmoid_ce (bool): whether to calculate the loss using weighted average of binary
+                cross entropy with logits. This could be used together with federated loss
+            get_fed_loss_cls_weights (Callable): a callable which takes dataset name and frequency
+                weight power, and returns the probabilities to sample negative classes for
+                federated loss. The implementation can be found in
+                detectron2/data/detection_utils.py
+            fed_loss_num_classes (int): number of federated classes to keep in total
+        """
+        super().__init__()
+        if isinstance(input_shape, int):  # some backward compatibility
+            input_shape = ShapeSpec(channels=input_shape)
+        self.num_classes = num_classes
+        input_size = input_shape.channels * (input_shape.width or 1) * (input_shape.height or 1)
+        # prediction layer for num_classes foreground classes and one background class (hence + 1)
+        self.cls_score = nn.Linear(input_size, num_classes + 1)
+        num_bbox_reg_classes = 1 if cls_agnostic_bbox_reg else num_classes
+        box_dim = len(box2box_transform.weights)
+        self.bbox_pred = nn.Linear(input_size, num_bbox_reg_classes * box_dim)
+
+        nn.init.normal_(self.cls_score.weight, std=0.01)
+        nn.init.normal_(self.bbox_pred.weight, std=0.001)
+        for l in [self.cls_score, self.bbox_pred]:
+            nn.init.constant_(l.bias, 0)
+
+        self.box2box_transform = box2box_transform
+        self.smooth_l1_beta = smooth_l1_beta
+        self.test_score_thresh = test_score_thresh
+        self.test_nms_thresh = test_nms_thresh
+        self.test_topk_per_image = test_topk_per_image
+        self.box_reg_loss_type = box_reg_loss_type
+        if isinstance(loss_weight, float):
+            loss_weight = {"loss_cls": loss_weight, "loss_box_reg": loss_weight}
+        self.loss_weight = loss_weight
+        self.use_fed_loss = use_fed_loss
+        self.use_sigmoid_ce = use_sigmoid_ce
+        self.fed_loss_num_classes = fed_loss_num_classes
+
+        if self.use_fed_loss:
+            assert self.use_sigmoid_ce, "Please use sigmoid cross entropy loss with federated loss"
+            fed_loss_cls_weights = get_fed_loss_cls_weights()
+            assert (
+                len(fed_loss_cls_weights) == self.num_classes
+            ), "Please check the provided fed_loss_cls_weights. Their size should match num_classes"
+            self.register_buffer("fed_loss_cls_weights", fed_loss_cls_weights)
+
+    @classmethod
+    def from_config(cls, cfg, input_shape):
+        return {
+            "input_shape": input_shape,
+            "box2box_transform": Box2BoxTransform(weights=cfg.MODEL.ROI_BOX_HEAD.BBOX_REG_WEIGHTS),
+            # fmt: off
+            "num_classes"               : cfg.MODEL.ROI_HEADS.NUM_CLASSES,
+            "cls_agnostic_bbox_reg"     : cfg.MODEL.ROI_BOX_HEAD.CLS_AGNOSTIC_BBOX_REG,
+            "smooth_l1_beta"            : cfg.MODEL.ROI_BOX_HEAD.SMOOTH_L1_BETA,
+            "test_score_thresh"         : cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST,
+            "test_nms_thresh"           : cfg.MODEL.ROI_HEADS.NMS_THRESH_TEST,
+            "test_topk_per_image"       : cfg.TEST.DETECTIONS_PER_IMAGE,
+            "box_reg_loss_type"         : cfg.MODEL.ROI_BOX_HEAD.BBOX_REG_LOSS_TYPE,
+            "loss_weight"               : {"loss_box_reg": cfg.MODEL.ROI_BOX_HEAD.BBOX_REG_LOSS_WEIGHT},  # noqa
+            "use_fed_loss"              : cfg.MODEL.ROI_BOX_HEAD.USE_FED_LOSS,
+            "use_sigmoid_ce"            : cfg.MODEL.ROI_BOX_HEAD.USE_SIGMOID_CE,
+            "get_fed_loss_cls_weights"  : lambda: get_fed_loss_cls_weights(dataset_names=cfg.DATASETS.TRAIN, freq_weight_power=cfg.MODEL.ROI_BOX_HEAD.FED_LOSS_FREQ_WEIGHT_POWER),  # noqa
+            "fed_loss_num_classes"      : cfg.MODEL.ROI_BOX_HEAD.FED_LOSS_NUM_CLASSES,
+            # fmt: on
+        }
+
+    def forward(self, x):
+        """
+        Args:
+            x: per-region features of shape (N, ...) for N bounding boxes to predict.
+
+        Returns:
+            (Tensor, Tensor):
+            First tensor: shape (N,K+1), scores for each of the N box. Each row contains the
+            scores for K object categories and 1 background class.
+
+            Second tensor: bounding box regression deltas for each box. Shape is shape (N,Kx4),
+            or (N,4) for class-agnostic regression.
+        """
+        if x.dim() > 2:
+            x = torch.flatten(x, start_dim=1)
+        scores = self.cls_score(x)
+        proposal_deltas = self.bbox_pred(x)
+        return scores, proposal_deltas
+
+    def losses(self, predictions, proposals):
+        """
+        Args:
+            predictions: return values of :meth:`forward()`.
+            proposals (list[Instances]): proposals that match the features that were used
+                to compute predictions. The fields ``proposal_boxes``, ``gt_boxes``,
+                ``gt_classes`` are expected.
+
+        Returns:
+            Dict[str, Tensor]: dict of losses
+        """
+        scores, proposal_deltas = predictions
+
+        # parse classification outputs
+        gt_classes = (
+            cat([p.gt_classes for p in proposals], dim=0) if len(proposals) else torch.empty(0)
+        )
+        _log_classification_stats(scores, gt_classes)
+
+        # parse box regression outputs
+        if len(proposals):
+            proposal_boxes = cat([p.proposal_boxes.tensor for p in proposals], dim=0)  # Nx4
+            assert not proposal_boxes.requires_grad, "Proposals should not require gradients!"
+            # If "gt_boxes" does not exist, the proposals must be all negative and
+            # should not be included in regression loss computation.
+            # Here we just use proposal_boxes as an arbitrary placeholder because its
+            # value won't be used in self.box_reg_loss().
+            gt_boxes = cat(
+                [(p.gt_boxes if p.has("gt_boxes") else p.proposal_boxes).tensor for p in proposals],
+                dim=0,
+            )
+        else:
+            proposal_boxes = gt_boxes = torch.empty((0, 4), device=proposal_deltas.device)
+
+        if self.use_sigmoid_ce:
+            loss_cls = self.sigmoid_cross_entropy_loss(scores, gt_classes)
+        else:
+            loss_cls = cross_entropy(scores, gt_classes, reduction="mean")
+
+        losses = {
+            "loss_cls": loss_cls,
+            "loss_box_reg": self.box_reg_loss(
+                proposal_boxes, gt_boxes, proposal_deltas, gt_classes
+            ),
+        }
+        return {k: v * self.loss_weight.get(k, 1.0) for k, v in losses.items()}
+
+    # Implementation from https://github.com/xingyizhou/CenterNet2/blob/master/projects/CenterNet2/centernet/modeling/roi_heads/fed_loss.py  # noqa
+    # with slight modifications
+    def get_fed_loss_classes(self, gt_classes, num_fed_loss_classes, num_classes, weight):
+        """
+        Args:
+            gt_classes: a long tensor of shape R that contains the gt class label of each proposal.
+            num_fed_loss_classes: minimum number of classes to keep when calculating federated loss.
+            Will sample negative classes if number of unique gt_classes is smaller than this value.
+            num_classes: number of foreground classes
+            weight: probabilities used to sample negative classes
+
+        Returns:
+            Tensor:
+                classes to keep when calculating the federated loss, including both unique gt
+                classes and sampled negative classes.
+        """
+        unique_gt_classes = torch.unique(gt_classes)
+        prob = unique_gt_classes.new_ones(num_classes + 1).float()
+        prob[-1] = 0
+        if len(unique_gt_classes) < num_fed_loss_classes:
+            prob[:num_classes] = weight.float().clone()
+            prob[unique_gt_classes] = 0
+            sampled_negative_classes = torch.multinomial(
+                prob, num_fed_loss_classes - len(unique_gt_classes), replacement=False
+            )
+            fed_loss_classes = torch.cat([unique_gt_classes, sampled_negative_classes])
+        else:
+            fed_loss_classes = unique_gt_classes
+        return fed_loss_classes
+
+    # Implementation from https://github.com/xingyizhou/CenterNet2/blob/master/projects/CenterNet2/centernet/modeling/roi_heads/custom_fast_rcnn.py#L113  # noqa
+    # with slight modifications
+    def sigmoid_cross_entropy_loss(self, pred_class_logits, gt_classes):
+        """
+        Args:
+            pred_class_logits: shape (N, K+1), scores for each of the N box. Each row contains the
+            scores for K object categories and 1 background class
+            gt_classes: a long tensor of shape R that contains the gt class label of each proposal.
+        """
+        if pred_class_logits.numel() == 0:
+            return pred_class_logits.new_zeros([1])[0]
+
+        N = pred_class_logits.shape[0]
+        K = pred_class_logits.shape[1] - 1
+
+        target = pred_class_logits.new_zeros(N, K + 1)
+        target[range(len(gt_classes)), gt_classes] = 1
+        target = target[:, :K]
+
+        cls_loss = F.binary_cross_entropy_with_logits(
+            pred_class_logits[:, :-1], target, reduction="none"
+        )
+
+        if self.use_fed_loss:
+            fed_loss_classes = self.get_fed_loss_classes(
+                gt_classes,
+                num_fed_loss_classes=self.fed_loss_num_classes,
+                num_classes=K,
+                weight=self.fed_loss_cls_weights,
+            )
+            fed_loss_classes_mask = fed_loss_classes.new_zeros(K + 1)
+            fed_loss_classes_mask[fed_loss_classes] = 1
+            fed_loss_classes_mask = fed_loss_classes_mask[:K]
+            weight = fed_loss_classes_mask.view(1, K).expand(N, K).float()
+        else:
+            weight = 1
+
+        loss = torch.sum(cls_loss * weight) / N
+        return loss
+
+    def box_reg_loss(self, proposal_boxes, gt_boxes, pred_deltas, gt_classes):
+        """
+        Args:
+            proposal_boxes/gt_boxes are tensors with the same shape (R, 4 or 5).
+            pred_deltas has shape (R, 4 or 5), or (R, num_classes * (4 or 5)).
+            gt_classes is a long tensor of shape R, the gt class label of each proposal.
+            R shall be the number of proposals.
+        """
+        box_dim = proposal_boxes.shape[1]  # 4 or 5
+        # Regression loss is only computed for foreground proposals (those matched to a GT)
+        fg_inds = nonzero_tuple((gt_classes >= 0) & (gt_classes < self.num_classes))[0]
+        if pred_deltas.shape[1] == box_dim:  # cls-agnostic regression
+            fg_pred_deltas = pred_deltas[fg_inds]
+        else:
+            fg_pred_deltas = pred_deltas.view(-1, self.num_classes, box_dim)[
+                fg_inds, gt_classes[fg_inds]
+            ]
+
+        loss_box_reg = _dense_box_regression_loss(
+            [proposal_boxes[fg_inds]],
+            self.box2box_transform,
+            [fg_pred_deltas.unsqueeze(0)],
+            [gt_boxes[fg_inds]],
+            ...,
+            self.box_reg_loss_type,
+            self.smooth_l1_beta,
+        )
+
+        # The reg loss is normalized using the total number of regions (R), not the number
+        # of foreground regions even though the box regression loss is only defined on
+        # foreground regions. Why? Because doing so gives equal training influence to
+        # each foreground example. To see how, consider two different minibatches:
+        #  (1) Contains a single foreground region
+        #  (2) Contains 100 foreground regions
+        # If we normalize by the number of foreground regions, the single example in
+        # minibatch (1) will be given 100 times as much influence as each foreground
+        # example in minibatch (2). Normalizing by the total number of regions, R,
+        # means that the single example in minibatch (1) and each of the 100 examples
+        # in minibatch (2) are given equal influence.
+        return loss_box_reg / max(gt_classes.numel(), 1.0)  # return 0 if empty
+
+    def inference(self, predictions: Tuple[torch.Tensor, torch.Tensor], proposals: List[Instances]):
+        """
+        Args:
+            predictions: return values of :meth:`forward()`.
+            proposals (list[Instances]): proposals that match the features that were
+                used to compute predictions. The ``proposal_boxes`` field is expected.
+
+        Returns:
+            list[Instances]: same as `fast_rcnn_inference`.
+            list[Tensor]: same as `fast_rcnn_inference`.
+        """
+        boxes = self.predict_boxes(predictions, proposals)
+        scores = self.predict_probs(predictions, proposals)
+        image_shapes = [x.image_size for x in proposals]
+        return fast_rcnn_inference(
+            boxes,
+            scores,
+            image_shapes,
+            self.test_score_thresh,
+            self.test_nms_thresh,
+            self.test_topk_per_image,
+        )
+
+    def predict_boxes_for_gt_classes(self, predictions, proposals):
+        """
+        Args:
+            predictions: return values of :meth:`forward()`.
+            proposals (list[Instances]): proposals that match the features that were used
+                to compute predictions. The fields ``proposal_boxes``, ``gt_classes`` are expected.
+
+        Returns:
+            list[Tensor]:
+                A list of Tensors of predicted boxes for GT classes in case of
+                class-specific box head. Element i of the list has shape (Ri, B), where Ri is
+                the number of proposals for image i and B is the box dimension (4 or 5)
+        """
+        if not len(proposals):
+            return []
+        scores, proposal_deltas = predictions
+        proposal_boxes = cat([p.proposal_boxes.tensor for p in proposals], dim=0)
+        N, B = proposal_boxes.shape
+        predict_boxes = self.box2box_transform.apply_deltas(
+            proposal_deltas, proposal_boxes
+        )  # Nx(KxB)
+
+        K = predict_boxes.shape[1] // B
+        if K > 1:
+            gt_classes = torch.cat([p.gt_classes for p in proposals], dim=0)
+            # Some proposals are ignored or have a background class. Their gt_classes
+            # cannot be used as index.
+            gt_classes = gt_classes.clamp_(0, K - 1)
+
+            predict_boxes = predict_boxes.view(N, K, B)[
+                torch.arange(N, dtype=torch.long, device=predict_boxes.device), gt_classes
+            ]
+        num_prop_per_image = [len(p) for p in proposals]
+        return predict_boxes.split(num_prop_per_image)
+
+    def predict_boxes(
+        self, predictions: Tuple[torch.Tensor, torch.Tensor], proposals: List[Instances]
+    ):
+        """
+        Args:
+            predictions: return values of :meth:`forward()`.
+            proposals (list[Instances]): proposals that match the features that were
+                used to compute predictions. The ``proposal_boxes`` field is expected.
+
+        Returns:
+            list[Tensor]:
+                A list of Tensors of predicted class-specific or class-agnostic boxes
+                for each image. Element i has shape (Ri, K * B) or (Ri, B), where Ri is
+                the number of proposals for image i and B is the box dimension (4 or 5)
+        """
+        if not len(proposals):
+            return []
+        _, proposal_deltas = predictions
+        num_prop_per_image = [len(p) for p in proposals]
+        proposal_boxes = cat([p.proposal_boxes.tensor for p in proposals], dim=0)
+        predict_boxes = self.box2box_transform.apply_deltas(
+            proposal_deltas,
+            proposal_boxes,
+        )  # Nx(KxB)
+        return predict_boxes.split(num_prop_per_image)
+
+    def predict_probs(
+        self, predictions: Tuple[torch.Tensor, torch.Tensor], proposals: List[Instances]
+    ):
+        """
+        Args:
+            predictions: return values of :meth:`forward()`.
+            proposals (list[Instances]): proposals that match the features that were
+                used to compute predictions.
+
+        Returns:
+            list[Tensor]:
+                A list of Tensors of predicted class probabilities for each image.
+                Element i has shape (Ri, K + 1), where Ri is the number of proposals for image i.
+        """
+        scores, _ = predictions
+        num_inst_per_image = [len(p) for p in proposals]
+        if self.use_sigmoid_ce:
+            probs = scores.sigmoid()
+        else:
+            probs = F.softmax(scores, dim=-1)
+        return probs.split(num_inst_per_image, dim=0)
diff --git a/detectron2/modeling/roi_heads/keypoint_head.py b/detectron2/modeling/roi_heads/keypoint_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..e0acc138e72fcb188e4ffb3d156358b8ca59babf
--- /dev/null
+++ b/detectron2/modeling/roi_heads/keypoint_head.py
@@ -0,0 +1,272 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+from typing import List
+import torch
+from torch import nn
+from torch.nn import functional as F
+
+from detectron2.config import configurable
+from detectron2.layers import Conv2d, ConvTranspose2d, cat, interpolate
+from detectron2.structures import Instances, heatmaps_to_keypoints
+from detectron2.utils.events import get_event_storage
+from detectron2.utils.registry import Registry
+
+_TOTAL_SKIPPED = 0
+
+
+__all__ = [
+    "ROI_KEYPOINT_HEAD_REGISTRY",
+    "build_keypoint_head",
+    "BaseKeypointRCNNHead",
+    "KRCNNConvDeconvUpsampleHead",
+]
+
+
+ROI_KEYPOINT_HEAD_REGISTRY = Registry("ROI_KEYPOINT_HEAD")
+ROI_KEYPOINT_HEAD_REGISTRY.__doc__ = """
+Registry for keypoint heads, which make keypoint predictions from per-region features.
+
+The registered object will be called with `obj(cfg, input_shape)`.
+"""
+
+
+def build_keypoint_head(cfg, input_shape):
+    """
+    Build a keypoint head from `cfg.MODEL.ROI_KEYPOINT_HEAD.NAME`.
+    """
+    name = cfg.MODEL.ROI_KEYPOINT_HEAD.NAME
+    return ROI_KEYPOINT_HEAD_REGISTRY.get(name)(cfg, input_shape)
+
+
+def keypoint_rcnn_loss(pred_keypoint_logits, instances, normalizer):
+    """
+    Arguments:
+        pred_keypoint_logits (Tensor): A tensor of shape (N, K, S, S) where N is the total number
+            of instances in the batch, K is the number of keypoints, and S is the side length
+            of the keypoint heatmap. The values are spatial logits.
+        instances (list[Instances]): A list of M Instances, where M is the batch size.
+            These instances are predictions from the model
+            that are in 1:1 correspondence with pred_keypoint_logits.
+            Each Instances should contain a `gt_keypoints` field containing a `structures.Keypoint`
+            instance.
+        normalizer (float): Normalize the loss by this amount.
+            If not specified, we normalize by the number of visible keypoints in the minibatch.
+
+    Returns a scalar tensor containing the loss.
+    """
+    heatmaps = []
+    valid = []
+
+    keypoint_side_len = pred_keypoint_logits.shape[2]
+    for instances_per_image in instances:
+        if len(instances_per_image) == 0:
+            continue
+        keypoints = instances_per_image.gt_keypoints
+        heatmaps_per_image, valid_per_image = keypoints.to_heatmap(
+            instances_per_image.proposal_boxes.tensor, keypoint_side_len
+        )
+        heatmaps.append(heatmaps_per_image.view(-1))
+        valid.append(valid_per_image.view(-1))
+
+    if len(heatmaps):
+        keypoint_targets = cat(heatmaps, dim=0)
+        valid = cat(valid, dim=0).to(dtype=torch.uint8)
+        valid = torch.nonzero(valid).squeeze(1)
+
+    # torch.mean (in binary_cross_entropy_with_logits) doesn't
+    # accept empty tensors, so handle it separately
+    if len(heatmaps) == 0 or valid.numel() == 0:
+        global _TOTAL_SKIPPED
+        _TOTAL_SKIPPED += 1
+        storage = get_event_storage()
+        storage.put_scalar("kpts_num_skipped_batches", _TOTAL_SKIPPED, smoothing_hint=False)
+        return pred_keypoint_logits.sum() * 0
+
+    N, K, H, W = pred_keypoint_logits.shape
+    pred_keypoint_logits = pred_keypoint_logits.view(N * K, H * W)
+
+    keypoint_loss = F.cross_entropy(
+        pred_keypoint_logits[valid], keypoint_targets[valid], reduction="sum"
+    )
+
+    # If a normalizer isn't specified, normalize by the number of visible keypoints in the minibatch
+    if normalizer is None:
+        normalizer = valid.numel()
+    keypoint_loss /= normalizer
+
+    return keypoint_loss
+
+
+def keypoint_rcnn_inference(pred_keypoint_logits: torch.Tensor, pred_instances: List[Instances]):
+    """
+    Post process each predicted keypoint heatmap in `pred_keypoint_logits` into (x, y, score)
+        and add it to the `pred_instances` as a `pred_keypoints` field.
+
+    Args:
+        pred_keypoint_logits (Tensor): A tensor of shape (R, K, S, S) where R is the total number
+           of instances in the batch, K is the number of keypoints, and S is the side length of
+           the keypoint heatmap. The values are spatial logits.
+        pred_instances (list[Instances]): A list of N Instances, where N is the number of images.
+
+    Returns:
+        None. Each element in pred_instances will contain extra "pred_keypoints" and
+            "pred_keypoint_heatmaps" fields. "pred_keypoints" is a tensor of shape
+            (#instance, K, 3) where the last dimension corresponds to (x, y, score).
+            The scores are larger than 0. "pred_keypoint_heatmaps" contains the raw
+            keypoint logits as passed to this function.
+    """
+    # flatten all bboxes from all images together (list[Boxes] -> Rx4 tensor)
+    bboxes_flat = cat([b.pred_boxes.tensor for b in pred_instances], dim=0)
+
+    pred_keypoint_logits = pred_keypoint_logits.detach()
+    keypoint_results = heatmaps_to_keypoints(pred_keypoint_logits, bboxes_flat.detach())
+    num_instances_per_image = [len(i) for i in pred_instances]
+    keypoint_results = keypoint_results[:, :, [0, 1, 3]].split(num_instances_per_image, dim=0)
+    heatmap_results = pred_keypoint_logits.split(num_instances_per_image, dim=0)
+
+    for keypoint_results_per_image, heatmap_results_per_image, instances_per_image in zip(
+        keypoint_results, heatmap_results, pred_instances
+    ):
+        # keypoint_results_per_image is (num instances)x(num keypoints)x(x, y, score)
+        # heatmap_results_per_image is (num instances)x(num keypoints)x(side)x(side)
+        instances_per_image.pred_keypoints = keypoint_results_per_image
+        instances_per_image.pred_keypoint_heatmaps = heatmap_results_per_image
+
+
+class BaseKeypointRCNNHead(nn.Module):
+    """
+    Implement the basic Keypoint R-CNN losses and inference logic described in
+    Sec. 5 of :paper:`Mask R-CNN`.
+    """
+
+    @configurable
+    def __init__(self, *, num_keypoints, loss_weight=1.0, loss_normalizer=1.0):
+        """
+        NOTE: this interface is experimental.
+
+        Args:
+            num_keypoints (int): number of keypoints to predict
+            loss_weight (float): weight to multiple on the keypoint loss
+            loss_normalizer (float or str):
+                If float, divide the loss by `loss_normalizer * #images`.
+                If 'visible', the loss is normalized by the total number of
+                visible keypoints across images.
+        """
+        super().__init__()
+        self.num_keypoints = num_keypoints
+        self.loss_weight = loss_weight
+        assert loss_normalizer == "visible" or isinstance(loss_normalizer, float), loss_normalizer
+        self.loss_normalizer = loss_normalizer
+
+    @classmethod
+    def from_config(cls, cfg, input_shape):
+        ret = {
+            "loss_weight": cfg.MODEL.ROI_KEYPOINT_HEAD.LOSS_WEIGHT,
+            "num_keypoints": cfg.MODEL.ROI_KEYPOINT_HEAD.NUM_KEYPOINTS,
+        }
+        normalize_by_visible = (
+            cfg.MODEL.ROI_KEYPOINT_HEAD.NORMALIZE_LOSS_BY_VISIBLE_KEYPOINTS
+        )  # noqa
+        if not normalize_by_visible:
+            batch_size_per_image = cfg.MODEL.ROI_HEADS.BATCH_SIZE_PER_IMAGE
+            positive_sample_fraction = cfg.MODEL.ROI_HEADS.POSITIVE_FRACTION
+            ret["loss_normalizer"] = (
+                ret["num_keypoints"] * batch_size_per_image * positive_sample_fraction
+            )
+        else:
+            ret["loss_normalizer"] = "visible"
+        return ret
+
+    def forward(self, x, instances: List[Instances]):
+        """
+        Args:
+            x: input 4D region feature(s) provided by :class:`ROIHeads`.
+            instances (list[Instances]): contains the boxes & labels corresponding
+                to the input features.
+                Exact format is up to its caller to decide.
+                Typically, this is the foreground instances in training, with
+                "proposal_boxes" field and other gt annotations.
+                In inference, it contains boxes that are already predicted.
+
+        Returns:
+            A dict of losses if in training. The predicted "instances" if in inference.
+        """
+        x = self.layers(x)
+        if self.training:
+            num_images = len(instances)
+            normalizer = (
+                None if self.loss_normalizer == "visible" else num_images * self.loss_normalizer
+            )
+            return {
+                "loss_keypoint": keypoint_rcnn_loss(x, instances, normalizer=normalizer)
+                * self.loss_weight
+            }
+        else:
+            keypoint_rcnn_inference(x, instances)
+            return instances
+
+    def layers(self, x):
+        """
+        Neural network layers that makes predictions from regional input features.
+        """
+        raise NotImplementedError
+
+
+# To get torchscript support, we make the head a subclass of `nn.Sequential`.
+# Therefore, to add new layers in this head class, please make sure they are
+# added in the order they will be used in forward().
+@ROI_KEYPOINT_HEAD_REGISTRY.register()
+class KRCNNConvDeconvUpsampleHead(BaseKeypointRCNNHead, nn.Sequential):
+    """
+    A standard keypoint head containing a series of 3x3 convs, followed by
+    a transpose convolution and bilinear interpolation for upsampling.
+    It is described in Sec. 5 of :paper:`Mask R-CNN`.
+    """
+
+    @configurable
+    def __init__(self, input_shape, *, num_keypoints, conv_dims, **kwargs):
+        """
+        NOTE: this interface is experimental.
+
+        Args:
+            input_shape (ShapeSpec): shape of the input feature
+            conv_dims: an iterable of output channel counts for each conv in the head
+                         e.g. (512, 512, 512) for three convs outputting 512 channels.
+        """
+        super().__init__(num_keypoints=num_keypoints, **kwargs)
+
+        # default up_scale to 2.0 (this can be made an option)
+        up_scale = 2.0
+        in_channels = input_shape.channels
+
+        for idx, layer_channels in enumerate(conv_dims, 1):
+            module = Conv2d(in_channels, layer_channels, 3, stride=1, padding=1)
+            self.add_module("conv_fcn{}".format(idx), module)
+            self.add_module("conv_fcn_relu{}".format(idx), nn.ReLU())
+            in_channels = layer_channels
+
+        deconv_kernel = 4
+        self.score_lowres = ConvTranspose2d(
+            in_channels, num_keypoints, deconv_kernel, stride=2, padding=deconv_kernel // 2 - 1
+        )
+        self.up_scale = up_scale
+
+        for name, param in self.named_parameters():
+            if "bias" in name:
+                nn.init.constant_(param, 0)
+            elif "weight" in name:
+                # Caffe2 implementation uses MSRAFill, which in fact
+                # corresponds to kaiming_normal_ in PyTorch
+                nn.init.kaiming_normal_(param, mode="fan_out", nonlinearity="relu")
+
+    @classmethod
+    def from_config(cls, cfg, input_shape):
+        ret = super().from_config(cfg, input_shape)
+        ret["input_shape"] = input_shape
+        ret["conv_dims"] = cfg.MODEL.ROI_KEYPOINT_HEAD.CONV_DIMS
+        return ret
+
+    def layers(self, x):
+        for layer in self:
+            x = layer(x)
+        x = interpolate(x, scale_factor=self.up_scale, mode="bilinear", align_corners=False)
+        return x
diff --git a/detectron2/modeling/roi_heads/mask_head.py b/detectron2/modeling/roi_heads/mask_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..1eff8f7916111546f9413cb6004cadcea01ba950
--- /dev/null
+++ b/detectron2/modeling/roi_heads/mask_head.py
@@ -0,0 +1,298 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+from typing import List
+import fvcore.nn.weight_init as weight_init
+import torch
+from torch import nn
+from torch.nn import functional as F
+
+from detectron2.config import configurable
+from detectron2.layers import Conv2d, ConvTranspose2d, ShapeSpec, cat, get_norm
+from detectron2.layers.wrappers import move_device_like
+from detectron2.structures import Instances
+from detectron2.utils.events import get_event_storage
+from detectron2.utils.registry import Registry
+
+__all__ = [
+    "BaseMaskRCNNHead",
+    "MaskRCNNConvUpsampleHead",
+    "build_mask_head",
+    "ROI_MASK_HEAD_REGISTRY",
+]
+
+
+ROI_MASK_HEAD_REGISTRY = Registry("ROI_MASK_HEAD")
+ROI_MASK_HEAD_REGISTRY.__doc__ = """
+Registry for mask heads, which predicts instance masks given
+per-region features.
+
+The registered object will be called with `obj(cfg, input_shape)`.
+"""
+
+
+@torch.jit.unused
+def mask_rcnn_loss(pred_mask_logits: torch.Tensor, instances: List[Instances], vis_period: int = 0):
+    """
+    Compute the mask prediction loss defined in the Mask R-CNN paper.
+
+    Args:
+        pred_mask_logits (Tensor): A tensor of shape (B, C, Hmask, Wmask) or (B, 1, Hmask, Wmask)
+            for class-specific or class-agnostic, where B is the total number of predicted masks
+            in all images, C is the number of foreground classes, and Hmask, Wmask are the height
+            and width of the mask predictions. The values are logits.
+        instances (list[Instances]): A list of N Instances, where N is the number of images
+            in the batch. These instances are in 1:1
+            correspondence with the pred_mask_logits. The ground-truth labels (class, box, mask,
+            ...) associated with each instance are stored in fields.
+        vis_period (int): the period (in steps) to dump visualization.
+
+    Returns:
+        mask_loss (Tensor): A scalar tensor containing the loss.
+    """
+    cls_agnostic_mask = pred_mask_logits.size(1) == 1
+    total_num_masks = pred_mask_logits.size(0)
+    mask_side_len = pred_mask_logits.size(2)
+    assert pred_mask_logits.size(2) == pred_mask_logits.size(3), "Mask prediction must be square!"
+
+    gt_classes = []
+    gt_masks = []
+    for instances_per_image in instances:
+        if len(instances_per_image) == 0:
+            continue
+        if not cls_agnostic_mask:
+            gt_classes_per_image = instances_per_image.gt_classes.to(dtype=torch.int64)
+            gt_classes.append(gt_classes_per_image)
+
+        gt_masks_per_image = instances_per_image.gt_masks.crop_and_resize(
+            instances_per_image.proposal_boxes.tensor, mask_side_len
+        ).to(device=pred_mask_logits.device)
+        # A tensor of shape (N, M, M), N=#instances in the image; M=mask_side_len
+        gt_masks.append(gt_masks_per_image)
+
+    if len(gt_masks) == 0:
+        return pred_mask_logits.sum() * 0
+
+    gt_masks = cat(gt_masks, dim=0)
+
+    if cls_agnostic_mask:
+        pred_mask_logits = pred_mask_logits[:, 0]
+    else:
+        indices = torch.arange(total_num_masks)
+        gt_classes = cat(gt_classes, dim=0)
+        pred_mask_logits = pred_mask_logits[indices, gt_classes]
+
+    if gt_masks.dtype == torch.bool:
+        gt_masks_bool = gt_masks
+    else:
+        # Here we allow gt_masks to be float as well (depend on the implementation of rasterize())
+        gt_masks_bool = gt_masks > 0.5
+    gt_masks = gt_masks.to(dtype=torch.float32)
+
+    # Log the training accuracy (using gt classes and sigmoid(0.0) == 0.5 threshold)
+    mask_incorrect = (pred_mask_logits > 0.0) != gt_masks_bool
+    mask_accuracy = 1 - (mask_incorrect.sum().item() / max(mask_incorrect.numel(), 1.0))
+    num_positive = gt_masks_bool.sum().item()
+    false_positive = (mask_incorrect & ~gt_masks_bool).sum().item() / max(
+        gt_masks_bool.numel() - num_positive, 1.0
+    )
+    false_negative = (mask_incorrect & gt_masks_bool).sum().item() / max(num_positive, 1.0)
+
+    storage = get_event_storage()
+    storage.put_scalar("mask_rcnn/accuracy", mask_accuracy)
+    storage.put_scalar("mask_rcnn/false_positive", false_positive)
+    storage.put_scalar("mask_rcnn/false_negative", false_negative)
+    if vis_period > 0 and storage.iter % vis_period == 0:
+        pred_masks = pred_mask_logits.sigmoid()
+        vis_masks = torch.cat([pred_masks, gt_masks], axis=2)
+        name = "Left: mask prediction;   Right: mask GT"
+        for idx, vis_mask in enumerate(vis_masks):
+            vis_mask = torch.stack([vis_mask] * 3, axis=0)
+            storage.put_image(name + f" ({idx})", vis_mask)
+
+    mask_loss = F.binary_cross_entropy_with_logits(pred_mask_logits, gt_masks, reduction="mean")
+    return mask_loss
+
+
+def mask_rcnn_inference(pred_mask_logits: torch.Tensor, pred_instances: List[Instances]):
+    """
+    Convert pred_mask_logits to estimated foreground probability masks while also
+    extracting only the masks for the predicted classes in pred_instances. For each
+    predicted box, the mask of the same class is attached to the instance by adding a
+    new "pred_masks" field to pred_instances.
+
+    Args:
+        pred_mask_logits (Tensor): A tensor of shape (B, C, Hmask, Wmask) or (B, 1, Hmask, Wmask)
+            for class-specific or class-agnostic, where B is the total number of predicted masks
+            in all images, C is the number of foreground classes, and Hmask, Wmask are the height
+            and width of the mask predictions. The values are logits.
+        pred_instances (list[Instances]): A list of N Instances, where N is the number of images
+            in the batch. Each Instances must have field "pred_classes".
+
+    Returns:
+        None. pred_instances will contain an extra "pred_masks" field storing a mask of size (Hmask,
+            Wmask) for predicted class. Note that the masks are returned as a soft (non-quantized)
+            masks the resolution predicted by the network; post-processing steps, such as resizing
+            the predicted masks to the original image resolution and/or binarizing them, is left
+            to the caller.
+    """
+    cls_agnostic_mask = pred_mask_logits.size(1) == 1
+
+    if cls_agnostic_mask:
+        mask_probs_pred = pred_mask_logits.sigmoid()
+    else:
+        # Select masks corresponding to the predicted classes
+        num_masks = pred_mask_logits.shape[0]
+        class_pred = cat([i.pred_classes for i in pred_instances])
+        device = (
+            class_pred.device
+            if torch.jit.is_scripting()
+            else ("cpu" if torch.jit.is_tracing() else class_pred.device)
+        )
+        indices = move_device_like(torch.arange(num_masks, device=device), class_pred)
+        mask_probs_pred = pred_mask_logits[indices, class_pred][:, None].sigmoid()
+    # mask_probs_pred.shape: (B, 1, Hmask, Wmask)
+
+    num_boxes_per_image = [len(i) for i in pred_instances]
+    mask_probs_pred = mask_probs_pred.split(num_boxes_per_image, dim=0)
+
+    for prob, instances in zip(mask_probs_pred, pred_instances):
+        instances.pred_masks = prob  # (1, Hmask, Wmask)
+
+
+class BaseMaskRCNNHead(nn.Module):
+    """
+    Implement the basic Mask R-CNN losses and inference logic described in :paper:`Mask R-CNN`
+    """
+
+    @configurable
+    def __init__(self, *, loss_weight: float = 1.0, vis_period: int = 0):
+        """
+        NOTE: this interface is experimental.
+
+        Args:
+            loss_weight (float): multiplier of the loss
+            vis_period (int): visualization period
+        """
+        super().__init__()
+        self.vis_period = vis_period
+        self.loss_weight = loss_weight
+
+    @classmethod
+    def from_config(cls, cfg, input_shape):
+        return {"vis_period": cfg.VIS_PERIOD}
+
+    def forward(self, x, instances: List[Instances]):
+        """
+        Args:
+            x: input region feature(s) provided by :class:`ROIHeads`.
+            instances (list[Instances]): contains the boxes & labels corresponding
+                to the input features.
+                Exact format is up to its caller to decide.
+                Typically, this is the foreground instances in training, with
+                "proposal_boxes" field and other gt annotations.
+                In inference, it contains boxes that are already predicted.
+
+        Returns:
+            A dict of losses in training. The predicted "instances" in inference.
+        """
+        x = self.layers(x)
+        if self.training:
+            return {"loss_mask": mask_rcnn_loss(x, instances, self.vis_period) * self.loss_weight}
+        else:
+            mask_rcnn_inference(x, instances)
+            return instances
+
+    def layers(self, x):
+        """
+        Neural network layers that makes predictions from input features.
+        """
+        raise NotImplementedError
+
+
+# To get torchscript support, we make the head a subclass of `nn.Sequential`.
+# Therefore, to add new layers in this head class, please make sure they are
+# added in the order they will be used in forward().
+@ROI_MASK_HEAD_REGISTRY.register()
+class MaskRCNNConvUpsampleHead(BaseMaskRCNNHead, nn.Sequential):
+    """
+    A mask head with several conv layers, plus an upsample layer (with `ConvTranspose2d`).
+    Predictions are made with a final 1x1 conv layer.
+    """
+
+    @configurable
+    def __init__(self, input_shape: ShapeSpec, *, num_classes, conv_dims, conv_norm="", **kwargs):
+        """
+        NOTE: this interface is experimental.
+
+        Args:
+            input_shape (ShapeSpec): shape of the input feature
+            num_classes (int): the number of foreground classes (i.e. background is not
+                included). 1 if using class agnostic prediction.
+            conv_dims (list[int]): a list of N>0 integers representing the output dimensions
+                of N-1 conv layers and the last upsample layer.
+            conv_norm (str or callable): normalization for the conv layers.
+                See :func:`detectron2.layers.get_norm` for supported types.
+        """
+        super().__init__(**kwargs)
+        assert len(conv_dims) >= 1, "conv_dims have to be non-empty!"
+
+        self.conv_norm_relus = []
+
+        cur_channels = input_shape.channels
+        for k, conv_dim in enumerate(conv_dims[:-1]):
+            conv = Conv2d(
+                cur_channels,
+                conv_dim,
+                kernel_size=3,
+                stride=1,
+                padding=1,
+                bias=not conv_norm,
+                norm=get_norm(conv_norm, conv_dim),
+                activation=nn.ReLU(),
+            )
+            self.add_module("mask_fcn{}".format(k + 1), conv)
+            self.conv_norm_relus.append(conv)
+            cur_channels = conv_dim
+
+        self.deconv = ConvTranspose2d(
+            cur_channels, conv_dims[-1], kernel_size=2, stride=2, padding=0
+        )
+        self.add_module("deconv_relu", nn.ReLU())
+        cur_channels = conv_dims[-1]
+
+        self.predictor = Conv2d(cur_channels, num_classes, kernel_size=1, stride=1, padding=0)
+
+        for layer in self.conv_norm_relus + [self.deconv]:
+            weight_init.c2_msra_fill(layer)
+        # use normal distribution initialization for mask prediction layer
+        nn.init.normal_(self.predictor.weight, std=0.001)
+        if self.predictor.bias is not None:
+            nn.init.constant_(self.predictor.bias, 0)
+
+    @classmethod
+    def from_config(cls, cfg, input_shape):
+        ret = super().from_config(cfg, input_shape)
+        conv_dim = cfg.MODEL.ROI_MASK_HEAD.CONV_DIM
+        num_conv = cfg.MODEL.ROI_MASK_HEAD.NUM_CONV
+        ret.update(
+            conv_dims=[conv_dim] * (num_conv + 1),  # +1 for ConvTranspose
+            conv_norm=cfg.MODEL.ROI_MASK_HEAD.NORM,
+            input_shape=input_shape,
+        )
+        if cfg.MODEL.ROI_MASK_HEAD.CLS_AGNOSTIC_MASK:
+            ret["num_classes"] = 1
+        else:
+            ret["num_classes"] = cfg.MODEL.ROI_HEADS.NUM_CLASSES
+        return ret
+
+    def layers(self, x):
+        for layer in self:
+            x = layer(x)
+        return x
+
+
+def build_mask_head(cfg, input_shape):
+    """
+    Build a mask head defined by `cfg.MODEL.ROI_MASK_HEAD.NAME`.
+    """
+    name = cfg.MODEL.ROI_MASK_HEAD.NAME
+    return ROI_MASK_HEAD_REGISTRY.get(name)(cfg, input_shape)
diff --git a/detectron2/modeling/roi_heads/roi_heads.py b/detectron2/modeling/roi_heads/roi_heads.py
new file mode 100644
index 0000000000000000000000000000000000000000..13dd57a0478917001841f6c6299f380e1198e63a
--- /dev/null
+++ b/detectron2/modeling/roi_heads/roi_heads.py
@@ -0,0 +1,877 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+import inspect
+import logging
+import numpy as np
+from typing import Dict, List, Optional, Tuple
+import torch
+from torch import nn
+
+from detectron2.config import configurable
+from detectron2.layers import ShapeSpec, nonzero_tuple
+from detectron2.structures import Boxes, ImageList, Instances, pairwise_iou
+from detectron2.utils.events import get_event_storage
+from detectron2.utils.registry import Registry
+
+from ..backbone.resnet import BottleneckBlock, ResNet
+from ..matcher import Matcher
+from ..poolers import ROIPooler
+from ..proposal_generator.proposal_utils import add_ground_truth_to_proposals
+from ..sampling import subsample_labels
+from .box_head import build_box_head
+from .fast_rcnn import FastRCNNOutputLayers
+from .keypoint_head import build_keypoint_head
+from .mask_head import build_mask_head
+
+ROI_HEADS_REGISTRY = Registry("ROI_HEADS")
+ROI_HEADS_REGISTRY.__doc__ = """
+Registry for ROI heads in a generalized R-CNN model.
+ROIHeads take feature maps and region proposals, and
+perform per-region computation.
+
+The registered object will be called with `obj(cfg, input_shape)`.
+The call is expected to return an :class:`ROIHeads`.
+"""
+
+logger = logging.getLogger(__name__)
+
+
+def build_roi_heads(cfg, input_shape):
+    """
+    Build ROIHeads defined by `cfg.MODEL.ROI_HEADS.NAME`.
+    """
+    name = cfg.MODEL.ROI_HEADS.NAME
+    return ROI_HEADS_REGISTRY.get(name)(cfg, input_shape)
+
+
+def select_foreground_proposals(
+    proposals: List[Instances], bg_label: int
+) -> Tuple[List[Instances], List[torch.Tensor]]:
+    """
+    Given a list of N Instances (for N images), each containing a `gt_classes` field,
+    return a list of Instances that contain only instances with `gt_classes != -1 &&
+    gt_classes != bg_label`.
+
+    Args:
+        proposals (list[Instances]): A list of N Instances, where N is the number of
+            images in the batch.
+        bg_label: label index of background class.
+
+    Returns:
+        list[Instances]: N Instances, each contains only the selected foreground instances.
+        list[Tensor]: N boolean vector, correspond to the selection mask of
+            each Instances object. True for selected instances.
+    """
+    assert isinstance(proposals, (list, tuple))
+    assert isinstance(proposals[0], Instances)
+    assert proposals[0].has("gt_classes")
+    fg_proposals = []
+    fg_selection_masks = []
+    for proposals_per_image in proposals:
+        gt_classes = proposals_per_image.gt_classes
+        fg_selection_mask = (gt_classes != -1) & (gt_classes != bg_label)
+        fg_idxs = fg_selection_mask.nonzero().squeeze(1)
+        fg_proposals.append(proposals_per_image[fg_idxs])
+        fg_selection_masks.append(fg_selection_mask)
+    return fg_proposals, fg_selection_masks
+
+
+def select_proposals_with_visible_keypoints(proposals: List[Instances]) -> List[Instances]:
+    """
+    Args:
+        proposals (list[Instances]): a list of N Instances, where N is the
+            number of images.
+
+    Returns:
+        proposals: only contains proposals with at least one visible keypoint.
+
+    Note that this is still slightly different from Detectron.
+    In Detectron, proposals for training keypoint head are re-sampled from
+    all the proposals with IOU>threshold & >=1 visible keypoint.
+
+    Here, the proposals are first sampled from all proposals with
+    IOU>threshold, then proposals with no visible keypoint are filtered out.
+    This strategy seems to make no difference on Detectron and is easier to implement.
+    """
+    ret = []
+    all_num_fg = []
+    for proposals_per_image in proposals:
+        # If empty/unannotated image (hard negatives), skip filtering for train
+        if len(proposals_per_image) == 0:
+            ret.append(proposals_per_image)
+            continue
+        gt_keypoints = proposals_per_image.gt_keypoints.tensor
+        # #fg x K x 3
+        vis_mask = gt_keypoints[:, :, 2] >= 1
+        xs, ys = gt_keypoints[:, :, 0], gt_keypoints[:, :, 1]
+        proposal_boxes = proposals_per_image.proposal_boxes.tensor.unsqueeze(dim=1)  # #fg x 1 x 4
+        kp_in_box = (
+            (xs >= proposal_boxes[:, :, 0])
+            & (xs <= proposal_boxes[:, :, 2])
+            & (ys >= proposal_boxes[:, :, 1])
+            & (ys <= proposal_boxes[:, :, 3])
+        )
+        selection = (kp_in_box & vis_mask).any(dim=1)
+        selection_idxs = nonzero_tuple(selection)[0]
+        all_num_fg.append(selection_idxs.numel())
+        ret.append(proposals_per_image[selection_idxs])
+
+    storage = get_event_storage()
+    storage.put_scalar("keypoint_head/num_fg_samples", np.mean(all_num_fg))
+    return ret
+
+
+class ROIHeads(torch.nn.Module):
+    """
+    ROIHeads perform all per-region computation in an R-CNN.
+
+    It typically contains logic to
+
+    1. (in training only) match proposals with ground truth and sample them
+    2. crop the regions and extract per-region features using proposals
+    3. make per-region predictions with different heads
+
+    It can have many variants, implemented as subclasses of this class.
+    This base class contains the logic to match/sample proposals.
+    But it is not necessary to inherit this class if the sampling logic is not needed.
+    """
+
+    @configurable
+    def __init__(
+        self,
+        *,
+        num_classes,
+        batch_size_per_image,
+        positive_fraction,
+        proposal_matcher,
+        proposal_append_gt=True,
+    ):
+        """
+        NOTE: this interface is experimental.
+
+        Args:
+            num_classes (int): number of foreground classes (i.e. background is not included)
+            batch_size_per_image (int): number of proposals to sample for training
+            positive_fraction (float): fraction of positive (foreground) proposals
+                to sample for training.
+            proposal_matcher (Matcher): matcher that matches proposals and ground truth
+            proposal_append_gt (bool): whether to include ground truth as proposals as well
+        """
+        super().__init__()
+        self.batch_size_per_image = batch_size_per_image
+        self.positive_fraction = positive_fraction
+        self.num_classes = num_classes
+        self.proposal_matcher = proposal_matcher
+        self.proposal_append_gt = proposal_append_gt
+
+    @classmethod
+    def from_config(cls, cfg):
+        return {
+            "batch_size_per_image": cfg.MODEL.ROI_HEADS.BATCH_SIZE_PER_IMAGE,
+            "positive_fraction": cfg.MODEL.ROI_HEADS.POSITIVE_FRACTION,
+            "num_classes": cfg.MODEL.ROI_HEADS.NUM_CLASSES,
+            "proposal_append_gt": cfg.MODEL.ROI_HEADS.PROPOSAL_APPEND_GT,
+            # Matcher to assign box proposals to gt boxes
+            "proposal_matcher": Matcher(
+                cfg.MODEL.ROI_HEADS.IOU_THRESHOLDS,
+                cfg.MODEL.ROI_HEADS.IOU_LABELS,
+                allow_low_quality_matches=False,
+            ),
+        }
+
+    def _sample_proposals(
+        self, matched_idxs: torch.Tensor, matched_labels: torch.Tensor, gt_classes: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Based on the matching between N proposals and M groundtruth,
+        sample the proposals and set their classification labels.
+
+        Args:
+            matched_idxs (Tensor): a vector of length N, each is the best-matched
+                gt index in [0, M) for each proposal.
+            matched_labels (Tensor): a vector of length N, the matcher's label
+                (one of cfg.MODEL.ROI_HEADS.IOU_LABELS) for each proposal.
+            gt_classes (Tensor): a vector of length M.
+
+        Returns:
+            Tensor: a vector of indices of sampled proposals. Each is in [0, N).
+            Tensor: a vector of the same length, the classification label for
+                each sampled proposal. Each sample is labeled as either a category in
+                [0, num_classes) or the background (num_classes).
+        """
+        has_gt = gt_classes.numel() > 0
+        # Get the corresponding GT for each proposal
+        if has_gt:
+            gt_classes = gt_classes[matched_idxs]
+            # Label unmatched proposals (0 label from matcher) as background (label=num_classes)
+            gt_classes[matched_labels == 0] = self.num_classes
+            # Label ignore proposals (-1 label)
+            gt_classes[matched_labels == -1] = -1
+        else:
+            gt_classes = torch.zeros_like(matched_idxs) + self.num_classes
+
+        sampled_fg_idxs, sampled_bg_idxs = subsample_labels(
+            gt_classes, self.batch_size_per_image, self.positive_fraction, self.num_classes
+        )
+
+        sampled_idxs = torch.cat([sampled_fg_idxs, sampled_bg_idxs], dim=0)
+        return sampled_idxs, gt_classes[sampled_idxs]
+
+    @torch.no_grad()
+    def label_and_sample_proposals(
+        self, proposals: List[Instances], targets: List[Instances]
+    ) -> List[Instances]:
+        """
+        Prepare some proposals to be used to train the ROI heads.
+        It performs box matching between `proposals` and `targets`, and assigns
+        training labels to the proposals.
+        It returns ``self.batch_size_per_image`` random samples from proposals and groundtruth
+        boxes, with a fraction of positives that is no larger than
+        ``self.positive_fraction``.
+
+        Args:
+            See :meth:`ROIHeads.forward`
+
+        Returns:
+            list[Instances]:
+                length `N` list of `Instances`s containing the proposals
+                sampled for training. Each `Instances` has the following fields:
+
+                - proposal_boxes: the proposal boxes
+                - gt_boxes: the ground-truth box that the proposal is assigned to
+                  (this is only meaningful if the proposal has a label > 0; if label = 0
+                  then the ground-truth box is random)
+
+                Other fields such as "gt_classes", "gt_masks", that's included in `targets`.
+        """
+        # Augment proposals with ground-truth boxes.
+        # In the case of learned proposals (e.g., RPN), when training starts
+        # the proposals will be low quality due to random initialization.
+        # It's possible that none of these initial
+        # proposals have high enough overlap with the gt objects to be used
+        # as positive examples for the second stage components (box head,
+        # cls head, mask head). Adding the gt boxes to the set of proposals
+        # ensures that the second stage components will have some positive
+        # examples from the start of training. For RPN, this augmentation improves
+        # convergence and empirically improves box AP on COCO by about 0.5
+        # points (under one tested configuration).
+        if self.proposal_append_gt:
+            proposals = add_ground_truth_to_proposals(targets, proposals)
+
+        proposals_with_gt = []
+
+        num_fg_samples = []
+        num_bg_samples = []
+        for proposals_per_image, targets_per_image in zip(proposals, targets):
+            has_gt = len(targets_per_image) > 0
+            match_quality_matrix = pairwise_iou(
+                targets_per_image.gt_boxes, proposals_per_image.proposal_boxes
+            )
+            matched_idxs, matched_labels = self.proposal_matcher(match_quality_matrix)
+            sampled_idxs, gt_classes = self._sample_proposals(
+                matched_idxs, matched_labels, targets_per_image.gt_classes
+            )
+
+            # Set target attributes of the sampled proposals:
+            proposals_per_image = proposals_per_image[sampled_idxs]
+            proposals_per_image.gt_classes = gt_classes
+
+            if has_gt:
+                sampled_targets = matched_idxs[sampled_idxs]
+                # We index all the attributes of targets that start with "gt_"
+                # and have not been added to proposals yet (="gt_classes").
+                # NOTE: here the indexing waste some compute, because heads
+                # like masks, keypoints, etc, will filter the proposals again,
+                # (by foreground/background, or number of keypoints in the image, etc)
+                # so we essentially index the data twice.
+                for (trg_name, trg_value) in targets_per_image.get_fields().items():
+                    if trg_name.startswith("gt_") and not proposals_per_image.has(trg_name):
+                        proposals_per_image.set(trg_name, trg_value[sampled_targets])
+            # If no GT is given in the image, we don't know what a dummy gt value can be.
+            # Therefore the returned proposals won't have any gt_* fields, except for a
+            # gt_classes full of background label.
+
+            num_bg_samples.append((gt_classes == self.num_classes).sum().item())
+            num_fg_samples.append(gt_classes.numel() - num_bg_samples[-1])
+            proposals_with_gt.append(proposals_per_image)
+
+        # Log the number of fg/bg samples that are selected for training ROI heads
+        storage = get_event_storage()
+        storage.put_scalar("roi_head/num_fg_samples", np.mean(num_fg_samples))
+        storage.put_scalar("roi_head/num_bg_samples", np.mean(num_bg_samples))
+
+        return proposals_with_gt
+
+    def forward(
+        self,
+        images: ImageList,
+        features: Dict[str, torch.Tensor],
+        proposals: List[Instances],
+        targets: Optional[List[Instances]] = None,
+    ) -> Tuple[List[Instances], Dict[str, torch.Tensor]]:
+        """
+        Args:
+            images (ImageList):
+            features (dict[str,Tensor]): input data as a mapping from feature
+                map name to tensor. Axis 0 represents the number of images `N` in
+                the input data; axes 1-3 are channels, height, and width, which may
+                vary between feature maps (e.g., if a feature pyramid is used).
+            proposals (list[Instances]): length `N` list of `Instances`. The i-th
+                `Instances` contains object proposals for the i-th input image,
+                with fields "proposal_boxes" and "objectness_logits".
+            targets (list[Instances], optional): length `N` list of `Instances`. The i-th
+                `Instances` contains the ground-truth per-instance annotations
+                for the i-th input image.  Specify `targets` during training only.
+                It may have the following fields:
+
+                - gt_boxes: the bounding box of each instance.
+                - gt_classes: the label for each instance with a category ranging in [0, #class].
+                - gt_masks: PolygonMasks or BitMasks, the ground-truth masks of each instance.
+                - gt_keypoints: NxKx3, the groud-truth keypoints for each instance.
+
+        Returns:
+            list[Instances]: length `N` list of `Instances` containing the
+            detected instances. Returned during inference only; may be [] during training.
+
+            dict[str->Tensor]:
+            mapping from a named loss to a tensor storing the loss. Used during training only.
+        """
+        raise NotImplementedError()
+
+
+@ROI_HEADS_REGISTRY.register()
+class Res5ROIHeads(ROIHeads):
+    """
+    The ROIHeads in a typical "C4" R-CNN model, where
+    the box and mask head share the cropping and
+    the per-region feature computation by a Res5 block.
+    See :paper:`ResNet` Appendix A.
+    """
+
+    @configurable
+    def __init__(
+        self,
+        *,
+        in_features: List[str],
+        pooler: ROIPooler,
+        res5: nn.Module,
+        box_predictor: nn.Module,
+        mask_head: Optional[nn.Module] = None,
+        **kwargs,
+    ):
+        """
+        NOTE: this interface is experimental.
+
+        Args:
+            in_features (list[str]): list of backbone feature map names to use for
+                feature extraction
+            pooler (ROIPooler): pooler to extra region features from backbone
+            res5 (nn.Sequential): a CNN to compute per-region features, to be used by
+                ``box_predictor`` and ``mask_head``. Typically this is a "res5"
+                block from a ResNet.
+            box_predictor (nn.Module): make box predictions from the feature.
+                Should have the same interface as :class:`FastRCNNOutputLayers`.
+            mask_head (nn.Module): transform features to make mask predictions
+        """
+        super().__init__(**kwargs)
+        self.in_features = in_features
+        self.pooler = pooler
+        if isinstance(res5, (list, tuple)):
+            res5 = nn.Sequential(*res5)
+        self.res5 = res5
+        self.box_predictor = box_predictor
+        self.mask_on = mask_head is not None
+        if self.mask_on:
+            self.mask_head = mask_head
+
+    @classmethod
+    def from_config(cls, cfg, input_shape):
+        # fmt: off
+        ret = super().from_config(cfg)
+        in_features = ret["in_features"] = cfg.MODEL.ROI_HEADS.IN_FEATURES
+        pooler_resolution = cfg.MODEL.ROI_BOX_HEAD.POOLER_RESOLUTION
+        pooler_type       = cfg.MODEL.ROI_BOX_HEAD.POOLER_TYPE
+        pooler_scales     = (1.0 / input_shape[in_features[0]].stride, )
+        sampling_ratio    = cfg.MODEL.ROI_BOX_HEAD.POOLER_SAMPLING_RATIO
+        mask_on           = cfg.MODEL.MASK_ON
+        # fmt: on
+        assert not cfg.MODEL.KEYPOINT_ON
+        assert len(in_features) == 1
+
+        ret["pooler"] = ROIPooler(
+            output_size=pooler_resolution,
+            scales=pooler_scales,
+            sampling_ratio=sampling_ratio,
+            pooler_type=pooler_type,
+        )
+
+        # Compatbility with old moco code. Might be useful.
+        # See notes in StandardROIHeads.from_config
+        if not inspect.ismethod(cls._build_res5_block):
+            logger.warning(
+                "The behavior of _build_res5_block may change. "
+                "Please do not depend on private methods."
+            )
+            cls._build_res5_block = classmethod(cls._build_res5_block)
+
+        ret["res5"], out_channels = cls._build_res5_block(cfg)
+        ret["box_predictor"] = FastRCNNOutputLayers(
+            cfg, ShapeSpec(channels=out_channels, height=1, width=1)
+        )
+
+        if mask_on:
+            ret["mask_head"] = build_mask_head(
+                cfg,
+                ShapeSpec(channels=out_channels, width=pooler_resolution, height=pooler_resolution),
+            )
+        return ret
+
+    @classmethod
+    def _build_res5_block(cls, cfg):
+        # fmt: off
+        stage_channel_factor = 2 ** 3  # res5 is 8x res2
+        num_groups           = cfg.MODEL.RESNETS.NUM_GROUPS
+        width_per_group      = cfg.MODEL.RESNETS.WIDTH_PER_GROUP
+        bottleneck_channels  = num_groups * width_per_group * stage_channel_factor
+        out_channels         = cfg.MODEL.RESNETS.RES2_OUT_CHANNELS * stage_channel_factor
+        stride_in_1x1        = cfg.MODEL.RESNETS.STRIDE_IN_1X1
+        norm                 = cfg.MODEL.RESNETS.NORM
+        assert not cfg.MODEL.RESNETS.DEFORM_ON_PER_STAGE[-1], \
+            "Deformable conv is not yet supported in res5 head."
+        # fmt: on
+
+        blocks = ResNet.make_stage(
+            BottleneckBlock,
+            3,
+            stride_per_block=[2, 1, 1],
+            in_channels=out_channels // 2,
+            bottleneck_channels=bottleneck_channels,
+            out_channels=out_channels,
+            num_groups=num_groups,
+            norm=norm,
+            stride_in_1x1=stride_in_1x1,
+        )
+        return nn.Sequential(*blocks), out_channels
+
+    def _shared_roi_transform(self, features: List[torch.Tensor], boxes: List[Boxes]):
+        x = self.pooler(features, boxes)
+        return self.res5(x)
+
+    def forward(
+        self,
+        images: ImageList,
+        features: Dict[str, torch.Tensor],
+        proposals: List[Instances],
+        targets: Optional[List[Instances]] = None,
+    ):
+        """
+        See :meth:`ROIHeads.forward`.
+        """
+        del images
+
+        if self.training:
+            assert targets
+            proposals = self.label_and_sample_proposals(proposals, targets)
+        del targets
+
+        proposal_boxes = [x.proposal_boxes for x in proposals]
+        box_features = self._shared_roi_transform(
+            [features[f] for f in self.in_features], proposal_boxes
+        )
+        predictions = self.box_predictor(box_features.mean(dim=[2, 3]))
+
+        if self.training:
+            del features
+            losses = self.box_predictor.losses(predictions, proposals)
+            if self.mask_on:
+                proposals, fg_selection_masks = select_foreground_proposals(
+                    proposals, self.num_classes
+                )
+                # Since the ROI feature transform is shared between boxes and masks,
+                # we don't need to recompute features. The mask loss is only defined
+                # on foreground proposals, so we need to select out the foreground
+                # features.
+                mask_features = box_features[torch.cat(fg_selection_masks, dim=0)]
+                del box_features
+                losses.update(self.mask_head(mask_features, proposals))
+            return [], losses
+        else:
+            pred_instances, _ = self.box_predictor.inference(predictions, proposals)
+            pred_instances = self.forward_with_given_boxes(features, pred_instances)
+            return pred_instances, {}
+
+    def forward_with_given_boxes(
+        self, features: Dict[str, torch.Tensor], instances: List[Instances]
+    ) -> List[Instances]:
+        """
+        Use the given boxes in `instances` to produce other (non-box) per-ROI outputs.
+
+        Args:
+            features: same as in `forward()`
+            instances (list[Instances]): instances to predict other outputs. Expect the keys
+                "pred_boxes" and "pred_classes" to exist.
+
+        Returns:
+            instances (Instances):
+                the same `Instances` object, with extra
+                fields such as `pred_masks` or `pred_keypoints`.
+        """
+        assert not self.training
+        assert instances[0].has("pred_boxes") and instances[0].has("pred_classes")
+
+        if self.mask_on:
+            feature_list = [features[f] for f in self.in_features]
+            x = self._shared_roi_transform(feature_list, [x.pred_boxes for x in instances])
+            return self.mask_head(x, instances)
+        else:
+            return instances
+
+
+@ROI_HEADS_REGISTRY.register()
+class StandardROIHeads(ROIHeads):
+    """
+    It's "standard" in a sense that there is no ROI transform sharing
+    or feature sharing between tasks.
+    Each head independently processes the input features by each head's
+    own pooler and head.
+
+    This class is used by most models, such as FPN and C5.
+    To implement more models, you can subclass it and implement a different
+    :meth:`forward()` or a head.
+    """
+
+    @configurable
+    def __init__(
+        self,
+        *,
+        box_in_features: List[str],
+        box_pooler: ROIPooler,
+        box_head: nn.Module,
+        box_predictor: nn.Module,
+        mask_in_features: Optional[List[str]] = None,
+        mask_pooler: Optional[ROIPooler] = None,
+        mask_head: Optional[nn.Module] = None,
+        keypoint_in_features: Optional[List[str]] = None,
+        keypoint_pooler: Optional[ROIPooler] = None,
+        keypoint_head: Optional[nn.Module] = None,
+        train_on_pred_boxes: bool = False,
+        **kwargs,
+    ):
+        """
+        NOTE: this interface is experimental.
+
+        Args:
+            box_in_features (list[str]): list of feature names to use for the box head.
+            box_pooler (ROIPooler): pooler to extra region features for box head
+            box_head (nn.Module): transform features to make box predictions
+            box_predictor (nn.Module): make box predictions from the feature.
+                Should have the same interface as :class:`FastRCNNOutputLayers`.
+            mask_in_features (list[str]): list of feature names to use for the mask
+                pooler or mask head. None if not using mask head.
+            mask_pooler (ROIPooler): pooler to extract region features from image features.
+                The mask head will then take region features to make predictions.
+                If None, the mask head will directly take the dict of image features
+                defined by `mask_in_features`
+            mask_head (nn.Module): transform features to make mask predictions
+            keypoint_in_features, keypoint_pooler, keypoint_head: similar to ``mask_*``.
+            train_on_pred_boxes (bool): whether to use proposal boxes or
+                predicted boxes from the box head to train other heads.
+        """
+        super().__init__(**kwargs)
+        # keep self.in_features for backward compatibility
+        self.in_features = self.box_in_features = box_in_features
+        self.box_pooler = box_pooler
+        self.box_head = box_head
+        self.box_predictor = box_predictor
+
+        self.mask_on = mask_in_features is not None
+        if self.mask_on:
+            self.mask_in_features = mask_in_features
+            self.mask_pooler = mask_pooler
+            self.mask_head = mask_head
+
+        self.keypoint_on = keypoint_in_features is not None
+        if self.keypoint_on:
+            self.keypoint_in_features = keypoint_in_features
+            self.keypoint_pooler = keypoint_pooler
+            self.keypoint_head = keypoint_head
+
+        self.train_on_pred_boxes = train_on_pred_boxes
+
+    @classmethod
+    def from_config(cls, cfg, input_shape):
+        ret = super().from_config(cfg)
+        ret["train_on_pred_boxes"] = cfg.MODEL.ROI_BOX_HEAD.TRAIN_ON_PRED_BOXES
+        # Subclasses that have not been updated to use from_config style construction
+        # may have overridden _init_*_head methods. In this case, those overridden methods
+        # will not be classmethods and we need to avoid trying to call them here.
+        # We test for this with ismethod which only returns True for bound methods of cls.
+        # Such subclasses will need to handle calling their overridden _init_*_head methods.
+        if inspect.ismethod(cls._init_box_head):
+            ret.update(cls._init_box_head(cfg, input_shape))
+        if inspect.ismethod(cls._init_mask_head):
+            ret.update(cls._init_mask_head(cfg, input_shape))
+        if inspect.ismethod(cls._init_keypoint_head):
+            ret.update(cls._init_keypoint_head(cfg, input_shape))
+        return ret
+
+    @classmethod
+    def _init_box_head(cls, cfg, input_shape):
+        # fmt: off
+        in_features       = cfg.MODEL.ROI_HEADS.IN_FEATURES
+        pooler_resolution = cfg.MODEL.ROI_BOX_HEAD.POOLER_RESOLUTION
+        pooler_scales     = tuple(1.0 / input_shape[k].stride for k in in_features)
+        sampling_ratio    = cfg.MODEL.ROI_BOX_HEAD.POOLER_SAMPLING_RATIO
+        pooler_type       = cfg.MODEL.ROI_BOX_HEAD.POOLER_TYPE
+        # fmt: on
+
+        # If StandardROIHeads is applied on multiple feature maps (as in FPN),
+        # then we share the same predictors and therefore the channel counts must be the same
+        in_channels = [input_shape[f].channels for f in in_features]
+        # Check all channel counts are equal
+        assert len(set(in_channels)) == 1, in_channels
+        in_channels = in_channels[0]
+
+        box_pooler = ROIPooler(
+            output_size=pooler_resolution,
+            scales=pooler_scales,
+            sampling_ratio=sampling_ratio,
+            pooler_type=pooler_type,
+        )
+        # Here we split "box head" and "box predictor", which is mainly due to historical reasons.
+        # They are used together so the "box predictor" layers should be part of the "box head".
+        # New subclasses of ROIHeads do not need "box predictor"s.
+        box_head = build_box_head(
+            cfg, ShapeSpec(channels=in_channels, height=pooler_resolution, width=pooler_resolution)
+        )
+        box_predictor = FastRCNNOutputLayers(cfg, box_head.output_shape)
+        return {
+            "box_in_features": in_features,
+            "box_pooler": box_pooler,
+            "box_head": box_head,
+            "box_predictor": box_predictor,
+        }
+
+    @classmethod
+    def _init_mask_head(cls, cfg, input_shape):
+        if not cfg.MODEL.MASK_ON:
+            return {}
+        # fmt: off
+        in_features       = cfg.MODEL.ROI_HEADS.IN_FEATURES
+        pooler_resolution = cfg.MODEL.ROI_MASK_HEAD.POOLER_RESOLUTION
+        pooler_scales     = tuple(1.0 / input_shape[k].stride for k in in_features)
+        sampling_ratio    = cfg.MODEL.ROI_MASK_HEAD.POOLER_SAMPLING_RATIO
+        pooler_type       = cfg.MODEL.ROI_MASK_HEAD.POOLER_TYPE
+        # fmt: on
+
+        in_channels = [input_shape[f].channels for f in in_features][0]
+
+        ret = {"mask_in_features": in_features}
+        ret["mask_pooler"] = (
+            ROIPooler(
+                output_size=pooler_resolution,
+                scales=pooler_scales,
+                sampling_ratio=sampling_ratio,
+                pooler_type=pooler_type,
+            )
+            if pooler_type
+            else None
+        )
+        if pooler_type:
+            shape = ShapeSpec(
+                channels=in_channels, width=pooler_resolution, height=pooler_resolution
+            )
+        else:
+            shape = {f: input_shape[f] for f in in_features}
+        ret["mask_head"] = build_mask_head(cfg, shape)
+        return ret
+
+    @classmethod
+    def _init_keypoint_head(cls, cfg, input_shape):
+        if not cfg.MODEL.KEYPOINT_ON:
+            return {}
+        # fmt: off
+        in_features       = cfg.MODEL.ROI_HEADS.IN_FEATURES
+        pooler_resolution = cfg.MODEL.ROI_KEYPOINT_HEAD.POOLER_RESOLUTION
+        pooler_scales     = tuple(1.0 / input_shape[k].stride for k in in_features)  # noqa
+        sampling_ratio    = cfg.MODEL.ROI_KEYPOINT_HEAD.POOLER_SAMPLING_RATIO
+        pooler_type       = cfg.MODEL.ROI_KEYPOINT_HEAD.POOLER_TYPE
+        # fmt: on
+
+        in_channels = [input_shape[f].channels for f in in_features][0]
+
+        ret = {"keypoint_in_features": in_features}
+        ret["keypoint_pooler"] = (
+            ROIPooler(
+                output_size=pooler_resolution,
+                scales=pooler_scales,
+                sampling_ratio=sampling_ratio,
+                pooler_type=pooler_type,
+            )
+            if pooler_type
+            else None
+        )
+        if pooler_type:
+            shape = ShapeSpec(
+                channels=in_channels, width=pooler_resolution, height=pooler_resolution
+            )
+        else:
+            shape = {f: input_shape[f] for f in in_features}
+        ret["keypoint_head"] = build_keypoint_head(cfg, shape)
+        return ret
+
+    def forward(
+        self,
+        images: ImageList,
+        features: Dict[str, torch.Tensor],
+        proposals: List[Instances],
+        targets: Optional[List[Instances]] = None,
+    ) -> Tuple[List[Instances], Dict[str, torch.Tensor]]:
+        """
+        See :class:`ROIHeads.forward`.
+        """
+        del images
+        if self.training:
+            assert targets, "'targets' argument is required during training"
+            proposals = self.label_and_sample_proposals(proposals, targets)
+        del targets
+
+        if self.training:
+            losses = self._forward_box(features, proposals)
+            # Usually the original proposals used by the box head are used by the mask, keypoint
+            # heads. But when `self.train_on_pred_boxes is True`, proposals will contain boxes
+            # predicted by the box head.
+            losses.update(self._forward_mask(features, proposals))
+            losses.update(self._forward_keypoint(features, proposals))
+            return proposals, losses
+        else:
+            pred_instances = self._forward_box(features, proposals)
+            # During inference cascaded prediction is used: the mask and keypoints heads are only
+            # applied to the top scoring box detections.
+            pred_instances = self.forward_with_given_boxes(features, pred_instances)
+            return pred_instances, {}
+
+    def forward_with_given_boxes(
+        self, features: Dict[str, torch.Tensor], instances: List[Instances]
+    ) -> List[Instances]:
+        """
+        Use the given boxes in `instances` to produce other (non-box) per-ROI outputs.
+
+        This is useful for downstream tasks where a box is known, but need to obtain
+        other attributes (outputs of other heads).
+        Test-time augmentation also uses this.
+
+        Args:
+            features: same as in `forward()`
+            instances (list[Instances]): instances to predict other outputs. Expect the keys
+                "pred_boxes" and "pred_classes" to exist.
+
+        Returns:
+            list[Instances]:
+                the same `Instances` objects, with extra
+                fields such as `pred_masks` or `pred_keypoints`.
+        """
+        assert not self.training
+        assert instances[0].has("pred_boxes") and instances[0].has("pred_classes")
+
+        instances = self._forward_mask(features, instances)
+        instances = self._forward_keypoint(features, instances)
+        return instances
+
+    def _forward_box(self, features: Dict[str, torch.Tensor], proposals: List[Instances]):
+        """
+        Forward logic of the box prediction branch. If `self.train_on_pred_boxes is True`,
+            the function puts predicted boxes in the `proposal_boxes` field of `proposals` argument.
+
+        Args:
+            features (dict[str, Tensor]): mapping from feature map names to tensor.
+                Same as in :meth:`ROIHeads.forward`.
+            proposals (list[Instances]): the per-image object proposals with
+                their matching ground truth.
+                Each has fields "proposal_boxes", and "objectness_logits",
+                "gt_classes", "gt_boxes".
+
+        Returns:
+            In training, a dict of losses.
+            In inference, a list of `Instances`, the predicted instances.
+        """
+        features = [features[f] for f in self.box_in_features]
+        box_features = self.box_pooler(features, [x.proposal_boxes for x in proposals])
+        box_features = self.box_head(box_features)
+        predictions = self.box_predictor(box_features)
+        del box_features
+
+        if self.training:
+            losses = self.box_predictor.losses(predictions, proposals)
+            # proposals is modified in-place below, so losses must be computed first.
+            if self.train_on_pred_boxes:
+                with torch.no_grad():
+                    pred_boxes = self.box_predictor.predict_boxes_for_gt_classes(
+                        predictions, proposals
+                    )
+                    for proposals_per_image, pred_boxes_per_image in zip(proposals, pred_boxes):
+                        proposals_per_image.proposal_boxes = Boxes(pred_boxes_per_image)
+            return losses
+        else:
+            pred_instances, _ = self.box_predictor.inference(predictions, proposals)
+            return pred_instances
+
+    def _forward_mask(self, features: Dict[str, torch.Tensor], instances: List[Instances]):
+        """
+        Forward logic of the mask prediction branch.
+
+        Args:
+            features (dict[str, Tensor]): mapping from feature map names to tensor.
+                Same as in :meth:`ROIHeads.forward`.
+            instances (list[Instances]): the per-image instances to train/predict masks.
+                In training, they can be the proposals.
+                In inference, they can be the boxes predicted by R-CNN box head.
+
+        Returns:
+            In training, a dict of losses.
+            In inference, update `instances` with new fields "pred_masks" and return it.
+        """
+        if not self.mask_on:
+            return {} if self.training else instances
+
+        if self.training:
+            # head is only trained on positive proposals.
+            instances, _ = select_foreground_proposals(instances, self.num_classes)
+
+        if self.mask_pooler is not None:
+            features = [features[f] for f in self.mask_in_features]
+            boxes = [x.proposal_boxes if self.training else x.pred_boxes for x in instances]
+            features = self.mask_pooler(features, boxes)
+        else:
+            features = {f: features[f] for f in self.mask_in_features}
+        return self.mask_head(features, instances)
+
+    def _forward_keypoint(self, features: Dict[str, torch.Tensor], instances: List[Instances]):
+        """
+        Forward logic of the keypoint prediction branch.
+
+        Args:
+            features (dict[str, Tensor]): mapping from feature map names to tensor.
+                Same as in :meth:`ROIHeads.forward`.
+            instances (list[Instances]): the per-image instances to train/predict keypoints.
+                In training, they can be the proposals.
+                In inference, they can be the boxes predicted by R-CNN box head.
+
+        Returns:
+            In training, a dict of losses.
+            In inference, update `instances` with new fields "pred_keypoints" and return it.
+        """
+        if not self.keypoint_on:
+            return {} if self.training else instances
+
+        if self.training:
+            # head is only trained on positive proposals with >=1 visible keypoints.
+            instances, _ = select_foreground_proposals(instances, self.num_classes)
+            instances = select_proposals_with_visible_keypoints(instances)
+
+        if self.keypoint_pooler is not None:
+            features = [features[f] for f in self.keypoint_in_features]
+            boxes = [x.proposal_boxes if self.training else x.pred_boxes for x in instances]
+            features = self.keypoint_pooler(features, boxes)
+        else:
+            features = {f: features[f] for f in self.keypoint_in_features}
+        return self.keypoint_head(features, instances)
diff --git a/detectron2/modeling/roi_heads/rotated_fast_rcnn.py b/detectron2/modeling/roi_heads/rotated_fast_rcnn.py
new file mode 100644
index 0000000000000000000000000000000000000000..1e7bfabdedff5c5a826d8d4f551ea96b541f2cb6
--- /dev/null
+++ b/detectron2/modeling/roi_heads/rotated_fast_rcnn.py
@@ -0,0 +1,271 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+import logging
+import numpy as np
+import torch
+
+from detectron2.config import configurable
+from detectron2.layers import ShapeSpec, batched_nms_rotated
+from detectron2.structures import Instances, RotatedBoxes, pairwise_iou_rotated
+from detectron2.utils.events import get_event_storage
+
+from ..box_regression import Box2BoxTransformRotated
+from ..poolers import ROIPooler
+from ..proposal_generator.proposal_utils import add_ground_truth_to_proposals
+from .box_head import build_box_head
+from .fast_rcnn import FastRCNNOutputLayers
+from .roi_heads import ROI_HEADS_REGISTRY, StandardROIHeads
+
+logger = logging.getLogger(__name__)
+
+"""
+Shape shorthand in this module:
+
+    N: number of images in the minibatch
+    R: number of ROIs, combined over all images, in the minibatch
+    Ri: number of ROIs in image i
+    K: number of foreground classes. E.g.,there are 80 foreground classes in COCO.
+
+Naming convention:
+
+    deltas: refers to the 5-d (dx, dy, dw, dh, da) deltas that parameterize the box2box
+    transform (see :class:`box_regression.Box2BoxTransformRotated`).
+
+    pred_class_logits: predicted class scores in [-inf, +inf]; use
+        softmax(pred_class_logits) to estimate P(class).
+
+    gt_classes: ground-truth classification labels in [0, K], where [0, K) represent
+        foreground object classes and K represents the background class.
+
+    pred_proposal_deltas: predicted rotated box2box transform deltas for transforming proposals
+        to detection box predictions.
+
+    gt_proposal_deltas: ground-truth rotated box2box transform deltas
+"""
+
+
+def fast_rcnn_inference_rotated(
+    boxes, scores, image_shapes, score_thresh, nms_thresh, topk_per_image
+):
+    """
+    Call `fast_rcnn_inference_single_image_rotated` for all images.
+
+    Args:
+        boxes (list[Tensor]): A list of Tensors of predicted class-specific or class-agnostic
+            boxes for each image. Element i has shape (Ri, K * 5) if doing
+            class-specific regression, or (Ri, 5) if doing class-agnostic
+            regression, where Ri is the number of predicted objects for image i.
+            This is compatible with the output of :meth:`FastRCNNOutputLayers.predict_boxes`.
+        scores (list[Tensor]): A list of Tensors of predicted class scores for each image.
+            Element i has shape (Ri, K + 1), where Ri is the number of predicted objects
+            for image i. Compatible with the output of :meth:`FastRCNNOutputLayers.predict_probs`.
+        image_shapes (list[tuple]): A list of (width, height) tuples for each image in the batch.
+        score_thresh (float): Only return detections with a confidence score exceeding this
+            threshold.
+        nms_thresh (float):  The threshold to use for box non-maximum suppression. Value in [0, 1].
+        topk_per_image (int): The number of top scoring detections to return. Set < 0 to return
+            all detections.
+
+    Returns:
+        instances: (list[Instances]): A list of N instances, one for each image in the batch,
+            that stores the topk most confidence detections.
+        kept_indices: (list[Tensor]): A list of 1D tensor of length of N, each element indicates
+            the corresponding boxes/scores index in [0, Ri) from the input, for image i.
+    """
+    result_per_image = [
+        fast_rcnn_inference_single_image_rotated(
+            boxes_per_image, scores_per_image, image_shape, score_thresh, nms_thresh, topk_per_image
+        )
+        for scores_per_image, boxes_per_image, image_shape in zip(scores, boxes, image_shapes)
+    ]
+    return [x[0] for x in result_per_image], [x[1] for x in result_per_image]
+
+
+@torch.no_grad()
+def fast_rcnn_inference_single_image_rotated(
+    boxes, scores, image_shape, score_thresh, nms_thresh, topk_per_image
+):
+    """
+    Single-image inference. Return rotated bounding-box detection results by thresholding
+    on scores and applying rotated non-maximum suppression (Rotated NMS).
+
+    Args:
+        Same as `fast_rcnn_inference_rotated`, but with rotated boxes, scores, and image shapes
+        per image.
+
+    Returns:
+        Same as `fast_rcnn_inference_rotated`, but for only one image.
+    """
+    valid_mask = torch.isfinite(boxes).all(dim=1) & torch.isfinite(scores).all(dim=1)
+    if not valid_mask.all():
+        boxes = boxes[valid_mask]
+        scores = scores[valid_mask]
+
+    B = 5  # box dimension
+    scores = scores[:, :-1]
+    num_bbox_reg_classes = boxes.shape[1] // B
+    # Convert to Boxes to use the `clip` function ...
+    boxes = RotatedBoxes(boxes.reshape(-1, B))
+    boxes.clip(image_shape)
+    boxes = boxes.tensor.view(-1, num_bbox_reg_classes, B)  # R x C x B
+    # Filter results based on detection scores
+    filter_mask = scores > score_thresh  # R x K
+    # R' x 2. First column contains indices of the R predictions;
+    # Second column contains indices of classes.
+    filter_inds = filter_mask.nonzero()
+    if num_bbox_reg_classes == 1:
+        boxes = boxes[filter_inds[:, 0], 0]
+    else:
+        boxes = boxes[filter_mask]
+    scores = scores[filter_mask]
+
+    # Apply per-class Rotated NMS
+    keep = batched_nms_rotated(boxes, scores, filter_inds[:, 1], nms_thresh)
+    if topk_per_image >= 0:
+        keep = keep[:topk_per_image]
+    boxes, scores, filter_inds = boxes[keep], scores[keep], filter_inds[keep]
+
+    result = Instances(image_shape)
+    result.pred_boxes = RotatedBoxes(boxes)
+    result.scores = scores
+    result.pred_classes = filter_inds[:, 1]
+
+    return result, filter_inds[:, 0]
+
+
+class RotatedFastRCNNOutputLayers(FastRCNNOutputLayers):
+    """
+    Two linear layers for predicting Rotated Fast R-CNN outputs.
+    """
+
+    @classmethod
+    def from_config(cls, cfg, input_shape):
+        args = super().from_config(cfg, input_shape)
+        args["box2box_transform"] = Box2BoxTransformRotated(
+            weights=cfg.MODEL.ROI_BOX_HEAD.BBOX_REG_WEIGHTS
+        )
+        return args
+
+    def inference(self, predictions, proposals):
+        """
+        Returns:
+            list[Instances]: same as `fast_rcnn_inference_rotated`.
+            list[Tensor]: same as `fast_rcnn_inference_rotated`.
+        """
+        boxes = self.predict_boxes(predictions, proposals)
+        scores = self.predict_probs(predictions, proposals)
+        image_shapes = [x.image_size for x in proposals]
+
+        return fast_rcnn_inference_rotated(
+            boxes,
+            scores,
+            image_shapes,
+            self.test_score_thresh,
+            self.test_nms_thresh,
+            self.test_topk_per_image,
+        )
+
+
+@ROI_HEADS_REGISTRY.register()
+class RROIHeads(StandardROIHeads):
+    """
+    This class is used by Rotated Fast R-CNN to detect rotated boxes.
+    For now, it only supports box predictions but not mask or keypoints.
+    """
+
+    @configurable
+    def __init__(self, **kwargs):
+        """
+        NOTE: this interface is experimental.
+        """
+        super().__init__(**kwargs)
+        assert (
+            not self.mask_on and not self.keypoint_on
+        ), "Mask/Keypoints not supported in Rotated ROIHeads."
+        assert not self.train_on_pred_boxes, "train_on_pred_boxes not implemented for RROIHeads!"
+
+    @classmethod
+    def _init_box_head(cls, cfg, input_shape):
+        # fmt: off
+        in_features       = cfg.MODEL.ROI_HEADS.IN_FEATURES
+        pooler_resolution = cfg.MODEL.ROI_BOX_HEAD.POOLER_RESOLUTION
+        pooler_scales     = tuple(1.0 / input_shape[k].stride for k in in_features)
+        sampling_ratio    = cfg.MODEL.ROI_BOX_HEAD.POOLER_SAMPLING_RATIO
+        pooler_type       = cfg.MODEL.ROI_BOX_HEAD.POOLER_TYPE
+        # fmt: on
+        assert pooler_type in ["ROIAlignRotated"], pooler_type
+        # assume all channel counts are equal
+        in_channels = [input_shape[f].channels for f in in_features][0]
+
+        box_pooler = ROIPooler(
+            output_size=pooler_resolution,
+            scales=pooler_scales,
+            sampling_ratio=sampling_ratio,
+            pooler_type=pooler_type,
+        )
+        box_head = build_box_head(
+            cfg, ShapeSpec(channels=in_channels, height=pooler_resolution, width=pooler_resolution)
+        )
+        # This line is the only difference v.s. StandardROIHeads
+        box_predictor = RotatedFastRCNNOutputLayers(cfg, box_head.output_shape)
+        return {
+            "box_in_features": in_features,
+            "box_pooler": box_pooler,
+            "box_head": box_head,
+            "box_predictor": box_predictor,
+        }
+
+    @torch.no_grad()
+    def label_and_sample_proposals(self, proposals, targets):
+        """
+        Prepare some proposals to be used to train the RROI heads.
+        It performs box matching between `proposals` and `targets`, and assigns
+        training labels to the proposals.
+        It returns `self.batch_size_per_image` random samples from proposals and groundtruth boxes,
+        with a fraction of positives that is no larger than `self.positive_sample_fraction.
+
+        Args:
+            See :meth:`StandardROIHeads.forward`
+
+        Returns:
+            list[Instances]: length `N` list of `Instances`s containing the proposals
+                sampled for training. Each `Instances` has the following fields:
+                - proposal_boxes: the rotated proposal boxes
+                - gt_boxes: the ground-truth rotated boxes that the proposal is assigned to
+                  (this is only meaningful if the proposal has a label > 0; if label = 0
+                   then the ground-truth box is random)
+                - gt_classes: the ground-truth classification lable for each proposal
+        """
+        if self.proposal_append_gt:
+            proposals = add_ground_truth_to_proposals(targets, proposals)
+
+        proposals_with_gt = []
+
+        num_fg_samples = []
+        num_bg_samples = []
+        for proposals_per_image, targets_per_image in zip(proposals, targets):
+            has_gt = len(targets_per_image) > 0
+            match_quality_matrix = pairwise_iou_rotated(
+                targets_per_image.gt_boxes, proposals_per_image.proposal_boxes
+            )
+            matched_idxs, matched_labels = self.proposal_matcher(match_quality_matrix)
+            sampled_idxs, gt_classes = self._sample_proposals(
+                matched_idxs, matched_labels, targets_per_image.gt_classes
+            )
+
+            proposals_per_image = proposals_per_image[sampled_idxs]
+            proposals_per_image.gt_classes = gt_classes
+
+            if has_gt:
+                sampled_targets = matched_idxs[sampled_idxs]
+                proposals_per_image.gt_boxes = targets_per_image.gt_boxes[sampled_targets]
+
+            num_bg_samples.append((gt_classes == self.num_classes).sum().item())
+            num_fg_samples.append(gt_classes.numel() - num_bg_samples[-1])
+            proposals_with_gt.append(proposals_per_image)
+
+        # Log the number of fg/bg samples that are selected for training ROI heads
+        storage = get_event_storage()
+        storage.put_scalar("roi_head/num_fg_samples", np.mean(num_fg_samples))
+        storage.put_scalar("roi_head/num_bg_samples", np.mean(num_bg_samples))
+
+        return proposals_with_gt
diff --git a/detectron2/modeling/sampling.py b/detectron2/modeling/sampling.py
new file mode 100644
index 0000000000000000000000000000000000000000..a2d0f6648b349c5ea39fd29785b77c961a58fa22
--- /dev/null
+++ b/detectron2/modeling/sampling.py
@@ -0,0 +1,54 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+import torch
+
+from detectron2.layers import nonzero_tuple
+
+__all__ = ["subsample_labels"]
+
+
+def subsample_labels(
+    labels: torch.Tensor, num_samples: int, positive_fraction: float, bg_label: int
+):
+    """
+    Return `num_samples` (or fewer, if not enough found)
+    random samples from `labels` which is a mixture of positives & negatives.
+    It will try to return as many positives as possible without
+    exceeding `positive_fraction * num_samples`, and then try to
+    fill the remaining slots with negatives.
+
+    Args:
+        labels (Tensor): (N, ) label vector with values:
+            * -1: ignore
+            * bg_label: background ("negative") class
+            * otherwise: one or more foreground ("positive") classes
+        num_samples (int): The total number of labels with value >= 0 to return.
+            Values that are not sampled will be filled with -1 (ignore).
+        positive_fraction (float): The number of subsampled labels with values > 0
+            is `min(num_positives, int(positive_fraction * num_samples))`. The number
+            of negatives sampled is `min(num_negatives, num_samples - num_positives_sampled)`.
+            In order words, if there are not enough positives, the sample is filled with
+            negatives. If there are also not enough negatives, then as many elements are
+            sampled as is possible.
+        bg_label (int): label index of background ("negative") class.
+
+    Returns:
+        pos_idx, neg_idx (Tensor):
+            1D vector of indices. The total length of both is `num_samples` or fewer.
+    """
+    positive = nonzero_tuple((labels != -1) & (labels != bg_label))[0]
+    negative = nonzero_tuple(labels == bg_label)[0]
+
+    num_pos = int(num_samples * positive_fraction)
+    # protect against not enough positive examples
+    num_pos = min(positive.numel(), num_pos)
+    num_neg = num_samples - num_pos
+    # protect against not enough negative examples
+    num_neg = min(negative.numel(), num_neg)
+
+    # randomly select positive and negative examples
+    perm1 = torch.randperm(positive.numel(), device=positive.device)[:num_pos]
+    perm2 = torch.randperm(negative.numel(), device=negative.device)[:num_neg]
+
+    pos_idx = positive[perm1]
+    neg_idx = negative[perm2]
+    return pos_idx, neg_idx
diff --git a/detectron2/modeling/test_time_augmentation.py b/detectron2/modeling/test_time_augmentation.py
new file mode 100644
index 0000000000000000000000000000000000000000..373e6bf00a39c040ff1da49d6dcd39a54a0b69a7
--- /dev/null
+++ b/detectron2/modeling/test_time_augmentation.py
@@ -0,0 +1,307 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+import copy
+import numpy as np
+from contextlib import contextmanager
+from itertools import count
+from typing import List
+import torch
+from fvcore.transforms import HFlipTransform, NoOpTransform
+from torch import nn
+from torch.nn.parallel import DistributedDataParallel
+
+from detectron2.config import configurable
+from detectron2.data.detection_utils import read_image
+from detectron2.data.transforms import (
+    RandomFlip,
+    ResizeShortestEdge,
+    ResizeTransform,
+    apply_augmentations,
+)
+from detectron2.structures import Boxes, Instances
+
+from .meta_arch import GeneralizedRCNN
+from .postprocessing import detector_postprocess
+from .roi_heads.fast_rcnn import fast_rcnn_inference_single_image
+
+__all__ = ["DatasetMapperTTA", "GeneralizedRCNNWithTTA"]
+
+
+class DatasetMapperTTA:
+    """
+    Implement test-time augmentation for detection data.
+    It is a callable which takes a dataset dict from a detection dataset,
+    and returns a list of dataset dicts where the images
+    are augmented from the input image by the transformations defined in the config.
+    This is used for test-time augmentation.
+    """
+
+    @configurable
+    def __init__(self, min_sizes: List[int], max_size: int, flip: bool):
+        """
+        Args:
+            min_sizes: list of short-edge size to resize the image to
+            max_size: maximum height or width of resized images
+            flip: whether to apply flipping augmentation
+        """
+        self.min_sizes = min_sizes
+        self.max_size = max_size
+        self.flip = flip
+
+    @classmethod
+    def from_config(cls, cfg):
+        return {
+            "min_sizes": cfg.TEST.AUG.MIN_SIZES,
+            "max_size": cfg.TEST.AUG.MAX_SIZE,
+            "flip": cfg.TEST.AUG.FLIP,
+        }
+
+    def __call__(self, dataset_dict):
+        """
+        Args:
+            dict: a dict in standard model input format. See tutorials for details.
+
+        Returns:
+            list[dict]:
+                a list of dicts, which contain augmented version of the input image.
+                The total number of dicts is ``len(min_sizes) * (2 if flip else 1)``.
+                Each dict has field "transforms" which is a TransformList,
+                containing the transforms that are used to generate this image.
+        """
+        numpy_image = dataset_dict["image"].permute(1, 2, 0).numpy()
+        shape = numpy_image.shape
+        orig_shape = (dataset_dict["height"], dataset_dict["width"])
+        if shape[:2] != orig_shape:
+            # It transforms the "original" image in the dataset to the input image
+            pre_tfm = ResizeTransform(orig_shape[0], orig_shape[1], shape[0], shape[1])
+        else:
+            pre_tfm = NoOpTransform()
+
+        # Create all combinations of augmentations to use
+        aug_candidates = []  # each element is a list[Augmentation]
+        for min_size in self.min_sizes:
+            resize = ResizeShortestEdge(min_size, self.max_size)
+            aug_candidates.append([resize])  # resize only
+            if self.flip:
+                flip = RandomFlip(prob=1.0)
+                aug_candidates.append([resize, flip])  # resize + flip
+
+        # Apply all the augmentations
+        ret = []
+        for aug in aug_candidates:
+            new_image, tfms = apply_augmentations(aug, np.copy(numpy_image))
+            torch_image = torch.from_numpy(np.ascontiguousarray(new_image.transpose(2, 0, 1)))
+
+            dic = copy.deepcopy(dataset_dict)
+            dic["transforms"] = pre_tfm + tfms
+            dic["image"] = torch_image
+            ret.append(dic)
+        return ret
+
+
+class GeneralizedRCNNWithTTA(nn.Module):
+    """
+    A GeneralizedRCNN with test-time augmentation enabled.
+    Its :meth:`__call__` method has the same interface as :meth:`GeneralizedRCNN.forward`.
+    """
+
+    def __init__(self, cfg, model, tta_mapper=None, batch_size=3):
+        """
+        Args:
+            cfg (CfgNode):
+            model (GeneralizedRCNN): a GeneralizedRCNN to apply TTA on.
+            tta_mapper (callable): takes a dataset dict and returns a list of
+                augmented versions of the dataset dict. Defaults to
+                `DatasetMapperTTA(cfg)`.
+            batch_size (int): batch the augmented images into this batch size for inference.
+        """
+        super().__init__()
+        if isinstance(model, DistributedDataParallel):
+            model = model.module
+        assert isinstance(
+            model, GeneralizedRCNN
+        ), "TTA is only supported on GeneralizedRCNN. Got a model of type {}".format(type(model))
+        self.cfg = cfg.clone()
+        assert not self.cfg.MODEL.KEYPOINT_ON, "TTA for keypoint is not supported yet"
+        assert (
+            not self.cfg.MODEL.LOAD_PROPOSALS
+        ), "TTA for pre-computed proposals is not supported yet"
+
+        self.model = model
+
+        if tta_mapper is None:
+            tta_mapper = DatasetMapperTTA(cfg)
+        self.tta_mapper = tta_mapper
+        self.batch_size = batch_size
+
+    @contextmanager
+    def _turn_off_roi_heads(self, attrs):
+        """
+        Open a context where some heads in `model.roi_heads` are temporarily turned off.
+        Args:
+            attr (list[str]): the attribute in `model.roi_heads` which can be used
+                to turn off a specific head, e.g., "mask_on", "keypoint_on".
+        """
+        roi_heads = self.model.roi_heads
+        old = {}
+        for attr in attrs:
+            try:
+                old[attr] = getattr(roi_heads, attr)
+            except AttributeError:
+                # The head may not be implemented in certain ROIHeads
+                pass
+
+        if len(old.keys()) == 0:
+            yield
+        else:
+            for attr in old.keys():
+                setattr(roi_heads, attr, False)
+            yield
+            for attr in old.keys():
+                setattr(roi_heads, attr, old[attr])
+
+    def _batch_inference(self, batched_inputs, detected_instances=None):
+        """
+        Execute inference on a list of inputs,
+        using batch size = self.batch_size, instead of the length of the list.
+
+        Inputs & outputs have the same format as :meth:`GeneralizedRCNN.inference`
+        """
+        if detected_instances is None:
+            detected_instances = [None] * len(batched_inputs)
+
+        outputs = []
+        inputs, instances = [], []
+        for idx, input, instance in zip(count(), batched_inputs, detected_instances):
+            inputs.append(input)
+            instances.append(instance)
+            if len(inputs) == self.batch_size or idx == len(batched_inputs) - 1:
+                outputs.extend(
+                    self.model.inference(
+                        inputs,
+                        instances if instances[0] is not None else None,
+                        do_postprocess=False,
+                    )
+                )
+                inputs, instances = [], []
+        return outputs
+
+    def __call__(self, batched_inputs):
+        """
+        Same input/output format as :meth:`GeneralizedRCNN.forward`
+        """
+
+        def _maybe_read_image(dataset_dict):
+            ret = copy.copy(dataset_dict)
+            if "image" not in ret:
+                image = read_image(ret.pop("file_name"), self.model.input_format)
+                image = torch.from_numpy(np.ascontiguousarray(image.transpose(2, 0, 1)))  # CHW
+                ret["image"] = image
+            if "height" not in ret and "width" not in ret:
+                ret["height"] = image.shape[1]
+                ret["width"] = image.shape[2]
+            return ret
+
+        return [self._inference_one_image(_maybe_read_image(x)) for x in batched_inputs]
+
+    def _inference_one_image(self, input):
+        """
+        Args:
+            input (dict): one dataset dict with "image" field being a CHW tensor
+
+        Returns:
+            dict: one output dict
+        """
+        orig_shape = (input["height"], input["width"])
+        augmented_inputs, tfms = self._get_augmented_inputs(input)
+        # Detect boxes from all augmented versions
+        with self._turn_off_roi_heads(["mask_on", "keypoint_on"]):
+            # temporarily disable roi heads
+            all_boxes, all_scores, all_classes = self._get_augmented_boxes(augmented_inputs, tfms)
+        # merge all detected boxes to obtain final predictions for boxes
+        merged_instances = self._merge_detections(all_boxes, all_scores, all_classes, orig_shape)
+
+        if self.cfg.MODEL.MASK_ON:
+            # Use the detected boxes to obtain masks
+            augmented_instances = self._rescale_detected_boxes(
+                augmented_inputs, merged_instances, tfms
+            )
+            # run forward on the detected boxes
+            outputs = self._batch_inference(augmented_inputs, augmented_instances)
+            # Delete now useless variables to avoid being out of memory
+            del augmented_inputs, augmented_instances
+            # average the predictions
+            merged_instances.pred_masks = self._reduce_pred_masks(outputs, tfms)
+            merged_instances = detector_postprocess(merged_instances, *orig_shape)
+            return {"instances": merged_instances}
+        else:
+            return {"instances": merged_instances}
+
+    def _get_augmented_inputs(self, input):
+        augmented_inputs = self.tta_mapper(input)
+        tfms = [x.pop("transforms") for x in augmented_inputs]
+        return augmented_inputs, tfms
+
+    def _get_augmented_boxes(self, augmented_inputs, tfms):
+        # 1: forward with all augmented images
+        outputs = self._batch_inference(augmented_inputs)
+        # 2: union the results
+        all_boxes = []
+        all_scores = []
+        all_classes = []
+        for output, tfm in zip(outputs, tfms):
+            # Need to inverse the transforms on boxes, to obtain results on original image
+            pred_boxes = output.pred_boxes.tensor
+            original_pred_boxes = tfm.inverse().apply_box(pred_boxes.cpu().numpy())
+            all_boxes.append(torch.from_numpy(original_pred_boxes).to(pred_boxes.device))
+
+            all_scores.extend(output.scores)
+            all_classes.extend(output.pred_classes)
+        all_boxes = torch.cat(all_boxes, dim=0)
+        return all_boxes, all_scores, all_classes
+
+    def _merge_detections(self, all_boxes, all_scores, all_classes, shape_hw):
+        # select from the union of all results
+        num_boxes = len(all_boxes)
+        num_classes = self.cfg.MODEL.ROI_HEADS.NUM_CLASSES
+        # +1 because fast_rcnn_inference expects background scores as well
+        all_scores_2d = torch.zeros(num_boxes, num_classes + 1, device=all_boxes.device)
+        for idx, cls, score in zip(count(), all_classes, all_scores):
+            all_scores_2d[idx, cls] = score
+
+        merged_instances, _ = fast_rcnn_inference_single_image(
+            all_boxes,
+            all_scores_2d,
+            shape_hw,
+            1e-8,
+            self.cfg.MODEL.ROI_HEADS.NMS_THRESH_TEST,
+            self.cfg.TEST.DETECTIONS_PER_IMAGE,
+        )
+
+        return merged_instances
+
+    def _rescale_detected_boxes(self, augmented_inputs, merged_instances, tfms):
+        augmented_instances = []
+        for input, tfm in zip(augmented_inputs, tfms):
+            # Transform the target box to the augmented image's coordinate space
+            pred_boxes = merged_instances.pred_boxes.tensor.cpu().numpy()
+            pred_boxes = torch.from_numpy(tfm.apply_box(pred_boxes))
+
+            aug_instances = Instances(
+                image_size=input["image"].shape[1:3],
+                pred_boxes=Boxes(pred_boxes),
+                pred_classes=merged_instances.pred_classes,
+                scores=merged_instances.scores,
+            )
+            augmented_instances.append(aug_instances)
+        return augmented_instances
+
+    def _reduce_pred_masks(self, outputs, tfms):
+        # Should apply inverse transforms on masks.
+        # We assume only resize & flip are used. pred_masks is a scale-invariant
+        # representation, so we handle flip specially
+        for output, tfm in zip(outputs, tfms):
+            if any(isinstance(t, HFlipTransform) for t in tfm.transforms):
+                output.pred_masks = output.pred_masks.flip(dims=[3])
+        all_pred_masks = torch.stack([o.pred_masks for o in outputs], dim=0)
+        avg_pred_masks = torch.mean(all_pred_masks, dim=0)
+        return avg_pred_masks
diff --git a/detectron2/projects/README.md b/detectron2/projects/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..95afe7ff8c8a9bd2f56621fcc3c1bdac11c256a9
--- /dev/null
+++ b/detectron2/projects/README.md
@@ -0,0 +1,2 @@
+
+Projects live in the [`projects` directory](../../projects) under the root of this repository, but not here.
diff --git a/detectron2/projects/__init__.py b/detectron2/projects/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..b2d0540b93ebbad78d6ff2cc0adc0fe8375816c2
--- /dev/null
+++ b/detectron2/projects/__init__.py
@@ -0,0 +1,34 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+import importlib.abc
+import importlib.util
+from pathlib import Path
+
+__all__ = []
+
+_PROJECTS = {
+    "point_rend": "PointRend",
+    "deeplab": "DeepLab",
+    "panoptic_deeplab": "Panoptic-DeepLab",
+}
+_PROJECT_ROOT = Path(__file__).resolve().parent.parent.parent / "projects"
+
+if _PROJECT_ROOT.is_dir():
+    # This is true only for in-place installation (pip install -e, setup.py develop),
+    # where setup(package_dir=) does not work: https://github.com/pypa/setuptools/issues/230
+
+    class _D2ProjectsFinder(importlib.abc.MetaPathFinder):
+        def find_spec(self, name, path, target=None):
+            if not name.startswith("detectron2.projects."):
+                return
+            project_name = name.split(".")[-1]
+            project_dir = _PROJECTS.get(project_name)
+            if not project_dir:
+                return
+            target_file = _PROJECT_ROOT / f"{project_dir}/{project_name}/__init__.py"
+            if not target_file.is_file():
+                return
+            return importlib.util.spec_from_file_location(name, target_file)
+
+    import sys
+
+    sys.meta_path.append(_D2ProjectsFinder())
diff --git a/detectron2/solver/__init__.py b/detectron2/solver/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..7e36c64f60f38f41d01dd2c9fb30364489a03841
--- /dev/null
+++ b/detectron2/solver/__init__.py
@@ -0,0 +1,11 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+from .build import build_lr_scheduler, build_optimizer, get_default_optimizer_params
+from .lr_scheduler import (
+    LRMultiplier,
+    LRScheduler,
+    WarmupCosineLR,
+    WarmupMultiStepLR,
+    WarmupParamScheduler,
+)
+
+__all__ = [k for k in globals().keys() if not k.startswith("_")]
diff --git a/detectron2/solver/__pycache__/__init__.cpython-310.pyc b/detectron2/solver/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f51fb2a8a54062b0c5f67080add6c1289d503980
Binary files /dev/null and b/detectron2/solver/__pycache__/__init__.cpython-310.pyc differ
diff --git a/detectron2/solver/__pycache__/__init__.cpython-39.pyc b/detectron2/solver/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c37c2ae98b1a73262671b98d055b607eb5294674
Binary files /dev/null and b/detectron2/solver/__pycache__/__init__.cpython-39.pyc differ
diff --git a/detectron2/solver/__pycache__/build.cpython-310.pyc b/detectron2/solver/__pycache__/build.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7dbb03222fc13775c5bbb392ba802f8abf65a99c
Binary files /dev/null and b/detectron2/solver/__pycache__/build.cpython-310.pyc differ
diff --git a/detectron2/solver/__pycache__/build.cpython-39.pyc b/detectron2/solver/__pycache__/build.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..599b492ffa8300906d9865f22268c68b28fe4542
Binary files /dev/null and b/detectron2/solver/__pycache__/build.cpython-39.pyc differ
diff --git a/detectron2/solver/__pycache__/lr_scheduler.cpython-310.pyc b/detectron2/solver/__pycache__/lr_scheduler.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ad1a013f1d9a9c7c6e5ad78cd4f159a594f3ae6f
Binary files /dev/null and b/detectron2/solver/__pycache__/lr_scheduler.cpython-310.pyc differ
diff --git a/detectron2/solver/__pycache__/lr_scheduler.cpython-39.pyc b/detectron2/solver/__pycache__/lr_scheduler.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ed1db9b0ea599c97b953df268b7fa6ed4d84207b
Binary files /dev/null and b/detectron2/solver/__pycache__/lr_scheduler.cpython-39.pyc differ
diff --git a/detectron2/solver/build.py b/detectron2/solver/build.py
new file mode 100644
index 0000000000000000000000000000000000000000..c0984d39f7227e94d2577435e32cd56e82c545fa
--- /dev/null
+++ b/detectron2/solver/build.py
@@ -0,0 +1,323 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+import copy
+import itertools
+import logging
+from collections import defaultdict
+from enum import Enum
+from typing import Any, Callable, Dict, Iterable, List, Optional, Set, Type, Union
+import torch
+from fvcore.common.param_scheduler import (
+    CosineParamScheduler,
+    MultiStepParamScheduler,
+    StepWithFixedGammaParamScheduler,
+)
+
+from detectron2.config import CfgNode
+from detectron2.utils.env import TORCH_VERSION
+
+from .lr_scheduler import LRMultiplier, LRScheduler, WarmupParamScheduler
+
+_GradientClipperInput = Union[torch.Tensor, Iterable[torch.Tensor]]
+_GradientClipper = Callable[[_GradientClipperInput], None]
+
+
+class GradientClipType(Enum):
+    VALUE = "value"
+    NORM = "norm"
+
+
+def _create_gradient_clipper(cfg: CfgNode) -> _GradientClipper:
+    """
+    Creates gradient clipping closure to clip by value or by norm,
+    according to the provided config.
+    """
+    cfg = copy.deepcopy(cfg)
+
+    def clip_grad_norm(p: _GradientClipperInput):
+        torch.nn.utils.clip_grad_norm_(p, cfg.CLIP_VALUE, cfg.NORM_TYPE)
+
+    def clip_grad_value(p: _GradientClipperInput):
+        torch.nn.utils.clip_grad_value_(p, cfg.CLIP_VALUE)
+
+    _GRADIENT_CLIP_TYPE_TO_CLIPPER = {
+        GradientClipType.VALUE: clip_grad_value,
+        GradientClipType.NORM: clip_grad_norm,
+    }
+    return _GRADIENT_CLIP_TYPE_TO_CLIPPER[GradientClipType(cfg.CLIP_TYPE)]
+
+
+def _generate_optimizer_class_with_gradient_clipping(
+    optimizer: Type[torch.optim.Optimizer],
+    *,
+    per_param_clipper: Optional[_GradientClipper] = None,
+    global_clipper: Optional[_GradientClipper] = None,
+) -> Type[torch.optim.Optimizer]:
+    """
+    Dynamically creates a new type that inherits the type of a given instance
+    and overrides the `step` method to add gradient clipping
+    """
+    assert (
+        per_param_clipper is None or global_clipper is None
+    ), "Not allowed to use both per-parameter clipping and global clipping"
+
+    def optimizer_wgc_step(self, closure=None):
+        if per_param_clipper is not None:
+            for group in self.param_groups:
+                for p in group["params"]:
+                    per_param_clipper(p)
+        else:
+            # global clipper for future use with detr
+            # (https://github.com/facebookresearch/detr/pull/287)
+            all_params = itertools.chain(*[g["params"] for g in self.param_groups])
+            global_clipper(all_params)
+        super(type(self), self).step(closure)
+
+    OptimizerWithGradientClip = type(
+        optimizer.__name__ + "WithGradientClip",
+        (optimizer,),
+        {"step": optimizer_wgc_step},
+    )
+    return OptimizerWithGradientClip
+
+
+def maybe_add_gradient_clipping(
+    cfg: CfgNode, optimizer: Type[torch.optim.Optimizer]
+) -> Type[torch.optim.Optimizer]:
+    """
+    If gradient clipping is enabled through config options, wraps the existing
+    optimizer type to become a new dynamically created class OptimizerWithGradientClip
+    that inherits the given optimizer and overrides the `step` method to
+    include gradient clipping.
+
+    Args:
+        cfg: CfgNode, configuration options
+        optimizer: type. A subclass of torch.optim.Optimizer
+
+    Return:
+        type: either the input `optimizer` (if gradient clipping is disabled), or
+            a subclass of it with gradient clipping included in the `step` method.
+    """
+    if not cfg.SOLVER.CLIP_GRADIENTS.ENABLED:
+        return optimizer
+    if isinstance(optimizer, torch.optim.Optimizer):
+        optimizer_type = type(optimizer)
+    else:
+        assert issubclass(optimizer, torch.optim.Optimizer), optimizer
+        optimizer_type = optimizer
+
+    grad_clipper = _create_gradient_clipper(cfg.SOLVER.CLIP_GRADIENTS)
+    OptimizerWithGradientClip = _generate_optimizer_class_with_gradient_clipping(
+        optimizer_type, per_param_clipper=grad_clipper
+    )
+    if isinstance(optimizer, torch.optim.Optimizer):
+        optimizer.__class__ = OptimizerWithGradientClip  # a bit hacky, not recommended
+        return optimizer
+    else:
+        return OptimizerWithGradientClip
+
+
+def build_optimizer(cfg: CfgNode, model: torch.nn.Module) -> torch.optim.Optimizer:
+    """
+    Build an optimizer from config.
+    """
+    params = get_default_optimizer_params(
+        model,
+        base_lr=cfg.SOLVER.BASE_LR,
+        weight_decay_norm=cfg.SOLVER.WEIGHT_DECAY_NORM,
+        bias_lr_factor=cfg.SOLVER.BIAS_LR_FACTOR,
+        weight_decay_bias=cfg.SOLVER.WEIGHT_DECAY_BIAS,
+    )
+    sgd_args = {
+        "params": params,
+        "lr": cfg.SOLVER.BASE_LR,
+        "momentum": cfg.SOLVER.MOMENTUM,
+        "nesterov": cfg.SOLVER.NESTEROV,
+        "weight_decay": cfg.SOLVER.WEIGHT_DECAY,
+    }
+    if TORCH_VERSION >= (1, 12):
+        sgd_args["foreach"] = True
+    return maybe_add_gradient_clipping(cfg, torch.optim.SGD(**sgd_args))
+
+
+def get_default_optimizer_params(
+    model: torch.nn.Module,
+    base_lr: Optional[float] = None,
+    weight_decay: Optional[float] = None,
+    weight_decay_norm: Optional[float] = None,
+    bias_lr_factor: Optional[float] = 1.0,
+    weight_decay_bias: Optional[float] = None,
+    lr_factor_func: Optional[Callable] = None,
+    overrides: Optional[Dict[str, Dict[str, float]]] = None,
+) -> List[Dict[str, Any]]:
+    """
+    Get default param list for optimizer, with support for a few types of
+    overrides. If no overrides needed, this is equivalent to `model.parameters()`.
+
+    Args:
+        base_lr: lr for every group by default. Can be omitted to use the one in optimizer.
+        weight_decay: weight decay for every group by default. Can be omitted to use the one
+            in optimizer.
+        weight_decay_norm: override weight decay for params in normalization layers
+        bias_lr_factor: multiplier of lr for bias parameters.
+        weight_decay_bias: override weight decay for bias parameters.
+        lr_factor_func: function to calculate lr decay rate by mapping the parameter names to
+            corresponding lr decay rate. Note that setting this option requires
+            also setting ``base_lr``.
+        overrides: if not `None`, provides values for optimizer hyperparameters
+            (LR, weight decay) for module parameters with a given name; e.g.
+            ``{"embedding": {"lr": 0.01, "weight_decay": 0.1}}`` will set the LR and
+            weight decay values for all module parameters named `embedding`.
+
+    For common detection models, ``weight_decay_norm`` is the only option
+    needed to be set. ``bias_lr_factor,weight_decay_bias`` are legacy settings
+    from Detectron1 that are not found useful.
+
+    Example:
+    ::
+        torch.optim.SGD(get_default_optimizer_params(model, weight_decay_norm=0),
+                       lr=0.01, weight_decay=1e-4, momentum=0.9)
+    """
+    if overrides is None:
+        overrides = {}
+    defaults = {}
+    if base_lr is not None:
+        defaults["lr"] = base_lr
+    if weight_decay is not None:
+        defaults["weight_decay"] = weight_decay
+    bias_overrides = {}
+    if bias_lr_factor is not None and bias_lr_factor != 1.0:
+        # NOTE: unlike Detectron v1, we now by default make bias hyperparameters
+        # exactly the same as regular weights.
+        if base_lr is None:
+            raise ValueError("bias_lr_factor requires base_lr")
+        bias_overrides["lr"] = base_lr * bias_lr_factor
+    if weight_decay_bias is not None:
+        bias_overrides["weight_decay"] = weight_decay_bias
+    if len(bias_overrides):
+        if "bias" in overrides:
+            raise ValueError("Conflicting overrides for 'bias'")
+        overrides["bias"] = bias_overrides
+    if lr_factor_func is not None:
+        if base_lr is None:
+            raise ValueError("lr_factor_func requires base_lr")
+    norm_module_types = (
+        torch.nn.BatchNorm1d,
+        torch.nn.BatchNorm2d,
+        torch.nn.BatchNorm3d,
+        torch.nn.SyncBatchNorm,
+        # NaiveSyncBatchNorm inherits from BatchNorm2d
+        torch.nn.GroupNorm,
+        torch.nn.InstanceNorm1d,
+        torch.nn.InstanceNorm2d,
+        torch.nn.InstanceNorm3d,
+        torch.nn.LayerNorm,
+        torch.nn.LocalResponseNorm,
+    )
+    params: List[Dict[str, Any]] = []
+    memo: Set[torch.nn.parameter.Parameter] = set()
+    for module_name, module in model.named_modules():
+        for module_param_name, value in module.named_parameters(recurse=False):
+            if not value.requires_grad:
+                continue
+            # Avoid duplicating parameters
+            if value in memo:
+                continue
+            memo.add(value)
+
+            hyperparams = copy.copy(defaults)
+            if isinstance(module, norm_module_types) and weight_decay_norm is not None:
+                hyperparams["weight_decay"] = weight_decay_norm
+            if lr_factor_func is not None:
+                hyperparams["lr"] *= lr_factor_func(f"{module_name}.{module_param_name}")
+
+            hyperparams.update(overrides.get(module_param_name, {}))
+            params.append({"params": [value], **hyperparams})
+    return reduce_param_groups(params)
+
+
+def _expand_param_groups(params: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
+    # Transform parameter groups into per-parameter structure.
+    # Later items in `params` can overwrite parameters set in previous items.
+    ret = defaultdict(dict)
+    for item in params:
+        assert "params" in item
+        cur_params = {x: y for x, y in item.items() if x != "params" and x != "param_names"}
+        if "param_names" in item:
+            for param_name, param in zip(item["param_names"], item["params"]):
+                ret[param].update({"param_names": [param_name], "params": [param], **cur_params})
+        else:
+            for param in item["params"]:
+                ret[param].update({"params": [param], **cur_params})
+    return list(ret.values())
+
+
+def reduce_param_groups(params: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
+    # Reorganize the parameter groups and merge duplicated groups.
+    # The number of parameter groups needs to be as small as possible in order
+    # to efficiently use the PyTorch multi-tensor optimizer. Therefore instead
+    # of using a parameter_group per single parameter, we reorganize the
+    # parameter groups and merge duplicated groups. This approach speeds
+    # up multi-tensor optimizer significantly.
+    params = _expand_param_groups(params)
+    groups = defaultdict(list)  # re-group all parameter groups by their hyperparams
+    for item in params:
+        cur_params = tuple((x, y) for x, y in item.items() if x != "params" and x != "param_names")
+        groups[cur_params].append({"params": item["params"]})
+        if "param_names" in item:
+            groups[cur_params][-1]["param_names"] = item["param_names"]
+
+    ret = []
+    for param_keys, param_values in groups.items():
+        cur = {kv[0]: kv[1] for kv in param_keys}
+        cur["params"] = list(
+            itertools.chain.from_iterable([params["params"] for params in param_values])
+        )
+        if len(param_values) > 0 and "param_names" in param_values[0]:
+            cur["param_names"] = list(
+                itertools.chain.from_iterable([params["param_names"] for params in param_values])
+            )
+        ret.append(cur)
+    return ret
+
+
+def build_lr_scheduler(cfg: CfgNode, optimizer: torch.optim.Optimizer) -> LRScheduler:
+    """
+    Build a LR scheduler from config.
+    """
+    name = cfg.SOLVER.LR_SCHEDULER_NAME
+
+    if name == "WarmupMultiStepLR":
+        steps = [x for x in cfg.SOLVER.STEPS if x <= cfg.SOLVER.MAX_ITER]
+        if len(steps) != len(cfg.SOLVER.STEPS):
+            logger = logging.getLogger(__name__)
+            logger.warning(
+                "SOLVER.STEPS contains values larger than SOLVER.MAX_ITER. "
+                "These values will be ignored."
+            )
+        sched = MultiStepParamScheduler(
+            values=[cfg.SOLVER.GAMMA**k for k in range(len(steps) + 1)],
+            milestones=steps,
+            num_updates=cfg.SOLVER.MAX_ITER,
+        )
+    elif name == "WarmupCosineLR":
+        end_value = cfg.SOLVER.BASE_LR_END / cfg.SOLVER.BASE_LR
+        assert end_value >= 0.0 and end_value <= 1.0, end_value
+        sched = CosineParamScheduler(1, end_value)
+    elif name == "WarmupStepWithFixedGammaLR":
+        sched = StepWithFixedGammaParamScheduler(
+            base_value=1.0,
+            gamma=cfg.SOLVER.GAMMA,
+            num_decays=cfg.SOLVER.NUM_DECAYS,
+            num_updates=cfg.SOLVER.MAX_ITER,
+        )
+    else:
+        raise ValueError("Unknown LR scheduler: {}".format(name))
+
+    sched = WarmupParamScheduler(
+        sched,
+        cfg.SOLVER.WARMUP_FACTOR,
+        min(cfg.SOLVER.WARMUP_ITERS / cfg.SOLVER.MAX_ITER, 1.0),
+        cfg.SOLVER.WARMUP_METHOD,
+        cfg.SOLVER.RESCALE_INTERVAL,
+    )
+    return LRMultiplier(optimizer, multiplier=sched, max_iter=cfg.SOLVER.MAX_ITER)
diff --git a/detectron2/solver/lr_scheduler.py b/detectron2/solver/lr_scheduler.py
new file mode 100644
index 0000000000000000000000000000000000000000..01e1eb7854a9662b9595a7ffa9b0e484faf34dff
--- /dev/null
+++ b/detectron2/solver/lr_scheduler.py
@@ -0,0 +1,247 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+import logging
+import math
+from bisect import bisect_right
+from typing import List
+import torch
+from fvcore.common.param_scheduler import (
+    CompositeParamScheduler,
+    ConstantParamScheduler,
+    LinearParamScheduler,
+    ParamScheduler,
+)
+
+try:
+    from torch.optim.lr_scheduler import LRScheduler
+except ImportError:
+    from torch.optim.lr_scheduler import _LRScheduler as LRScheduler
+
+logger = logging.getLogger(__name__)
+
+
+class WarmupParamScheduler(CompositeParamScheduler):
+    """
+    Add an initial warmup stage to another scheduler.
+    """
+
+    def __init__(
+        self,
+        scheduler: ParamScheduler,
+        warmup_factor: float,
+        warmup_length: float,
+        warmup_method: str = "linear",
+        rescale_interval: bool = False,
+    ):
+        """
+        Args:
+            scheduler: warmup will be added at the beginning of this scheduler
+            warmup_factor: the factor w.r.t the initial value of ``scheduler``, e.g. 0.001
+            warmup_length: the relative length (in [0, 1]) of warmup steps w.r.t the entire
+                training, e.g. 0.01
+            warmup_method: one of "linear" or "constant"
+            rescale_interval: whether we will rescale the interval of the scheduler after
+                warmup
+        """
+        # the value to reach when warmup ends
+        end_value = scheduler(0.0) if rescale_interval else scheduler(warmup_length)
+        start_value = warmup_factor * scheduler(0.0)
+        if warmup_method == "constant":
+            warmup = ConstantParamScheduler(start_value)
+        elif warmup_method == "linear":
+            warmup = LinearParamScheduler(start_value, end_value)
+        else:
+            raise ValueError("Unknown warmup method: {}".format(warmup_method))
+        super().__init__(
+            [warmup, scheduler],
+            interval_scaling=["rescaled", "rescaled" if rescale_interval else "fixed"],
+            lengths=[warmup_length, 1 - warmup_length],
+        )
+
+
+class LRMultiplier(LRScheduler):
+    """
+    A LRScheduler which uses fvcore :class:`ParamScheduler` to multiply the
+    learning rate of each param in the optimizer.
+    Every step, the learning rate of each parameter becomes its initial value
+    multiplied by the output of the given :class:`ParamScheduler`.
+
+    The absolute learning rate value of each parameter can be different.
+    This scheduler can be used as long as the relative scale among them do
+    not change during training.
+
+    Examples:
+    ::
+        LRMultiplier(
+            opt,
+            WarmupParamScheduler(
+                MultiStepParamScheduler(
+                    [1, 0.1, 0.01],
+                    milestones=[60000, 80000],
+                    num_updates=90000,
+                ), 0.001, 100 / 90000
+            ),
+            max_iter=90000
+        )
+    """
+
+    # NOTES: in the most general case, every LR can use its own scheduler.
+    # Supporting this requires interaction with the optimizer when its parameter
+    # group is initialized. For example, classyvision implements its own optimizer
+    # that allows different schedulers for every parameter group.
+    # To avoid this complexity, we use this class to support the most common cases
+    # where the relative scale among all LRs stay unchanged during training.  In this
+    # case we only need a total of one scheduler that defines the relative LR multiplier.
+
+    def __init__(
+        self,
+        optimizer: torch.optim.Optimizer,
+        multiplier: ParamScheduler,
+        max_iter: int,
+        last_iter: int = -1,
+    ):
+        """
+        Args:
+            optimizer, last_iter: See ``torch.optim.lr_scheduler.LRScheduler``.
+                ``last_iter`` is the same as ``last_epoch``.
+            multiplier: a fvcore ParamScheduler that defines the multiplier on
+                every LR of the optimizer
+            max_iter: the total number of training iterations
+        """
+        if not isinstance(multiplier, ParamScheduler):
+            raise ValueError(
+                "_LRMultiplier(multiplier=) must be an instance of fvcore "
+                f"ParamScheduler. Got {multiplier} instead."
+            )
+        self._multiplier = multiplier
+        self._max_iter = max_iter
+        super().__init__(optimizer, last_epoch=last_iter)
+
+    def state_dict(self):
+        # fvcore schedulers are stateless. Only keep pytorch scheduler states
+        return {"base_lrs": self.base_lrs, "last_epoch": self.last_epoch}
+
+    def get_lr(self) -> List[float]:
+        multiplier = self._multiplier(self.last_epoch / self._max_iter)
+        return [base_lr * multiplier for base_lr in self.base_lrs]
+
+
+"""
+Content below is no longer needed!
+"""
+
+# NOTE: PyTorch's LR scheduler interface uses names that assume the LR changes
+# only on epoch boundaries. We typically use iteration based schedules instead.
+# As a result, "epoch" (e.g., as in self.last_epoch) should be understood to mean
+# "iteration" instead.
+
+# FIXME: ideally this would be achieved with a CombinedLRScheduler, separating
+# MultiStepLR with WarmupLR but the current LRScheduler design doesn't allow it.
+
+
+class WarmupMultiStepLR(LRScheduler):
+    def __init__(
+        self,
+        optimizer: torch.optim.Optimizer,
+        milestones: List[int],
+        gamma: float = 0.1,
+        warmup_factor: float = 0.001,
+        warmup_iters: int = 1000,
+        warmup_method: str = "linear",
+        last_epoch: int = -1,
+    ):
+        logger.warning(
+            "WarmupMultiStepLR is deprecated! Use LRMultipilier with fvcore ParamScheduler instead!"
+        )
+        if not list(milestones) == sorted(milestones):
+            raise ValueError(
+                "Milestones should be a list of" " increasing integers. Got {}", milestones
+            )
+        self.milestones = milestones
+        self.gamma = gamma
+        self.warmup_factor = warmup_factor
+        self.warmup_iters = warmup_iters
+        self.warmup_method = warmup_method
+        super().__init__(optimizer, last_epoch)
+
+    def get_lr(self) -> List[float]:
+        warmup_factor = _get_warmup_factor_at_iter(
+            self.warmup_method, self.last_epoch, self.warmup_iters, self.warmup_factor
+        )
+        return [
+            base_lr * warmup_factor * self.gamma ** bisect_right(self.milestones, self.last_epoch)
+            for base_lr in self.base_lrs
+        ]
+
+    def _compute_values(self) -> List[float]:
+        # The new interface
+        return self.get_lr()
+
+
+class WarmupCosineLR(LRScheduler):
+    def __init__(
+        self,
+        optimizer: torch.optim.Optimizer,
+        max_iters: int,
+        warmup_factor: float = 0.001,
+        warmup_iters: int = 1000,
+        warmup_method: str = "linear",
+        last_epoch: int = -1,
+    ):
+        logger.warning(
+            "WarmupCosineLR is deprecated! Use LRMultipilier with fvcore ParamScheduler instead!"
+        )
+        self.max_iters = max_iters
+        self.warmup_factor = warmup_factor
+        self.warmup_iters = warmup_iters
+        self.warmup_method = warmup_method
+        super().__init__(optimizer, last_epoch)
+
+    def get_lr(self) -> List[float]:
+        warmup_factor = _get_warmup_factor_at_iter(
+            self.warmup_method, self.last_epoch, self.warmup_iters, self.warmup_factor
+        )
+        # Different definitions of half-cosine with warmup are possible. For
+        # simplicity we multiply the standard half-cosine schedule by the warmup
+        # factor. An alternative is to start the period of the cosine at warmup_iters
+        # instead of at 0. In the case that warmup_iters << max_iters the two are
+        # very close to each other.
+        return [
+            base_lr
+            * warmup_factor
+            * 0.5
+            * (1.0 + math.cos(math.pi * self.last_epoch / self.max_iters))
+            for base_lr in self.base_lrs
+        ]
+
+    def _compute_values(self) -> List[float]:
+        # The new interface
+        return self.get_lr()
+
+
+def _get_warmup_factor_at_iter(
+    method: str, iter: int, warmup_iters: int, warmup_factor: float
+) -> float:
+    """
+    Return the learning rate warmup factor at a specific iteration.
+    See :paper:`ImageNet in 1h` for more details.
+
+    Args:
+        method (str): warmup method; either "constant" or "linear".
+        iter (int): iteration at which to calculate the warmup factor.
+        warmup_iters (int): the number of warmup iterations.
+        warmup_factor (float): the base warmup factor (the meaning changes according
+            to the method used).
+
+    Returns:
+        float: the effective warmup factor at the given iteration.
+    """
+    if iter >= warmup_iters:
+        return 1.0
+
+    if method == "constant":
+        return warmup_factor
+    elif method == "linear":
+        alpha = iter / warmup_iters
+        return warmup_factor * (1 - alpha) + alpha
+    else:
+        raise ValueError("Unknown warmup method: {}".format(method))
diff --git a/detectron2/structures/__init__.py b/detectron2/structures/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..f3ee6057e3ec2731984ce8203c6eaf5348d08260
--- /dev/null
+++ b/detectron2/structures/__init__.py
@@ -0,0 +1,17 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+from .boxes import Boxes, BoxMode, pairwise_iou, pairwise_ioa, pairwise_point_box_distance
+from .image_list import ImageList
+
+from .instances import Instances
+from .keypoints import Keypoints, heatmaps_to_keypoints
+from .masks import BitMasks, PolygonMasks, polygons_to_bitmask, ROIMasks
+from .rotated_boxes import RotatedBoxes
+from .rotated_boxes import pairwise_iou as pairwise_iou_rotated
+
+__all__ = [k for k in globals().keys() if not k.startswith("_")]
+
+
+from detectron2.utils.env import fixup_module_metadata
+
+fixup_module_metadata(__name__, globals(), __all__)
+del fixup_module_metadata
diff --git a/detectron2/structures/__pycache__/__init__.cpython-310.pyc b/detectron2/structures/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..89de39165d0f974e1e08264b81f8330718052457
Binary files /dev/null and b/detectron2/structures/__pycache__/__init__.cpython-310.pyc differ
diff --git a/detectron2/structures/__pycache__/__init__.cpython-39.pyc b/detectron2/structures/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7f466a024ff2c3f4b7cccdb3487575fa2eb6bf43
Binary files /dev/null and b/detectron2/structures/__pycache__/__init__.cpython-39.pyc differ
diff --git a/detectron2/structures/__pycache__/boxes.cpython-310.pyc b/detectron2/structures/__pycache__/boxes.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d1418f7d16b4f49fdcdbef7474f79c00b7dfc5f8
Binary files /dev/null and b/detectron2/structures/__pycache__/boxes.cpython-310.pyc differ
diff --git a/detectron2/structures/__pycache__/boxes.cpython-39.pyc b/detectron2/structures/__pycache__/boxes.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..83d50584cd1a255e1b21274a5670bab0543a5420
Binary files /dev/null and b/detectron2/structures/__pycache__/boxes.cpython-39.pyc differ
diff --git a/detectron2/structures/__pycache__/image_list.cpython-310.pyc b/detectron2/structures/__pycache__/image_list.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..becae68c0154c5492ef4a9ef59ca289e69978171
Binary files /dev/null and b/detectron2/structures/__pycache__/image_list.cpython-310.pyc differ
diff --git a/detectron2/structures/__pycache__/image_list.cpython-39.pyc b/detectron2/structures/__pycache__/image_list.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..cec7ecadf584c8a121f61ef4acc947941cfe1514
Binary files /dev/null and b/detectron2/structures/__pycache__/image_list.cpython-39.pyc differ
diff --git a/detectron2/structures/__pycache__/instances.cpython-310.pyc b/detectron2/structures/__pycache__/instances.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..edba19582174bc497a7e3d3ce89216793f11f47f
Binary files /dev/null and b/detectron2/structures/__pycache__/instances.cpython-310.pyc differ
diff --git a/detectron2/structures/__pycache__/instances.cpython-39.pyc b/detectron2/structures/__pycache__/instances.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1c0dcbaa2a12a12eb959d5ccb77c7711d1760660
Binary files /dev/null and b/detectron2/structures/__pycache__/instances.cpython-39.pyc differ
diff --git a/detectron2/structures/__pycache__/keypoints.cpython-310.pyc b/detectron2/structures/__pycache__/keypoints.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c310d36d6478ed739307af92abc3c2c886ea3952
Binary files /dev/null and b/detectron2/structures/__pycache__/keypoints.cpython-310.pyc differ
diff --git a/detectron2/structures/__pycache__/keypoints.cpython-39.pyc b/detectron2/structures/__pycache__/keypoints.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e60aa6ae4a3bf2f0861c4d14a18ff6034a89a483
Binary files /dev/null and b/detectron2/structures/__pycache__/keypoints.cpython-39.pyc differ
diff --git a/detectron2/structures/__pycache__/masks.cpython-310.pyc b/detectron2/structures/__pycache__/masks.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ffa47c8b792b4a626a4167c7915bc3c97e033769
Binary files /dev/null and b/detectron2/structures/__pycache__/masks.cpython-310.pyc differ
diff --git a/detectron2/structures/__pycache__/masks.cpython-39.pyc b/detectron2/structures/__pycache__/masks.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5766d9c1ba6daaa9b022cf607b7978b1719b2e49
Binary files /dev/null and b/detectron2/structures/__pycache__/masks.cpython-39.pyc differ
diff --git a/detectron2/structures/__pycache__/rotated_boxes.cpython-310.pyc b/detectron2/structures/__pycache__/rotated_boxes.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e09040fa6a38b37018e2140c1aaa2847d49c5d08
Binary files /dev/null and b/detectron2/structures/__pycache__/rotated_boxes.cpython-310.pyc differ
diff --git a/detectron2/structures/__pycache__/rotated_boxes.cpython-39.pyc b/detectron2/structures/__pycache__/rotated_boxes.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ba67d2184f96c5c040132eab18b4d5cfc355855c
Binary files /dev/null and b/detectron2/structures/__pycache__/rotated_boxes.cpython-39.pyc differ
diff --git a/detectron2/structures/boxes.py b/detectron2/structures/boxes.py
new file mode 100644
index 0000000000000000000000000000000000000000..fd396f68645db1d6946056eed868ffcc02cd7a22
--- /dev/null
+++ b/detectron2/structures/boxes.py
@@ -0,0 +1,425 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+import math
+import numpy as np
+from enum import IntEnum, unique
+from typing import List, Tuple, Union
+import torch
+from torch import device
+
+_RawBoxType = Union[List[float], Tuple[float, ...], torch.Tensor, np.ndarray]
+
+
+@unique
+class BoxMode(IntEnum):
+    """
+    Enum of different ways to represent a box.
+    """
+
+    XYXY_ABS = 0
+    """
+    (x0, y0, x1, y1) in absolute floating points coordinates.
+    The coordinates in range [0, width or height].
+    """
+    XYWH_ABS = 1
+    """
+    (x0, y0, w, h) in absolute floating points coordinates.
+    """
+    XYXY_REL = 2
+    """
+    Not yet supported!
+    (x0, y0, x1, y1) in range [0, 1]. They are relative to the size of the image.
+    """
+    XYWH_REL = 3
+    """
+    Not yet supported!
+    (x0, y0, w, h) in range [0, 1]. They are relative to the size of the image.
+    """
+    XYWHA_ABS = 4
+    """
+    (xc, yc, w, h, a) in absolute floating points coordinates.
+    (xc, yc) is the center of the rotated box, and the angle a is in degrees ccw.
+    """
+
+    @staticmethod
+    def convert(box: _RawBoxType, from_mode: "BoxMode", to_mode: "BoxMode") -> _RawBoxType:
+        """
+        Args:
+            box: can be a k-tuple, k-list or an Nxk array/tensor, where k = 4 or 5
+            from_mode, to_mode (BoxMode)
+
+        Returns:
+            The converted box of the same type.
+        """
+        if from_mode == to_mode:
+            return box
+
+        original_type = type(box)
+        is_numpy = isinstance(box, np.ndarray)
+        single_box = isinstance(box, (list, tuple))
+        if single_box:
+            assert len(box) == 4 or len(box) == 5, (
+                "BoxMode.convert takes either a k-tuple/list or an Nxk array/tensor,"
+                " where k == 4 or 5"
+            )
+            arr = torch.tensor(box)[None, :]
+        else:
+            # avoid modifying the input box
+            if is_numpy:
+                arr = torch.from_numpy(np.asarray(box)).clone()
+            else:
+                arr = box.clone()
+
+        assert to_mode not in [BoxMode.XYXY_REL, BoxMode.XYWH_REL] and from_mode not in [
+            BoxMode.XYXY_REL,
+            BoxMode.XYWH_REL,
+        ], "Relative mode not yet supported!"
+
+        if from_mode == BoxMode.XYWHA_ABS and to_mode == BoxMode.XYXY_ABS:
+            assert (
+                arr.shape[-1] == 5
+            ), "The last dimension of input shape must be 5 for XYWHA format"
+            original_dtype = arr.dtype
+            arr = arr.double()
+
+            w = arr[:, 2]
+            h = arr[:, 3]
+            a = arr[:, 4]
+            c = torch.abs(torch.cos(a * math.pi / 180.0))
+            s = torch.abs(torch.sin(a * math.pi / 180.0))
+            # This basically computes the horizontal bounding rectangle of the rotated box
+            new_w = c * w + s * h
+            new_h = c * h + s * w
+
+            # convert center to top-left corner
+            arr[:, 0] -= new_w / 2.0
+            arr[:, 1] -= new_h / 2.0
+            # bottom-right corner
+            arr[:, 2] = arr[:, 0] + new_w
+            arr[:, 3] = arr[:, 1] + new_h
+
+            arr = arr[:, :4].to(dtype=original_dtype)
+        elif from_mode == BoxMode.XYWH_ABS and to_mode == BoxMode.XYWHA_ABS:
+            original_dtype = arr.dtype
+            arr = arr.double()
+            arr[:, 0] += arr[:, 2] / 2.0
+            arr[:, 1] += arr[:, 3] / 2.0
+            angles = torch.zeros((arr.shape[0], 1), dtype=arr.dtype)
+            arr = torch.cat((arr, angles), axis=1).to(dtype=original_dtype)
+        else:
+            if to_mode == BoxMode.XYXY_ABS and from_mode == BoxMode.XYWH_ABS:
+                arr[:, 2] += arr[:, 0]
+                arr[:, 3] += arr[:, 1]
+            elif from_mode == BoxMode.XYXY_ABS and to_mode == BoxMode.XYWH_ABS:
+                arr[:, 2] -= arr[:, 0]
+                arr[:, 3] -= arr[:, 1]
+            else:
+                raise NotImplementedError(
+                    "Conversion from BoxMode {} to {} is not supported yet".format(
+                        from_mode, to_mode
+                    )
+                )
+
+        if single_box:
+            return original_type(arr.flatten().tolist())
+        if is_numpy:
+            return arr.numpy()
+        else:
+            return arr
+
+
+class Boxes:
+    """
+    This structure stores a list of boxes as a Nx4 torch.Tensor.
+    It supports some common methods about boxes
+    (`area`, `clip`, `nonempty`, etc),
+    and also behaves like a Tensor
+    (support indexing, `to(device)`, `.device`, and iteration over all boxes)
+
+    Attributes:
+        tensor (torch.Tensor): float matrix of Nx4. Each row is (x1, y1, x2, y2).
+    """
+
+    def __init__(self, tensor: torch.Tensor):
+        """
+        Args:
+            tensor (Tensor[float]): a Nx4 matrix.  Each row is (x1, y1, x2, y2).
+        """
+        if not isinstance(tensor, torch.Tensor):
+            tensor = torch.as_tensor(tensor, dtype=torch.float32, device=torch.device("cpu"))
+        else:
+            tensor = tensor.to(torch.float32)
+        if tensor.numel() == 0:
+            # Use reshape, so we don't end up creating a new tensor that does not depend on
+            # the inputs (and consequently confuses jit)
+            tensor = tensor.reshape((-1, 4)).to(dtype=torch.float32)
+        assert tensor.dim() == 2 and tensor.size(-1) == 4, tensor.size()
+
+        self.tensor = tensor
+
+    def clone(self) -> "Boxes":
+        """
+        Clone the Boxes.
+
+        Returns:
+            Boxes
+        """
+        return Boxes(self.tensor.clone())
+
+    def to(self, device: torch.device):
+        # Boxes are assumed float32 and does not support to(dtype)
+        return Boxes(self.tensor.to(device=device))
+
+    def area(self) -> torch.Tensor:
+        """
+        Computes the area of all the boxes.
+
+        Returns:
+            torch.Tensor: a vector with areas of each box.
+        """
+        box = self.tensor
+        area = (box[:, 2] - box[:, 0]) * (box[:, 3] - box[:, 1])
+        return area
+
+    def clip(self, box_size: Tuple[int, int]) -> None:
+        """
+        Clip (in place) the boxes by limiting x coordinates to the range [0, width]
+        and y coordinates to the range [0, height].
+
+        Args:
+            box_size (height, width): The clipping box's size.
+        """
+        assert torch.isfinite(self.tensor).all(), "Box tensor contains infinite or NaN!"
+        h, w = box_size
+        x1 = self.tensor[:, 0].clamp(min=0, max=w)
+        y1 = self.tensor[:, 1].clamp(min=0, max=h)
+        x2 = self.tensor[:, 2].clamp(min=0, max=w)
+        y2 = self.tensor[:, 3].clamp(min=0, max=h)
+        self.tensor = torch.stack((x1, y1, x2, y2), dim=-1)
+
+    def nonempty(self, threshold: float = 0.0) -> torch.Tensor:
+        """
+        Find boxes that are non-empty.
+        A box is considered empty, if either of its side is no larger than threshold.
+
+        Returns:
+            Tensor:
+                a binary vector which represents whether each box is empty
+                (False) or non-empty (True).
+        """
+        box = self.tensor
+        widths = box[:, 2] - box[:, 0]
+        heights = box[:, 3] - box[:, 1]
+        keep = (widths > threshold) & (heights > threshold)
+        return keep
+
+    def __getitem__(self, item) -> "Boxes":
+        """
+        Args:
+            item: int, slice, or a BoolTensor
+
+        Returns:
+            Boxes: Create a new :class:`Boxes` by indexing.
+
+        The following usage are allowed:
+
+        1. `new_boxes = boxes[3]`: return a `Boxes` which contains only one box.
+        2. `new_boxes = boxes[2:10]`: return a slice of boxes.
+        3. `new_boxes = boxes[vector]`, where vector is a torch.BoolTensor
+           with `length = len(boxes)`. Nonzero elements in the vector will be selected.
+
+        Note that the returned Boxes might share storage with this Boxes,
+        subject to Pytorch's indexing semantics.
+        """
+        if isinstance(item, int):
+            return Boxes(self.tensor[item].view(1, -1))
+        b = self.tensor[item]
+        assert b.dim() == 2, "Indexing on Boxes with {} failed to return a matrix!".format(item)
+        return Boxes(b)
+
+    def __len__(self) -> int:
+        return self.tensor.shape[0]
+
+    def __repr__(self) -> str:
+        return "Boxes(" + str(self.tensor) + ")"
+
+    def inside_box(self, box_size: Tuple[int, int], boundary_threshold: int = 0) -> torch.Tensor:
+        """
+        Args:
+            box_size (height, width): Size of the reference box.
+            boundary_threshold (int): Boxes that extend beyond the reference box
+                boundary by more than boundary_threshold are considered "outside".
+
+        Returns:
+            a binary vector, indicating whether each box is inside the reference box.
+        """
+        height, width = box_size
+        inds_inside = (
+            (self.tensor[..., 0] >= -boundary_threshold)
+            & (self.tensor[..., 1] >= -boundary_threshold)
+            & (self.tensor[..., 2] < width + boundary_threshold)
+            & (self.tensor[..., 3] < height + boundary_threshold)
+        )
+        return inds_inside
+
+    def get_centers(self) -> torch.Tensor:
+        """
+        Returns:
+            The box centers in a Nx2 array of (x, y).
+        """
+        return (self.tensor[:, :2] + self.tensor[:, 2:]) / 2
+
+    def scale(self, scale_x: float, scale_y: float) -> None:
+        """
+        Scale the box with horizontal and vertical scaling factors
+        """
+        self.tensor[:, 0::2] *= scale_x
+        self.tensor[:, 1::2] *= scale_y
+
+    @classmethod
+    def cat(cls, boxes_list: List["Boxes"]) -> "Boxes":
+        """
+        Concatenates a list of Boxes into a single Boxes
+
+        Arguments:
+            boxes_list (list[Boxes])
+
+        Returns:
+            Boxes: the concatenated Boxes
+        """
+        assert isinstance(boxes_list, (list, tuple))
+        if len(boxes_list) == 0:
+            return cls(torch.empty(0))
+        assert all([isinstance(box, Boxes) for box in boxes_list])
+
+        # use torch.cat (v.s. layers.cat) so the returned boxes never share storage with input
+        cat_boxes = cls(torch.cat([b.tensor for b in boxes_list], dim=0))
+        return cat_boxes
+
+    @property
+    def device(self) -> device:
+        return self.tensor.device
+
+    # type "Iterator[torch.Tensor]", yield, and iter() not supported by torchscript
+    # https://github.com/pytorch/pytorch/issues/18627
+    @torch.jit.unused
+    def __iter__(self):
+        """
+        Yield a box as a Tensor of shape (4,) at a time.
+        """
+        yield from self.tensor
+
+
+def pairwise_intersection(boxes1: Boxes, boxes2: Boxes) -> torch.Tensor:
+    """
+    Given two lists of boxes of size N and M,
+    compute the intersection area between __all__ N x M pairs of boxes.
+    The box order must be (xmin, ymin, xmax, ymax)
+
+    Args:
+        boxes1,boxes2 (Boxes): two `Boxes`. Contains N & M boxes, respectively.
+
+    Returns:
+        Tensor: intersection, sized [N,M].
+    """
+    boxes1, boxes2 = boxes1.tensor, boxes2.tensor
+    width_height = torch.min(boxes1[:, None, 2:], boxes2[:, 2:]) - torch.max(
+        boxes1[:, None, :2], boxes2[:, :2]
+    )  # [N,M,2]
+
+    width_height.clamp_(min=0)  # [N,M,2]
+    intersection = width_height.prod(dim=2)  # [N,M]
+    return intersection
+
+
+# implementation from https://github.com/kuangliu/torchcv/blob/master/torchcv/utils/box.py
+# with slight modifications
+def pairwise_iou(boxes1: Boxes, boxes2: Boxes) -> torch.Tensor:
+    """
+    Given two lists of boxes of size N and M, compute the IoU
+    (intersection over union) between **all** N x M pairs of boxes.
+    The box order must be (xmin, ymin, xmax, ymax).
+
+    Args:
+        boxes1,boxes2 (Boxes): two `Boxes`. Contains N & M boxes, respectively.
+
+    Returns:
+        Tensor: IoU, sized [N,M].
+    """
+    area1 = boxes1.area()  # [N]
+    area2 = boxes2.area()  # [M]
+    inter = pairwise_intersection(boxes1, boxes2)
+
+    # handle empty boxes
+    iou = torch.where(
+        inter > 0,
+        inter / (area1[:, None] + area2 - inter),
+        torch.zeros(1, dtype=inter.dtype, device=inter.device),
+    )
+    return iou
+
+
+def pairwise_ioa(boxes1: Boxes, boxes2: Boxes) -> torch.Tensor:
+    """
+    Similar to :func:`pariwise_iou` but compute the IoA (intersection over boxes2 area).
+
+    Args:
+        boxes1,boxes2 (Boxes): two `Boxes`. Contains N & M boxes, respectively.
+
+    Returns:
+        Tensor: IoA, sized [N,M].
+    """
+    area2 = boxes2.area()  # [M]
+    inter = pairwise_intersection(boxes1, boxes2)
+
+    # handle empty boxes
+    ioa = torch.where(
+        inter > 0, inter / area2, torch.zeros(1, dtype=inter.dtype, device=inter.device)
+    )
+    return ioa
+
+
+def pairwise_point_box_distance(points: torch.Tensor, boxes: Boxes):
+    """
+    Pairwise distance between N points and M boxes. The distance between a
+    point and a box is represented by the distance from the point to 4 edges
+    of the box. Distances are all positive when the point is inside the box.
+
+    Args:
+        points: Nx2 coordinates. Each row is (x, y)
+        boxes: M boxes
+
+    Returns:
+        Tensor: distances of size (N, M, 4). The 4 values are distances from
+            the point to the left, top, right, bottom of the box.
+    """
+    x, y = points.unsqueeze(dim=2).unbind(dim=1)  # (N, 1)
+    x0, y0, x1, y1 = boxes.tensor.unsqueeze(dim=0).unbind(dim=2)  # (1, M)
+    return torch.stack([x - x0, y - y0, x1 - x, y1 - y], dim=2)
+
+
+def matched_pairwise_iou(boxes1: Boxes, boxes2: Boxes) -> torch.Tensor:
+    """
+    Compute pairwise intersection over union (IOU) of two sets of matched
+    boxes that have the same number of boxes.
+    Similar to :func:`pairwise_iou`, but computes only diagonal elements of the matrix.
+
+    Args:
+        boxes1 (Boxes): bounding boxes, sized [N,4].
+        boxes2 (Boxes): same length as boxes1
+    Returns:
+        Tensor: iou, sized [N].
+    """
+    assert len(boxes1) == len(
+        boxes2
+    ), "boxlists should have the same" "number of entries, got {}, {}".format(
+        len(boxes1), len(boxes2)
+    )
+    area1 = boxes1.area()  # [N]
+    area2 = boxes2.area()  # [N]
+    box1, box2 = boxes1.tensor, boxes2.tensor
+    lt = torch.max(box1[:, :2], box2[:, :2])  # [N,2]
+    rb = torch.min(box1[:, 2:], box2[:, 2:])  # [N,2]
+    wh = (rb - lt).clamp(min=0)  # [N,2]
+    inter = wh[:, 0] * wh[:, 1]  # [N]
+    iou = inter / (area1 + area2 - inter)  # [N]
+    return iou
diff --git a/detectron2/structures/image_list.py b/detectron2/structures/image_list.py
new file mode 100644
index 0000000000000000000000000000000000000000..e4243bb11e8fd95e8732f966f1d840d0560ae4c4
--- /dev/null
+++ b/detectron2/structures/image_list.py
@@ -0,0 +1,129 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+from __future__ import division
+from typing import Any, Dict, List, Optional, Tuple
+import torch
+from torch import device
+from torch.nn import functional as F
+
+from detectron2.layers.wrappers import move_device_like, shapes_to_tensor
+
+
+class ImageList:
+    """
+    Structure that holds a list of images (of possibly
+    varying sizes) as a single tensor.
+    This works by padding the images to the same size.
+    The original sizes of each image is stored in `image_sizes`.
+
+    Attributes:
+        image_sizes (list[tuple[int, int]]): each tuple is (h, w).
+            During tracing, it becomes list[Tensor] instead.
+    """
+
+    def __init__(self, tensor: torch.Tensor, image_sizes: List[Tuple[int, int]]):
+        """
+        Arguments:
+            tensor (Tensor): of shape (N, H, W) or (N, C_1, ..., C_K, H, W) where K >= 1
+            image_sizes (list[tuple[int, int]]): Each tuple is (h, w). It can
+                be smaller than (H, W) due to padding.
+        """
+        self.tensor = tensor
+        self.image_sizes = image_sizes
+
+    def __len__(self) -> int:
+        return len(self.image_sizes)
+
+    def __getitem__(self, idx) -> torch.Tensor:
+        """
+        Access the individual image in its original size.
+
+        Args:
+            idx: int or slice
+
+        Returns:
+            Tensor: an image of shape (H, W) or (C_1, ..., C_K, H, W) where K >= 1
+        """
+        size = self.image_sizes[idx]
+        return self.tensor[idx, ..., : size[0], : size[1]]
+
+    @torch.jit.unused
+    def to(self, *args: Any, **kwargs: Any) -> "ImageList":
+        cast_tensor = self.tensor.to(*args, **kwargs)
+        return ImageList(cast_tensor, self.image_sizes)
+
+    @property
+    def device(self) -> device:
+        return self.tensor.device
+
+    @staticmethod
+    def from_tensors(
+        tensors: List[torch.Tensor],
+        size_divisibility: int = 0,
+        pad_value: float = 0.0,
+        padding_constraints: Optional[Dict[str, int]] = None,
+    ) -> "ImageList":
+        """
+        Args:
+            tensors: a tuple or list of `torch.Tensor`, each of shape (Hi, Wi) or
+                (C_1, ..., C_K, Hi, Wi) where K >= 1. The Tensors will be padded
+                to the same shape with `pad_value`.
+            size_divisibility (int): If `size_divisibility > 0`, add padding to ensure
+                the common height and width is divisible by `size_divisibility`.
+                This depends on the model and many models need a divisibility of 32.
+            pad_value (float): value to pad.
+            padding_constraints (optional[Dict]): If given, it would follow the format as
+                {"size_divisibility": int, "square_size": int}, where `size_divisibility` will
+                overwrite the above one if presented and `square_size` indicates the
+                square padding size if `square_size` > 0.
+        Returns:
+            an `ImageList`.
+        """
+        assert len(tensors) > 0
+        assert isinstance(tensors, (tuple, list))
+        for t in tensors:
+            assert isinstance(t, torch.Tensor), type(t)
+            assert t.shape[:-2] == tensors[0].shape[:-2], t.shape
+
+        image_sizes = [(im.shape[-2], im.shape[-1]) for im in tensors]
+        image_sizes_tensor = [shapes_to_tensor(x) for x in image_sizes]
+        max_size = torch.stack(image_sizes_tensor).max(0).values
+
+        if padding_constraints is not None:
+            square_size = padding_constraints.get("square_size", 0)
+            if square_size > 0:
+                # pad to square.
+                max_size[0] = max_size[1] = square_size
+            if "size_divisibility" in padding_constraints:
+                size_divisibility = padding_constraints["size_divisibility"]
+        if size_divisibility > 1:
+            stride = size_divisibility
+            # the last two dims are H,W, both subject to divisibility requirement
+            max_size = (max_size + (stride - 1)).div(stride, rounding_mode="floor") * stride
+
+        # handle weirdness of scripting and tracing ...
+        if torch.jit.is_scripting():
+            max_size: List[int] = max_size.to(dtype=torch.long).tolist()
+        else:
+            if torch.jit.is_tracing():
+                image_sizes = image_sizes_tensor
+
+        if len(tensors) == 1:
+            # This seems slightly (2%) faster.
+            # TODO: check whether it's faster for multiple images as well
+            image_size = image_sizes[0]
+            padding_size = [0, max_size[-1] - image_size[1], 0, max_size[-2] - image_size[0]]
+            batched_imgs = F.pad(tensors[0], padding_size, value=pad_value).unsqueeze_(0)
+        else:
+            # max_size can be a tensor in tracing mode, therefore convert to list
+            batch_shape = [len(tensors)] + list(tensors[0].shape[:-2]) + list(max_size)
+            device = (
+                None if torch.jit.is_scripting() else ("cpu" if torch.jit.is_tracing() else None)
+            )
+            batched_imgs = tensors[0].new_full(batch_shape, pad_value, device=device)
+            batched_imgs = move_device_like(batched_imgs, tensors[0])
+            for i, img in enumerate(tensors):
+                # Use `batched_imgs` directly instead of `img, pad_img = zip(tensors, batched_imgs)`
+                # Tracing mode cannot capture `copy_()` of temporary locals
+                batched_imgs[i, ..., : img.shape[-2], : img.shape[-1]].copy_(img)
+
+        return ImageList(batched_imgs.contiguous(), image_sizes)
diff --git a/detectron2/structures/instances.py b/detectron2/structures/instances.py
new file mode 100644
index 0000000000000000000000000000000000000000..c9579bce2730f42e256c6eed99d9014d09304c99
--- /dev/null
+++ b/detectron2/structures/instances.py
@@ -0,0 +1,194 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+import itertools
+import warnings
+from typing import Any, Dict, List, Tuple, Union
+import torch
+
+
+class Instances:
+    """
+    This class represents a list of instances in an image.
+    It stores the attributes of instances (e.g., boxes, masks, labels, scores) as "fields".
+    All fields must have the same ``__len__`` which is the number of instances.
+
+    All other (non-field) attributes of this class are considered private:
+    they must start with '_' and are not modifiable by a user.
+
+    Some basic usage:
+
+    1. Set/get/check a field:
+
+       .. code-block:: python
+
+          instances.gt_boxes = Boxes(...)
+          print(instances.pred_masks)  # a tensor of shape (N, H, W)
+          print('gt_masks' in instances)
+
+    2. ``len(instances)`` returns the number of instances
+    3. Indexing: ``instances[indices]`` will apply the indexing on all the fields
+       and returns a new :class:`Instances`.
+       Typically, ``indices`` is a integer vector of indices,
+       or a binary mask of length ``num_instances``
+
+       .. code-block:: python
+
+          category_3_detections = instances[instances.pred_classes == 3]
+          confident_detections = instances[instances.scores > 0.9]
+    """
+
+    def __init__(self, image_size: Tuple[int, int], **kwargs: Any):
+        """
+        Args:
+            image_size (height, width): the spatial size of the image.
+            kwargs: fields to add to this `Instances`.
+        """
+        self._image_size = image_size
+        self._fields: Dict[str, Any] = {}
+        for k, v in kwargs.items():
+            self.set(k, v)
+
+    @property
+    def image_size(self) -> Tuple[int, int]:
+        """
+        Returns:
+            tuple: height, width
+        """
+        return self._image_size
+
+    def __setattr__(self, name: str, val: Any) -> None:
+        if name.startswith("_"):
+            super().__setattr__(name, val)
+        else:
+            self.set(name, val)
+
+    def __getattr__(self, name: str) -> Any:
+        if name == "_fields" or name not in self._fields:
+            raise AttributeError("Cannot find field '{}' in the given Instances!".format(name))
+        return self._fields[name]
+
+    def set(self, name: str, value: Any) -> None:
+        """
+        Set the field named `name` to `value`.
+        The length of `value` must be the number of instances,
+        and must agree with other existing fields in this object.
+        """
+        with warnings.catch_warnings(record=True):
+            data_len = len(value)
+        if len(self._fields):
+            assert (
+                len(self) == data_len
+            ), "Adding a field of length {} to a Instances of length {}".format(data_len, len(self))
+        self._fields[name] = value
+
+    def has(self, name: str) -> bool:
+        """
+        Returns:
+            bool: whether the field called `name` exists.
+        """
+        return name in self._fields
+
+    def remove(self, name: str) -> None:
+        """
+        Remove the field called `name`.
+        """
+        del self._fields[name]
+
+    def get(self, name: str) -> Any:
+        """
+        Returns the field called `name`.
+        """
+        return self._fields[name]
+
+    def get_fields(self) -> Dict[str, Any]:
+        """
+        Returns:
+            dict: a dict which maps names (str) to data of the fields
+
+        Modifying the returned dict will modify this instance.
+        """
+        return self._fields
+
+    # Tensor-like methods
+    def to(self, *args: Any, **kwargs: Any) -> "Instances":
+        """
+        Returns:
+            Instances: all fields are called with a `to(device)`, if the field has this method.
+        """
+        ret = Instances(self._image_size)
+        for k, v in self._fields.items():
+            if hasattr(v, "to"):
+                v = v.to(*args, **kwargs)
+            ret.set(k, v)
+        return ret
+
+    def __getitem__(self, item: Union[int, slice, torch.BoolTensor]) -> "Instances":
+        """
+        Args:
+            item: an index-like object and will be used to index all the fields.
+
+        Returns:
+            If `item` is a string, return the data in the corresponding field.
+            Otherwise, returns an `Instances` where all fields are indexed by `item`.
+        """
+        if type(item) == int:
+            if item >= len(self) or item < -len(self):
+                raise IndexError("Instances index out of range!")
+            else:
+                item = slice(item, None, len(self))
+
+        ret = Instances(self._image_size)
+        for k, v in self._fields.items():
+            ret.set(k, v[item])
+        return ret
+
+    def __len__(self) -> int:
+        for v in self._fields.values():
+            # use __len__ because len() has to be int and is not friendly to tracing
+            return v.__len__()
+        raise NotImplementedError("Empty Instances does not support __len__!")
+
+    def __iter__(self):
+        raise NotImplementedError("`Instances` object is not iterable!")
+
+    @staticmethod
+    def cat(instance_lists: List["Instances"]) -> "Instances":
+        """
+        Args:
+            instance_lists (list[Instances])
+
+        Returns:
+            Instances
+        """
+        assert all(isinstance(i, Instances) for i in instance_lists)
+        assert len(instance_lists) > 0
+        if len(instance_lists) == 1:
+            return instance_lists[0]
+
+        image_size = instance_lists[0].image_size
+        if not isinstance(image_size, torch.Tensor):  # could be a tensor in tracing
+            for i in instance_lists[1:]:
+                assert i.image_size == image_size
+        ret = Instances(image_size)
+        for k in instance_lists[0]._fields.keys():
+            values = [i.get(k) for i in instance_lists]
+            v0 = values[0]
+            if isinstance(v0, torch.Tensor):
+                values = torch.cat(values, dim=0)
+            elif isinstance(v0, list):
+                values = list(itertools.chain(*values))
+            elif hasattr(type(v0), "cat"):
+                values = type(v0).cat(values)
+            else:
+                raise ValueError("Unsupported type {} for concatenation".format(type(v0)))
+            ret.set(k, values)
+        return ret
+
+    def __str__(self) -> str:
+        s = self.__class__.__name__ + "("
+        s += "num_instances={}, ".format(len(self))
+        s += "image_height={}, ".format(self._image_size[0])
+        s += "image_width={}, ".format(self._image_size[1])
+        s += "fields=[{}])".format(", ".join((f"{k}: {v}" for k, v in self._fields.items())))
+        return s
+
+    __repr__ = __str__
diff --git a/detectron2/structures/keypoints.py b/detectron2/structures/keypoints.py
new file mode 100644
index 0000000000000000000000000000000000000000..b93ebed4f6554e67ba9bde8d3af90e8dbb3246b6
--- /dev/null
+++ b/detectron2/structures/keypoints.py
@@ -0,0 +1,235 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+import numpy as np
+from typing import Any, List, Tuple, Union
+import torch
+from torch.nn import functional as F
+
+
+class Keypoints:
+    """
+    Stores keypoint **annotation** data. GT Instances have a `gt_keypoints` property
+    containing the x,y location and visibility flag of each keypoint. This tensor has shape
+    (N, K, 3) where N is the number of instances and K is the number of keypoints per instance.
+
+    The visibility flag follows the COCO format and must be one of three integers:
+
+    * v=0: not labeled (in which case x=y=0)
+    * v=1: labeled but not visible
+    * v=2: labeled and visible
+    """
+
+    def __init__(self, keypoints: Union[torch.Tensor, np.ndarray, List[List[float]]]):
+        """
+        Arguments:
+            keypoints: A Tensor, numpy array, or list of the x, y, and visibility of each keypoint.
+                The shape should be (N, K, 3) where N is the number of
+                instances, and K is the number of keypoints per instance.
+        """
+        device = keypoints.device if isinstance(keypoints, torch.Tensor) else torch.device("cpu")
+        keypoints = torch.as_tensor(keypoints, dtype=torch.float32, device=device)
+        assert keypoints.dim() == 3 and keypoints.shape[2] == 3, keypoints.shape
+        self.tensor = keypoints
+
+    def __len__(self) -> int:
+        return self.tensor.size(0)
+
+    def to(self, *args: Any, **kwargs: Any) -> "Keypoints":
+        return type(self)(self.tensor.to(*args, **kwargs))
+
+    @property
+    def device(self) -> torch.device:
+        return self.tensor.device
+
+    def to_heatmap(self, boxes: torch.Tensor, heatmap_size: int) -> torch.Tensor:
+        """
+        Convert keypoint annotations to a heatmap of one-hot labels for training,
+        as described in :paper:`Mask R-CNN`.
+
+        Arguments:
+            boxes: Nx4 tensor, the boxes to draw the keypoints to
+
+        Returns:
+            heatmaps:
+                A tensor of shape (N, K), each element is integer spatial label
+                in the range [0, heatmap_size**2 - 1] for each keypoint in the input.
+            valid:
+                A tensor of shape (N, K) containing whether each keypoint is in the roi or not.
+        """
+        return _keypoints_to_heatmap(self.tensor, boxes, heatmap_size)
+
+    def __getitem__(self, item: Union[int, slice, torch.BoolTensor]) -> "Keypoints":
+        """
+        Create a new `Keypoints` by indexing on this `Keypoints`.
+
+        The following usage are allowed:
+
+        1. `new_kpts = kpts[3]`: return a `Keypoints` which contains only one instance.
+        2. `new_kpts = kpts[2:10]`: return a slice of key points.
+        3. `new_kpts = kpts[vector]`, where vector is a torch.ByteTensor
+           with `length = len(kpts)`. Nonzero elements in the vector will be selected.
+
+        Note that the returned Keypoints might share storage with this Keypoints,
+        subject to Pytorch's indexing semantics.
+        """
+        if isinstance(item, int):
+            return Keypoints([self.tensor[item]])
+        return Keypoints(self.tensor[item])
+
+    def __repr__(self) -> str:
+        s = self.__class__.__name__ + "("
+        s += "num_instances={})".format(len(self.tensor))
+        return s
+
+    @staticmethod
+    def cat(keypoints_list: List["Keypoints"]) -> "Keypoints":
+        """
+        Concatenates a list of Keypoints into a single Keypoints
+
+        Arguments:
+            keypoints_list (list[Keypoints])
+
+        Returns:
+            Keypoints: the concatenated Keypoints
+        """
+        assert isinstance(keypoints_list, (list, tuple))
+        assert len(keypoints_list) > 0
+        assert all(isinstance(keypoints, Keypoints) for keypoints in keypoints_list)
+
+        cat_kpts = type(keypoints_list[0])(
+            torch.cat([kpts.tensor for kpts in keypoints_list], dim=0)
+        )
+        return cat_kpts
+
+
+# TODO make this nicer, this is a direct translation from C2 (but removing the inner loop)
+def _keypoints_to_heatmap(
+    keypoints: torch.Tensor, rois: torch.Tensor, heatmap_size: int
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    Encode keypoint locations into a target heatmap for use in SoftmaxWithLoss across space.
+
+    Maps keypoints from the half-open interval [x1, x2) on continuous image coordinates to the
+    closed interval [0, heatmap_size - 1] on discrete image coordinates. We use the
+    continuous-discrete conversion from Heckbert 1990 ("What is the coordinate of a pixel?"):
+    d = floor(c) and c = d + 0.5, where d is a discrete coordinate and c is a continuous coordinate.
+
+    Arguments:
+        keypoints: tensor of keypoint locations in of shape (N, K, 3).
+        rois: Nx4 tensor of rois in xyxy format
+        heatmap_size: integer side length of square heatmap.
+
+    Returns:
+        heatmaps: A tensor of shape (N, K) containing an integer spatial label
+            in the range [0, heatmap_size**2 - 1] for each keypoint in the input.
+        valid: A tensor of shape (N, K) containing whether each keypoint is in
+            the roi or not.
+    """
+
+    if rois.numel() == 0:
+        return rois.new().long(), rois.new().long()
+    offset_x = rois[:, 0]
+    offset_y = rois[:, 1]
+    scale_x = heatmap_size / (rois[:, 2] - rois[:, 0])
+    scale_y = heatmap_size / (rois[:, 3] - rois[:, 1])
+
+    offset_x = offset_x[:, None]
+    offset_y = offset_y[:, None]
+    scale_x = scale_x[:, None]
+    scale_y = scale_y[:, None]
+
+    x = keypoints[..., 0]
+    y = keypoints[..., 1]
+
+    x_boundary_inds = x == rois[:, 2][:, None]
+    y_boundary_inds = y == rois[:, 3][:, None]
+
+    x = (x - offset_x) * scale_x
+    x = x.floor().long()
+    y = (y - offset_y) * scale_y
+    y = y.floor().long()
+
+    x[x_boundary_inds] = heatmap_size - 1
+    y[y_boundary_inds] = heatmap_size - 1
+
+    valid_loc = (x >= 0) & (y >= 0) & (x < heatmap_size) & (y < heatmap_size)
+    vis = keypoints[..., 2] > 0
+    valid = (valid_loc & vis).long()
+
+    lin_ind = y * heatmap_size + x
+    heatmaps = lin_ind * valid
+
+    return heatmaps, valid
+
+
+@torch.jit.script_if_tracing
+def heatmaps_to_keypoints(maps: torch.Tensor, rois: torch.Tensor) -> torch.Tensor:
+    """
+    Extract predicted keypoint locations from heatmaps.
+
+    Args:
+        maps (Tensor): (#ROIs, #keypoints, POOL_H, POOL_W). The predicted heatmap of logits for
+            each ROI and each keypoint.
+        rois (Tensor): (#ROIs, 4). The box of each ROI.
+
+    Returns:
+        Tensor of shape (#ROIs, #keypoints, 4) with the last dimension corresponding to
+        (x, y, logit, score) for each keypoint.
+
+    When converting discrete pixel indices in an NxN image to a continuous keypoint coordinate,
+    we maintain consistency with :meth:`Keypoints.to_heatmap` by using the conversion from
+    Heckbert 1990: c = d + 0.5, where d is a discrete coordinate and c is a continuous coordinate.
+    """
+
+    offset_x = rois[:, 0]
+    offset_y = rois[:, 1]
+
+    widths = (rois[:, 2] - rois[:, 0]).clamp(min=1)
+    heights = (rois[:, 3] - rois[:, 1]).clamp(min=1)
+    widths_ceil = widths.ceil()
+    heights_ceil = heights.ceil()
+
+    num_rois, num_keypoints = maps.shape[:2]
+    xy_preds = maps.new_zeros(rois.shape[0], num_keypoints, 4)
+
+    width_corrections = widths / widths_ceil
+    height_corrections = heights / heights_ceil
+
+    keypoints_idx = torch.arange(num_keypoints, device=maps.device)
+
+    for i in range(num_rois):
+        outsize = (int(heights_ceil[i]), int(widths_ceil[i]))
+        roi_map = F.interpolate(maps[[i]], size=outsize, mode="bicubic", align_corners=False)
+
+        # Although semantically equivalent, `reshape` is used instead of `squeeze` due
+        # to limitation during ONNX export of `squeeze` in scripting mode
+        roi_map = roi_map.reshape(roi_map.shape[1:])  # keypoints x H x W
+
+        # softmax over the spatial region
+        max_score, _ = roi_map.view(num_keypoints, -1).max(1)
+        max_score = max_score.view(num_keypoints, 1, 1)
+        tmp_full_resolution = (roi_map - max_score).exp_()
+        tmp_pool_resolution = (maps[i] - max_score).exp_()
+        # Produce scores over the region H x W, but normalize with POOL_H x POOL_W,
+        # so that the scores of objects of different absolute sizes will be more comparable
+        roi_map_scores = tmp_full_resolution / tmp_pool_resolution.sum((1, 2), keepdim=True)
+
+        w = roi_map.shape[2]
+        pos = roi_map.view(num_keypoints, -1).argmax(1)
+
+        x_int = pos % w
+        y_int = (pos - x_int) // w
+
+        assert (
+            roi_map_scores[keypoints_idx, y_int, x_int]
+            == roi_map_scores.view(num_keypoints, -1).max(1)[0]
+        ).all()
+
+        x = (x_int.float() + 0.5) * width_corrections[i]
+        y = (y_int.float() + 0.5) * height_corrections[i]
+
+        xy_preds[i, :, 0] = x + offset_x[i]
+        xy_preds[i, :, 1] = y + offset_y[i]
+        xy_preds[i, :, 2] = roi_map[keypoints_idx, y_int, x_int]
+        xy_preds[i, :, 3] = roi_map_scores[keypoints_idx, y_int, x_int]
+
+    return xy_preds
diff --git a/detectron2/structures/masks.py b/detectron2/structures/masks.py
new file mode 100644
index 0000000000000000000000000000000000000000..899ad8b6ce1557ccc38da58d31814c3ddb9cb737
--- /dev/null
+++ b/detectron2/structures/masks.py
@@ -0,0 +1,534 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+import copy
+import itertools
+import numpy as np
+from typing import Any, Iterator, List, Union
+import pycocotools.mask as mask_util
+import torch
+from torch import device
+
+from detectron2.layers.roi_align import ROIAlign
+from detectron2.utils.memory import retry_if_cuda_oom
+
+from .boxes import Boxes
+
+
+def polygon_area(x, y):
+    # Using the shoelace formula
+    # https://stackoverflow.com/questions/24467972/calculate-area-of-polygon-given-x-y-coordinates
+    return 0.5 * np.abs(np.dot(x, np.roll(y, 1)) - np.dot(y, np.roll(x, 1)))
+
+
+def polygons_to_bitmask(polygons: List[np.ndarray], height: int, width: int) -> np.ndarray:
+    """
+    Args:
+        polygons (list[ndarray]): each array has shape (Nx2,)
+        height, width (int)
+
+    Returns:
+        ndarray: a bool mask of shape (height, width)
+    """
+    if len(polygons) == 0:
+        # COCOAPI does not support empty polygons
+        return np.zeros((height, width)).astype(bool)
+    rles = mask_util.frPyObjects(polygons, height, width)
+    rle = mask_util.merge(rles)
+    return mask_util.decode(rle).astype(bool)
+
+
+def rasterize_polygons_within_box(
+    polygons: List[np.ndarray], box: np.ndarray, mask_size: int
+) -> torch.Tensor:
+    """
+    Rasterize the polygons into a mask image and
+    crop the mask content in the given box.
+    The cropped mask is resized to (mask_size, mask_size).
+
+    This function is used when generating training targets for mask head in Mask R-CNN.
+    Given original ground-truth masks for an image, new ground-truth mask
+    training targets in the size of `mask_size x mask_size`
+    must be provided for each predicted box. This function will be called to
+    produce such targets.
+
+    Args:
+        polygons (list[ndarray[float]]): a list of polygons, which represents an instance.
+        box: 4-element numpy array
+        mask_size (int):
+
+    Returns:
+        Tensor: BoolTensor of shape (mask_size, mask_size)
+    """
+    # 1. Shift the polygons w.r.t the boxes
+    w, h = box[2] - box[0], box[3] - box[1]
+
+    polygons = copy.deepcopy(polygons)
+    for p in polygons:
+        p[0::2] = p[0::2] - box[0]
+        p[1::2] = p[1::2] - box[1]
+
+    # 2. Rescale the polygons to the new box size
+    # max() to avoid division by small number
+    ratio_h = mask_size / max(h, 0.1)
+    ratio_w = mask_size / max(w, 0.1)
+
+    if ratio_h == ratio_w:
+        for p in polygons:
+            p *= ratio_h
+    else:
+        for p in polygons:
+            p[0::2] *= ratio_w
+            p[1::2] *= ratio_h
+
+    # 3. Rasterize the polygons with coco api
+    mask = polygons_to_bitmask(polygons, mask_size, mask_size)
+    mask = torch.from_numpy(mask)
+    return mask
+
+
+class BitMasks:
+    """
+    This class stores the segmentation masks for all objects in one image, in
+    the form of bitmaps.
+
+    Attributes:
+        tensor: bool Tensor of N,H,W, representing N instances in the image.
+    """
+
+    def __init__(self, tensor: Union[torch.Tensor, np.ndarray]):
+        """
+        Args:
+            tensor: bool Tensor of N,H,W, representing N instances in the image.
+        """
+        if isinstance(tensor, torch.Tensor):
+            tensor = tensor.to(torch.bool)
+        else:
+            tensor = torch.as_tensor(tensor, dtype=torch.bool, device=torch.device("cpu"))
+        assert tensor.dim() == 3, tensor.size()
+        self.image_size = tensor.shape[1:]
+        self.tensor = tensor
+
+    @torch.jit.unused
+    def to(self, *args: Any, **kwargs: Any) -> "BitMasks":
+        return BitMasks(self.tensor.to(*args, **kwargs))
+
+    @property
+    def device(self) -> torch.device:
+        return self.tensor.device
+
+    @torch.jit.unused
+    def __getitem__(self, item: Union[int, slice, torch.BoolTensor]) -> "BitMasks":
+        """
+        Returns:
+            BitMasks: Create a new :class:`BitMasks` by indexing.
+
+        The following usage are allowed:
+
+        1. `new_masks = masks[3]`: return a `BitMasks` which contains only one mask.
+        2. `new_masks = masks[2:10]`: return a slice of masks.
+        3. `new_masks = masks[vector]`, where vector is a torch.BoolTensor
+           with `length = len(masks)`. Nonzero elements in the vector will be selected.
+
+        Note that the returned object might share storage with this object,
+        subject to Pytorch's indexing semantics.
+        """
+        if isinstance(item, int):
+            return BitMasks(self.tensor[item].unsqueeze(0))
+        m = self.tensor[item]
+        assert m.dim() == 3, "Indexing on BitMasks with {} returns a tensor with shape {}!".format(
+            item, m.shape
+        )
+        return BitMasks(m)
+
+    @torch.jit.unused
+    def __iter__(self) -> torch.Tensor:
+        yield from self.tensor
+
+    @torch.jit.unused
+    def __repr__(self) -> str:
+        s = self.__class__.__name__ + "("
+        s += "num_instances={})".format(len(self.tensor))
+        return s
+
+    def __len__(self) -> int:
+        return self.tensor.shape[0]
+
+    def nonempty(self) -> torch.Tensor:
+        """
+        Find masks that are non-empty.
+
+        Returns:
+            Tensor: a BoolTensor which represents
+                whether each mask is empty (False) or non-empty (True).
+        """
+        return self.tensor.flatten(1).any(dim=1)
+
+    @staticmethod
+    def from_polygon_masks(
+        polygon_masks: Union["PolygonMasks", List[List[np.ndarray]]], height: int, width: int
+    ) -> "BitMasks":
+        """
+        Args:
+            polygon_masks (list[list[ndarray]] or PolygonMasks)
+            height, width (int)
+        """
+        if isinstance(polygon_masks, PolygonMasks):
+            polygon_masks = polygon_masks.polygons
+        masks = [polygons_to_bitmask(p, height, width) for p in polygon_masks]
+        if len(masks):
+            return BitMasks(torch.stack([torch.from_numpy(x) for x in masks]))
+        else:
+            return BitMasks(torch.empty(0, height, width, dtype=torch.bool))
+
+    @staticmethod
+    def from_roi_masks(roi_masks: "ROIMasks", height: int, width: int) -> "BitMasks":
+        """
+        Args:
+            roi_masks:
+            height, width (int):
+        """
+        return roi_masks.to_bitmasks(height, width)
+
+    def crop_and_resize(self, boxes: torch.Tensor, mask_size: int) -> torch.Tensor:
+        """
+        Crop each bitmask by the given box, and resize results to (mask_size, mask_size).
+        This can be used to prepare training targets for Mask R-CNN.
+        It has less reconstruction error compared to rasterization with polygons.
+        However we observe no difference in accuracy,
+        but BitMasks requires more memory to store all the masks.
+
+        Args:
+            boxes (Tensor): Nx4 tensor storing the boxes for each mask
+            mask_size (int): the size of the rasterized mask.
+
+        Returns:
+            Tensor:
+                A bool tensor of shape (N, mask_size, mask_size), where
+                N is the number of predicted boxes for this image.
+        """
+        assert len(boxes) == len(self), "{} != {}".format(len(boxes), len(self))
+        device = self.tensor.device
+
+        batch_inds = torch.arange(len(boxes), device=device).to(dtype=boxes.dtype)[:, None]
+        rois = torch.cat([batch_inds, boxes], dim=1)  # Nx5
+
+        bit_masks = self.tensor.to(dtype=torch.float32)
+        rois = rois.to(device=device)
+        output = (
+            ROIAlign((mask_size, mask_size), 1.0, 0, aligned=True)
+            .forward(bit_masks[:, None, :, :], rois)
+            .squeeze(1)
+        )
+        output = output >= 0.5
+        return output
+
+    def get_bounding_boxes(self) -> Boxes:
+        """
+        Returns:
+            Boxes: tight bounding boxes around bitmasks.
+            If a mask is empty, it's bounding box will be all zero.
+        """
+        boxes = torch.zeros(self.tensor.shape[0], 4, dtype=torch.float32)
+        x_any = torch.any(self.tensor, dim=1)
+        y_any = torch.any(self.tensor, dim=2)
+        for idx in range(self.tensor.shape[0]):
+            x = torch.where(x_any[idx, :])[0]
+            y = torch.where(y_any[idx, :])[0]
+            if len(x) > 0 and len(y) > 0:
+                boxes[idx, :] = torch.as_tensor(
+                    [x[0], y[0], x[-1] + 1, y[-1] + 1], dtype=torch.float32
+                )
+        return Boxes(boxes)
+
+    @staticmethod
+    def cat(bitmasks_list: List["BitMasks"]) -> "BitMasks":
+        """
+        Concatenates a list of BitMasks into a single BitMasks
+
+        Arguments:
+            bitmasks_list (list[BitMasks])
+
+        Returns:
+            BitMasks: the concatenated BitMasks
+        """
+        assert isinstance(bitmasks_list, (list, tuple))
+        assert len(bitmasks_list) > 0
+        assert all(isinstance(bitmask, BitMasks) for bitmask in bitmasks_list)
+
+        cat_bitmasks = type(bitmasks_list[0])(torch.cat([bm.tensor for bm in bitmasks_list], dim=0))
+        return cat_bitmasks
+
+
+class PolygonMasks:
+    """
+    This class stores the segmentation masks for all objects in one image, in the form of polygons.
+
+    Attributes:
+        polygons: list[list[ndarray]]. Each ndarray is a float64 vector representing a polygon.
+    """
+
+    def __init__(self, polygons: List[List[Union[torch.Tensor, np.ndarray]]]):
+        """
+        Arguments:
+            polygons (list[list[np.ndarray]]): The first
+                level of the list correspond to individual instances,
+                the second level to all the polygons that compose the
+                instance, and the third level to the polygon coordinates.
+                The third level array should have the format of
+                [x0, y0, x1, y1, ..., xn, yn] (n >= 3).
+        """
+        if not isinstance(polygons, list):
+            raise ValueError(
+                "Cannot create PolygonMasks: Expect a list of list of polygons per image. "
+                "Got '{}' instead.".format(type(polygons))
+            )
+
+        def _make_array(t: Union[torch.Tensor, np.ndarray]) -> np.ndarray:
+            # Use float64 for higher precision, because why not?
+            # Always put polygons on CPU (self.to is a no-op) since they
+            # are supposed to be small tensors.
+            # May need to change this assumption if GPU placement becomes useful
+            if isinstance(t, torch.Tensor):
+                t = t.cpu().numpy()
+            return np.asarray(t).astype("float64")
+
+        def process_polygons(
+            polygons_per_instance: List[Union[torch.Tensor, np.ndarray]]
+        ) -> List[np.ndarray]:
+            if not isinstance(polygons_per_instance, list):
+                raise ValueError(
+                    "Cannot create polygons: Expect a list of polygons per instance. "
+                    "Got '{}' instead.".format(type(polygons_per_instance))
+                )
+            # transform each polygon to a numpy array
+            polygons_per_instance = [_make_array(p) for p in polygons_per_instance]
+            for polygon in polygons_per_instance:
+                if len(polygon) % 2 != 0 or len(polygon) < 6:
+                    raise ValueError(f"Cannot create a polygon from {len(polygon)} coordinates.")
+            return polygons_per_instance
+
+        self.polygons: List[List[np.ndarray]] = [
+            process_polygons(polygons_per_instance) for polygons_per_instance in polygons
+        ]
+
+    def to(self, *args: Any, **kwargs: Any) -> "PolygonMasks":
+        return self
+
+    @property
+    def device(self) -> torch.device:
+        return torch.device("cpu")
+
+    def get_bounding_boxes(self) -> Boxes:
+        """
+        Returns:
+            Boxes: tight bounding boxes around polygon masks.
+        """
+        boxes = torch.zeros(len(self.polygons), 4, dtype=torch.float32)
+        for idx, polygons_per_instance in enumerate(self.polygons):
+            minxy = torch.as_tensor([float("inf"), float("inf")], dtype=torch.float32)
+            maxxy = torch.zeros(2, dtype=torch.float32)
+            for polygon in polygons_per_instance:
+                coords = torch.from_numpy(polygon).view(-1, 2).to(dtype=torch.float32)
+                minxy = torch.min(minxy, torch.min(coords, dim=0).values)
+                maxxy = torch.max(maxxy, torch.max(coords, dim=0).values)
+            boxes[idx, :2] = minxy
+            boxes[idx, 2:] = maxxy
+        return Boxes(boxes)
+
+    def nonempty(self) -> torch.Tensor:
+        """
+        Find masks that are non-empty.
+
+        Returns:
+            Tensor:
+                a BoolTensor which represents whether each mask is empty (False) or not (True).
+        """
+        keep = [1 if len(polygon) > 0 else 0 for polygon in self.polygons]
+        return torch.from_numpy(np.asarray(keep, dtype=bool))
+
+    def __getitem__(self, item: Union[int, slice, List[int], torch.BoolTensor]) -> "PolygonMasks":
+        """
+        Support indexing over the instances and return a `PolygonMasks` object.
+        `item` can be:
+
+        1. An integer. It will return an object with only one instance.
+        2. A slice. It will return an object with the selected instances.
+        3. A list[int]. It will return an object with the selected instances,
+           correpsonding to the indices in the list.
+        4. A vector mask of type BoolTensor, whose length is num_instances.
+           It will return an object with the instances whose mask is nonzero.
+        """
+        if isinstance(item, int):
+            selected_polygons = [self.polygons[item]]
+        elif isinstance(item, slice):
+            selected_polygons = self.polygons[item]
+        elif isinstance(item, list):
+            selected_polygons = [self.polygons[i] for i in item]
+        elif isinstance(item, torch.Tensor):
+            # Polygons is a list, so we have to move the indices back to CPU.
+            if item.dtype == torch.bool:
+                assert item.dim() == 1, item.shape
+                item = item.nonzero().squeeze(1).cpu().numpy().tolist()
+            elif item.dtype in [torch.int32, torch.int64]:
+                item = item.cpu().numpy().tolist()
+            else:
+                raise ValueError("Unsupported tensor dtype={} for indexing!".format(item.dtype))
+            selected_polygons = [self.polygons[i] for i in item]
+        return PolygonMasks(selected_polygons)
+
+    def __iter__(self) -> Iterator[List[np.ndarray]]:
+        """
+        Yields:
+            list[ndarray]: the polygons for one instance.
+            Each Tensor is a float64 vector representing a polygon.
+        """
+        return iter(self.polygons)
+
+    def __repr__(self) -> str:
+        s = self.__class__.__name__ + "("
+        s += "num_instances={})".format(len(self.polygons))
+        return s
+
+    def __len__(self) -> int:
+        return len(self.polygons)
+
+    def crop_and_resize(self, boxes: torch.Tensor, mask_size: int) -> torch.Tensor:
+        """
+        Crop each mask by the given box, and resize results to (mask_size, mask_size).
+        This can be used to prepare training targets for Mask R-CNN.
+
+        Args:
+            boxes (Tensor): Nx4 tensor storing the boxes for each mask
+            mask_size (int): the size of the rasterized mask.
+
+        Returns:
+            Tensor: A bool tensor of shape (N, mask_size, mask_size), where
+            N is the number of predicted boxes for this image.
+        """
+        assert len(boxes) == len(self), "{} != {}".format(len(boxes), len(self))
+
+        device = boxes.device
+        # Put boxes on the CPU, as the polygon representation is not efficient GPU-wise
+        # (several small tensors for representing a single instance mask)
+        boxes = boxes.to(torch.device("cpu"))
+
+        results = [
+            rasterize_polygons_within_box(poly, box.numpy(), mask_size)
+            for poly, box in zip(self.polygons, boxes)
+        ]
+        """
+        poly: list[list[float]], the polygons for one instance
+        box: a tensor of shape (4,)
+        """
+        if len(results) == 0:
+            return torch.empty(0, mask_size, mask_size, dtype=torch.bool, device=device)
+        return torch.stack(results, dim=0).to(device=device)
+
+    def area(self):
+        """
+        Computes area of the mask.
+        Only works with Polygons, using the shoelace formula:
+        https://stackoverflow.com/questions/24467972/calculate-area-of-polygon-given-x-y-coordinates
+
+        Returns:
+            Tensor: a vector, area for each instance
+        """
+
+        area = []
+        for polygons_per_instance in self.polygons:
+            area_per_instance = 0
+            for p in polygons_per_instance:
+                area_per_instance += polygon_area(p[0::2], p[1::2])
+            area.append(area_per_instance)
+
+        return torch.tensor(area)
+
+    @staticmethod
+    def cat(polymasks_list: List["PolygonMasks"]) -> "PolygonMasks":
+        """
+        Concatenates a list of PolygonMasks into a single PolygonMasks
+
+        Arguments:
+            polymasks_list (list[PolygonMasks])
+
+        Returns:
+            PolygonMasks: the concatenated PolygonMasks
+        """
+        assert isinstance(polymasks_list, (list, tuple))
+        assert len(polymasks_list) > 0
+        assert all(isinstance(polymask, PolygonMasks) for polymask in polymasks_list)
+
+        cat_polymasks = type(polymasks_list[0])(
+            list(itertools.chain.from_iterable(pm.polygons for pm in polymasks_list))
+        )
+        return cat_polymasks
+
+
+class ROIMasks:
+    """
+    Represent masks by N smaller masks defined in some ROIs. Once ROI boxes are given,
+    full-image bitmask can be obtained by "pasting" the mask on the region defined
+    by the corresponding ROI box.
+    """
+
+    def __init__(self, tensor: torch.Tensor):
+        """
+        Args:
+            tensor: (N, M, M) mask tensor that defines the mask within each ROI.
+        """
+        if tensor.dim() != 3:
+            raise ValueError("ROIMasks must take a masks of 3 dimension.")
+        self.tensor = tensor
+
+    def to(self, device: torch.device) -> "ROIMasks":
+        return ROIMasks(self.tensor.to(device))
+
+    @property
+    def device(self) -> device:
+        return self.tensor.device
+
+    def __len__(self):
+        return self.tensor.shape[0]
+
+    def __getitem__(self, item) -> "ROIMasks":
+        """
+        Returns:
+            ROIMasks: Create a new :class:`ROIMasks` by indexing.
+
+        The following usage are allowed:
+
+        1. `new_masks = masks[2:10]`: return a slice of masks.
+        2. `new_masks = masks[vector]`, where vector is a torch.BoolTensor
+           with `length = len(masks)`. Nonzero elements in the vector will be selected.
+
+        Note that the returned object might share storage with this object,
+        subject to Pytorch's indexing semantics.
+        """
+        t = self.tensor[item]
+        if t.dim() != 3:
+            raise ValueError(
+                f"Indexing on ROIMasks with {item} returns a tensor with shape {t.shape}!"
+            )
+        return ROIMasks(t)
+
+    @torch.jit.unused
+    def __repr__(self) -> str:
+        s = self.__class__.__name__ + "("
+        s += "num_instances={})".format(len(self.tensor))
+        return s
+
+    @torch.jit.unused
+    def to_bitmasks(self, boxes: torch.Tensor, height, width, threshold=0.5):
+        """
+        Args: see documentation of :func:`paste_masks_in_image`.
+        """
+        from detectron2.layers.mask_ops import paste_masks_in_image, _paste_masks_tensor_shape
+
+        if torch.jit.is_tracing():
+            if isinstance(height, torch.Tensor):
+                paste_func = _paste_masks_tensor_shape
+            else:
+                paste_func = paste_masks_in_image
+        else:
+            paste_func = retry_if_cuda_oom(paste_masks_in_image)
+        bitmasks = paste_func(self.tensor, boxes.tensor, (height, width), threshold=threshold)
+        return BitMasks(bitmasks)
diff --git a/detectron2/structures/rotated_boxes.py b/detectron2/structures/rotated_boxes.py
new file mode 100644
index 0000000000000000000000000000000000000000..c842b999db62e5c8898aca32dc85778609a4da1d
--- /dev/null
+++ b/detectron2/structures/rotated_boxes.py
@@ -0,0 +1,505 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+import math
+from typing import List, Tuple
+import torch
+
+from detectron2.layers.rotated_boxes import pairwise_iou_rotated
+
+from .boxes import Boxes
+
+
+class RotatedBoxes(Boxes):
+    """
+    This structure stores a list of rotated boxes as a Nx5 torch.Tensor.
+    It supports some common methods about boxes
+    (`area`, `clip`, `nonempty`, etc),
+    and also behaves like a Tensor
+    (support indexing, `to(device)`, `.device`, and iteration over all boxes)
+    """
+
+    def __init__(self, tensor: torch.Tensor):
+        """
+        Args:
+            tensor (Tensor[float]): a Nx5 matrix.  Each row is
+                (x_center, y_center, width, height, angle),
+                in which angle is represented in degrees.
+                While there's no strict range restriction for it,
+                the recommended principal range is between [-180, 180) degrees.
+
+        Assume we have a horizontal box B = (x_center, y_center, width, height),
+        where width is along the x-axis and height is along the y-axis.
+        The rotated box B_rot (x_center, y_center, width, height, angle)
+        can be seen as:
+
+        1. When angle == 0:
+           B_rot == B
+        2. When angle > 0:
+           B_rot is obtained by rotating B w.r.t its center by :math:`|angle|` degrees CCW;
+        3. When angle < 0:
+           B_rot is obtained by rotating B w.r.t its center by :math:`|angle|` degrees CW.
+
+        Mathematically, since the right-handed coordinate system for image space
+        is (y, x), where y is top->down and x is left->right, the 4 vertices of the
+        rotated rectangle :math:`(yr_i, xr_i)` (i = 1, 2, 3, 4) can be obtained from
+        the vertices of the horizontal rectangle :math:`(y_i, x_i)` (i = 1, 2, 3, 4)
+        in the following way (:math:`\\theta = angle*\\pi/180` is the angle in radians,
+        :math:`(y_c, x_c)` is the center of the rectangle):
+
+        .. math::
+
+            yr_i = \\cos(\\theta) (y_i - y_c) - \\sin(\\theta) (x_i - x_c) + y_c,
+
+            xr_i = \\sin(\\theta) (y_i - y_c) + \\cos(\\theta) (x_i - x_c) + x_c,
+
+        which is the standard rigid-body rotation transformation.
+
+        Intuitively, the angle is
+        (1) the rotation angle from y-axis in image space
+        to the height vector (top->down in the box's local coordinate system)
+        of the box in CCW, and
+        (2) the rotation angle from x-axis in image space
+        to the width vector (left->right in the box's local coordinate system)
+        of the box in CCW.
+
+        More intuitively, consider the following horizontal box ABCD represented
+        in (x1, y1, x2, y2): (3, 2, 7, 4),
+        covering the [3, 7] x [2, 4] region of the continuous coordinate system
+        which looks like this:
+
+        .. code:: none
+
+            O--------> x
+            |
+            |  A---B
+            |  |   |
+            |  D---C
+            |
+            v y
+
+        Note that each capital letter represents one 0-dimensional geometric point
+        instead of a 'square pixel' here.
+
+        In the example above, using (x, y) to represent a point we have:
+
+        .. math::
+
+            O = (0, 0), A = (3, 2), B = (7, 2), C = (7, 4), D = (3, 4)
+
+        We name vector AB = vector DC as the width vector in box's local coordinate system, and
+        vector AD = vector BC as the height vector in box's local coordinate system. Initially,
+        when angle = 0 degree, they're aligned with the positive directions of x-axis and y-axis
+        in the image space, respectively.
+
+        For better illustration, we denote the center of the box as E,
+
+        .. code:: none
+
+            O--------> x
+            |
+            |  A---B
+            |  | E |
+            |  D---C
+            |
+            v y
+
+        where the center E = ((3+7)/2, (2+4)/2) = (5, 3).
+
+        Also,
+
+        .. math::
+
+            width = |AB| = |CD| = 7 - 3 = 4,
+            height = |AD| = |BC| = 4 - 2 = 2.
+
+        Therefore, the corresponding representation for the same shape in rotated box in
+        (x_center, y_center, width, height, angle) format is:
+
+        (5, 3, 4, 2, 0),
+
+        Now, let's consider (5, 3, 4, 2, 90), which is rotated by 90 degrees
+        CCW (counter-clockwise) by definition. It looks like this:
+
+        .. code:: none
+
+            O--------> x
+            |   B-C
+            |   | |
+            |   |E|
+            |   | |
+            |   A-D
+            v y
+
+        The center E is still located at the same point (5, 3), while the vertices
+        ABCD are rotated by 90 degrees CCW with regard to E:
+        A = (4, 5), B = (4, 1), C = (6, 1), D = (6, 5)
+
+        Here, 90 degrees can be seen as the CCW angle to rotate from y-axis to
+        vector AD or vector BC (the top->down height vector in box's local coordinate system),
+        or the CCW angle to rotate from x-axis to vector AB or vector DC (the left->right
+        width vector in box's local coordinate system).
+
+        .. math::
+
+            width = |AB| = |CD| = 5 - 1 = 4,
+            height = |AD| = |BC| = 6 - 4 = 2.
+
+        Next, how about (5, 3, 4, 2, -90), which is rotated by 90 degrees CW (clockwise)
+        by definition? It looks like this:
+
+        .. code:: none
+
+            O--------> x
+            |   D-A
+            |   | |
+            |   |E|
+            |   | |
+            |   C-B
+            v y
+
+        The center E is still located at the same point (5, 3), while the vertices
+        ABCD are rotated by 90 degrees CW with regard to E:
+        A = (6, 1), B = (6, 5), C = (4, 5), D = (4, 1)
+
+        .. math::
+
+            width = |AB| = |CD| = 5 - 1 = 4,
+            height = |AD| = |BC| = 6 - 4 = 2.
+
+        This covers exactly the same region as (5, 3, 4, 2, 90) does, and their IoU
+        will be 1. However, these two will generate different RoI Pooling results and
+        should not be treated as an identical box.
+
+        On the other hand, it's easy to see that (X, Y, W, H, A) is identical to
+        (X, Y, W, H, A+360N), for any integer N. For example (5, 3, 4, 2, 270) would be
+        identical to (5, 3, 4, 2, -90), because rotating the shape 270 degrees CCW is
+        equivalent to rotating the same shape 90 degrees CW.
+
+        We could rotate further to get (5, 3, 4, 2, 180), or (5, 3, 4, 2, -180):
+
+        .. code:: none
+
+            O--------> x
+            |
+            |  C---D
+            |  | E |
+            |  B---A
+            |
+            v y
+
+        .. math::
+
+            A = (7, 4), B = (3, 4), C = (3, 2), D = (7, 2),
+
+            width = |AB| = |CD| = 7 - 3 = 4,
+            height = |AD| = |BC| = 4 - 2 = 2.
+
+        Finally, this is a very inaccurate (heavily quantized) illustration of
+        how (5, 3, 4, 2, 60) looks like in case anyone wonders:
+
+        .. code:: none
+
+            O--------> x
+            |     B\
+            |    /  C
+            |   /E /
+            |  A  /
+            |   `D
+            v y
+
+        It's still a rectangle with center of (5, 3), width of 4 and height of 2,
+        but its angle (and thus orientation) is somewhere between
+        (5, 3, 4, 2, 0) and (5, 3, 4, 2, 90).
+        """
+        device = tensor.device if isinstance(tensor, torch.Tensor) else torch.device("cpu")
+        tensor = torch.as_tensor(tensor, dtype=torch.float32, device=device)
+        if tensor.numel() == 0:
+            # Use reshape, so we don't end up creating a new tensor that does not depend on
+            # the inputs (and consequently confuses jit)
+            tensor = tensor.reshape((0, 5)).to(dtype=torch.float32, device=device)
+        assert tensor.dim() == 2 and tensor.size(-1) == 5, tensor.size()
+
+        self.tensor = tensor
+
+    def clone(self) -> "RotatedBoxes":
+        """
+        Clone the RotatedBoxes.
+
+        Returns:
+            RotatedBoxes
+        """
+        return RotatedBoxes(self.tensor.clone())
+
+    def to(self, device: torch.device):
+        # Boxes are assumed float32 and does not support to(dtype)
+        return RotatedBoxes(self.tensor.to(device=device))
+
+    def area(self) -> torch.Tensor:
+        """
+        Computes the area of all the boxes.
+
+        Returns:
+            torch.Tensor: a vector with areas of each box.
+        """
+        box = self.tensor
+        area = box[:, 2] * box[:, 3]
+        return area
+
+    # Avoid in-place operations so that we can torchscript; NOTE: this creates a new tensor
+    def normalize_angles(self) -> None:
+        """
+        Restrict angles to the range of [-180, 180) degrees
+        """
+        angle_tensor = (self.tensor[:, 4] + 180.0) % 360.0 - 180.0
+        self.tensor = torch.cat((self.tensor[:, :4], angle_tensor[:, None]), dim=1)
+
+    def clip(self, box_size: Tuple[int, int], clip_angle_threshold: float = 1.0) -> None:
+        """
+        Clip (in place) the boxes by limiting x coordinates to the range [0, width]
+        and y coordinates to the range [0, height].
+
+        For RRPN:
+        Only clip boxes that are almost horizontal with a tolerance of
+        clip_angle_threshold to maintain backward compatibility.
+
+        Rotated boxes beyond this threshold are not clipped for two reasons:
+
+        1. There are potentially multiple ways to clip a rotated box to make it
+           fit within the image.
+        2. It's tricky to make the entire rectangular box fit within the image
+           and still be able to not leave out pixels of interest.
+
+        Therefore we rely on ops like RoIAlignRotated to safely handle this.
+
+        Args:
+            box_size (height, width): The clipping box's size.
+            clip_angle_threshold:
+                Iff. abs(normalized(angle)) <= clip_angle_threshold (in degrees),
+                we do the clipping as horizontal boxes.
+        """
+        h, w = box_size
+
+        # normalize angles to be within (-180, 180] degrees
+        self.normalize_angles()
+
+        idx = torch.where(torch.abs(self.tensor[:, 4]) <= clip_angle_threshold)[0]
+
+        # convert to (x1, y1, x2, y2)
+        x1 = self.tensor[idx, 0] - self.tensor[idx, 2] / 2.0
+        y1 = self.tensor[idx, 1] - self.tensor[idx, 3] / 2.0
+        x2 = self.tensor[idx, 0] + self.tensor[idx, 2] / 2.0
+        y2 = self.tensor[idx, 1] + self.tensor[idx, 3] / 2.0
+
+        # clip
+        x1.clamp_(min=0, max=w)
+        y1.clamp_(min=0, max=h)
+        x2.clamp_(min=0, max=w)
+        y2.clamp_(min=0, max=h)
+
+        # convert back to (xc, yc, w, h)
+        self.tensor[idx, 0] = (x1 + x2) / 2.0
+        self.tensor[idx, 1] = (y1 + y2) / 2.0
+        # make sure widths and heights do not increase due to numerical errors
+        self.tensor[idx, 2] = torch.min(self.tensor[idx, 2], x2 - x1)
+        self.tensor[idx, 3] = torch.min(self.tensor[idx, 3], y2 - y1)
+
+    def nonempty(self, threshold: float = 0.0) -> torch.Tensor:
+        """
+        Find boxes that are non-empty.
+        A box is considered empty, if either of its side is no larger than threshold.
+
+        Returns:
+            Tensor: a binary vector which represents
+            whether each box is empty (False) or non-empty (True).
+        """
+        box = self.tensor
+        widths = box[:, 2]
+        heights = box[:, 3]
+        keep = (widths > threshold) & (heights > threshold)
+        return keep
+
+    def __getitem__(self, item) -> "RotatedBoxes":
+        """
+        Returns:
+            RotatedBoxes: Create a new :class:`RotatedBoxes` by indexing.
+
+        The following usage are allowed:
+
+        1. `new_boxes = boxes[3]`: return a `RotatedBoxes` which contains only one box.
+        2. `new_boxes = boxes[2:10]`: return a slice of boxes.
+        3. `new_boxes = boxes[vector]`, where vector is a torch.ByteTensor
+           with `length = len(boxes)`. Nonzero elements in the vector will be selected.
+
+        Note that the returned RotatedBoxes might share storage with this RotatedBoxes,
+        subject to Pytorch's indexing semantics.
+        """
+        if isinstance(item, int):
+            return RotatedBoxes(self.tensor[item].view(1, -1))
+        b = self.tensor[item]
+        assert b.dim() == 2, "Indexing on RotatedBoxes with {} failed to return a matrix!".format(
+            item
+        )
+        return RotatedBoxes(b)
+
+    def __len__(self) -> int:
+        return self.tensor.shape[0]
+
+    def __repr__(self) -> str:
+        return "RotatedBoxes(" + str(self.tensor) + ")"
+
+    def inside_box(self, box_size: Tuple[int, int], boundary_threshold: int = 0) -> torch.Tensor:
+        """
+        Args:
+            box_size (height, width): Size of the reference box covering
+                [0, width] x [0, height]
+            boundary_threshold (int): Boxes that extend beyond the reference box
+                boundary by more than boundary_threshold are considered "outside".
+
+        For RRPN, it might not be necessary to call this function since it's common
+        for rotated box to extend to outside of the image boundaries
+        (the clip function only clips the near-horizontal boxes)
+
+        Returns:
+            a binary vector, indicating whether each box is inside the reference box.
+        """
+        height, width = box_size
+
+        cnt_x = self.tensor[..., 0]
+        cnt_y = self.tensor[..., 1]
+        half_w = self.tensor[..., 2] / 2.0
+        half_h = self.tensor[..., 3] / 2.0
+        a = self.tensor[..., 4]
+        c = torch.abs(torch.cos(a * math.pi / 180.0))
+        s = torch.abs(torch.sin(a * math.pi / 180.0))
+        # This basically computes the horizontal bounding rectangle of the rotated box
+        max_rect_dx = c * half_w + s * half_h
+        max_rect_dy = c * half_h + s * half_w
+
+        inds_inside = (
+            (cnt_x - max_rect_dx >= -boundary_threshold)
+            & (cnt_y - max_rect_dy >= -boundary_threshold)
+            & (cnt_x + max_rect_dx < width + boundary_threshold)
+            & (cnt_y + max_rect_dy < height + boundary_threshold)
+        )
+
+        return inds_inside
+
+    def get_centers(self) -> torch.Tensor:
+        """
+        Returns:
+            The box centers in a Nx2 array of (x, y).
+        """
+        return self.tensor[:, :2]
+
+    def scale(self, scale_x: float, scale_y: float) -> None:
+        """
+        Scale the rotated box with horizontal and vertical scaling factors
+        Note: when scale_factor_x != scale_factor_y,
+        the rotated box does not preserve the rectangular shape when the angle
+        is not a multiple of 90 degrees under resize transformation.
+        Instead, the shape is a parallelogram (that has skew)
+        Here we make an approximation by fitting a rotated rectangle to the parallelogram.
+        """
+        self.tensor[:, 0] *= scale_x
+        self.tensor[:, 1] *= scale_y
+        theta = self.tensor[:, 4] * math.pi / 180.0
+        c = torch.cos(theta)
+        s = torch.sin(theta)
+
+        # In image space, y is top->down and x is left->right
+        # Consider the local coordintate system for the rotated box,
+        # where the box center is located at (0, 0), and the four vertices ABCD are
+        # A(-w / 2, -h / 2), B(w / 2, -h / 2), C(w / 2, h / 2), D(-w / 2, h / 2)
+        # the midpoint of the left edge AD of the rotated box E is:
+        # E = (A+D)/2 = (-w / 2, 0)
+        # the midpoint of the top edge AB of the rotated box F is:
+        # F(0, -h / 2)
+        # To get the old coordinates in the global system, apply the rotation transformation
+        # (Note: the right-handed coordinate system for image space is yOx):
+        # (old_x, old_y) = (s * y + c * x, c * y - s * x)
+        # E(old) = (s * 0 + c * (-w/2), c * 0 - s * (-w/2)) = (-c * w / 2, s * w / 2)
+        # F(old) = (s * (-h / 2) + c * 0, c * (-h / 2) - s * 0) = (-s * h / 2, -c * h / 2)
+        # After applying the scaling factor (sfx, sfy):
+        # E(new) = (-sfx * c * w / 2, sfy * s * w / 2)
+        # F(new) = (-sfx * s * h / 2, -sfy * c * h / 2)
+        # The new width after scaling tranformation becomes:
+
+        # w(new) = |E(new) - O| * 2
+        #        = sqrt[(sfx * c * w / 2)^2 + (sfy * s * w / 2)^2] * 2
+        #        = sqrt[(sfx * c)^2 + (sfy * s)^2] * w
+        # i.e., scale_factor_w = sqrt[(sfx * c)^2 + (sfy * s)^2]
+        #
+        # For example,
+        # when angle = 0 or 180, |c| = 1, s = 0, scale_factor_w == scale_factor_x;
+        # when |angle| = 90, c = 0, |s| = 1, scale_factor_w == scale_factor_y
+        self.tensor[:, 2] *= torch.sqrt((scale_x * c) ** 2 + (scale_y * s) ** 2)
+
+        # h(new) = |F(new) - O| * 2
+        #        = sqrt[(sfx * s * h / 2)^2 + (sfy * c * h / 2)^2] * 2
+        #        = sqrt[(sfx * s)^2 + (sfy * c)^2] * h
+        # i.e., scale_factor_h = sqrt[(sfx * s)^2 + (sfy * c)^2]
+        #
+        # For example,
+        # when angle = 0 or 180, |c| = 1, s = 0, scale_factor_h == scale_factor_y;
+        # when |angle| = 90, c = 0, |s| = 1, scale_factor_h == scale_factor_x
+        self.tensor[:, 3] *= torch.sqrt((scale_x * s) ** 2 + (scale_y * c) ** 2)
+
+        # The angle is the rotation angle from y-axis in image space to the height
+        # vector (top->down in the box's local coordinate system) of the box in CCW.
+        #
+        # angle(new) = angle_yOx(O - F(new))
+        #            = angle_yOx( (sfx * s * h / 2, sfy * c * h / 2) )
+        #            = atan2(sfx * s * h / 2, sfy * c * h / 2)
+        #            = atan2(sfx * s, sfy * c)
+        #
+        # For example,
+        # when sfx == sfy, angle(new) == atan2(s, c) == angle(old)
+        self.tensor[:, 4] = torch.atan2(scale_x * s, scale_y * c) * 180 / math.pi
+
+    @classmethod
+    def cat(cls, boxes_list: List["RotatedBoxes"]) -> "RotatedBoxes":
+        """
+        Concatenates a list of RotatedBoxes into a single RotatedBoxes
+
+        Arguments:
+            boxes_list (list[RotatedBoxes])
+
+        Returns:
+            RotatedBoxes: the concatenated RotatedBoxes
+        """
+        assert isinstance(boxes_list, (list, tuple))
+        if len(boxes_list) == 0:
+            return cls(torch.empty(0))
+        assert all([isinstance(box, RotatedBoxes) for box in boxes_list])
+
+        # use torch.cat (v.s. layers.cat) so the returned boxes never share storage with input
+        cat_boxes = cls(torch.cat([b.tensor for b in boxes_list], dim=0))
+        return cat_boxes
+
+    @property
+    def device(self) -> torch.device:
+        return self.tensor.device
+
+    @torch.jit.unused
+    def __iter__(self):
+        """
+        Yield a box as a Tensor of shape (5,) at a time.
+        """
+        yield from self.tensor
+
+
+def pairwise_iou(boxes1: RotatedBoxes, boxes2: RotatedBoxes) -> None:
+    """
+    Given two lists of rotated boxes of size N and M,
+    compute the IoU (intersection over union)
+    between **all** N x M pairs of boxes.
+    The box order must be (x_center, y_center, width, height, angle).
+
+    Args:
+        boxes1, boxes2 (RotatedBoxes):
+            two `RotatedBoxes`. Contains N & M rotated boxes, respectively.
+
+    Returns:
+        Tensor: IoU, sized [N,M].
+    """
+
+    return pairwise_iou_rotated(boxes1.tensor, boxes2.tensor)
diff --git a/detectron2/tracking/__init__.py b/detectron2/tracking/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..21078ae822b04b71dbd8b056b5993d173eaf6bff
--- /dev/null
+++ b/detectron2/tracking/__init__.py
@@ -0,0 +1,15 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+from .base_tracker import (  # noqa
+    BaseTracker,
+    build_tracker_head,
+    TRACKER_HEADS_REGISTRY,
+)
+from .bbox_iou_tracker import BBoxIOUTracker  # noqa
+from .hungarian_tracker import BaseHungarianTracker  # noqa
+from .iou_weighted_hungarian_bbox_iou_tracker import (  # noqa
+    IOUWeightedHungarianBBoxIOUTracker,
+)
+from .utils import create_prediction_pairs  # noqa
+from .vanilla_hungarian_bbox_iou_tracker import VanillaHungarianBBoxIOUTracker  # noqa
+
+__all__ = [k for k in globals().keys() if not k.startswith("_")]
diff --git a/detectron2/tracking/base_tracker.py b/detectron2/tracking/base_tracker.py
new file mode 100644
index 0000000000000000000000000000000000000000..f2f20455c1841324292e9b9d8f42669c8ba61825
--- /dev/null
+++ b/detectron2/tracking/base_tracker.py
@@ -0,0 +1,64 @@
+#!/usr/bin/env python3
+# Copyright 2004-present Facebook. All Rights Reserved.
+from detectron2.config import configurable
+from detectron2.utils.registry import Registry
+
+from ..config.config import CfgNode as CfgNode_
+from ..structures import Instances
+
+TRACKER_HEADS_REGISTRY = Registry("TRACKER_HEADS")
+TRACKER_HEADS_REGISTRY.__doc__ = """
+Registry for tracking classes.
+"""
+
+
+class BaseTracker:
+    """
+    A parent class for all trackers
+    """
+
+    @configurable
+    def __init__(self, **kwargs):
+        self._prev_instances = None  # (D2)instances for previous frame
+        self._matched_idx = set()  # indices in prev_instances found matching
+        self._matched_ID = set()  # idendities in prev_instances found matching
+        self._untracked_prev_idx = set()  # indices in prev_instances not found matching
+        self._id_count = 0  # used to assign new id
+
+    @classmethod
+    def from_config(cls, cfg: CfgNode_):
+        raise NotImplementedError("Calling BaseTracker::from_config")
+
+    def update(self, predictions: Instances) -> Instances:
+        """
+        Args:
+            predictions: D2 Instances for predictions of the current frame
+        Return:
+            D2 Instances for predictions of the current frame with ID assigned
+
+        _prev_instances and instances will have the following fields:
+          .pred_boxes               (shape=[N, 4])
+          .scores                   (shape=[N,])
+          .pred_classes             (shape=[N,])
+          .pred_keypoints           (shape=[N, M, 3], Optional)
+          .pred_masks               (shape=List[2D_MASK], Optional)   2D_MASK: shape=[H, W]
+          .ID                       (shape=[N,])
+
+        N: # of detected bboxes
+        H and W: height and width of 2D mask
+        """
+        raise NotImplementedError("Calling BaseTracker::update")
+
+
+def build_tracker_head(cfg: CfgNode_) -> BaseTracker:
+    """
+    Build a tracker head from `cfg.TRACKER_HEADS.TRACKER_NAME`.
+
+    Args:
+        cfg: D2 CfgNode, config file with tracker information
+    Return:
+        tracker object
+    """
+    name = cfg.TRACKER_HEADS.TRACKER_NAME
+    tracker_class = TRACKER_HEADS_REGISTRY.get(name)
+    return tracker_class(cfg)
diff --git a/detectron2/tracking/bbox_iou_tracker.py b/detectron2/tracking/bbox_iou_tracker.py
new file mode 100644
index 0000000000000000000000000000000000000000..598081cb542ce64dd1d100c0d3e12a59f57b8e0e
--- /dev/null
+++ b/detectron2/tracking/bbox_iou_tracker.py
@@ -0,0 +1,276 @@
+#!/usr/bin/env python3
+# Copyright 2004-present Facebook. All Rights Reserved.
+import copy
+import numpy as np
+from typing import List
+import torch
+
+from detectron2.config import configurable
+from detectron2.structures import Boxes, Instances
+from detectron2.structures.boxes import pairwise_iou
+
+from ..config.config import CfgNode as CfgNode_
+from .base_tracker import TRACKER_HEADS_REGISTRY, BaseTracker
+
+
+@TRACKER_HEADS_REGISTRY.register()
+class BBoxIOUTracker(BaseTracker):
+    """
+    A bounding box tracker to assign ID based on IoU between current and previous instances
+    """
+
+    @configurable
+    def __init__(
+        self,
+        *,
+        video_height: int,
+        video_width: int,
+        max_num_instances: int = 200,
+        max_lost_frame_count: int = 0,
+        min_box_rel_dim: float = 0.02,
+        min_instance_period: int = 1,
+        track_iou_threshold: float = 0.5,
+        **kwargs,
+    ):
+        """
+        Args:
+        video_height: height the video frame
+        video_width: width of the video frame
+        max_num_instances: maximum number of id allowed to be tracked
+        max_lost_frame_count: maximum number of frame an id can lost tracking
+                              exceed this number, an id is considered as lost
+                              forever
+        min_box_rel_dim: a percentage, smaller than this dimension, a bbox is
+                         removed from tracking
+        min_instance_period: an instance will be shown after this number of period
+                             since its first showing up in the video
+        track_iou_threshold: iou threshold, below this number a bbox pair is removed
+                             from tracking
+        """
+        super().__init__(**kwargs)
+        self._video_height = video_height
+        self._video_width = video_width
+        self._max_num_instances = max_num_instances
+        self._max_lost_frame_count = max_lost_frame_count
+        self._min_box_rel_dim = min_box_rel_dim
+        self._min_instance_period = min_instance_period
+        self._track_iou_threshold = track_iou_threshold
+
+    @classmethod
+    def from_config(cls, cfg: CfgNode_):
+        """
+        Old style initialization using CfgNode
+
+        Args:
+            cfg: D2 CfgNode, config file
+        Return:
+            dictionary storing arguments for __init__ method
+        """
+        assert "VIDEO_HEIGHT" in cfg.TRACKER_HEADS
+        assert "VIDEO_WIDTH" in cfg.TRACKER_HEADS
+        video_height = cfg.TRACKER_HEADS.get("VIDEO_HEIGHT")
+        video_width = cfg.TRACKER_HEADS.get("VIDEO_WIDTH")
+        max_num_instances = cfg.TRACKER_HEADS.get("MAX_NUM_INSTANCES", 200)
+        max_lost_frame_count = cfg.TRACKER_HEADS.get("MAX_LOST_FRAME_COUNT", 0)
+        min_box_rel_dim = cfg.TRACKER_HEADS.get("MIN_BOX_REL_DIM", 0.02)
+        min_instance_period = cfg.TRACKER_HEADS.get("MIN_INSTANCE_PERIOD", 1)
+        track_iou_threshold = cfg.TRACKER_HEADS.get("TRACK_IOU_THRESHOLD", 0.5)
+        return {
+            "_target_": "detectron2.tracking.bbox_iou_tracker.BBoxIOUTracker",
+            "video_height": video_height,
+            "video_width": video_width,
+            "max_num_instances": max_num_instances,
+            "max_lost_frame_count": max_lost_frame_count,
+            "min_box_rel_dim": min_box_rel_dim,
+            "min_instance_period": min_instance_period,
+            "track_iou_threshold": track_iou_threshold,
+        }
+
+    def update(self, instances: Instances) -> Instances:
+        """
+        See BaseTracker description
+        """
+        instances = self._initialize_extra_fields(instances)
+        if self._prev_instances is not None:
+            # calculate IoU of all bbox pairs
+            iou_all = pairwise_iou(
+                boxes1=instances.pred_boxes,
+                boxes2=self._prev_instances.pred_boxes,
+            )
+            # sort IoU in descending order
+            bbox_pairs = self._create_prediction_pairs(instances, iou_all)
+            # assign previous ID to current bbox if IoU > track_iou_threshold
+            self._reset_fields()
+            for bbox_pair in bbox_pairs:
+                idx = bbox_pair["idx"]
+                prev_id = bbox_pair["prev_id"]
+                if (
+                    idx in self._matched_idx
+                    or prev_id in self._matched_ID
+                    or bbox_pair["IoU"] < self._track_iou_threshold
+                ):
+                    continue
+                instances.ID[idx] = prev_id
+                instances.ID_period[idx] = bbox_pair["prev_period"] + 1
+                instances.lost_frame_count[idx] = 0
+                self._matched_idx.add(idx)
+                self._matched_ID.add(prev_id)
+                self._untracked_prev_idx.remove(bbox_pair["prev_idx"])
+            instances = self._assign_new_id(instances)
+            instances = self._merge_untracked_instances(instances)
+        self._prev_instances = copy.deepcopy(instances)
+        return instances
+
+    def _create_prediction_pairs(self, instances: Instances, iou_all: np.ndarray) -> List:
+        """
+        For all instances in previous and current frames, create pairs. For each
+        pair, store index of the instance in current frame predcitions, index in
+        previous predictions, ID in previous predictions, IoU of the bboxes in this
+        pair, period in previous predictions.
+
+        Args:
+            instances: D2 Instances, for predictions of the current frame
+            iou_all: IoU for all bboxes pairs
+        Return:
+            A list of IoU for all pairs
+        """
+        bbox_pairs = []
+        for i in range(len(instances)):
+            for j in range(len(self._prev_instances)):
+                bbox_pairs.append(
+                    {
+                        "idx": i,
+                        "prev_idx": j,
+                        "prev_id": self._prev_instances.ID[j],
+                        "IoU": iou_all[i, j],
+                        "prev_period": self._prev_instances.ID_period[j],
+                    }
+                )
+        return bbox_pairs
+
+    def _initialize_extra_fields(self, instances: Instances) -> Instances:
+        """
+        If input instances don't have ID, ID_period, lost_frame_count fields,
+        this method is used to initialize these fields.
+
+        Args:
+            instances: D2 Instances, for predictions of the current frame
+        Return:
+            D2 Instances with extra fields added
+        """
+        if not instances.has("ID"):
+            instances.set("ID", [None] * len(instances))
+        if not instances.has("ID_period"):
+            instances.set("ID_period", [None] * len(instances))
+        if not instances.has("lost_frame_count"):
+            instances.set("lost_frame_count", [None] * len(instances))
+        if self._prev_instances is None:
+            instances.ID = list(range(len(instances)))
+            self._id_count += len(instances)
+            instances.ID_period = [1] * len(instances)
+            instances.lost_frame_count = [0] * len(instances)
+        return instances
+
+    def _reset_fields(self):
+        """
+        Before each uodate call, reset fields first
+        """
+        self._matched_idx = set()
+        self._matched_ID = set()
+        self._untracked_prev_idx = set(range(len(self._prev_instances)))
+
+    def _assign_new_id(self, instances: Instances) -> Instances:
+        """
+        For each untracked instance, assign a new id
+
+        Args:
+            instances: D2 Instances, for predictions of the current frame
+        Return:
+            D2 Instances with new ID assigned
+        """
+        untracked_idx = set(range(len(instances))).difference(self._matched_idx)
+        for idx in untracked_idx:
+            instances.ID[idx] = self._id_count
+            self._id_count += 1
+            instances.ID_period[idx] = 1
+            instances.lost_frame_count[idx] = 0
+        return instances
+
+    def _merge_untracked_instances(self, instances: Instances) -> Instances:
+        """
+        For untracked previous instances, under certain condition, still keep them
+        in tracking and merge with the current instances.
+
+        Args:
+            instances: D2 Instances, for predictions of the current frame
+        Return:
+            D2 Instances merging current instances and instances from previous
+            frame decided to keep tracking
+        """
+        untracked_instances = Instances(
+            image_size=instances.image_size,
+            pred_boxes=[],
+            pred_classes=[],
+            scores=[],
+            ID=[],
+            ID_period=[],
+            lost_frame_count=[],
+        )
+        prev_bboxes = list(self._prev_instances.pred_boxes)
+        prev_classes = list(self._prev_instances.pred_classes)
+        prev_scores = list(self._prev_instances.scores)
+        prev_ID_period = self._prev_instances.ID_period
+        if instances.has("pred_masks"):
+            untracked_instances.set("pred_masks", [])
+            prev_masks = list(self._prev_instances.pred_masks)
+        if instances.has("pred_keypoints"):
+            untracked_instances.set("pred_keypoints", [])
+            prev_keypoints = list(self._prev_instances.pred_keypoints)
+        if instances.has("pred_keypoint_heatmaps"):
+            untracked_instances.set("pred_keypoint_heatmaps", [])
+            prev_keypoint_heatmaps = list(self._prev_instances.pred_keypoint_heatmaps)
+        for idx in self._untracked_prev_idx:
+            x_left, y_top, x_right, y_bot = prev_bboxes[idx]
+            if (
+                (1.0 * (x_right - x_left) / self._video_width < self._min_box_rel_dim)
+                or (1.0 * (y_bot - y_top) / self._video_height < self._min_box_rel_dim)
+                or self._prev_instances.lost_frame_count[idx] >= self._max_lost_frame_count
+                or prev_ID_period[idx] <= self._min_instance_period
+            ):
+                continue
+            untracked_instances.pred_boxes.append(list(prev_bboxes[idx].numpy()))
+            untracked_instances.pred_classes.append(int(prev_classes[idx]))
+            untracked_instances.scores.append(float(prev_scores[idx]))
+            untracked_instances.ID.append(self._prev_instances.ID[idx])
+            untracked_instances.ID_period.append(self._prev_instances.ID_period[idx])
+            untracked_instances.lost_frame_count.append(
+                self._prev_instances.lost_frame_count[idx] + 1
+            )
+            if instances.has("pred_masks"):
+                untracked_instances.pred_masks.append(prev_masks[idx].numpy().astype(np.uint8))
+            if instances.has("pred_keypoints"):
+                untracked_instances.pred_keypoints.append(
+                    prev_keypoints[idx].numpy().astype(np.uint8)
+                )
+            if instances.has("pred_keypoint_heatmaps"):
+                untracked_instances.pred_keypoint_heatmaps.append(
+                    prev_keypoint_heatmaps[idx].numpy().astype(np.float32)
+                )
+        untracked_instances.pred_boxes = Boxes(torch.FloatTensor(untracked_instances.pred_boxes))
+        untracked_instances.pred_classes = torch.IntTensor(untracked_instances.pred_classes)
+        untracked_instances.scores = torch.FloatTensor(untracked_instances.scores)
+        if instances.has("pred_masks"):
+            untracked_instances.pred_masks = torch.IntTensor(untracked_instances.pred_masks)
+        if instances.has("pred_keypoints"):
+            untracked_instances.pred_keypoints = torch.IntTensor(untracked_instances.pred_keypoints)
+        if instances.has("pred_keypoint_heatmaps"):
+            untracked_instances.pred_keypoint_heatmaps = torch.FloatTensor(
+                untracked_instances.pred_keypoint_heatmaps
+            )
+
+        return Instances.cat(
+            [
+                instances,
+                untracked_instances,
+            ]
+        )
diff --git a/detectron2/tracking/hungarian_tracker.py b/detectron2/tracking/hungarian_tracker.py
new file mode 100644
index 0000000000000000000000000000000000000000..5b3ce884d80d9cdc2e0da07194693dd1bf16dd61
--- /dev/null
+++ b/detectron2/tracking/hungarian_tracker.py
@@ -0,0 +1,171 @@
+#!/usr/bin/env python3
+# Copyright 2004-present Facebook. All Rights Reserved.
+import copy
+import numpy as np
+from typing import Dict
+import torch
+from scipy.optimize import linear_sum_assignment
+
+from detectron2.config import configurable
+from detectron2.structures import Boxes, Instances
+
+from ..config.config import CfgNode as CfgNode_
+from .base_tracker import BaseTracker
+
+
+class BaseHungarianTracker(BaseTracker):
+    """
+    A base class for all Hungarian trackers
+    """
+
+    @configurable
+    def __init__(
+        self,
+        video_height: int,
+        video_width: int,
+        max_num_instances: int = 200,
+        max_lost_frame_count: int = 0,
+        min_box_rel_dim: float = 0.02,
+        min_instance_period: int = 1,
+        **kwargs
+    ):
+        """
+        Args:
+        video_height: height the video frame
+        video_width: width of the video frame
+        max_num_instances: maximum number of id allowed to be tracked
+        max_lost_frame_count: maximum number of frame an id can lost tracking
+                              exceed this number, an id is considered as lost
+                              forever
+        min_box_rel_dim: a percentage, smaller than this dimension, a bbox is
+                         removed from tracking
+        min_instance_period: an instance will be shown after this number of period
+                             since its first showing up in the video
+        """
+        super().__init__(**kwargs)
+        self._video_height = video_height
+        self._video_width = video_width
+        self._max_num_instances = max_num_instances
+        self._max_lost_frame_count = max_lost_frame_count
+        self._min_box_rel_dim = min_box_rel_dim
+        self._min_instance_period = min_instance_period
+
+    @classmethod
+    def from_config(cls, cfg: CfgNode_) -> Dict:
+        raise NotImplementedError("Calling HungarianTracker::from_config")
+
+    def build_cost_matrix(self, instances: Instances, prev_instances: Instances) -> np.ndarray:
+        raise NotImplementedError("Calling HungarianTracker::build_matrix")
+
+    def update(self, instances: Instances) -> Instances:
+        if instances.has("pred_keypoints"):
+            raise NotImplementedError("Need to add support for keypoints")
+        instances = self._initialize_extra_fields(instances)
+        if self._prev_instances is not None:
+            self._untracked_prev_idx = set(range(len(self._prev_instances)))
+            cost_matrix = self.build_cost_matrix(instances, self._prev_instances)
+            matched_idx, matched_prev_idx = linear_sum_assignment(cost_matrix)
+            instances = self._process_matched_idx(instances, matched_idx, matched_prev_idx)
+            instances = self._process_unmatched_idx(instances, matched_idx)
+            instances = self._process_unmatched_prev_idx(instances, matched_prev_idx)
+        self._prev_instances = copy.deepcopy(instances)
+        return instances
+
+    def _initialize_extra_fields(self, instances: Instances) -> Instances:
+        """
+        If input instances don't have ID, ID_period, lost_frame_count fields,
+        this method is used to initialize these fields.
+
+        Args:
+            instances: D2 Instances, for predictions of the current frame
+        Return:
+            D2 Instances with extra fields added
+        """
+        if not instances.has("ID"):
+            instances.set("ID", [None] * len(instances))
+        if not instances.has("ID_period"):
+            instances.set("ID_period", [None] * len(instances))
+        if not instances.has("lost_frame_count"):
+            instances.set("lost_frame_count", [None] * len(instances))
+        if self._prev_instances is None:
+            instances.ID = list(range(len(instances)))
+            self._id_count += len(instances)
+            instances.ID_period = [1] * len(instances)
+            instances.lost_frame_count = [0] * len(instances)
+        return instances
+
+    def _process_matched_idx(
+        self, instances: Instances, matched_idx: np.ndarray, matched_prev_idx: np.ndarray
+    ) -> Instances:
+        assert matched_idx.size == matched_prev_idx.size
+        for i in range(matched_idx.size):
+            instances.ID[matched_idx[i]] = self._prev_instances.ID[matched_prev_idx[i]]
+            instances.ID_period[matched_idx[i]] = (
+                self._prev_instances.ID_period[matched_prev_idx[i]] + 1
+            )
+            instances.lost_frame_count[matched_idx[i]] = 0
+        return instances
+
+    def _process_unmatched_idx(self, instances: Instances, matched_idx: np.ndarray) -> Instances:
+        untracked_idx = set(range(len(instances))).difference(set(matched_idx))
+        for idx in untracked_idx:
+            instances.ID[idx] = self._id_count
+            self._id_count += 1
+            instances.ID_period[idx] = 1
+            instances.lost_frame_count[idx] = 0
+        return instances
+
+    def _process_unmatched_prev_idx(
+        self, instances: Instances, matched_prev_idx: np.ndarray
+    ) -> Instances:
+        untracked_instances = Instances(
+            image_size=instances.image_size,
+            pred_boxes=[],
+            pred_masks=[],
+            pred_classes=[],
+            scores=[],
+            ID=[],
+            ID_period=[],
+            lost_frame_count=[],
+        )
+        prev_bboxes = list(self._prev_instances.pred_boxes)
+        prev_classes = list(self._prev_instances.pred_classes)
+        prev_scores = list(self._prev_instances.scores)
+        prev_ID_period = self._prev_instances.ID_period
+        if instances.has("pred_masks"):
+            prev_masks = list(self._prev_instances.pred_masks)
+        untracked_prev_idx = set(range(len(self._prev_instances))).difference(set(matched_prev_idx))
+        for idx in untracked_prev_idx:
+            x_left, y_top, x_right, y_bot = prev_bboxes[idx]
+            if (
+                (1.0 * (x_right - x_left) / self._video_width < self._min_box_rel_dim)
+                or (1.0 * (y_bot - y_top) / self._video_height < self._min_box_rel_dim)
+                or self._prev_instances.lost_frame_count[idx] >= self._max_lost_frame_count
+                or prev_ID_period[idx] <= self._min_instance_period
+            ):
+                continue
+            untracked_instances.pred_boxes.append(list(prev_bboxes[idx].numpy()))
+            untracked_instances.pred_classes.append(int(prev_classes[idx]))
+            untracked_instances.scores.append(float(prev_scores[idx]))
+            untracked_instances.ID.append(self._prev_instances.ID[idx])
+            untracked_instances.ID_period.append(self._prev_instances.ID_period[idx])
+            untracked_instances.lost_frame_count.append(
+                self._prev_instances.lost_frame_count[idx] + 1
+            )
+            if instances.has("pred_masks"):
+                untracked_instances.pred_masks.append(prev_masks[idx].numpy().astype(np.uint8))
+
+        untracked_instances.pred_boxes = Boxes(torch.FloatTensor(untracked_instances.pred_boxes))
+        untracked_instances.pred_classes = torch.IntTensor(untracked_instances.pred_classes)
+        untracked_instances.scores = torch.FloatTensor(untracked_instances.scores)
+        if instances.has("pred_masks"):
+            untracked_instances.pred_masks = torch.IntTensor(untracked_instances.pred_masks)
+        else:
+            untracked_instances.remove("pred_masks")
+
+        return Instances.cat(
+            [
+                instances,
+                untracked_instances,
+            ]
+        )
diff --git a/detectron2/tracking/iou_weighted_hungarian_bbox_iou_tracker.py b/detectron2/tracking/iou_weighted_hungarian_bbox_iou_tracker.py
new file mode 100644
index 0000000000000000000000000000000000000000..b3b4d1c5663fb49b2fc40752d6b7a42eddd58e75
--- /dev/null
+++ b/detectron2/tracking/iou_weighted_hungarian_bbox_iou_tracker.py
@@ -0,0 +1,102 @@
+#!/usr/bin/env python3
+# Copyright 2004-present Facebook. All Rights Reserved.
+
+import numpy as np
+from typing import List
+
+from detectron2.config import CfgNode as CfgNode_
+from detectron2.config import configurable
+
+from .base_tracker import TRACKER_HEADS_REGISTRY
+from .vanilla_hungarian_bbox_iou_tracker import VanillaHungarianBBoxIOUTracker
+
+
+@TRACKER_HEADS_REGISTRY.register()
+class IOUWeightedHungarianBBoxIOUTracker(VanillaHungarianBBoxIOUTracker):
+    """
+    A tracker using IoU as weight in Hungarian algorithm, also known
+    as Munkres or Kuhn-Munkres algorithm
+    """
+
+    @configurable
+    def __init__(
+        self,
+        *,
+        video_height: int,
+        video_width: int,
+        max_num_instances: int = 200,
+        max_lost_frame_count: int = 0,
+        min_box_rel_dim: float = 0.02,
+        min_instance_period: int = 1,
+        track_iou_threshold: float = 0.5,
+        **kwargs,
+    ):
+        """
+        Args:
+        video_height: height the video frame
+        video_width: width of the video frame
+        max_num_instances: maximum number of id allowed to be tracked
+        max_lost_frame_count: maximum number of frame an id can lost tracking
+                              exceed this number, an id is considered as lost
+                              forever
+        min_box_rel_dim: a percentage, smaller than this dimension, a bbox is
+                         removed from tracking
+        min_instance_period: an instance will be shown after this number of period
+                             since its first showing up in the video
+        track_iou_threshold: iou threshold, below this number a bbox pair is removed
+                             from tracking
+        """
+        super().__init__(
+            video_height=video_height,
+            video_width=video_width,
+            max_num_instances=max_num_instances,
+            max_lost_frame_count=max_lost_frame_count,
+            min_box_rel_dim=min_box_rel_dim,
+            min_instance_period=min_instance_period,
+            track_iou_threshold=track_iou_threshold,
+        )
+
+    @classmethod
+    def from_config(cls, cfg: CfgNode_):
+        """
+        Old style initialization using CfgNode
+
+        Args:
+            cfg: D2 CfgNode, config file
+        Return:
+            dictionary storing arguments for __init__ method
+        """
+        assert "VIDEO_HEIGHT" in cfg.TRACKER_HEADS
+        assert "VIDEO_WIDTH" in cfg.TRACKER_HEADS
+        video_height = cfg.TRACKER_HEADS.get("VIDEO_HEIGHT")
+        video_width = cfg.TRACKER_HEADS.get("VIDEO_WIDTH")
+        max_num_instances = cfg.TRACKER_HEADS.get("MAX_NUM_INSTANCES", 200)
+        max_lost_frame_count = cfg.TRACKER_HEADS.get("MAX_LOST_FRAME_COUNT", 0)
+        min_box_rel_dim = cfg.TRACKER_HEADS.get("MIN_BOX_REL_DIM", 0.02)
+        min_instance_period = cfg.TRACKER_HEADS.get("MIN_INSTANCE_PERIOD", 1)
+        track_iou_threshold = cfg.TRACKER_HEADS.get("TRACK_IOU_THRESHOLD", 0.5)
+        return {
+            "_target_": "detectron2.tracking.iou_weighted_hungarian_bbox_iou_tracker.IOUWeightedHungarianBBoxIOUTracker",  # noqa
+            "video_height": video_height,
+            "video_width": video_width,
+            "max_num_instances": max_num_instances,
+            "max_lost_frame_count": max_lost_frame_count,
+            "min_box_rel_dim": min_box_rel_dim,
+            "min_instance_period": min_instance_period,
+            "track_iou_threshold": track_iou_threshold,
+        }
+
+    def assign_cost_matrix_values(self, cost_matrix: np.ndarray, bbox_pairs: List) -> np.ndarray:
+        """
+        Based on IoU for each pair of bbox, assign the associated value in cost matrix
+
+        Args:
+            cost_matrix: np.ndarray, initialized 2D array with target dimensions
+            bbox_pairs: list of bbox pair, in each pair, iou value is stored
+        Return:
+            np.ndarray, cost_matrix with assigned values
+        """
+        for pair in bbox_pairs:
+            # assign (-1 * IoU) for above threshold pairs, algorithms will minimize cost
+            cost_matrix[pair["idx"]][pair["prev_idx"]] = -1 * pair["IoU"]
+        return cost_matrix
diff --git a/detectron2/tracking/utils.py b/detectron2/tracking/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..92634c5cfe0c18eda00ce6c8bfe767ed20470a80
--- /dev/null
+++ b/detectron2/tracking/utils.py
@@ -0,0 +1,40 @@
+#!/usr/bin/env python3
+import numpy as np
+from typing import List
+
+from detectron2.structures import Instances
+
+
+def create_prediction_pairs(
+    instances: Instances,
+    prev_instances: Instances,
+    iou_all: np.ndarray,
+    threshold: float = 0.5,
+) -> List:
+    """
+    Args:
+        instances: predictions from current frame
+        prev_instances: predictions from previous frame
+        iou_all: 2D numpy array containing iou for each bbox pair
+        threshold: below the threshold, doesn't consider the pair of bbox is valid
+    Return:
+        List of bbox pairs
+    """
+    bbox_pairs = []
+    for i in range(len(instances)):
+        for j in range(len(prev_instances)):
+            if iou_all[i, j] < threshold:
+                continue
+            bbox_pairs.append(
+                {
+                    "idx": i,
+                    "prev_idx": j,
+                    "prev_id": prev_instances.ID[j],
+                    "IoU": iou_all[i, j],
+                    "prev_period": prev_instances.ID_period[j],
+                }
+            )
+    return bbox_pairs
+
+
+LARGE_COST_VALUE = 100000
diff --git a/detectron2/tracking/vanilla_hungarian_bbox_iou_tracker.py b/detectron2/tracking/vanilla_hungarian_bbox_iou_tracker.py
new file mode 100644
index 0000000000000000000000000000000000000000..5629f7383adcafeaa1ebdae1f38f968437149652
--- /dev/null
+++ b/detectron2/tracking/vanilla_hungarian_bbox_iou_tracker.py
@@ -0,0 +1,129 @@
+#!/usr/bin/env python3
+# Copyright 2004-present Facebook. All Rights Reserved.
+
+import numpy as np
+from typing import List
+
+from detectron2.config import CfgNode as CfgNode_
+from detectron2.config import configurable
+from detectron2.structures import Instances
+from detectron2.structures.boxes import pairwise_iou
+from detectron2.tracking.utils import LARGE_COST_VALUE, create_prediction_pairs
+
+from .base_tracker import TRACKER_HEADS_REGISTRY
+from .hungarian_tracker import BaseHungarianTracker
+
+
+@TRACKER_HEADS_REGISTRY.register()
+class VanillaHungarianBBoxIOUTracker(BaseHungarianTracker):
+    """
+    Hungarian algo based tracker using bbox iou as metric
+    """
+
+    @configurable
+    def __init__(
+        self,
+        *,
+        video_height: int,
+        video_width: int,
+        max_num_instances: int = 200,
+        max_lost_frame_count: int = 0,
+        min_box_rel_dim: float = 0.02,
+        min_instance_period: int = 1,
+        track_iou_threshold: float = 0.5,
+        **kwargs,
+    ):
+        """
+        Args:
+        video_height: height the video frame
+        video_width: width of the video frame
+        max_num_instances: maximum number of id allowed to be tracked
+        max_lost_frame_count: maximum number of frame an id can lost tracking
+                              exceed this number, an id is considered as lost
+                              forever
+        min_box_rel_dim: a percentage, smaller than this dimension, a bbox is
+                         removed from tracking
+        min_instance_period: an instance will be shown after this number of period
+                             since its first showing up in the video
+        track_iou_threshold: iou threshold, below this number a bbox pair is removed
+                             from tracking
+        """
+        super().__init__(
+            video_height=video_height,
+            video_width=video_width,
+            max_num_instances=max_num_instances,
+            max_lost_frame_count=max_lost_frame_count,
+            min_box_rel_dim=min_box_rel_dim,
+            min_instance_period=min_instance_period,
+        )
+        self._track_iou_threshold = track_iou_threshold
+
+    @classmethod
+    def from_config(cls, cfg: CfgNode_):
+        """
+        Old style initialization using CfgNode
+
+        Args:
+            cfg: D2 CfgNode, config file
+        Return:
+            dictionary storing arguments for __init__ method
+        """
+        assert "VIDEO_HEIGHT" in cfg.TRACKER_HEADS
+        assert "VIDEO_WIDTH" in cfg.TRACKER_HEADS
+        video_height = cfg.TRACKER_HEADS.get("VIDEO_HEIGHT")
+        video_width = cfg.TRACKER_HEADS.get("VIDEO_WIDTH")
+        max_num_instances = cfg.TRACKER_HEADS.get("MAX_NUM_INSTANCES", 200)
+        max_lost_frame_count = cfg.TRACKER_HEADS.get("MAX_LOST_FRAME_COUNT", 0)
+        min_box_rel_dim = cfg.TRACKER_HEADS.get("MIN_BOX_REL_DIM", 0.02)
+        min_instance_period = cfg.TRACKER_HEADS.get("MIN_INSTANCE_PERIOD", 1)
+        track_iou_threshold = cfg.TRACKER_HEADS.get("TRACK_IOU_THRESHOLD", 0.5)
+        return {
+            "_target_": "detectron2.tracking.vanilla_hungarian_bbox_iou_tracker.VanillaHungarianBBoxIOUTracker",  # noqa
+            "video_height": video_height,
+            "video_width": video_width,
+            "max_num_instances": max_num_instances,
+            "max_lost_frame_count": max_lost_frame_count,
+            "min_box_rel_dim": min_box_rel_dim,
+            "min_instance_period": min_instance_period,
+            "track_iou_threshold": track_iou_threshold,
+        }
+
+    def build_cost_matrix(self, instances: Instances, prev_instances: Instances) -> np.ndarray:
+        """
+        Build the cost matrix for assignment problem
+        (https://en.wikipedia.org/wiki/Assignment_problem)
+
+        Args:
+            instances: D2 Instances, for current frame predictions
+            prev_instances: D2 Instances, for previous frame predictions
+
+        Return:
+            the cost matrix in numpy array
+        """
+        assert instances is not None and prev_instances is not None
+        # calculate IoU of all bbox pairs
+        iou_all = pairwise_iou(
+            boxes1=instances.pred_boxes,
+            boxes2=self._prev_instances.pred_boxes,
+        )
+        bbox_pairs = create_prediction_pairs(
+            instances, self._prev_instances, iou_all, threshold=self._track_iou_threshold
+        )
+        # assign large cost value to make sure pair below IoU threshold won't be matched
+        cost_matrix = np.full((len(instances), len(prev_instances)), LARGE_COST_VALUE)
+        return self.assign_cost_matrix_values(cost_matrix, bbox_pairs)
+
+    def assign_cost_matrix_values(self, cost_matrix: np.ndarray, bbox_pairs: List) -> np.ndarray:
+        """
+        Based on IoU for each pair of bbox, assign the associated value in cost matrix
+
+        Args:
+            cost_matrix: np.ndarray, initialized 2D array with target dimensions
+            bbox_pairs: list of bbox pair, in each pair, iou value is stored
+        Return:
+            np.ndarray, cost_matrix with assigned values
+        """
+        for pair in bbox_pairs:
+            # assign -1 for IoU above threshold pairs, algorithms will minimize cost
+            cost_matrix[pair["idx"]][pair["prev_idx"]] = -1
+        return cost_matrix
diff --git a/detectron2/utils/README.md b/detectron2/utils/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..9765b24a730b77556104187ac3ef5439ab0859fd
--- /dev/null
+++ b/detectron2/utils/README.md
@@ -0,0 +1,5 @@
+# Utility functions
+
+This folder contain utility functions that are not used in the
+core library, but are useful for building models or training
+code using the config system.
diff --git a/detectron2/utils/__init__.py b/detectron2/utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..9020c2df23e2af280b7bb168b996ae9eaf312eb8
--- /dev/null
+++ b/detectron2/utils/__init__.py
@@ -0,0 +1 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
diff --git a/detectron2/utils/__pycache__/__init__.cpython-310.pyc b/detectron2/utils/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ea5000741b7b0f1a16206f5915d72e1b3edd5c72
Binary files /dev/null and b/detectron2/utils/__pycache__/__init__.cpython-310.pyc differ
diff --git a/detectron2/utils/__pycache__/__init__.cpython-39.pyc b/detectron2/utils/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..225431a9631114d1231f757752c2d8ddd2a99e46
Binary files /dev/null and b/detectron2/utils/__pycache__/__init__.cpython-39.pyc differ
diff --git a/detectron2/utils/__pycache__/collect_env.cpython-310.pyc b/detectron2/utils/__pycache__/collect_env.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6b0998f7980a05c03f23b090b24a7dadd65d6145
Binary files /dev/null and b/detectron2/utils/__pycache__/collect_env.cpython-310.pyc differ
diff --git a/detectron2/utils/__pycache__/collect_env.cpython-39.pyc b/detectron2/utils/__pycache__/collect_env.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..955e41cfba47b0c453296567bd3dc348c176ee8e
Binary files /dev/null and b/detectron2/utils/__pycache__/collect_env.cpython-39.pyc differ
diff --git a/detectron2/utils/__pycache__/comm.cpython-310.pyc b/detectron2/utils/__pycache__/comm.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..183722aee1a822aa4771af7ca69ce7d27ef523e6
Binary files /dev/null and b/detectron2/utils/__pycache__/comm.cpython-310.pyc differ
diff --git a/detectron2/utils/__pycache__/comm.cpython-39.pyc b/detectron2/utils/__pycache__/comm.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7868802189580a6ccd490dec66f61ea85af8edfd
Binary files /dev/null and b/detectron2/utils/__pycache__/comm.cpython-39.pyc differ
diff --git a/detectron2/utils/__pycache__/develop.cpython-310.pyc b/detectron2/utils/__pycache__/develop.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a3430876eecc9e06fb6d32c5b305cdc0b40bdd9a
Binary files /dev/null and b/detectron2/utils/__pycache__/develop.cpython-310.pyc differ
diff --git a/detectron2/utils/__pycache__/develop.cpython-39.pyc b/detectron2/utils/__pycache__/develop.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..635ac7cef101ef4943a9dd499894b076ae8c24f7
Binary files /dev/null and b/detectron2/utils/__pycache__/develop.cpython-39.pyc differ
diff --git a/detectron2/utils/__pycache__/env.cpython-310.pyc b/detectron2/utils/__pycache__/env.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7a769e250d4053726b7678c03e23eab2b82bc56a
Binary files /dev/null and b/detectron2/utils/__pycache__/env.cpython-310.pyc differ
diff --git a/detectron2/utils/__pycache__/env.cpython-39.pyc b/detectron2/utils/__pycache__/env.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..33bb882025037758b81ffb4f9f676812c8218b3a
Binary files /dev/null and b/detectron2/utils/__pycache__/env.cpython-39.pyc differ
diff --git a/detectron2/utils/__pycache__/events.cpython-310.pyc b/detectron2/utils/__pycache__/events.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e140c89658825a0369c53f411d2ea62ca342694c
Binary files /dev/null and b/detectron2/utils/__pycache__/events.cpython-310.pyc differ
diff --git a/detectron2/utils/__pycache__/events.cpython-39.pyc b/detectron2/utils/__pycache__/events.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..86bcae79dcccbc4465d5e5bef8918e6304d97321
Binary files /dev/null and b/detectron2/utils/__pycache__/events.cpython-39.pyc differ
diff --git a/detectron2/utils/__pycache__/file_io.cpython-310.pyc b/detectron2/utils/__pycache__/file_io.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8c6d3141d9f9874bc3613773bd1b517b83ec23a0
Binary files /dev/null and b/detectron2/utils/__pycache__/file_io.cpython-310.pyc differ
diff --git a/detectron2/utils/__pycache__/file_io.cpython-39.pyc b/detectron2/utils/__pycache__/file_io.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c51a005a8111895ad9120b6c10e1b04bdbe27072
Binary files /dev/null and b/detectron2/utils/__pycache__/file_io.cpython-39.pyc differ
diff --git a/detectron2/utils/__pycache__/logger.cpython-310.pyc b/detectron2/utils/__pycache__/logger.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..868d6df71a40f38920c9528e2aff7c240ce3d549
Binary files /dev/null and b/detectron2/utils/__pycache__/logger.cpython-310.pyc differ
diff --git a/detectron2/utils/__pycache__/logger.cpython-39.pyc b/detectron2/utils/__pycache__/logger.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..15fee19ccb83809cbcdfb2469598c0620af3ae74
Binary files /dev/null and b/detectron2/utils/__pycache__/logger.cpython-39.pyc differ
diff --git a/detectron2/utils/__pycache__/memory.cpython-310.pyc b/detectron2/utils/__pycache__/memory.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9b8d2495f749071a6cc9bbcb6196ff1496a65c3f
Binary files /dev/null and b/detectron2/utils/__pycache__/memory.cpython-310.pyc differ
diff --git a/detectron2/utils/__pycache__/memory.cpython-39.pyc b/detectron2/utils/__pycache__/memory.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c4bce1dd1c19e8e49a9098828bc6d577f0910814
Binary files /dev/null and b/detectron2/utils/__pycache__/memory.cpython-39.pyc differ
diff --git a/detectron2/utils/__pycache__/registry.cpython-310.pyc b/detectron2/utils/__pycache__/registry.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e6802d84f9d0652df3a24af14e36f6f25bb1a68d
Binary files /dev/null and b/detectron2/utils/__pycache__/registry.cpython-310.pyc differ
diff --git a/detectron2/utils/__pycache__/registry.cpython-39.pyc b/detectron2/utils/__pycache__/registry.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0cf8bbd6826e3b3da863c8a933132ce3de309a55
Binary files /dev/null and b/detectron2/utils/__pycache__/registry.cpython-39.pyc differ
diff --git a/detectron2/utils/__pycache__/serialize.cpython-310.pyc b/detectron2/utils/__pycache__/serialize.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4e7d5a6827667895809021424df8d72163742bbb
Binary files /dev/null and b/detectron2/utils/__pycache__/serialize.cpython-310.pyc differ
diff --git a/detectron2/utils/__pycache__/serialize.cpython-39.pyc b/detectron2/utils/__pycache__/serialize.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b4aff0f39a38040935d2c6b8f1e2f8fb42ed4845
Binary files /dev/null and b/detectron2/utils/__pycache__/serialize.cpython-39.pyc differ
diff --git a/detectron2/utils/__pycache__/tracing.cpython-310.pyc b/detectron2/utils/__pycache__/tracing.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d8e3b0f6d01cc9c3b28e05d82a5691df9c8c243c
Binary files /dev/null and b/detectron2/utils/__pycache__/tracing.cpython-310.pyc differ
diff --git a/detectron2/utils/__pycache__/tracing.cpython-39.pyc b/detectron2/utils/__pycache__/tracing.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b6a2bd7de0c43d41cc2f0b11001bb116edf57cb0
Binary files /dev/null and b/detectron2/utils/__pycache__/tracing.cpython-39.pyc differ
diff --git a/detectron2/utils/analysis.py b/detectron2/utils/analysis.py
new file mode 100644
index 0000000000000000000000000000000000000000..178da7968cc08c29ec61b823bba8b74e8d97e1d6
--- /dev/null
+++ b/detectron2/utils/analysis.py
@@ -0,0 +1,188 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# -*- coding: utf-8 -*-
+
+import typing
+from typing import Any, List
+import fvcore
+from fvcore.nn import activation_count, flop_count, parameter_count, parameter_count_table
+from torch import nn
+
+from detectron2.export import TracingAdapter
+
+__all__ = [
+    "activation_count_operators",
+    "flop_count_operators",
+    "parameter_count_table",
+    "parameter_count",
+    "FlopCountAnalysis",
+]
+
+FLOPS_MODE = "flops"
+ACTIVATIONS_MODE = "activations"
+
+
+# Some extra ops to ignore from counting, including elementwise and reduction ops
+_IGNORED_OPS = {
+    "aten::add",
+    "aten::add_",
+    "aten::argmax",
+    "aten::argsort",
+    "aten::batch_norm",
+    "aten::constant_pad_nd",
+    "aten::div",
+    "aten::div_",
+    "aten::exp",
+    "aten::log2",
+    "aten::max_pool2d",
+    "aten::meshgrid",
+    "aten::mul",
+    "aten::mul_",
+    "aten::neg",
+    "aten::nonzero_numpy",
+    "aten::reciprocal",
+    "aten::repeat_interleave",
+    "aten::rsub",
+    "aten::sigmoid",
+    "aten::sigmoid_",
+    "aten::softmax",
+    "aten::sort",
+    "aten::sqrt",
+    "aten::sub",
+    "torchvision::nms",  # TODO estimate flop for nms
+}
+
+
+class FlopCountAnalysis(fvcore.nn.FlopCountAnalysis):
+    """
+    Same as :class:`fvcore.nn.FlopCountAnalysis`, but supports detectron2 models.
+    """
+
+    def __init__(self, model, inputs):
+        """
+        Args:
+            model (nn.Module):
+            inputs (Any): inputs of the given model. Does not have to be tuple of tensors.
+        """
+        wrapper = TracingAdapter(model, inputs, allow_non_tensor=True)
+        super().__init__(wrapper, wrapper.flattened_inputs)
+        self.set_op_handle(**{k: None for k in _IGNORED_OPS})
+
+
+def flop_count_operators(model: nn.Module, inputs: list) -> typing.DefaultDict[str, float]:
+    """
+    Implement operator-level flops counting using jit.
+    This is a wrapper of :func:`fvcore.nn.flop_count` and adds supports for standard
+    detection models in detectron2.
+    Please use :class:`FlopCountAnalysis` for more advanced functionalities.
+
+    Note:
+        The function runs the input through the model to compute flops.
+        The flops of a detection model is often input-dependent, for example,
+        the flops of box & mask head depends on the number of proposals &
+        the number of detected objects.
+        Therefore, the flops counting using a single input may not accurately
+        reflect the computation cost of a model. It's recommended to average
+        across a number of inputs.
+
+    Args:
+        model: a detectron2 model that takes `list[dict]` as input.
+        inputs (list[dict]): inputs to model, in detectron2's standard format.
+            Only "image" key will be used.
+        supported_ops (dict[str, Handle]): see documentation of :func:`fvcore.nn.flop_count`
+
+    Returns:
+        Counter: Gflop count per operator
+    """
+    old_train = model.training
+    model.eval()
+    ret = FlopCountAnalysis(model, inputs).by_operator()
+    model.train(old_train)
+    return {k: v / 1e9 for k, v in ret.items()}
+
+
+def activation_count_operators(
+    model: nn.Module, inputs: list, **kwargs
+) -> typing.DefaultDict[str, float]:
+    """
+    Implement operator-level activations counting using jit.
+    This is a wrapper of fvcore.nn.activation_count, that supports standard detection models
+    in detectron2.
+
+    Note:
+        The function runs the input through the model to compute activations.
+        The activations of a detection model is often input-dependent, for example,
+        the activations of box & mask head depends on the number of proposals &
+        the number of detected objects.
+
+    Args:
+        model: a detectron2 model that takes `list[dict]` as input.
+        inputs (list[dict]): inputs to model, in detectron2's standard format.
+            Only "image" key will be used.
+
+    Returns:
+        Counter: activation count per operator
+    """
+    return _wrapper_count_operators(model=model, inputs=inputs, mode=ACTIVATIONS_MODE, **kwargs)
+
+
+def _wrapper_count_operators(
+    model: nn.Module, inputs: list, mode: str, **kwargs
+) -> typing.DefaultDict[str, float]:
+    # ignore some ops
+    supported_ops = {k: lambda *args, **kwargs: {} for k in _IGNORED_OPS}
+    supported_ops.update(kwargs.pop("supported_ops", {}))
+    kwargs["supported_ops"] = supported_ops
+
+    assert len(inputs) == 1, "Please use batch size=1"
+    tensor_input = inputs[0]["image"]
+    inputs = [{"image": tensor_input}]  # remove other keys, in case there are any
+
+    old_train = model.training
+    if isinstance(model, (nn.parallel.distributed.DistributedDataParallel, nn.DataParallel)):
+        model = model.module
+    wrapper = TracingAdapter(model, inputs)
+    wrapper.eval()
+    if mode == FLOPS_MODE:
+        ret = flop_count(wrapper, (tensor_input,), **kwargs)
+    elif mode == ACTIVATIONS_MODE:
+        ret = activation_count(wrapper, (tensor_input,), **kwargs)
+    else:
+        raise NotImplementedError("Count for mode {} is not supported yet.".format(mode))
+    # compatible with change in fvcore
+    if isinstance(ret, tuple):
+        ret = ret[0]
+    model.train(old_train)
+    return ret
+
+
+def find_unused_parameters(model: nn.Module, inputs: Any) -> List[str]:
+    """
+    Given a model, find parameters that do not contribute
+    to the loss.
+
+    Args:
+        model: a model in training mode that returns losses
+        inputs: argument or a tuple of arguments. Inputs of the model
+
+    Returns:
+        list[str]: the name of unused parameters
+    """
+    assert model.training
+    for _, prm in model.named_parameters():
+        prm.grad = None
+
+    if isinstance(inputs, tuple):
+        losses = model(*inputs)
+    else:
+        losses = model(inputs)
+
+    if isinstance(losses, dict):
+        losses = sum(losses.values())
+    losses.backward()
+
+    unused: List[str] = []
+    for name, prm in model.named_parameters():
+        if prm.grad is None:
+            unused.append(name)
+        prm.grad = None
+    return unused
diff --git a/detectron2/utils/collect_env.py b/detectron2/utils/collect_env.py
new file mode 100644
index 0000000000000000000000000000000000000000..27b38bc3bc6c6f1e49097c3406967d824eef3848
--- /dev/null
+++ b/detectron2/utils/collect_env.py
@@ -0,0 +1,272 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+import importlib
+import os
+import re
+import subprocess
+import sys
+from collections import defaultdict
+
+import numpy as np
+import PIL
+import torch
+import torchvision
+from tabulate import tabulate
+
+__all__ = ["collect_env_info"]
+
+
+def collect_torch_env():
+    try:
+        import torch.__config__
+
+        return torch.__config__.show()
+    except ImportError:
+        # compatible with older versions of pytorch
+        from torch.utils.collect_env import get_pretty_env_info
+
+        return get_pretty_env_info()
+
+
+def get_env_module():
+    var_name = "DETECTRON2_ENV_MODULE"
+    return var_name, os.environ.get(var_name, "<not set>")
+
+
+def detect_compute_compatibility(CUDA_HOME, so_file):
+    try:
+        cuobjdump = os.path.join(CUDA_HOME, "bin", "cuobjdump")
+        if os.path.isfile(cuobjdump):
+            output = subprocess.check_output(
+                "'{}' --list-elf '{}'".format(cuobjdump, so_file), shell=True
+            )
+            output = output.decode("utf-8").strip().split("\n")
+            arch = []
+            for line in output:
+                line = re.findall(r"\.sm_([0-9]*)\.", line)[0]
+                arch.append(".".join(line))
+            arch = sorted(set(arch))
+            return ", ".join(arch)
+        else:
+            return so_file + "; cannot find cuobjdump"
+    except Exception:
+        # unhandled failure
+        return so_file
+
+
+def collect_env_info():
+    has_gpu = torch.cuda.is_available()  # true for both CUDA & ROCM
+    torch_version = torch.__version__
+
+    # NOTE that CUDA_HOME/ROCM_HOME could be None even when CUDA runtime libs are functional
+    from torch.utils.cpp_extension import CUDA_HOME, ROCM_HOME
+
+    has_rocm = False
+    if (getattr(torch.version, "hip", None) is not None) and (ROCM_HOME is not None):
+        has_rocm = True
+    has_cuda = has_gpu and (not has_rocm)
+
+    data = []
+    data.append(("sys.platform", sys.platform))  # check-template.yml depends on it
+    data.append(("Python", sys.version.replace("\n", "")))
+    data.append(("numpy", np.__version__))
+
+    try:
+        import detectron2  # noqa
+
+        data.append(
+            (
+                "detectron2",
+                detectron2.__version__ + " @" + os.path.dirname(detectron2.__file__),
+            )
+        )
+    except ImportError:
+        data.append(("detectron2", "failed to import"))
+    except AttributeError:
+        data.append(("detectron2", "imported a wrong installation"))
+
+    try:
+        import detectron2._C as _C
+    except ImportError as e:
+        data.append(("detectron2._C", f"not built correctly: {e}"))
+
+        # print system compilers when extension fails to build
+        if sys.platform != "win32":  # don't know what to do for windows
+            try:
+                # this is how torch/utils/cpp_extensions.py choose compiler
+                cxx = os.environ.get("CXX", "c++")
+                cxx = subprocess.check_output("'{}' --version".format(cxx), shell=True)
+                cxx = cxx.decode("utf-8").strip().split("\n")[0]
+            except subprocess.SubprocessError:
+                cxx = "Not found"
+            data.append(("Compiler ($CXX)", cxx))
+
+            if has_cuda and CUDA_HOME is not None:
+                try:
+                    nvcc = os.path.join(CUDA_HOME, "bin", "nvcc")
+                    nvcc = subprocess.check_output("'{}' -V".format(nvcc), shell=True)
+                    nvcc = nvcc.decode("utf-8").strip().split("\n")[-1]
+                except subprocess.SubprocessError:
+                    nvcc = "Not found"
+                data.append(("CUDA compiler", nvcc))
+        if has_cuda and sys.platform != "win32":
+            try:
+                so_file = importlib.util.find_spec("detectron2._C").origin
+            except (ImportError, AttributeError):
+                pass
+            else:
+                data.append(
+                    (
+                        "detectron2 arch flags",
+                        detect_compute_compatibility(CUDA_HOME, so_file),
+                    )
+                )
+    else:
+        # print compilers that are used to build extension
+        data.append(("Compiler", _C.get_compiler_version()))
+        data.append(("CUDA compiler", _C.get_cuda_version()))  # cuda or hip
+        if has_cuda and getattr(_C, "has_cuda", lambda: True)():
+            data.append(
+                (
+                    "detectron2 arch flags",
+                    detect_compute_compatibility(CUDA_HOME, _C.__file__),
+                )
+            )
+
+    data.append(get_env_module())
+    data.append(("PyTorch", torch_version + " @" + os.path.dirname(torch.__file__)))
+    data.append(("PyTorch debug build", torch.version.debug))
+    try:
+        data.append(
+            ("torch._C._GLIBCXX_USE_CXX11_ABI", torch._C._GLIBCXX_USE_CXX11_ABI)
+        )
+    except Exception:
+        pass
+
+    if not has_gpu:
+        has_gpu_text = "No: torch.cuda.is_available() == False"
+    else:
+        has_gpu_text = "Yes"
+    data.append(("GPU available", has_gpu_text))
+    if has_gpu:
+        devices = defaultdict(list)
+        for k in range(torch.cuda.device_count()):
+            cap = ".".join((str(x) for x in torch.cuda.get_device_capability(k)))
+            name = torch.cuda.get_device_name(k) + f" (arch={cap})"
+            devices[name].append(str(k))
+        for name, devids in devices.items():
+            data.append(("GPU " + ",".join(devids), name))
+
+        if has_rocm:
+            msg = " - invalid!" if not (ROCM_HOME and os.path.isdir(ROCM_HOME)) else ""
+            data.append(("ROCM_HOME", str(ROCM_HOME) + msg))
+        else:
+            try:
+                from torch.utils.collect_env import (
+                    get_nvidia_driver_version,
+                    run as _run,
+                )
+
+                data.append(("Driver version", get_nvidia_driver_version(_run)))
+            except Exception:
+                pass
+            msg = " - invalid!" if not (CUDA_HOME and os.path.isdir(CUDA_HOME)) else ""
+            data.append(("CUDA_HOME", str(CUDA_HOME) + msg))
+
+            cuda_arch_list = os.environ.get("TORCH_CUDA_ARCH_LIST", None)
+            if cuda_arch_list:
+                data.append(("TORCH_CUDA_ARCH_LIST", cuda_arch_list))
+    data.append(("Pillow", PIL.__version__))
+
+    try:
+        data.append(
+            (
+                "torchvision",
+                str(torchvision.__version__)
+                + " @"
+                + os.path.dirname(torchvision.__file__),
+            )
+        )
+        if has_cuda:
+            try:
+                torchvision_C = importlib.util.find_spec("torchvision._C").origin
+                msg = detect_compute_compatibility(CUDA_HOME, torchvision_C)
+                data.append(("torchvision arch flags", msg))
+            except (ImportError, AttributeError):
+                data.append(("torchvision._C", "Not found"))
+    except AttributeError:
+        data.append(("torchvision", "unknown"))
+
+    try:
+        import fvcore
+
+        data.append(("fvcore", fvcore.__version__))
+    except (ImportError, AttributeError):
+        pass
+
+    try:
+        import iopath
+
+        data.append(("iopath", iopath.__version__))
+    except (ImportError, AttributeError):
+        pass
+
+    try:
+        import cv2
+
+        data.append(("cv2", cv2.__version__))
+    except (ImportError, AttributeError):
+        data.append(("cv2", "Not found"))
+    env_str = tabulate(data) + "\n"
+    env_str += collect_torch_env()
+    return env_str
+
+
+def test_nccl_ops():
+    num_gpu = torch.cuda.device_count()
+    if os.access("/tmp", os.W_OK):
+        import torch.multiprocessing as mp
+
+        dist_url = "file:///tmp/nccl_tmp_file"
+        print("Testing NCCL connectivity ... this should not hang.")
+        mp.spawn(
+            _test_nccl_worker, nprocs=num_gpu, args=(num_gpu, dist_url), daemon=False
+        )
+        print("NCCL succeeded.")
+
+
+def _test_nccl_worker(rank, num_gpu, dist_url):
+    import torch.distributed as dist
+
+    dist.init_process_group(
+        backend="NCCL", init_method=dist_url, rank=rank, world_size=num_gpu
+    )
+    dist.barrier(device_ids=[rank])
+
+
+def main() -> None:
+    global x
+    try:
+        from detectron2.utils.collect_env import collect_env_info as f
+
+        print(f())
+    except ImportError:
+        print(collect_env_info())
+
+    if torch.cuda.is_available():
+        num_gpu = torch.cuda.device_count()
+        for k in range(num_gpu):
+            device = f"cuda:{k}"
+            try:
+                x = torch.tensor([1, 2.0], dtype=torch.float32)
+                x = x.to(device)
+            except Exception as e:
+                print(
+                    f"Unable to copy tensor to device={device}: {e}. "
+                    "Your CUDA environment is broken."
+                )
+        if num_gpu > 1:
+            test_nccl_ops()
+
+
+if __name__ == "__main__":
+    main()  # pragma: no cover
diff --git a/detectron2/utils/colormap.py b/detectron2/utils/colormap.py
new file mode 100644
index 0000000000000000000000000000000000000000..14ded1659b40b161358c4aaf9cc84ffe0ffafe64
--- /dev/null
+++ b/detectron2/utils/colormap.py
@@ -0,0 +1,158 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+
+"""
+An awesome colormap for really neat visualizations.
+Copied from Detectron, and removed gray colors.
+"""
+
+import numpy as np
+import random
+
+__all__ = ["colormap", "random_color", "random_colors"]
+
+# fmt: off
+# RGB:
+_COLORS = np.array(
+    [
+        0.000, 0.447, 0.741,
+        0.850, 0.325, 0.098,
+        0.929, 0.694, 0.125,
+        0.494, 0.184, 0.556,
+        0.466, 0.674, 0.188,
+        0.301, 0.745, 0.933,
+        0.635, 0.078, 0.184,
+        0.300, 0.300, 0.300,
+        0.600, 0.600, 0.600,
+        1.000, 0.000, 0.000,
+        1.000, 0.500, 0.000,
+        0.749, 0.749, 0.000,
+        0.000, 1.000, 0.000,
+        0.000, 0.000, 1.000,
+        0.667, 0.000, 1.000,
+        0.333, 0.333, 0.000,
+        0.333, 0.667, 0.000,
+        0.333, 1.000, 0.000,
+        0.667, 0.333, 0.000,
+        0.667, 0.667, 0.000,
+        0.667, 1.000, 0.000,
+        1.000, 0.333, 0.000,
+        1.000, 0.667, 0.000,
+        1.000, 1.000, 0.000,
+        0.000, 0.333, 0.500,
+        0.000, 0.667, 0.500,
+        0.000, 1.000, 0.500,
+        0.333, 0.000, 0.500,
+        0.333, 0.333, 0.500,
+        0.333, 0.667, 0.500,
+        0.333, 1.000, 0.500,
+        0.667, 0.000, 0.500,
+        0.667, 0.333, 0.500,
+        0.667, 0.667, 0.500,
+        0.667, 1.000, 0.500,
+        1.000, 0.000, 0.500,
+        1.000, 0.333, 0.500,
+        1.000, 0.667, 0.500,
+        1.000, 1.000, 0.500,
+        0.000, 0.333, 1.000,
+        0.000, 0.667, 1.000,
+        0.000, 1.000, 1.000,
+        0.333, 0.000, 1.000,
+        0.333, 0.333, 1.000,
+        0.333, 0.667, 1.000,
+        0.333, 1.000, 1.000,
+        0.667, 0.000, 1.000,
+        0.667, 0.333, 1.000,
+        0.667, 0.667, 1.000,
+        0.667, 1.000, 1.000,
+        1.000, 0.000, 1.000,
+        1.000, 0.333, 1.000,
+        1.000, 0.667, 1.000,
+        0.333, 0.000, 0.000,
+        0.500, 0.000, 0.000,
+        0.667, 0.000, 0.000,
+        0.833, 0.000, 0.000,
+        1.000, 0.000, 0.000,
+        0.000, 0.167, 0.000,
+        0.000, 0.333, 0.000,
+        0.000, 0.500, 0.000,
+        0.000, 0.667, 0.000,
+        0.000, 0.833, 0.000,
+        0.000, 1.000, 0.000,
+        0.000, 0.000, 0.167,
+        0.000, 0.000, 0.333,
+        0.000, 0.000, 0.500,
+        0.000, 0.000, 0.667,
+        0.000, 0.000, 0.833,
+        0.000, 0.000, 1.000,
+        0.000, 0.000, 0.000,
+        0.143, 0.143, 0.143,
+        0.857, 0.857, 0.857,
+        1.000, 1.000, 1.000
+    ]
+).astype(np.float32).reshape(-1, 3)
+# fmt: on
+
+
+def colormap(rgb=False, maximum=255):
+    """
+    Args:
+        rgb (bool): whether to return RGB colors or BGR colors.
+        maximum (int): either 255 or 1
+
+    Returns:
+        ndarray: a float32 array of Nx3 colors, in range [0, 255] or [0, 1]
+    """
+    assert maximum in [255, 1], maximum
+    c = _COLORS * maximum
+    if not rgb:
+        c = c[:, ::-1]
+    return c
+
+
+def random_color(rgb=False, maximum=255):
+    """
+    Args:
+        rgb (bool): whether to return RGB colors or BGR colors.
+        maximum (int): either 255 or 1
+
+    Returns:
+        ndarray: a vector of 3 numbers
+    """
+    idx = np.random.randint(0, len(_COLORS))
+    ret = _COLORS[idx] * maximum
+    if not rgb:
+        ret = ret[::-1]
+    return ret
+
+
+def random_colors(N, rgb=False, maximum=255):
+    """
+    Args:
+        N (int): number of unique colors needed
+        rgb (bool): whether to return RGB colors or BGR colors.
+        maximum (int): either 255 or 1
+
+    Returns:
+        ndarray: a list of random_color
+    """
+    indices = random.sample(range(len(_COLORS)), N)
+    ret = [_COLORS[i] * maximum for i in indices]
+    if not rgb:
+        ret = [x[::-1] for x in ret]
+    return ret
+
+
+if __name__ == "__main__":
+    import cv2
+
+    size = 100
+    H, W = 10, 10
+    canvas = np.random.rand(H * size, W * size, 3).astype("float32")
+    for h in range(H):
+        for w in range(W):
+            idx = h * W + w
+            if idx >= len(_COLORS):
+                break
+            canvas[h * size : (h + 1) * size, w * size : (w + 1) * size] = _COLORS[idx]
+    cv2.imshow("a", canvas)
+    cv2.waitKey(0)
diff --git a/detectron2/utils/comm.py b/detectron2/utils/comm.py
new file mode 100644
index 0000000000000000000000000000000000000000..a9ea9a9f578c5704d1e7ff563ef156e9133ab465
--- /dev/null
+++ b/detectron2/utils/comm.py
@@ -0,0 +1,238 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+"""
+This file contains primitives for multi-gpu communication.
+This is useful when doing distributed training.
+"""
+
+import functools
+import numpy as np
+import torch
+import torch.distributed as dist
+
+_LOCAL_PROCESS_GROUP = None
+_MISSING_LOCAL_PG_ERROR = (
+    "Local process group is not yet created! Please use detectron2's `launch()` "
+    "to start processes and initialize pytorch process group. If you need to start "
+    "processes in other ways, please call comm.create_local_process_group("
+    "num_workers_per_machine) after calling torch.distributed.init_process_group()."
+)
+
+
+def get_world_size() -> int:
+    if not dist.is_available():
+        return 1
+    if not dist.is_initialized():
+        return 1
+    return dist.get_world_size()
+
+
+def get_rank() -> int:
+    if not dist.is_available():
+        return 0
+    if not dist.is_initialized():
+        return 0
+    return dist.get_rank()
+
+
+@functools.lru_cache()
+def create_local_process_group(num_workers_per_machine: int) -> None:
+    """
+    Create a process group that contains ranks within the same machine.
+
+    Detectron2's launch() in engine/launch.py will call this function. If you start
+    workers without launch(), you'll have to also call this. Otherwise utilities
+    like `get_local_rank()` will not work.
+
+    This function contains a barrier. All processes must call it together.
+
+    Args:
+        num_workers_per_machine: the number of worker processes per machine. Typically
+          the number of GPUs.
+    """
+    global _LOCAL_PROCESS_GROUP
+    assert _LOCAL_PROCESS_GROUP is None
+    assert get_world_size() % num_workers_per_machine == 0
+    num_machines = get_world_size() // num_workers_per_machine
+    machine_rank = get_rank() // num_workers_per_machine
+    for i in range(num_machines):
+        ranks_on_i = list(range(i * num_workers_per_machine, (i + 1) * num_workers_per_machine))
+        pg = dist.new_group(ranks_on_i)
+        if i == machine_rank:
+            _LOCAL_PROCESS_GROUP = pg
+
+
+def get_local_process_group():
+    """
+    Returns:
+        A torch process group which only includes processes that are on the same
+        machine as the current process. This group can be useful for communication
+        within a machine, e.g. a per-machine SyncBN.
+    """
+    assert _LOCAL_PROCESS_GROUP is not None, _MISSING_LOCAL_PG_ERROR
+    return _LOCAL_PROCESS_GROUP
+
+
+def get_local_rank() -> int:
+    """
+    Returns:
+        The rank of the current process within the local (per-machine) process group.
+    """
+    if not dist.is_available():
+        return 0
+    if not dist.is_initialized():
+        return 0
+    assert _LOCAL_PROCESS_GROUP is not None, _MISSING_LOCAL_PG_ERROR
+    return dist.get_rank(group=_LOCAL_PROCESS_GROUP)
+
+
+def get_local_size() -> int:
+    """
+    Returns:
+        The size of the per-machine process group,
+        i.e. the number of processes per machine.
+    """
+    if not dist.is_available():
+        return 1
+    if not dist.is_initialized():
+        return 1
+    assert _LOCAL_PROCESS_GROUP is not None, _MISSING_LOCAL_PG_ERROR
+    return dist.get_world_size(group=_LOCAL_PROCESS_GROUP)
+
+
+def is_main_process() -> bool:
+    return get_rank() == 0
+
+
+def synchronize():
+    """
+    Helper function to synchronize (barrier) among all processes when
+    using distributed training
+    """
+    if not dist.is_available():
+        return
+    if not dist.is_initialized():
+        return
+    world_size = dist.get_world_size()
+    if world_size == 1:
+        return
+    if dist.get_backend() == dist.Backend.NCCL:
+        # This argument is needed to avoid warnings.
+        # It's valid only for NCCL backend.
+        dist.barrier(device_ids=[torch.cuda.current_device()])
+    else:
+        dist.barrier()
+
+
+@functools.lru_cache()
+def _get_global_gloo_group():
+    """
+    Return a process group based on gloo backend, containing all the ranks
+    The result is cached.
+    """
+    if dist.get_backend() == "nccl":
+        return dist.new_group(backend="gloo")
+    else:
+        return dist.group.WORLD
+
+
+def all_gather(data, group=None):
+    """
+    Run all_gather on arbitrary picklable data (not necessarily tensors).
+
+    Args:
+        data: any picklable object
+        group: a torch process group. By default, will use a group which
+            contains all ranks on gloo backend.
+
+    Returns:
+        list[data]: list of data gathered from each rank
+    """
+    if get_world_size() == 1:
+        return [data]
+    if group is None:
+        group = _get_global_gloo_group()  # use CPU group by default, to reduce GPU RAM usage.
+    world_size = dist.get_world_size(group)
+    if world_size == 1:
+        return [data]
+
+    output = [None for _ in range(world_size)]
+    dist.all_gather_object(output, data, group=group)
+    return output
+
+
+def gather(data, dst=0, group=None):
+    """
+    Run gather on arbitrary picklable data (not necessarily tensors).
+
+    Args:
+        data: any picklable object
+        dst (int): destination rank
+        group: a torch process group. By default, will use a group which
+            contains all ranks on gloo backend.
+
+    Returns:
+        list[data]: on dst, a list of data gathered from each rank. Otherwise,
+            an empty list.
+    """
+    if get_world_size() == 1:
+        return [data]
+    if group is None:
+        group = _get_global_gloo_group()
+    world_size = dist.get_world_size(group=group)
+    if world_size == 1:
+        return [data]
+    rank = dist.get_rank(group=group)
+
+    if rank == dst:
+        output = [None for _ in range(world_size)]
+        dist.gather_object(data, output, dst=dst, group=group)
+        return output
+    else:
+        dist.gather_object(data, None, dst=dst, group=group)
+        return []
+
+
+def shared_random_seed():
+    """
+    Returns:
+        int: a random number that is the same across all workers.
+        If workers need a shared RNG, they can use this shared seed to
+        create one.
+
+    All workers must call this function, otherwise it will deadlock.
+    """
+    ints = np.random.randint(2**31)
+    all_ints = all_gather(ints)
+    return all_ints[0]
+
+
+def reduce_dict(input_dict, average=True):
+    """
+    Reduce the values in the dictionary from all processes so that process with rank
+    0 has the reduced results.
+
+    Args:
+        input_dict (dict): inputs to be reduced. All the values must be scalar CUDA Tensor.
+        average (bool): whether to do average or sum
+
+    Returns:
+        a dict with the same keys as input_dict, after reduction.
+    """
+    world_size = get_world_size()
+    if world_size < 2:
+        return input_dict
+    with torch.no_grad():
+        names = []
+        values = []
+        # sort the keys so that they are consistent across processes
+        for k in sorted(input_dict.keys()):
+            names.append(k)
+            values.append(input_dict[k])
+        values = torch.stack(values, dim=0)
+        dist.reduce(values, dst=0)
+        if dist.get_rank() == 0 and average:
+            # only main process gets accumulated, so only divide by
+            # world_size in this case
+            values /= world_size
+        reduced_dict = {k: v for k, v in zip(names, values)}
+    return reduced_dict
diff --git a/detectron2/utils/develop.py b/detectron2/utils/develop.py
new file mode 100644
index 0000000000000000000000000000000000000000..e8416984954f7b32fc269100620e3c0d0d0f9585
--- /dev/null
+++ b/detectron2/utils/develop.py
@@ -0,0 +1,59 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+""" Utilities for developers only.
+These are not visible to users (not automatically imported). And should not
+appeared in docs."""
+# adapted from https://github.com/tensorpack/tensorpack/blob/master/tensorpack/utils/develop.py
+
+
+def create_dummy_class(klass, dependency, message=""):
+    """
+    When a dependency of a class is not available, create a dummy class which throws ImportError
+    when used.
+
+    Args:
+        klass (str): name of the class.
+        dependency (str): name of the dependency.
+        message: extra message to print
+    Returns:
+        class: a class object
+    """
+    err = "Cannot import '{}', therefore '{}' is not available.".format(dependency, klass)
+    if message:
+        err = err + " " + message
+
+    class _DummyMetaClass(type):
+        # throw error on class attribute access
+        def __getattr__(_, __):  # noqa: B902
+            raise ImportError(err)
+
+    class _Dummy(object, metaclass=_DummyMetaClass):
+        # throw error on constructor
+        def __init__(self, *args, **kwargs):
+            raise ImportError(err)
+
+    return _Dummy
+
+
+def create_dummy_func(func, dependency, message=""):
+    """
+    When a dependency of a function is not available, create a dummy function which throws
+    ImportError when used.
+
+    Args:
+        func (str): name of the function.
+        dependency (str or list[str]): name(s) of the dependency.
+        message: extra message to print
+    Returns:
+        function: a function object
+    """
+    err = "Cannot import '{}', therefore '{}' is not available.".format(dependency, func)
+    if message:
+        err = err + " " + message
+
+    if isinstance(dependency, (list, tuple)):
+        dependency = ",".join(dependency)
+
+    def _dummy(*args, **kwargs):
+        raise ImportError(err)
+
+    return _dummy
diff --git a/detectron2/utils/env.py b/detectron2/utils/env.py
new file mode 100644
index 0000000000000000000000000000000000000000..40634c17c73273ac8927632be164f466cfe7d1fa
--- /dev/null
+++ b/detectron2/utils/env.py
@@ -0,0 +1,170 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+import importlib
+import importlib.util
+import logging
+import numpy as np
+import os
+import random
+import sys
+from datetime import datetime
+import torch
+
+__all__ = ["seed_all_rng"]
+
+
+TORCH_VERSION = tuple(int(x) for x in torch.__version__.split(".")[:2])
+"""
+PyTorch version as a tuple of 2 ints. Useful for comparison.
+"""
+
+
+DOC_BUILDING = os.getenv("_DOC_BUILDING", False)  # set in docs/conf.py
+"""
+Whether we're building documentation.
+"""
+
+
+def seed_all_rng(seed=None):
+    """
+    Set the random seed for the RNG in torch, numpy and python.
+
+    Args:
+        seed (int): if None, will use a strong random seed.
+    """
+    if seed is None:
+        seed = (
+            os.getpid()
+            + int(datetime.now().strftime("%S%f"))
+            + int.from_bytes(os.urandom(2), "big")
+        )
+        logger = logging.getLogger(__name__)
+        logger.info("Using a generated random seed {}".format(seed))
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    random.seed(seed)
+    os.environ["PYTHONHASHSEED"] = str(seed)
+
+
+# from https://stackoverflow.com/questions/67631/how-to-import-a-module-given-the-full-path
+def _import_file(module_name, file_path, make_importable=False):
+    spec = importlib.util.spec_from_file_location(module_name, file_path)
+    module = importlib.util.module_from_spec(spec)
+    spec.loader.exec_module(module)
+    if make_importable:
+        sys.modules[module_name] = module
+    return module
+
+
+def _configure_libraries():
+    """
+    Configurations for some libraries.
+    """
+    # An environment option to disable `import cv2` globally,
+    # in case it leads to negative performance impact
+    disable_cv2 = int(os.environ.get("DETECTRON2_DISABLE_CV2", False))
+    if disable_cv2:
+        sys.modules["cv2"] = None
+    else:
+        # Disable opencl in opencv since its interaction with cuda often has negative effects
+        # This envvar is supported after OpenCV 3.4.0
+        os.environ["OPENCV_OPENCL_RUNTIME"] = "disabled"
+        try:
+            import cv2
+
+            if int(cv2.__version__.split(".")[0]) >= 3:
+                cv2.ocl.setUseOpenCL(False)
+        except ModuleNotFoundError:
+            # Other types of ImportError, if happened, should not be ignored.
+            # Because a failed opencv import could mess up address space
+            # https://github.com/skvark/opencv-python/issues/381
+            pass
+
+    def get_version(module, digit=2):
+        return tuple(map(int, module.__version__.split(".")[:digit]))
+
+    # fmt: off
+    assert get_version(torch) >= (1, 4), "Requires torch>=1.4"
+    import fvcore
+    assert get_version(fvcore, 3) >= (0, 1, 2), "Requires fvcore>=0.1.2"
+    import yaml
+    assert get_version(yaml) >= (5, 1), "Requires pyyaml>=5.1"
+    # fmt: on
+
+
+_ENV_SETUP_DONE = False
+
+
+def setup_environment():
+    """Perform environment setup work. The default setup is a no-op, but this
+    function allows the user to specify a Python source file or a module in
+    the $DETECTRON2_ENV_MODULE environment variable, that performs
+    custom setup work that may be necessary to their computing environment.
+    """
+    global _ENV_SETUP_DONE
+    if _ENV_SETUP_DONE:
+        return
+    _ENV_SETUP_DONE = True
+
+    _configure_libraries()
+
+    custom_module_path = os.environ.get("DETECTRON2_ENV_MODULE")
+
+    if custom_module_path:
+        setup_custom_environment(custom_module_path)
+    else:
+        # The default setup is a no-op
+        pass
+
+
+def setup_custom_environment(custom_module):
+    """
+    Load custom environment setup by importing a Python source file or a
+    module, and run the setup function.
+    """
+    if custom_module.endswith(".py"):
+        module = _import_file("detectron2.utils.env.custom_module", custom_module)
+    else:
+        module = importlib.import_module(custom_module)
+    assert hasattr(module, "setup_environment") and callable(module.setup_environment), (
+        "Custom environment module defined in {} does not have the "
+        "required callable attribute 'setup_environment'."
+    ).format(custom_module)
+    module.setup_environment()
+
+
+def fixup_module_metadata(module_name, namespace, keys=None):
+    """
+    Fix the __qualname__ of module members to be their exported api name, so
+    when they are referenced in docs, sphinx can find them. Reference:
+    https://github.com/python-trio/trio/blob/6754c74eacfad9cc5c92d5c24727a2f3b620624e/trio/_util.py#L216-L241
+    """
+    if not DOC_BUILDING:
+        return
+    seen_ids = set()
+
+    def fix_one(qualname, name, obj):
+        # avoid infinite recursion (relevant when using
+        # typing.Generic, for example)
+        if id(obj) in seen_ids:
+            return
+        seen_ids.add(id(obj))
+
+        mod = getattr(obj, "__module__", None)
+        if mod is not None and (mod.startswith(module_name) or mod.startswith("fvcore.")):
+            obj.__module__ = module_name
+            # Modules, unlike everything else in Python, put fully-qualitied
+            # names into their __name__ attribute. We check for "." to avoid
+            # rewriting these.
+            if hasattr(obj, "__name__") and "." not in obj.__name__:
+                obj.__name__ = name
+                obj.__qualname__ = qualname
+            if isinstance(obj, type):
+                for attr_name, attr_value in obj.__dict__.items():
+                    fix_one(objname + "." + attr_name, attr_name, attr_value)
+
+    if keys is None:
+        keys = namespace.keys()
+    for objname in keys:
+        if not objname.startswith("_"):
+            obj = namespace[objname]
+            fix_one(objname, objname, obj)
diff --git a/detectron2/utils/events.py b/detectron2/utils/events.py
new file mode 100644
index 0000000000000000000000000000000000000000..7d582a9a1683c2bf3a0452a81b7e1c869789e57e
--- /dev/null
+++ b/detectron2/utils/events.py
@@ -0,0 +1,551 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+import datetime
+import json
+import logging
+import os
+import time
+from collections import defaultdict
+from contextlib import contextmanager
+from functools import cached_property
+from typing import Optional
+import torch
+from fvcore.common.history_buffer import HistoryBuffer
+
+from detectron2.utils.file_io import PathManager
+
+__all__ = [
+    "get_event_storage",
+    "has_event_storage",
+    "JSONWriter",
+    "TensorboardXWriter",
+    "CommonMetricPrinter",
+    "EventStorage",
+]
+
+_CURRENT_STORAGE_STACK = []
+
+
+def get_event_storage():
+    """
+    Returns:
+        The :class:`EventStorage` object that's currently being used.
+        Throws an error if no :class:`EventStorage` is currently enabled.
+    """
+    assert len(
+        _CURRENT_STORAGE_STACK
+    ), "get_event_storage() has to be called inside a 'with EventStorage(...)' context!"
+    return _CURRENT_STORAGE_STACK[-1]
+
+
+def has_event_storage():
+    """
+    Returns:
+        Check if there are EventStorage() context existed.
+    """
+    return len(_CURRENT_STORAGE_STACK) > 0
+
+
+class EventWriter:
+    """
+    Base class for writers that obtain events from :class:`EventStorage` and process them.
+    """
+
+    def write(self):
+        raise NotImplementedError
+
+    def close(self):
+        pass
+
+
+class JSONWriter(EventWriter):
+    """
+    Write scalars to a json file.
+
+    It saves scalars as one json per line (instead of a big json) for easy parsing.
+
+    Examples parsing such a json file:
+    ::
+        $ cat metrics.json | jq -s '.[0:2]'
+        [
+          {
+            "data_time": 0.008433341979980469,
+            "iteration": 19,
+            "loss": 1.9228371381759644,
+            "loss_box_reg": 0.050025828182697296,
+            "loss_classifier": 0.5316952466964722,
+            "loss_mask": 0.7236229181289673,
+            "loss_rpn_box": 0.0856662318110466,
+            "loss_rpn_cls": 0.48198649287223816,
+            "lr": 0.007173333333333333,
+            "time": 0.25401854515075684
+          },
+          {
+            "data_time": 0.007216215133666992,
+            "iteration": 39,
+            "loss": 1.282649278640747,
+            "loss_box_reg": 0.06222952902317047,
+            "loss_classifier": 0.30682939291000366,
+            "loss_mask": 0.6970193982124329,
+            "loss_rpn_box": 0.038663312792778015,
+            "loss_rpn_cls": 0.1471673548221588,
+            "lr": 0.007706666666666667,
+            "time": 0.2490077018737793
+          }
+        ]
+
+        $ cat metrics.json | jq '.loss_mask'
+        0.7126231789588928
+        0.689423680305481
+        0.6776131987571716
+        ...
+
+    """
+
+    def __init__(self, json_file, window_size=20):
+        """
+        Args:
+            json_file (str): path to the json file. New data will be appended if the file exists.
+            window_size (int): the window size of median smoothing for the scalars whose
+                `smoothing_hint` are True.
+        """
+        self._file_handle = PathManager.open(json_file, "a")
+        self._window_size = window_size
+        self._last_write = -1
+
+    def write(self):
+        storage = get_event_storage()
+        to_save = defaultdict(dict)
+
+        for k, (v, iter) in storage.latest_with_smoothing_hint(self._window_size).items():
+            # keep scalars that have not been written
+            if iter <= self._last_write:
+                continue
+            to_save[iter][k] = v
+        if len(to_save):
+            all_iters = sorted(to_save.keys())
+            self._last_write = max(all_iters)
+
+        for itr, scalars_per_iter in to_save.items():
+            scalars_per_iter["iteration"] = itr
+            self._file_handle.write(json.dumps(scalars_per_iter, sort_keys=True) + "\n")
+        self._file_handle.flush()
+        try:
+            os.fsync(self._file_handle.fileno())
+        except AttributeError:
+            pass
+
+    def close(self):
+        self._file_handle.close()
+
+
+class TensorboardXWriter(EventWriter):
+    """
+    Write all scalars to a tensorboard file.
+    """
+
+    def __init__(self, log_dir: str, window_size: int = 20, **kwargs):
+        """
+        Args:
+            log_dir (str): the directory to save the output events
+            window_size (int): the scalars will be median-smoothed by this window size
+
+            kwargs: other arguments passed to `torch.utils.tensorboard.SummaryWriter(...)`
+        """
+        self._window_size = window_size
+        self._writer_args = {"log_dir": log_dir, **kwargs}
+        self._last_write = -1
+
+    @cached_property
+    def _writer(self):
+        from torch.utils.tensorboard import SummaryWriter
+
+        return SummaryWriter(**self._writer_args)
+
+    def write(self):
+        storage = get_event_storage()
+        new_last_write = self._last_write
+        for k, (v, iter) in storage.latest_with_smoothing_hint(self._window_size).items():
+            if iter > self._last_write:
+                self._writer.add_scalar(k, v, iter)
+                new_last_write = max(new_last_write, iter)
+        self._last_write = new_last_write
+
+        # storage.put_{image,histogram} is only meant to be used by
+        # tensorboard writer. So we access its internal fields directly from here.
+        if len(storage._vis_data) >= 1:
+            for img_name, img, step_num in storage._vis_data:
+                self._writer.add_image(img_name, img, step_num)
+            # Storage stores all image data and rely on this writer to clear them.
+            # As a result it assumes only one writer will use its image data.
+            # An alternative design is to let storage store limited recent
+            # data (e.g. only the most recent image) that all writers can access.
+            # In that case a writer may not see all image data if its period is long.
+            storage.clear_images()
+
+        if len(storage._histograms) >= 1:
+            for params in storage._histograms:
+                self._writer.add_histogram_raw(**params)
+            storage.clear_histograms()
+
+    def close(self):
+        if "_writer" in self.__dict__:
+            self._writer.close()
+
+
+class CommonMetricPrinter(EventWriter):
+    """
+    Print **common** metrics to the terminal, including
+    iteration time, ETA, memory, all losses, and the learning rate.
+    It also applies smoothing using a window of 20 elements.
+
+    It's meant to print common metrics in common ways.
+    To print something in more customized ways, please implement a similar printer by yourself.
+    """
+
+    def __init__(self, max_iter: Optional[int] = None, window_size: int = 20):
+        """
+        Args:
+            max_iter: the maximum number of iterations to train.
+                Used to compute ETA. If not given, ETA will not be printed.
+            window_size (int): the losses will be median-smoothed by this window size
+        """
+        self.logger = logging.getLogger("detectron2.utils.events")
+        self._max_iter = max_iter
+        self._window_size = window_size
+        self._last_write = None  # (step, time) of last call to write(). Used to compute ETA
+
+    def _get_eta(self, storage) -> Optional[str]:
+        if self._max_iter is None:
+            return ""
+        iteration = storage.iter
+        try:
+            eta_seconds = storage.history("time").median(1000) * (self._max_iter - iteration - 1)
+            storage.put_scalar("eta_seconds", eta_seconds, smoothing_hint=False)
+            return str(datetime.timedelta(seconds=int(eta_seconds)))
+        except KeyError:
+            # estimate eta on our own - more noisy
+            eta_string = None
+            if self._last_write is not None:
+                estimate_iter_time = (time.perf_counter() - self._last_write[1]) / (
+                    iteration - self._last_write[0]
+                )
+                eta_seconds = estimate_iter_time * (self._max_iter - iteration - 1)
+                eta_string = str(datetime.timedelta(seconds=int(eta_seconds)))
+            self._last_write = (iteration, time.perf_counter())
+            return eta_string
+
+    def write(self):
+        storage = get_event_storage()
+        iteration = storage.iter
+        if iteration == self._max_iter:
+            # This hook only reports training progress (loss, ETA, etc) but not other data,
+            # therefore do not write anything after training succeeds, even if this method
+            # is called.
+            return
+
+        try:
+            avg_data_time = storage.history("data_time").avg(
+                storage.count_samples("data_time", self._window_size)
+            )
+            last_data_time = storage.history("data_time").latest()
+        except KeyError:
+            # they may not exist in the first few iterations (due to warmup)
+            # or when SimpleTrainer is not used
+            avg_data_time = None
+            last_data_time = None
+        try:
+            avg_iter_time = storage.history("time").global_avg()
+            last_iter_time = storage.history("time").latest()
+        except KeyError:
+            avg_iter_time = None
+            last_iter_time = None
+        try:
+            lr = "{:.5g}".format(storage.history("lr").latest())
+        except KeyError:
+            lr = "N/A"
+
+        eta_string = self._get_eta(storage)
+
+        if torch.cuda.is_available():
+            max_mem_mb = torch.cuda.max_memory_allocated() / 1024.0 / 1024.0
+        else:
+            max_mem_mb = None
+
+        # NOTE: max_mem is parsed by grep in "dev/parse_results.sh"
+        self.logger.info(
+            str.format(
+                " {eta}iter: {iter}  {losses}  {non_losses}  {avg_time}{last_time}"
+                + "{avg_data_time}{last_data_time} lr: {lr}  {memory}",
+                eta=f"eta: {eta_string}  " if eta_string else "",
+                iter=iteration,
+                losses="  ".join(
+                    [
+                        "{}: {:.4g}".format(
+                            k, v.median(storage.count_samples(k, self._window_size))
+                        )
+                        for k, v in storage.histories().items()
+                        if "loss" in k
+                    ]
+                ),
+                non_losses="  ".join(
+                    [
+                        "{}: {:.4g}".format(
+                            k, v.median(storage.count_samples(k, self._window_size))
+                        )
+                        for k, v in storage.histories().items()
+                        if "[metric]" in k
+                    ]
+                ),
+                avg_time="time: {:.4f}  ".format(avg_iter_time)
+                if avg_iter_time is not None
+                else "",
+                last_time="last_time: {:.4f}  ".format(last_iter_time)
+                if last_iter_time is not None
+                else "",
+                avg_data_time="data_time: {:.4f}  ".format(avg_data_time)
+                if avg_data_time is not None
+                else "",
+                last_data_time="last_data_time: {:.4f}  ".format(last_data_time)
+                if last_data_time is not None
+                else "",
+                lr=lr,
+                memory="max_mem: {:.0f}M".format(max_mem_mb) if max_mem_mb is not None else "",
+            )
+        )
+
+
+class EventStorage:
+    """
+    The user-facing class that provides metric storage functionalities.
+
+    In the future we may add support for storing / logging other types of data if needed.
+    """
+
+    def __init__(self, start_iter=0):
+        """
+        Args:
+            start_iter (int): the iteration number to start with
+        """
+        self._history = defaultdict(HistoryBuffer)
+        self._smoothing_hints = {}
+        self._latest_scalars = {}
+        self._iter = start_iter
+        self._current_prefix = ""
+        self._vis_data = []
+        self._histograms = []
+
+    def put_image(self, img_name, img_tensor):
+        """
+        Add an `img_tensor` associated with `img_name`, to be shown on
+        tensorboard.
+
+        Args:
+            img_name (str): The name of the image to put into tensorboard.
+            img_tensor (torch.Tensor or numpy.array): An `uint8` or `float`
+                Tensor of shape `[channel, height, width]` where `channel` is
+                3. The image format should be RGB. The elements in img_tensor
+                can either have values in [0, 1] (float32) or [0, 255] (uint8).
+                The `img_tensor` will be visualized in tensorboard.
+        """
+        self._vis_data.append((img_name, img_tensor, self._iter))
+
+    def put_scalar(self, name, value, smoothing_hint=True, cur_iter=None):
+        """
+        Add a scalar `value` to the `HistoryBuffer` associated with `name`.
+
+        Args:
+            smoothing_hint (bool): a 'hint' on whether this scalar is noisy and should be
+                smoothed when logged. The hint will be accessible through
+                :meth:`EventStorage.smoothing_hints`.  A writer may ignore the hint
+                and apply custom smoothing rule.
+
+                It defaults to True because most scalars we save need to be smoothed to
+                provide any useful signal.
+            cur_iter (int): an iteration number to set explicitly instead of current iteration
+        """
+        name = self._current_prefix + name
+        cur_iter = self._iter if cur_iter is None else cur_iter
+        history = self._history[name]
+        value = float(value)
+        history.update(value, cur_iter)
+        self._latest_scalars[name] = (value, cur_iter)
+
+        existing_hint = self._smoothing_hints.get(name)
+
+        if existing_hint is not None:
+            assert (
+                existing_hint == smoothing_hint
+            ), "Scalar {} was put with a different smoothing_hint!".format(name)
+        else:
+            self._smoothing_hints[name] = smoothing_hint
+
+    def put_scalars(self, *, smoothing_hint=True, cur_iter=None, **kwargs):
+        """
+        Put multiple scalars from keyword arguments.
+
+        Examples:
+
+            storage.put_scalars(loss=my_loss, accuracy=my_accuracy, smoothing_hint=True)
+        """
+        for k, v in kwargs.items():
+            self.put_scalar(k, v, smoothing_hint=smoothing_hint, cur_iter=cur_iter)
+
+    def put_histogram(self, hist_name, hist_tensor, bins=1000):
+        """
+        Create a histogram from a tensor.
+
+        Args:
+            hist_name (str): The name of the histogram to put into tensorboard.
+            hist_tensor (torch.Tensor): A Tensor of arbitrary shape to be converted
+                into a histogram.
+            bins (int): Number of histogram bins.
+        """
+        ht_min, ht_max = hist_tensor.min().item(), hist_tensor.max().item()
+
+        # Create a histogram with PyTorch
+        hist_counts = torch.histc(hist_tensor, bins=bins)
+        hist_edges = torch.linspace(start=ht_min, end=ht_max, steps=bins + 1, dtype=torch.float32)
+
+        # Parameter for the add_histogram_raw function of SummaryWriter
+        hist_params = dict(
+            tag=hist_name,
+            min=ht_min,
+            max=ht_max,
+            num=len(hist_tensor),
+            sum=float(hist_tensor.sum()),
+            sum_squares=float(torch.sum(hist_tensor**2)),
+            bucket_limits=hist_edges[1:].tolist(),
+            bucket_counts=hist_counts.tolist(),
+            global_step=self._iter,
+        )
+        self._histograms.append(hist_params)
+
+    def history(self, name):
+        """
+        Returns:
+            HistoryBuffer: the scalar history for name
+        """
+        ret = self._history.get(name, None)
+        if ret is None:
+            raise KeyError("No history metric available for {}!".format(name))
+        return ret
+
+    def histories(self):
+        """
+        Returns:
+            dict[name -> HistoryBuffer]: the HistoryBuffer for all scalars
+        """
+        return self._history
+
+    def latest(self):
+        """
+        Returns:
+            dict[str -> (float, int)]: mapping from the name of each scalar to the most
+                recent value and the iteration number its added.
+        """
+        return self._latest_scalars
+
+    def latest_with_smoothing_hint(self, window_size=20):
+        """
+        Similar to :meth:`latest`, but the returned values
+        are either the un-smoothed original latest value,
+        or a median of the given window_size,
+        depend on whether the smoothing_hint is True.
+
+        This provides a default behavior that other writers can use.
+
+        Note: All scalars saved in the past `window_size` iterations are used for smoothing.
+        This is different from the `window_size` definition in HistoryBuffer.
+        Use :meth:`get_history_window_size` to get the `window_size` used in HistoryBuffer.
+        """
+        result = {}
+        for k, (v, itr) in self._latest_scalars.items():
+            result[k] = (
+                self._history[k].median(self.count_samples(k, window_size))
+                if self._smoothing_hints[k]
+                else v,
+                itr,
+            )
+        return result
+
+    def count_samples(self, name, window_size=20):
+        """
+        Return the number of samples logged in the past `window_size` iterations.
+        """
+        samples = 0
+        data = self._history[name].values()
+        for _, iter_ in reversed(data):
+            if iter_ > data[-1][1] - window_size:
+                samples += 1
+            else:
+                break
+        return samples
+
+    def smoothing_hints(self):
+        """
+        Returns:
+            dict[name -> bool]: the user-provided hint on whether the scalar
+                is noisy and needs smoothing.
+        """
+        return self._smoothing_hints
+
+    def step(self):
+        """
+        User should either: (1) Call this function to increment storage.iter when needed. Or
+        (2) Set `storage.iter` to the correct iteration number before each iteration.
+
+        The storage will then be able to associate the new data with an iteration number.
+        """
+        self._iter += 1
+
+    @property
+    def iter(self):
+        """
+        Returns:
+            int: The current iteration number. When used together with a trainer,
+                this is ensured to be the same as trainer.iter.
+        """
+        return self._iter
+
+    @iter.setter
+    def iter(self, val):
+        self._iter = int(val)
+
+    @property
+    def iteration(self):
+        # for backward compatibility
+        return self._iter
+
+    def __enter__(self):
+        _CURRENT_STORAGE_STACK.append(self)
+        return self
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        assert _CURRENT_STORAGE_STACK[-1] == self
+        _CURRENT_STORAGE_STACK.pop()
+
+    @contextmanager
+    def name_scope(self, name):
+        """
+        Yields:
+            A context within which all the events added to this storage
+            will be prefixed by the name scope.
+        """
+        old_prefix = self._current_prefix
+        self._current_prefix = name.rstrip("/") + "/"
+        yield
+        self._current_prefix = old_prefix
+
+    def clear_images(self):
+        """
+        Delete all the stored images for visualization. This should be called
+        after images are written to tensorboard.
+        """
+        self._vis_data = []
+
+    def clear_histograms(self):
+        """
+        Delete all the stored histograms for visualization.
+        This should be called after histograms are written to tensorboard.
+        """
+        self._histograms = []
diff --git a/detectron2/utils/file_io.py b/detectron2/utils/file_io.py
new file mode 100644
index 0000000000000000000000000000000000000000..09f7dffdb36199350bba57bd3b4e9e8babb40594
--- /dev/null
+++ b/detectron2/utils/file_io.py
@@ -0,0 +1,39 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+from iopath.common.file_io import HTTPURLHandler, OneDrivePathHandler, PathHandler
+from iopath.common.file_io import PathManager as PathManagerBase
+
+__all__ = ["PathManager", "PathHandler"]
+
+
+PathManager = PathManagerBase()
+"""
+This is a detectron2 project-specific PathManager.
+We try to stay away from global PathManager in fvcore as it
+introduces potential conflicts among other libraries.
+"""
+
+
+class Detectron2Handler(PathHandler):
+    """
+    Resolve anything that's hosted under detectron2's namespace.
+    """
+
+    PREFIX = "detectron2://"
+    S3_DETECTRON2_PREFIX = "https://dl.fbaipublicfiles.com/detectron2/"
+
+    def _get_supported_prefixes(self):
+        return [self.PREFIX]
+
+    def _get_local_path(self, path, **kwargs):
+        name = path[len(self.PREFIX) :]
+        return PathManager.get_local_path(self.S3_DETECTRON2_PREFIX + name, **kwargs)
+
+    def _open(self, path, mode="r", **kwargs):
+        return PathManager.open(
+            self.S3_DETECTRON2_PREFIX + path[len(self.PREFIX) :], mode, **kwargs
+        )
+
+
+PathManager.register_handler(HTTPURLHandler())
+PathManager.register_handler(OneDrivePathHandler())
+PathManager.register_handler(Detectron2Handler())
diff --git a/detectron2/utils/logger.py b/detectron2/utils/logger.py
new file mode 100644
index 0000000000000000000000000000000000000000..85be03cb174a8802ff775842395fd30b4b5db61b
--- /dev/null
+++ b/detectron2/utils/logger.py
@@ -0,0 +1,261 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+import atexit
+import functools
+import logging
+import os
+import sys
+import time
+from collections import Counter
+import torch
+from tabulate import tabulate
+from termcolor import colored
+
+from detectron2.utils.file_io import PathManager
+
+__all__ = ["setup_logger", "log_first_n", "log_every_n", "log_every_n_seconds"]
+
+D2_LOG_BUFFER_SIZE_KEY: str = "D2_LOG_BUFFER_SIZE"
+
+DEFAULT_LOG_BUFFER_SIZE: int = 1024 * 1024  # 1MB
+
+
+class _ColorfulFormatter(logging.Formatter):
+    def __init__(self, *args, **kwargs):
+        self._root_name = kwargs.pop("root_name") + "."
+        self._abbrev_name = kwargs.pop("abbrev_name", "")
+        if len(self._abbrev_name):
+            self._abbrev_name = self._abbrev_name + "."
+        super(_ColorfulFormatter, self).__init__(*args, **kwargs)
+
+    def formatMessage(self, record):
+        record.name = record.name.replace(self._root_name, self._abbrev_name)
+        log = super(_ColorfulFormatter, self).formatMessage(record)
+        if record.levelno == logging.WARNING:
+            prefix = colored("WARNING", "red", attrs=["blink"])
+        elif record.levelno == logging.ERROR or record.levelno == logging.CRITICAL:
+            prefix = colored("ERROR", "red", attrs=["blink", "underline"])
+        else:
+            return log
+        return prefix + " " + log
+
+
+@functools.lru_cache()  # so that calling setup_logger multiple times won't add many handlers
+def setup_logger(
+    output=None,
+    distributed_rank=0,
+    *,
+    color=True,
+    name="detectron2",
+    abbrev_name=None,
+    enable_propagation: bool = False,
+    configure_stdout: bool = True
+):
+    """
+    Initialize the detectron2 logger and set its verbosity level to "DEBUG".
+
+    Args:
+        output (str): a file name or a directory to save log. If None, will not save log file.
+            If ends with ".txt" or ".log", assumed to be a file name.
+            Otherwise, logs will be saved to `output/log.txt`.
+        name (str): the root module name of this logger
+        abbrev_name (str): an abbreviation of the module, to avoid long names in logs.
+            Set to "" to not log the root module in logs.
+            By default, will abbreviate "detectron2" to "d2" and leave other
+            modules unchanged.
+        enable_propagation (bool): whether to propagate logs to the parent logger.
+        configure_stdout (bool): whether to configure logging to stdout.
+
+
+    Returns:
+        logging.Logger: a logger
+    """
+    logger = logging.getLogger(name)
+    logger.setLevel(logging.DEBUG)
+    logger.propagate = enable_propagation
+
+    if abbrev_name is None:
+        abbrev_name = "d2" if name == "detectron2" else name
+
+    plain_formatter = logging.Formatter(
+        "[%(asctime)s] %(name)s %(levelname)s: %(message)s", datefmt="%m/%d %H:%M:%S"
+    )
+    # stdout logging: master only
+    if configure_stdout and distributed_rank == 0:
+        ch = logging.StreamHandler(stream=sys.stdout)
+        ch.setLevel(logging.DEBUG)
+        if color:
+            formatter = _ColorfulFormatter(
+                colored("[%(asctime)s %(name)s]: ", "green") + "%(message)s",
+                datefmt="%m/%d %H:%M:%S",
+                root_name=name,
+                abbrev_name=str(abbrev_name),
+            )
+        else:
+            formatter = plain_formatter
+        ch.setFormatter(formatter)
+        logger.addHandler(ch)
+
+    # file logging: all workers
+    if output is not None:
+        if output.endswith(".txt") or output.endswith(".log"):
+            filename = output
+        else:
+            filename = os.path.join(output, "log.txt")
+        if distributed_rank > 0:
+            filename = filename + ".rank{}".format(distributed_rank)
+        PathManager.mkdirs(os.path.dirname(filename))
+
+        fh = logging.StreamHandler(_cached_log_stream(filename))
+        fh.setLevel(logging.DEBUG)
+        fh.setFormatter(plain_formatter)
+        logger.addHandler(fh)
+
+    return logger
+
+
+# cache the opened file object, so that different calls to `setup_logger`
+# with the same file name can safely write to the same file.
+@functools.lru_cache(maxsize=None)
+def _cached_log_stream(filename):
+    # use 1K buffer if writing to cloud storage
+    io = PathManager.open(filename, "a", buffering=_get_log_stream_buffer_size(filename))
+    atexit.register(io.close)
+    return io
+
+
+def _get_log_stream_buffer_size(filename: str) -> int:
+    if "://" not in filename:
+        # Local file, no extra caching is necessary
+        return -1
+    # Remote file requires a larger cache to avoid many small writes.
+    if D2_LOG_BUFFER_SIZE_KEY in os.environ:
+        return int(os.environ[D2_LOG_BUFFER_SIZE_KEY])
+    return DEFAULT_LOG_BUFFER_SIZE
+
+
+"""
+Below are some other convenient logging methods.
+They are mainly adopted from
+https://github.com/abseil/abseil-py/blob/master/absl/logging/__init__.py
+"""
+
+
+def _find_caller():
+    """
+    Returns:
+        str: module name of the caller
+        tuple: a hashable key to be used to identify different callers
+    """
+    frame = sys._getframe(2)
+    while frame:
+        code = frame.f_code
+        if os.path.join("utils", "logger.") not in code.co_filename:
+            mod_name = frame.f_globals["__name__"]
+            if mod_name == "__main__":
+                mod_name = "detectron2"
+            return mod_name, (code.co_filename, frame.f_lineno, code.co_name)
+        frame = frame.f_back
+
+
+_LOG_COUNTER = Counter()
+_LOG_TIMER = {}
+
+
+def log_first_n(lvl, msg, n=1, *, name=None, key="caller"):
+    """
+    Log only for the first n times.
+
+    Args:
+        lvl (int): the logging level
+        msg (str):
+        n (int):
+        name (str): name of the logger to use. Will use the caller's module by default.
+        key (str or tuple[str]): the string(s) can be one of "caller" or
+            "message", which defines how to identify duplicated logs.
+            For example, if called with `n=1, key="caller"`, this function
+            will only log the first call from the same caller, regardless of
+            the message content.
+            If called with `n=1, key="message"`, this function will log the
+            same content only once, even if they are called from different places.
+            If called with `n=1, key=("caller", "message")`, this function
+            will not log only if the same caller has logged the same message before.
+    """
+    if isinstance(key, str):
+        key = (key,)
+    assert len(key) > 0
+
+    caller_module, caller_key = _find_caller()
+    hash_key = ()
+    if "caller" in key:
+        hash_key = hash_key + caller_key
+    if "message" in key:
+        hash_key = hash_key + (msg,)
+
+    _LOG_COUNTER[hash_key] += 1
+    if _LOG_COUNTER[hash_key] <= n:
+        logging.getLogger(name or caller_module).log(lvl, msg)
+
+
+def log_every_n(lvl, msg, n=1, *, name=None):
+    """
+    Log once per n times.
+
+    Args:
+        lvl (int): the logging level
+        msg (str):
+        n (int):
+        name (str): name of the logger to use. Will use the caller's module by default.
+    """
+    caller_module, key = _find_caller()
+    _LOG_COUNTER[key] += 1
+    if n == 1 or _LOG_COUNTER[key] % n == 1:
+        logging.getLogger(name or caller_module).log(lvl, msg)
+
+
+def log_every_n_seconds(lvl, msg, n=1, *, name=None):
+    """
+    Log no more than once per n seconds.
+
+    Args:
+        lvl (int): the logging level
+        msg (str):
+        n (int):
+        name (str): name of the logger to use. Will use the caller's module by default.
+    """
+    caller_module, key = _find_caller()
+    last_logged = _LOG_TIMER.get(key, None)
+    current_time = time.time()
+    if last_logged is None or current_time - last_logged >= n:
+        logging.getLogger(name or caller_module).log(lvl, msg)
+        _LOG_TIMER[key] = current_time
+
+
+def create_small_table(small_dict):
+    """
+    Create a small table using the keys of small_dict as headers. This is only
+    suitable for small dictionaries.
+
+    Args:
+        small_dict (dict): a result dictionary of only a few items.
+
+    Returns:
+        str: the table as a string.
+    """
+    keys, values = tuple(zip(*small_dict.items()))
+    table = tabulate(
+        [values],
+        headers=keys,
+        tablefmt="pipe",
+        floatfmt=".3f",
+        stralign="center",
+        numalign="center",
+    )
+    return table
+
+
+def _log_api_usage(identifier: str):
+    """
+    Internal function used to log the usage of different detectron2 components
+    inside facebook's infra.
+    """
+    torch._C._log_api_usage_once("detectron2." + identifier)
diff --git a/detectron2/utils/memory.py b/detectron2/utils/memory.py
new file mode 100644
index 0000000000000000000000000000000000000000..bd494780b9dbbd1571688cd270bb9b53d113c13e
--- /dev/null
+++ b/detectron2/utils/memory.py
@@ -0,0 +1,84 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+
+import logging
+from contextlib import contextmanager
+from functools import wraps
+import torch
+
+__all__ = ["retry_if_cuda_oom"]
+
+
+@contextmanager
+def _ignore_torch_cuda_oom():
+    """
+    A context which ignores CUDA OOM exception from pytorch.
+    """
+    try:
+        yield
+    except RuntimeError as e:
+        # NOTE: the string may change?
+        if "CUDA out of memory. " in str(e):
+            pass
+        else:
+            raise
+
+
+def retry_if_cuda_oom(func):
+    """
+    Makes a function retry itself after encountering
+    pytorch's CUDA OOM error.
+    It will first retry after calling `torch.cuda.empty_cache()`.
+
+    If that still fails, it will then retry by trying to convert inputs to CPUs.
+    In this case, it expects the function to dispatch to CPU implementation.
+    The return values may become CPU tensors as well and it's user's
+    responsibility to convert it back to CUDA tensor if needed.
+
+    Args:
+        func: a stateless callable that takes tensor-like objects as arguments
+
+    Returns:
+        a callable which retries `func` if OOM is encountered.
+
+    Examples:
+    ::
+        output = retry_if_cuda_oom(some_torch_function)(input1, input2)
+        # output may be on CPU even if inputs are on GPU
+
+    Note:
+        1. When converting inputs to CPU, it will only look at each argument and check
+           if it has `.device` and `.to` for conversion. Nested structures of tensors
+           are not supported.
+
+        2. Since the function might be called more than once, it has to be
+           stateless.
+    """
+
+    def maybe_to_cpu(x):
+        try:
+            like_gpu_tensor = x.device.type == "cuda" and hasattr(x, "to")
+        except AttributeError:
+            like_gpu_tensor = False
+        if like_gpu_tensor:
+            return x.to(device="cpu")
+        else:
+            return x
+
+    @wraps(func)
+    def wrapped(*args, **kwargs):
+        with _ignore_torch_cuda_oom():
+            return func(*args, **kwargs)
+
+        # Clear cache and retry
+        torch.cuda.empty_cache()
+        with _ignore_torch_cuda_oom():
+            return func(*args, **kwargs)
+
+        # Try on CPU. This slows down the code significantly, therefore print a notice.
+        logger = logging.getLogger(__name__)
+        logger.info("Attempting to copy inputs of {} to CPU due to CUDA OOM".format(str(func)))
+        new_args = (maybe_to_cpu(x) for x in args)
+        new_kwargs = {k: maybe_to_cpu(v) for k, v in kwargs.items()}
+        return func(*new_args, **new_kwargs)
+
+    return wrapped
diff --git a/detectron2/utils/registry.py b/detectron2/utils/registry.py
new file mode 100644
index 0000000000000000000000000000000000000000..4b01e9007c2578a7b5ae555c926cc06c8a3010f9
--- /dev/null
+++ b/detectron2/utils/registry.py
@@ -0,0 +1,60 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+
+from typing import Any
+import pydoc
+from fvcore.common.registry import Registry  # for backward compatibility.
+
+"""
+``Registry`` and `locate` provide ways to map a string (typically found
+in config files) to callable objects.
+"""
+
+__all__ = ["Registry", "locate"]
+
+
+def _convert_target_to_string(t: Any) -> str:
+    """
+    Inverse of ``locate()``.
+
+    Args:
+        t: any object with ``__module__`` and ``__qualname__``
+    """
+    module, qualname = t.__module__, t.__qualname__
+
+    # Compress the path to this object, e.g. ``module.submodule._impl.class``
+    # may become ``module.submodule.class``, if the later also resolves to the same
+    # object. This simplifies the string, and also is less affected by moving the
+    # class implementation.
+    module_parts = module.split(".")
+    for k in range(1, len(module_parts)):
+        prefix = ".".join(module_parts[:k])
+        candidate = f"{prefix}.{qualname}"
+        try:
+            if locate(candidate) is t:
+                return candidate
+        except ImportError:
+            pass
+    return f"{module}.{qualname}"
+
+
+def locate(name: str) -> Any:
+    """
+    Locate and return an object ``x`` using an input string ``{x.__module__}.{x.__qualname__}``,
+    such as "module.submodule.class_name".
+
+    Raise Exception if it cannot be found.
+    """
+    obj = pydoc.locate(name)
+
+    # Some cases (e.g. torch.optim.sgd.SGD) not handled correctly
+    # by pydoc.locate. Try a private function from hydra.
+    if obj is None:
+        try:
+            # from hydra.utils import get_method - will print many errors
+            from hydra.utils import _locate
+        except ImportError as e:
+            raise ImportError(f"Cannot dynamically locate object {name}!") from e
+        else:
+            obj = _locate(name)  # it raises if fails
+
+    return obj
diff --git a/detectron2/utils/serialize.py b/detectron2/utils/serialize.py
new file mode 100644
index 0000000000000000000000000000000000000000..611903d287c6ccd4195f391bfb134ac2a7b5ddec
--- /dev/null
+++ b/detectron2/utils/serialize.py
@@ -0,0 +1,32 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+import cloudpickle
+
+
+class PicklableWrapper:
+    """
+    Wrap an object to make it more picklable, note that it uses
+    heavy weight serialization libraries that are slower than pickle.
+    It's best to use it only on closures (which are usually not picklable).
+
+    This is a simplified version of
+    https://github.com/joblib/joblib/blob/master/joblib/externals/loky/cloudpickle_wrapper.py
+    """
+
+    def __init__(self, obj):
+        while isinstance(obj, PicklableWrapper):
+            # Wrapping an object twice is no-op
+            obj = obj._obj
+        self._obj = obj
+
+    def __reduce__(self):
+        s = cloudpickle.dumps(self._obj)
+        return cloudpickle.loads, (s,)
+
+    def __call__(self, *args, **kwargs):
+        return self._obj(*args, **kwargs)
+
+    def __getattr__(self, attr):
+        # Ensure that the wrapped object can be used seamlessly as the previous object.
+        if attr not in ["_obj"]:
+            return getattr(self._obj, attr)
+        return getattr(self, attr)
diff --git a/detectron2/utils/testing.py b/detectron2/utils/testing.py
new file mode 100644
index 0000000000000000000000000000000000000000..3f5b9dbe4438e1f5c6976b45bafed8966aee2dd9
--- /dev/null
+++ b/detectron2/utils/testing.py
@@ -0,0 +1,478 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+import io
+import numpy as np
+import os
+import re
+import tempfile
+import unittest
+from typing import Callable
+import torch
+import torch.onnx.symbolic_helper as sym_help
+from packaging import version
+from torch._C import ListType
+from torch.onnx import register_custom_op_symbolic
+
+from detectron2 import model_zoo
+from detectron2.config import CfgNode, LazyConfig, instantiate
+from detectron2.data import DatasetCatalog
+from detectron2.data.detection_utils import read_image
+from detectron2.modeling import build_model
+from detectron2.structures import Boxes, Instances, ROIMasks
+from detectron2.utils.file_io import PathManager
+
+
+"""
+Internal utilities for tests. Don't use except for writing tests.
+"""
+
+
+def get_model_no_weights(config_path):
+    """
+    Like model_zoo.get, but do not load any weights (even pretrained)
+    """
+    cfg = model_zoo.get_config(config_path)
+    if isinstance(cfg, CfgNode):
+        if not torch.cuda.is_available():
+            cfg.MODEL.DEVICE = "cpu"
+        return build_model(cfg)
+    else:
+        return instantiate(cfg.model)
+
+
+def random_boxes(num_boxes, max_coord=100, device="cpu"):
+    """
+    Create a random Nx4 boxes tensor, with coordinates < max_coord.
+    """
+    boxes = torch.rand(num_boxes, 4, device=device) * (max_coord * 0.5)
+    boxes.clamp_(min=1.0)  # tiny boxes cause numerical instability in box regression
+    # Note: the implementation of this function in torchvision is:
+    # boxes[:, 2:] += torch.rand(N, 2) * 100
+    # but it does not guarantee non-negative widths/heights constraints:
+    # boxes[:, 2] >= boxes[:, 0] and boxes[:, 3] >= boxes[:, 1]:
+    boxes[:, 2:] += boxes[:, :2]
+    return boxes
+
+
+def get_sample_coco_image(tensor=True):
+    """
+    Args:
+        tensor (bool): if True, returns 3xHxW tensor.
+            else, returns a HxWx3 numpy array.
+
+    Returns:
+        an image, in BGR color.
+    """
+    try:
+        file_name = DatasetCatalog.get("coco_2017_val_100")[0]["file_name"]
+        if not PathManager.exists(file_name):
+            raise FileNotFoundError()
+    except IOError:
+        # for public CI to run
+        file_name = PathManager.get_local_path(
+            "http://images.cocodataset.org/train2017/000000000009.jpg"
+        )
+    ret = read_image(file_name, format="BGR")
+    if tensor:
+        ret = torch.from_numpy(np.ascontiguousarray(ret.transpose(2, 0, 1)))
+    return ret
+
+
+def convert_scripted_instances(instances):
+    """
+    Convert a scripted Instances object to a regular :class:`Instances` object
+    """
+    assert hasattr(
+        instances, "image_size"
+    ), f"Expect an Instances object, but got {type(instances)}!"
+    ret = Instances(instances.image_size)
+    for name in instances._field_names:
+        val = getattr(instances, "_" + name, None)
+        if val is not None:
+            ret.set(name, val)
+    return ret
+
+
+def assert_instances_allclose(input, other, *, rtol=1e-5, msg="", size_as_tensor=False):
+    """
+    Args:
+        input, other (Instances):
+        size_as_tensor: compare image_size of the Instances as tensors (instead of tuples).
+             Useful for comparing outputs of tracing.
+    """
+    if not isinstance(input, Instances):
+        input = convert_scripted_instances(input)
+    if not isinstance(other, Instances):
+        other = convert_scripted_instances(other)
+
+    if not msg:
+        msg = "Two Instances are different! "
+    else:
+        msg = msg.rstrip() + " "
+
+    size_error_msg = msg + f"image_size is {input.image_size} vs. {other.image_size}!"
+    if size_as_tensor:
+        assert torch.equal(
+            torch.tensor(input.image_size), torch.tensor(other.image_size)
+        ), size_error_msg
+    else:
+        assert input.image_size == other.image_size, size_error_msg
+    fields = sorted(input.get_fields().keys())
+    fields_other = sorted(other.get_fields().keys())
+    assert fields == fields_other, msg + f"Fields are {fields} vs {fields_other}!"
+
+    for f in fields:
+        val1, val2 = input.get(f), other.get(f)
+        if isinstance(val1, (Boxes, ROIMasks)):
+            # boxes in the range of O(100) and can have a larger tolerance
+            assert torch.allclose(val1.tensor, val2.tensor, atol=100 * rtol), (
+                msg + f"Field {f} differs too much!"
+            )
+        elif isinstance(val1, torch.Tensor):
+            if val1.dtype.is_floating_point:
+                mag = torch.abs(val1).max().cpu().item()
+                assert torch.allclose(val1, val2, atol=mag * rtol), (
+                    msg + f"Field {f} differs too much!"
+                )
+            else:
+                assert torch.equal(val1, val2), msg + f"Field {f} is different!"
+        else:
+            raise ValueError(f"Don't know how to compare type {type(val1)}")
+
+
+def reload_script_model(module):
+    """
+    Save a jit module and load it back.
+    Similar to the `getExportImportCopy` function in torch/testing/
+    """
+    buffer = io.BytesIO()
+    torch.jit.save(module, buffer)
+    buffer.seek(0)
+    return torch.jit.load(buffer)
+
+
+def reload_lazy_config(cfg):
+    """
+    Save an object by LazyConfig.save and load it back.
+    This is used to test that a config still works the same after
+    serialization/deserialization.
+    """
+    with tempfile.TemporaryDirectory(prefix="detectron2") as d:
+        fname = os.path.join(d, "d2_cfg_test.yaml")
+        LazyConfig.save(cfg, fname)
+        return LazyConfig.load(fname)
+
+
+def min_torch_version(min_version: str) -> bool:
+    """
+    Returns True when torch's  version is at least `min_version`.
+    """
+    try:
+        import torch
+    except ImportError:
+        return False
+
+    installed_version = version.parse(torch.__version__.split("+")[0])
+    min_version = version.parse(min_version)
+    return installed_version >= min_version
+
+
+def has_dynamic_axes(onnx_model):
+    """
+    Return True when all ONNX input/output have only dynamic axes for all ranks
+    """
+    return all(
+        not dim.dim_param.isnumeric()
+        for inp in onnx_model.graph.input
+        for dim in inp.type.tensor_type.shape.dim
+    ) and all(
+        not dim.dim_param.isnumeric()
+        for out in onnx_model.graph.output
+        for dim in out.type.tensor_type.shape.dim
+    )
+
+
+def register_custom_op_onnx_export(
+    opname: str, symbolic_fn: Callable, opset_version: int, min_version: str
+) -> None:
+    """
+    Register `symbolic_fn` as PyTorch's symbolic `opname`-`opset_version` for ONNX export.
+    The registration is performed only when current PyTorch's version is < `min_version.`
+    IMPORTANT: symbolic must be manually unregistered after the caller function returns
+    """
+    if min_torch_version(min_version):
+        return
+    register_custom_op_symbolic(opname, symbolic_fn, opset_version)
+    print(f"_register_custom_op_onnx_export({opname}, {opset_version}) succeeded.")
+
+
+def unregister_custom_op_onnx_export(opname: str, opset_version: int, min_version: str) -> None:
+    """
+    Unregister PyTorch's symbolic `opname`-`opset_version` for ONNX export.
+    The un-registration is performed only when PyTorch's version is < `min_version`
+    IMPORTANT: The symbolic must have been manually registered by the caller, otherwise
+               the incorrect symbolic may be unregistered instead.
+    """
+
+    # TODO: _unregister_custom_op_symbolic is introduced PyTorch>=1.10
+    #       Remove after PyTorch 1.10+ is used by ALL detectron2's CI
+    try:
+        from torch.onnx import unregister_custom_op_symbolic as _unregister_custom_op_symbolic
+    except ImportError:
+
+        def _unregister_custom_op_symbolic(symbolic_name, opset_version):
+            import torch.onnx.symbolic_registry as sym_registry
+            from torch.onnx.symbolic_helper import _onnx_main_opset, _onnx_stable_opsets
+
+            def _get_ns_op_name_from_custom_op(symbolic_name):
+                try:
+                    from torch.onnx.utils import get_ns_op_name_from_custom_op
+
+                    ns, op_name = get_ns_op_name_from_custom_op(symbolic_name)
+                except ImportError as import_error:
+                    if not bool(
+                        re.match(r"^[a-zA-Z0-9-_]*::[a-zA-Z-_]+[a-zA-Z0-9-_]*$", symbolic_name)
+                    ):
+                        raise ValueError(
+                            f"Invalid symbolic name {symbolic_name}. Must be `domain::name`"
+                        ) from import_error
+
+                    ns, op_name = symbolic_name.split("::")
+                    if ns == "onnx":
+                        raise ValueError(f"{ns} domain cannot be modified.") from import_error
+
+                    if ns == "aten":
+                        ns = ""
+
+                return ns, op_name
+
+            def _unregister_op(opname: str, domain: str, version: int):
+                try:
+                    sym_registry.unregister_op(op_name, ns, ver)
+                except AttributeError as attribute_error:
+                    if sym_registry.is_registered_op(opname, domain, version):
+                        del sym_registry._registry[(domain, version)][opname]
+                        if not sym_registry._registry[(domain, version)]:
+                            del sym_registry._registry[(domain, version)]
+                    else:
+                        raise RuntimeError(
+                            f"The opname {opname} is not registered."
+                        ) from attribute_error
+
+            ns, op_name = _get_ns_op_name_from_custom_op(symbolic_name)
+            for ver in _onnx_stable_opsets + [_onnx_main_opset]:
+                if ver >= opset_version:
+                    _unregister_op(op_name, ns, ver)
+
+    if min_torch_version(min_version):
+        return
+    _unregister_custom_op_symbolic(opname, opset_version)
+    print(f"_unregister_custom_op_onnx_export({opname}, {opset_version}) succeeded.")
+
+
+skipIfOnCPUCI = unittest.skipIf(
+    os.environ.get("CI") and not torch.cuda.is_available(),
+    "The test is too slow on CPUs and will be executed on CircleCI's GPU jobs.",
+)
+
+
+def skipIfUnsupportedMinOpsetVersion(min_opset_version, current_opset_version=None):
+    """
+    Skips tests for ONNX Opset versions older than min_opset_version.
+    """
+
+    def skip_dec(func):
+        def wrapper(self):
+            try:
+                opset_version = self.opset_version
+            except AttributeError:
+                opset_version = current_opset_version
+            if opset_version < min_opset_version:
+                raise unittest.SkipTest(
+                    f"Unsupported opset_version {opset_version}"
+                    f", required is {min_opset_version}"
+                )
+            return func(self)
+
+        return wrapper
+
+    return skip_dec
+
+
+def skipIfUnsupportedMinTorchVersion(min_version):
+    """
+    Skips tests for PyTorch versions older than min_version.
+    """
+    reason = f"module 'torch' has __version__ {torch.__version__}" f", required is: {min_version}"
+    return unittest.skipIf(not min_torch_version(min_version), reason)
+
+
+# TODO: Remove after PyTorch 1.11.1+ is used by detectron2's CI
+def _pytorch1111_symbolic_opset9_to(g, self, *args):
+    """aten::to() symbolic that must be used for testing with PyTorch < 1.11.1."""
+
+    def is_aten_to_device_only(args):
+        if len(args) == 4:
+            # aten::to(Tensor, Device, bool, bool, memory_format)
+            return (
+                args[0].node().kind() == "prim::device"
+                or args[0].type().isSubtypeOf(ListType.ofInts())
+                or (
+                    sym_help._is_value(args[0])
+                    and args[0].node().kind() == "onnx::Constant"
+                    and isinstance(args[0].node()["value"], str)
+                )
+            )
+        elif len(args) == 5:
+            # aten::to(Tensor, Device, ScalarType, bool, bool, memory_format)
+            # When dtype is None, this is a aten::to(device) call
+            dtype = sym_help._get_const(args[1], "i", "dtype")
+            return dtype is None
+        elif len(args) in (6, 7):
+            # aten::to(Tensor, ScalarType, Layout, Device, bool, bool, memory_format)
+            # aten::to(Tensor, ScalarType, Layout, Device, bool, bool, bool, memory_format)
+            # When dtype is None, this is a aten::to(device) call
+            dtype = sym_help._get_const(args[0], "i", "dtype")
+            return dtype is None
+        return False
+
+    # ONNX doesn't have a concept of a device, so we ignore device-only casts
+    if is_aten_to_device_only(args):
+        return self
+
+    if len(args) == 4:
+        # TestONNXRuntime::test_ones_bool shows args[0] of aten::to can be onnx::Constant[Tensor]
+        # In this case, the constant value is a tensor not int,
+        # so sym_help._maybe_get_const(args[0], 'i') would not work.
+        dtype = args[0]
+        if sym_help._is_value(args[0]) and args[0].node().kind() == "onnx::Constant":
+            tval = args[0].node()["value"]
+            if isinstance(tval, torch.Tensor):
+                if len(tval.shape) == 0:
+                    tval = tval.item()
+                    dtype = int(tval)
+                else:
+                    dtype = tval
+
+        if sym_help._is_value(dtype) or isinstance(dtype, torch.Tensor):
+            # aten::to(Tensor, Tensor, bool, bool, memory_format)
+            dtype = args[0].type().scalarType()
+            return g.op("Cast", self, to_i=sym_help.cast_pytorch_to_onnx[dtype])
+        else:
+            # aten::to(Tensor, ScalarType, bool, bool, memory_format)
+            # memory_format is ignored
+            return g.op("Cast", self, to_i=sym_help.scalar_type_to_onnx[dtype])
+    elif len(args) == 5:
+        # aten::to(Tensor, Device, ScalarType, bool, bool, memory_format)
+        dtype = sym_help._get_const(args[1], "i", "dtype")
+        # memory_format is ignored
+        return g.op("Cast", self, to_i=sym_help.scalar_type_to_onnx[dtype])
+    elif len(args) == 6:
+        # aten::to(Tensor, ScalarType, Layout, Device, bool, bool, memory_format)
+        dtype = sym_help._get_const(args[0], "i", "dtype")
+        # Layout, device and memory_format are ignored
+        return g.op("Cast", self, to_i=sym_help.scalar_type_to_onnx[dtype])
+    elif len(args) == 7:
+        # aten::to(Tensor, ScalarType, Layout, Device, bool, bool, bool, memory_format)
+        dtype = sym_help._get_const(args[0], "i", "dtype")
+        # Layout, device and memory_format are ignored
+        return g.op("Cast", self, to_i=sym_help.scalar_type_to_onnx[dtype])
+    else:
+        return sym_help._onnx_unsupported("Unknown aten::to signature")
+
+
+# TODO: Remove after PyTorch 1.11.1+ is used by detectron2's CI
+def _pytorch1111_symbolic_opset9_repeat_interleave(g, self, repeats, dim=None, output_size=None):
+
+    # from torch.onnx.symbolic_helper import ScalarType
+    from torch.onnx.symbolic_opset9 import expand, unsqueeze
+
+    input = self
+    # if dim is None flatten
+    # By default, use the flattened input array, and return a flat output array
+    if sym_help._is_none(dim):
+        input = sym_help._reshape_helper(g, self, g.op("Constant", value_t=torch.tensor([-1])))
+        dim = 0
+    else:
+        dim = sym_help._maybe_get_scalar(dim)
+
+    repeats_dim = sym_help._get_tensor_rank(repeats)
+    repeats_sizes = sym_help._get_tensor_sizes(repeats)
+    input_sizes = sym_help._get_tensor_sizes(input)
+    if repeats_dim is None:
+        raise RuntimeError(
+            "Unsupported: ONNX export of repeat_interleave for unknown " "repeats rank."
+        )
+    if repeats_sizes is None:
+        raise RuntimeError(
+            "Unsupported: ONNX export of repeat_interleave for unknown " "repeats size."
+        )
+    if input_sizes is None:
+        raise RuntimeError(
+            "Unsupported: ONNX export of repeat_interleave for unknown " "input size."
+        )
+
+    input_sizes_temp = input_sizes.copy()
+    for idx, input_size in enumerate(input_sizes):
+        if input_size is None:
+            input_sizes[idx], input_sizes_temp[idx] = 0, -1
+
+    # Cases where repeats is an int or single value tensor
+    if repeats_dim == 0 or (repeats_dim == 1 and repeats_sizes[0] == 1):
+        if not sym_help._is_tensor(repeats):
+            repeats = g.op("Constant", value_t=torch.LongTensor(repeats))
+        if input_sizes[dim] == 0:
+            return sym_help._onnx_opset_unsupported_detailed(
+                "repeat_interleave",
+                9,
+                13,
+                "Unsupported along dimension with unknown input size",
+            )
+        else:
+            reps = input_sizes[dim]
+            repeats = expand(g, repeats, g.op("Constant", value_t=torch.tensor([reps])), None)
+
+    # Cases where repeats is a 1 dim Tensor
+    elif repeats_dim == 1:
+        if input_sizes[dim] == 0:
+            return sym_help._onnx_opset_unsupported_detailed(
+                "repeat_interleave",
+                9,
+                13,
+                "Unsupported along dimension with unknown input size",
+            )
+        if repeats_sizes[0] is None:
+            return sym_help._onnx_opset_unsupported_detailed(
+                "repeat_interleave", 9, 13, "Unsupported for cases with dynamic repeats"
+            )
+        assert (
+            repeats_sizes[0] == input_sizes[dim]
+        ), "repeats must have the same size as input along dim"
+        reps = repeats_sizes[0]
+    else:
+        raise RuntimeError("repeats must be 0-dim or 1-dim tensor")
+
+    final_splits = list()
+    r_splits = sym_help._repeat_interleave_split_helper(g, repeats, reps, 0)
+    if isinstance(r_splits, torch._C.Value):
+        r_splits = [r_splits]
+    i_splits = sym_help._repeat_interleave_split_helper(g, input, reps, dim)
+    if isinstance(i_splits, torch._C.Value):
+        i_splits = [i_splits]
+    input_sizes[dim], input_sizes_temp[dim] = -1, 1
+    for idx, r_split in enumerate(r_splits):
+        i_split = unsqueeze(g, i_splits[idx], dim + 1)
+        r_concat = [
+            g.op("Constant", value_t=torch.LongTensor(input_sizes_temp[: dim + 1])),
+            r_split,
+            g.op("Constant", value_t=torch.LongTensor(input_sizes_temp[dim + 1 :])),
+        ]
+        r_concat = g.op("Concat", *r_concat, axis_i=0)
+        i_split = expand(g, i_split, r_concat, None)
+        i_split = sym_help._reshape_helper(
+            g,
+            i_split,
+            g.op("Constant", value_t=torch.LongTensor(input_sizes)),
+            allowzero=0,
+        )
+        final_splits.append(i_split)
+    return g.op("Concat", *final_splits, axis_i=dim)
diff --git a/detectron2/utils/tracing.py b/detectron2/utils/tracing.py
new file mode 100644
index 0000000000000000000000000000000000000000..002c0a3ee4892be23dfbcd103028d99911a32461
--- /dev/null
+++ b/detectron2/utils/tracing.py
@@ -0,0 +1,73 @@
+import inspect
+import torch
+
+from detectron2.utils.env import TORCH_VERSION
+
+try:
+    from torch.fx._symbolic_trace import is_fx_tracing as is_fx_tracing_current
+
+    tracing_current_exists = True
+except ImportError:
+    tracing_current_exists = False
+
+try:
+    from torch.fx._symbolic_trace import _orig_module_call
+
+    tracing_legacy_exists = True
+except ImportError:
+    tracing_legacy_exists = False
+
+
+@torch.jit.ignore
+def is_fx_tracing_legacy() -> bool:
+    """
+    Returns a bool indicating whether torch.fx is currently symbolically tracing a module.
+    Can be useful for gating module logic that is incompatible with symbolic tracing.
+    """
+    return torch.nn.Module.__call__ is not _orig_module_call
+
+
+def is_fx_tracing() -> bool:
+    """Returns whether execution is currently in
+    Torch FX tracing mode"""
+    if torch.jit.is_scripting():
+        return False
+    if TORCH_VERSION >= (1, 10) and tracing_current_exists:
+        return is_fx_tracing_current()
+    elif tracing_legacy_exists:
+        return is_fx_tracing_legacy()
+    else:
+        # Can't find either current or legacy tracing indication code.
+        # Enabling this assert_fx_safe() call regardless of tracing status.
+        return False
+
+
+def assert_fx_safe(condition: bool, message: str) -> torch.Tensor:
+    """An FX-tracing safe version of assert.
+    Avoids erroneous type assertion triggering when types are masked inside
+    an fx.proxy.Proxy object during tracing.
+    Args: condition - either a boolean expression or a string representing
+    the condition to test. If this assert triggers an exception when tracing
+    due to dynamic control flow, try encasing the expression in quotation
+    marks and supplying it as a string."""
+    # Must return a concrete tensor for compatibility with PyTorch <=1.8.
+    # If <=1.8 compatibility is not needed, return type can be converted to None
+    if torch.jit.is_scripting() or is_fx_tracing():
+        return torch.zeros(1)
+    return _do_assert_fx_safe(condition, message)
+
+
+def _do_assert_fx_safe(condition: bool, message: str) -> torch.Tensor:
+    try:
+        if isinstance(condition, str):
+            caller_frame = inspect.currentframe().f_back
+            torch._assert(eval(condition, caller_frame.f_globals, caller_frame.f_locals), message)
+            return torch.ones(1)
+        else:
+            torch._assert(condition, message)
+            return torch.ones(1)
+    except torch.fx.proxy.TraceError as e:
+        print(
+            "Found a non-FX compatible assertion. Skipping the check. Failure is shown below"
+            + str(e)
+        )
diff --git a/detectron2/utils/video_visualizer.py b/detectron2/utils/video_visualizer.py
new file mode 100644
index 0000000000000000000000000000000000000000..42685be53bb09bab8420b1bcd4d63d8dc6ba7cab
--- /dev/null
+++ b/detectron2/utils/video_visualizer.py
@@ -0,0 +1,287 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+import numpy as np
+from typing import List
+import pycocotools.mask as mask_util
+
+from detectron2.structures import Instances
+from detectron2.utils.visualizer import (
+    ColorMode,
+    Visualizer,
+    _create_text_labels,
+    _PanopticPrediction,
+)
+
+from .colormap import random_color, random_colors
+
+
+class _DetectedInstance:
+    """
+    Used to store data about detected objects in video frame,
+    in order to transfer color to objects in the future frames.
+
+    Attributes:
+        label (int):
+        bbox (tuple[float]):
+        mask_rle (dict):
+        color (tuple[float]): RGB colors in range (0, 1)
+        ttl (int): time-to-live for the instance. For example, if ttl=2,
+            the instance color can be transferred to objects in the next two frames.
+    """
+
+    __slots__ = ["label", "bbox", "mask_rle", "color", "ttl"]
+
+    def __init__(self, label, bbox, mask_rle, color, ttl):
+        self.label = label
+        self.bbox = bbox
+        self.mask_rle = mask_rle
+        self.color = color
+        self.ttl = ttl
+
+
+class VideoVisualizer:
+    def __init__(self, metadata, instance_mode=ColorMode.IMAGE):
+        """
+        Args:
+            metadata (MetadataCatalog): image metadata.
+        """
+        self.metadata = metadata
+        self._old_instances = []
+        assert instance_mode in [
+            ColorMode.IMAGE,
+            ColorMode.IMAGE_BW,
+        ], "Other mode not supported yet."
+        self._instance_mode = instance_mode
+        self._max_num_instances = self.metadata.get("max_num_instances", 74)
+        self._assigned_colors = {}
+        self._color_pool = random_colors(self._max_num_instances, rgb=True, maximum=1)
+        self._color_idx_set = set(range(len(self._color_pool)))
+
+    def draw_instance_predictions(self, frame, predictions):
+        """
+        Draw instance-level prediction results on an image.
+
+        Args:
+            frame (ndarray): an RGB image of shape (H, W, C), in the range [0, 255].
+            predictions (Instances): the output of an instance detection/segmentation
+                model. Following fields will be used to draw:
+                "pred_boxes", "pred_classes", "scores", "pred_masks" (or "pred_masks_rle").
+
+        Returns:
+            output (VisImage): image object with visualizations.
+        """
+        frame_visualizer = Visualizer(frame, self.metadata)
+        num_instances = len(predictions)
+        if num_instances == 0:
+            return frame_visualizer.output
+
+        boxes = predictions.pred_boxes.tensor.numpy() if predictions.has("pred_boxes") else None
+        scores = predictions.scores if predictions.has("scores") else None
+        classes = predictions.pred_classes.numpy() if predictions.has("pred_classes") else None
+        keypoints = predictions.pred_keypoints if predictions.has("pred_keypoints") else None
+        colors = predictions.COLOR if predictions.has("COLOR") else [None] * len(predictions)
+        periods = predictions.ID_period if predictions.has("ID_period") else None
+        period_threshold = self.metadata.get("period_threshold", 0)
+        visibilities = (
+            [True] * len(predictions)
+            if periods is None
+            else [x > period_threshold for x in periods]
+        )
+
+        if predictions.has("pred_masks"):
+            masks = predictions.pred_masks
+            # mask IOU is not yet enabled
+            # masks_rles = mask_util.encode(np.asarray(masks.permute(1, 2, 0), order="F"))
+            # assert len(masks_rles) == num_instances
+        else:
+            masks = None
+
+        if not predictions.has("COLOR"):
+            if predictions.has("ID"):
+                colors = self._assign_colors_by_id(predictions)
+            else:
+                # ToDo: clean old assign color method and use a default tracker to assign id
+                detected = [
+                    _DetectedInstance(classes[i], boxes[i], mask_rle=None, color=colors[i], ttl=8)
+                    for i in range(num_instances)
+                ]
+                colors = self._assign_colors(detected)
+
+        labels = _create_text_labels(classes, scores, self.metadata.get("thing_classes", None))
+
+        if self._instance_mode == ColorMode.IMAGE_BW:
+            # any() returns uint8 tensor
+            frame_visualizer.output.reset_image(
+                frame_visualizer._create_grayscale_image(
+                    (masks.any(dim=0) > 0).numpy() if masks is not None else None
+                )
+            )
+            alpha = 0.3
+        else:
+            alpha = 0.5
+
+        labels = (
+            None
+            if labels is None
+            else [y[0] for y in filter(lambda x: x[1], zip(labels, visibilities))]
+        )  # noqa
+        assigned_colors = (
+            None
+            if colors is None
+            else [y[0] for y in filter(lambda x: x[1], zip(colors, visibilities))]
+        )  # noqa
+        frame_visualizer.overlay_instances(
+            boxes=None if masks is not None else boxes[visibilities],  # boxes are a bit distracting
+            masks=None if masks is None else masks[visibilities],
+            labels=labels,
+            keypoints=None if keypoints is None else keypoints[visibilities],
+            assigned_colors=assigned_colors,
+            alpha=alpha,
+        )
+
+        return frame_visualizer.output
+
+    def draw_sem_seg(self, frame, sem_seg, area_threshold=None):
+        """
+        Args:
+            sem_seg (ndarray or Tensor): semantic segmentation of shape (H, W),
+                each value is the integer label.
+            area_threshold (Optional[int]): only draw segmentations larger than the threshold
+        """
+        # don't need to do anything special
+        frame_visualizer = Visualizer(frame, self.metadata)
+        frame_visualizer.draw_sem_seg(sem_seg, area_threshold=None)
+        return frame_visualizer.output
+
+    def draw_panoptic_seg_predictions(
+        self, frame, panoptic_seg, segments_info, area_threshold=None, alpha=0.5
+    ):
+        frame_visualizer = Visualizer(frame, self.metadata)
+        pred = _PanopticPrediction(panoptic_seg, segments_info, self.metadata)
+
+        if self._instance_mode == ColorMode.IMAGE_BW:
+            frame_visualizer.output.reset_image(
+                frame_visualizer._create_grayscale_image(pred.non_empty_mask())
+            )
+
+        # draw mask for all semantic segments first i.e. "stuff"
+        for mask, sinfo in pred.semantic_masks():
+            category_idx = sinfo["category_id"]
+            try:
+                mask_color = [x / 255 for x in self.metadata.stuff_colors[category_idx]]
+            except AttributeError:
+                mask_color = None
+
+            frame_visualizer.draw_binary_mask(
+                mask,
+                color=mask_color,
+                text=self.metadata.stuff_classes[category_idx],
+                alpha=alpha,
+                area_threshold=area_threshold,
+            )
+
+        all_instances = list(pred.instance_masks())
+        if len(all_instances) == 0:
+            return frame_visualizer.output
+        # draw mask for all instances second
+        masks, sinfo = list(zip(*all_instances))
+        num_instances = len(masks)
+        masks_rles = mask_util.encode(
+            np.asarray(np.asarray(masks).transpose(1, 2, 0), dtype=np.uint8, order="F")
+        )
+        assert len(masks_rles) == num_instances
+
+        category_ids = [x["category_id"] for x in sinfo]
+        detected = [
+            _DetectedInstance(category_ids[i], bbox=None, mask_rle=masks_rles[i], color=None, ttl=8)
+            for i in range(num_instances)
+        ]
+        colors = self._assign_colors(detected)
+        labels = [self.metadata.thing_classes[k] for k in category_ids]
+
+        frame_visualizer.overlay_instances(
+            boxes=None,
+            masks=masks,
+            labels=labels,
+            keypoints=None,
+            assigned_colors=colors,
+            alpha=alpha,
+        )
+        return frame_visualizer.output
+
+    def _assign_colors(self, instances):
+        """
+        Naive tracking heuristics to assign same color to the same instance,
+        will update the internal state of tracked instances.
+
+        Returns:
+            list[tuple[float]]: list of colors.
+        """
+
+        # Compute iou with either boxes or masks:
+        is_crowd = np.zeros((len(instances),), dtype=bool)
+        if instances[0].bbox is None:
+            assert instances[0].mask_rle is not None
+            # use mask iou only when box iou is None
+            # because box seems good enough
+            rles_old = [x.mask_rle for x in self._old_instances]
+            rles_new = [x.mask_rle for x in instances]
+            ious = mask_util.iou(rles_old, rles_new, is_crowd)
+            threshold = 0.5
+        else:
+            boxes_old = [x.bbox for x in self._old_instances]
+            boxes_new = [x.bbox for x in instances]
+            ious = mask_util.iou(boxes_old, boxes_new, is_crowd)
+            threshold = 0.6
+        if len(ious) == 0:
+            ious = np.zeros((len(self._old_instances), len(instances)), dtype="float32")
+
+        # Only allow matching instances of the same label:
+        for old_idx, old in enumerate(self._old_instances):
+            for new_idx, new in enumerate(instances):
+                if old.label != new.label:
+                    ious[old_idx, new_idx] = 0
+
+        matched_new_per_old = np.asarray(ious).argmax(axis=1)
+        max_iou_per_old = np.asarray(ious).max(axis=1)
+
+        # Try to find match for each old instance:
+        extra_instances = []
+        for idx, inst in enumerate(self._old_instances):
+            if max_iou_per_old[idx] > threshold:
+                newidx = matched_new_per_old[idx]
+                if instances[newidx].color is None:
+                    instances[newidx].color = inst.color
+                    continue
+            # If an old instance does not match any new instances,
+            # keep it for the next frame in case it is just missed by the detector
+            inst.ttl -= 1
+            if inst.ttl > 0:
+                extra_instances.append(inst)
+
+        # Assign random color to newly-detected instances:
+        for inst in instances:
+            if inst.color is None:
+                inst.color = random_color(rgb=True, maximum=1)
+        self._old_instances = instances[:] + extra_instances
+        return [d.color for d in instances]
+
+    def _assign_colors_by_id(self, instances: Instances) -> List:
+        colors = []
+        untracked_ids = set(self._assigned_colors.keys())
+        for id in instances.ID:
+            if id in self._assigned_colors:
+                colors.append(self._color_pool[self._assigned_colors[id]])
+                untracked_ids.remove(id)
+            else:
+                assert (
+                    len(self._color_idx_set) >= 1
+                ), f"Number of id exceeded maximum, \
+                    max = {self._max_num_instances}"
+                idx = self._color_idx_set.pop()
+                color = self._color_pool[idx]
+                self._assigned_colors[id] = idx
+                colors.append(color)
+        for id in untracked_ids:
+            self._color_idx_set.add(self._assigned_colors[id])
+            del self._assigned_colors[id]
+        return colors
diff --git a/detectron2/utils/visualizer.py b/detectron2/utils/visualizer.py
new file mode 100644
index 0000000000000000000000000000000000000000..5d2cc1762d9b7c018b1f2cb32481485594d1d397
--- /dev/null
+++ b/detectron2/utils/visualizer.py
@@ -0,0 +1,1267 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+import colorsys
+import logging
+import math
+import numpy as np
+from enum import Enum, unique
+import cv2
+import matplotlib as mpl
+import matplotlib.colors as mplc
+import matplotlib.figure as mplfigure
+import pycocotools.mask as mask_util
+import torch
+from matplotlib.backends.backend_agg import FigureCanvasAgg
+from PIL import Image
+
+from detectron2.data import MetadataCatalog
+from detectron2.structures import BitMasks, Boxes, BoxMode, Keypoints, PolygonMasks, RotatedBoxes
+from detectron2.utils.file_io import PathManager
+
+from .colormap import random_color
+
+logger = logging.getLogger(__name__)
+
+__all__ = ["ColorMode", "VisImage", "Visualizer"]
+
+
+_SMALL_OBJECT_AREA_THRESH = 1000
+_LARGE_MASK_AREA_THRESH = 120000
+_OFF_WHITE = (1.0, 1.0, 240.0 / 255)
+_BLACK = (0, 0, 0)
+_RED = (1.0, 0, 0)
+
+_KEYPOINT_THRESHOLD = 0.05
+
+
+@unique
+class ColorMode(Enum):
+    """
+    Enum of different color modes to use for instance visualizations.
+    """
+
+    IMAGE = 0
+    """
+    Picks a random color for every instance and overlay segmentations with low opacity.
+    """
+    SEGMENTATION = 1
+    """
+    Let instances of the same category have similar colors
+    (from metadata.thing_colors), and overlay them with
+    high opacity. This provides more attention on the quality of segmentation.
+    """
+    IMAGE_BW = 2
+    """
+    Same as IMAGE, but convert all areas without masks to gray-scale.
+    Only available for drawing per-instance mask predictions.
+    """
+
+
+class GenericMask:
+    """
+    Attribute:
+        polygons (list[ndarray]): list[ndarray]: polygons for this mask.
+            Each ndarray has format [x, y, x, y, ...]
+        mask (ndarray): a binary mask
+    """
+
+    def __init__(self, mask_or_polygons, height, width):
+        self._mask = self._polygons = self._has_holes = None
+        self.height = height
+        self.width = width
+
+        m = mask_or_polygons
+        if isinstance(m, dict):
+            # RLEs
+            assert "counts" in m and "size" in m
+            if isinstance(m["counts"], list):  # uncompressed RLEs
+                h, w = m["size"]
+                assert h == height and w == width
+                m = mask_util.frPyObjects(m, h, w)
+            self._mask = mask_util.decode(m)[:, :]
+            return
+
+        if isinstance(m, list):  # list[ndarray]
+            self._polygons = [np.asarray(x).reshape(-1) for x in m]
+            return
+
+        if isinstance(m, np.ndarray):  # assumed to be a binary mask
+            assert m.shape[1] != 2, m.shape
+            assert m.shape == (
+                height,
+                width,
+            ), f"mask shape: {m.shape}, target dims: {height}, {width}"
+            self._mask = m.astype("uint8")
+            return
+
+        raise ValueError("GenericMask cannot handle object {} of type '{}'".format(m, type(m)))
+
+    @property
+    def mask(self):
+        if self._mask is None:
+            self._mask = self.polygons_to_mask(self._polygons)
+        return self._mask
+
+    @property
+    def polygons(self):
+        if self._polygons is None:
+            self._polygons, self._has_holes = self.mask_to_polygons(self._mask)
+        return self._polygons
+
+    @property
+    def has_holes(self):
+        if self._has_holes is None:
+            if self._mask is not None:
+                self._polygons, self._has_holes = self.mask_to_polygons(self._mask)
+            else:
+                self._has_holes = False  # if original format is polygon, does not have holes
+        return self._has_holes
+
+    def mask_to_polygons(self, mask):
+        # cv2.RETR_CCOMP flag retrieves all the contours and arranges them to a 2-level
+        # hierarchy. External contours (boundary) of the object are placed in hierarchy-1.
+        # Internal contours (holes) are placed in hierarchy-2.
+        # cv2.CHAIN_APPROX_NONE flag gets vertices of polygons from contours.
+        mask = np.ascontiguousarray(mask)  # some versions of cv2 does not support incontiguous arr
+        res = cv2.findContours(mask.astype("uint8"), cv2.RETR_CCOMP, cv2.CHAIN_APPROX_NONE)
+        hierarchy = res[-1]
+        if hierarchy is None:  # empty mask
+            return [], False
+        has_holes = (hierarchy.reshape(-1, 4)[:, 3] >= 0).sum() > 0
+        res = res[-2]
+        res = [x.flatten() for x in res]
+        # These coordinates from OpenCV are integers in range [0, W-1 or H-1].
+        # We add 0.5 to turn them into real-value coordinate space. A better solution
+        # would be to first +0.5 and then dilate the returned polygon by 0.5.
+        res = [x + 0.5 for x in res if len(x) >= 6]
+        return res, has_holes
+
+    def polygons_to_mask(self, polygons):
+        rle = mask_util.frPyObjects(polygons, self.height, self.width)
+        rle = mask_util.merge(rle)
+        return mask_util.decode(rle)[:, :]
+
+    def area(self):
+        return self.mask.sum()
+
+    def bbox(self):
+        p = mask_util.frPyObjects(self.polygons, self.height, self.width)
+        p = mask_util.merge(p)
+        bbox = mask_util.toBbox(p)
+        bbox[2] += bbox[0]
+        bbox[3] += bbox[1]
+        return bbox
+
+
+class _PanopticPrediction:
+    """
+    Unify different panoptic annotation/prediction formats
+    """
+
+    def __init__(self, panoptic_seg, segments_info, metadata=None):
+        if segments_info is None:
+            assert metadata is not None
+            # If "segments_info" is None, we assume "panoptic_img" is a
+            # H*W int32 image storing the panoptic_id in the format of
+            # category_id * label_divisor + instance_id. We reserve -1 for
+            # VOID label.
+            label_divisor = metadata.label_divisor
+            segments_info = []
+            for panoptic_label in np.unique(panoptic_seg.numpy()):
+                if panoptic_label == -1:
+                    # VOID region.
+                    continue
+                pred_class = panoptic_label // label_divisor
+                isthing = pred_class in metadata.thing_dataset_id_to_contiguous_id.values()
+                segments_info.append(
+                    {
+                        "id": int(panoptic_label),
+                        "category_id": int(pred_class),
+                        "isthing": bool(isthing),
+                    }
+                )
+        del metadata
+
+        self._seg = panoptic_seg
+
+        self._sinfo = {s["id"]: s for s in segments_info}  # seg id -> seg info
+        segment_ids, areas = torch.unique(panoptic_seg, sorted=True, return_counts=True)
+        areas = areas.numpy()
+        sorted_idxs = np.argsort(-areas)
+        self._seg_ids, self._seg_areas = segment_ids[sorted_idxs], areas[sorted_idxs]
+        self._seg_ids = self._seg_ids.tolist()
+        for sid, area in zip(self._seg_ids, self._seg_areas):
+            if sid in self._sinfo:
+                self._sinfo[sid]["area"] = float(area)
+
+    def non_empty_mask(self):
+        """
+        Returns:
+            (H, W) array, a mask for all pixels that have a prediction
+        """
+        empty_ids = []
+        for id in self._seg_ids:
+            if id not in self._sinfo:
+                empty_ids.append(id)
+        if len(empty_ids) == 0:
+            return np.zeros(self._seg.shape, dtype=np.uint8)
+        assert (
+            len(empty_ids) == 1
+        ), ">1 ids corresponds to no labels. This is currently not supported"
+        return (self._seg != empty_ids[0]).numpy().astype(bool)
+
+    def semantic_masks(self):
+        for sid in self._seg_ids:
+            sinfo = self._sinfo.get(sid)
+            if sinfo is None or sinfo["isthing"]:
+                # Some pixels (e.g. id 0 in PanopticFPN) have no instance or semantic predictions.
+                continue
+            yield (self._seg == sid).numpy().astype(bool), sinfo
+
+    def instance_masks(self):
+        for sid in self._seg_ids:
+            sinfo = self._sinfo.get(sid)
+            if sinfo is None or not sinfo["isthing"]:
+                continue
+            mask = (self._seg == sid).numpy().astype(bool)
+            if mask.sum() > 0:
+                yield mask, sinfo
+
+
+def _create_text_labels(classes, scores, class_names, is_crowd=None):
+    """
+    Args:
+        classes (list[int] or None):
+        scores (list[float] or None):
+        class_names (list[str] or None):
+        is_crowd (list[bool] or None):
+
+    Returns:
+        list[str] or None
+    """
+    labels = None
+    if classes is not None:
+        if class_names is not None and len(class_names) > 0:
+            labels = [class_names[i] for i in classes]
+        else:
+            labels = [str(i) for i in classes]
+    if scores is not None:
+        if labels is None:
+            labels = ["{:.0f}%".format(s * 100) for s in scores]
+        else:
+            labels = ["{} {:.0f}%".format(l, s * 100) for l, s in zip(labels, scores)]
+    if labels is not None and is_crowd is not None:
+        labels = [l + ("|crowd" if crowd else "") for l, crowd in zip(labels, is_crowd)]
+    return labels
+
+
+class VisImage:
+    def __init__(self, img, scale=1.0):
+        """
+        Args:
+            img (ndarray): an RGB image of shape (H, W, 3) in range [0, 255].
+            scale (float): scale the input image
+        """
+        self.img = img
+        self.scale = scale
+        self.width, self.height = img.shape[1], img.shape[0]
+        self._setup_figure(img)
+
+    def _setup_figure(self, img):
+        """
+        Args:
+            Same as in :meth:`__init__()`.
+
+        Returns:
+            fig (matplotlib.pyplot.figure): top level container for all the image plot elements.
+            ax (matplotlib.pyplot.Axes): contains figure elements and sets the coordinate system.
+        """
+        fig = mplfigure.Figure(frameon=False)
+        self.dpi = fig.get_dpi()
+        # add a small 1e-2 to avoid precision lost due to matplotlib's truncation
+        # (https://github.com/matplotlib/matplotlib/issues/15363)
+        fig.set_size_inches(
+            (self.width * self.scale + 1e-2) / self.dpi,
+            (self.height * self.scale + 1e-2) / self.dpi,
+        )
+        self.canvas = FigureCanvasAgg(fig)
+        # self.canvas = mpl.backends.backend_cairo.FigureCanvasCairo(fig)
+        ax = fig.add_axes([0.0, 0.0, 1.0, 1.0])
+        ax.axis("off")
+        self.fig = fig
+        self.ax = ax
+        self.reset_image(img)
+
+    def reset_image(self, img):
+        """
+        Args:
+            img: same as in __init__
+        """
+        img = img.astype("uint8")
+        self.ax.imshow(img, extent=(0, self.width, self.height, 0), interpolation="nearest")
+
+    def save(self, filepath):
+        """
+        Args:
+            filepath (str): a string that contains the absolute path, including the file name, where
+                the visualized image will be saved.
+        """
+        self.fig.savefig(filepath)
+
+    def get_image(self):
+        """
+        Returns:
+            ndarray:
+                the visualized image of shape (H, W, 3) (RGB) in uint8 type.
+                The shape is scaled w.r.t the input image using the given `scale` argument.
+        """
+        canvas = self.canvas
+        s, (width, height) = canvas.print_to_buffer()
+        # buf = io.BytesIO()  # works for cairo backend
+        # canvas.print_rgba(buf)
+        # width, height = self.width, self.height
+        # s = buf.getvalue()
+
+        buffer = np.frombuffer(s, dtype="uint8")
+
+        img_rgba = buffer.reshape(height, width, 4)
+        rgb, alpha = np.split(img_rgba, [3], axis=2)
+        return rgb.astype("uint8")
+
+
+class Visualizer:
+    """
+    Visualizer that draws data about detection/segmentation on images.
+
+    It contains methods like `draw_{text,box,circle,line,binary_mask,polygon}`
+    that draw primitive objects to images, as well as high-level wrappers like
+    `draw_{instance_predictions,sem_seg,panoptic_seg_predictions,dataset_dict}`
+    that draw composite data in some pre-defined style.
+
+    Note that the exact visualization style for the high-level wrappers are subject to change.
+    Style such as color, opacity, label contents, visibility of labels, or even the visibility
+    of objects themselves (e.g. when the object is too small) may change according
+    to different heuristics, as long as the results still look visually reasonable.
+
+    To obtain a consistent style, you can implement custom drawing functions with the
+    abovementioned primitive methods instead. If you need more customized visualization
+    styles, you can process the data yourself following their format documented in
+    tutorials (:doc:`/tutorials/models`, :doc:`/tutorials/datasets`). This class does not
+    intend to satisfy everyone's preference on drawing styles.
+
+    This visualizer focuses on high rendering quality rather than performance. It is not
+    designed to be used for real-time applications.
+    """
+
+    # TODO implement a fast, rasterized version using OpenCV
+
+    def __init__(self, img_rgb, metadata=None, scale=1.0, instance_mode=ColorMode.IMAGE):
+        """
+        Args:
+            img_rgb: a numpy array of shape (H, W, C), where H and W correspond to
+                the height and width of the image respectively. C is the number of
+                color channels. The image is required to be in RGB format since that
+                is a requirement of the Matplotlib library. The image is also expected
+                to be in the range [0, 255].
+            metadata (Metadata): dataset metadata (e.g. class names and colors)
+            instance_mode (ColorMode): defines one of the pre-defined style for drawing
+                instances on an image.
+        """
+        self.img = np.asarray(img_rgb).clip(0, 255).astype(np.uint8)
+        if metadata is None:
+            metadata = MetadataCatalog.get("__nonexist__")
+        self.metadata = metadata
+        self.output = VisImage(self.img, scale=scale)
+        self.cpu_device = torch.device("cpu")
+
+        # too small texts are useless, therefore clamp to 9
+        self._default_font_size = max(
+            np.sqrt(self.output.height * self.output.width) // 90, 10 // scale
+        )
+        self._instance_mode = instance_mode
+        self.keypoint_threshold = _KEYPOINT_THRESHOLD
+
+    def draw_instance_predictions(self, predictions):
+        """
+        Draw instance-level prediction results on an image.
+
+        Args:
+            predictions (Instances): the output of an instance detection/segmentation
+                model. Following fields will be used to draw:
+                "pred_boxes", "pred_classes", "scores", "pred_masks" (or "pred_masks_rle").
+
+        Returns:
+            output (VisImage): image object with visualizations.
+        """
+        boxes = predictions.pred_boxes if predictions.has("pred_boxes") else None
+        scores = predictions.scores if predictions.has("scores") else None
+        classes = predictions.pred_classes.tolist() if predictions.has("pred_classes") else None
+        labels = _create_text_labels(classes, scores, self.metadata.get("thing_classes", None))
+        keypoints = predictions.pred_keypoints if predictions.has("pred_keypoints") else None
+
+        if predictions.has("pred_masks"):
+            masks = np.asarray(predictions.pred_masks)
+            masks = [GenericMask(x, self.output.height, self.output.width) for x in masks]
+        else:
+            masks = None
+
+        if self._instance_mode == ColorMode.SEGMENTATION and self.metadata.get("thing_colors"):
+            colors = [
+                self._jitter([x / 255 for x in self.metadata.thing_colors[c]]) for c in classes
+            ]
+            alpha = 0.8
+        else:
+            colors = None
+            alpha = 0.5
+
+        if self._instance_mode == ColorMode.IMAGE_BW:
+            self.output.reset_image(
+                self._create_grayscale_image(
+                    (predictions.pred_masks.any(dim=0) > 0).numpy()
+                    if predictions.has("pred_masks")
+                    else None
+                )
+            )
+            alpha = 0.3
+
+        self.overlay_instances(
+            masks=masks,
+            boxes=boxes,
+            labels=labels,
+            keypoints=keypoints,
+            assigned_colors=colors,
+            alpha=alpha,
+        )
+        return self.output
+
+    def draw_sem_seg(self, sem_seg, area_threshold=None, alpha=0.8):
+        """
+        Draw semantic segmentation predictions/labels.
+
+        Args:
+            sem_seg (Tensor or ndarray): the segmentation of shape (H, W).
+                Each value is the integer label of the pixel.
+            area_threshold (int): segments with less than `area_threshold` are not drawn.
+            alpha (float): the larger it is, the more opaque the segmentations are.
+
+        Returns:
+            output (VisImage): image object with visualizations.
+        """
+        if isinstance(sem_seg, torch.Tensor):
+            sem_seg = sem_seg.numpy()
+        labels, areas = np.unique(sem_seg, return_counts=True)
+        sorted_idxs = np.argsort(-areas).tolist()
+        labels = labels[sorted_idxs]
+        for label in filter(lambda l: l < len(self.metadata.stuff_classes), labels):
+            try:
+                mask_color = [x / 255 for x in self.metadata.stuff_colors[label]]
+            except (AttributeError, IndexError):
+                mask_color = None
+
+            binary_mask = (sem_seg == label).astype(np.uint8)
+            text = self.metadata.stuff_classes[label]
+            self.draw_binary_mask(
+                binary_mask,
+                color=mask_color,
+                edge_color=_OFF_WHITE,
+                text=text,
+                alpha=alpha,
+                area_threshold=area_threshold,
+            )
+        return self.output
+
+    def draw_panoptic_seg(self, panoptic_seg, segments_info, area_threshold=None, alpha=0.7):
+        """
+        Draw panoptic prediction annotations or results.
+
+        Args:
+            panoptic_seg (Tensor): of shape (height, width) where the values are ids for each
+                segment.
+            segments_info (list[dict] or None): Describe each segment in `panoptic_seg`.
+                If it is a ``list[dict]``, each dict contains keys "id", "category_id".
+                If None, category id of each pixel is computed by
+                ``pixel // metadata.label_divisor``.
+            area_threshold (int): stuff segments with less than `area_threshold` are not drawn.
+
+        Returns:
+            output (VisImage): image object with visualizations.
+        """
+        pred = _PanopticPrediction(panoptic_seg, segments_info, self.metadata)
+
+        if self._instance_mode == ColorMode.IMAGE_BW:
+            self.output.reset_image(self._create_grayscale_image(pred.non_empty_mask()))
+
+        # draw mask for all semantic segments first i.e. "stuff"
+        for mask, sinfo in pred.semantic_masks():
+            category_idx = sinfo["category_id"]
+            try:
+                mask_color = [x / 255 for x in self.metadata.stuff_colors[category_idx]]
+            except AttributeError:
+                mask_color = None
+
+            text = self.metadata.stuff_classes[category_idx]
+            self.draw_binary_mask(
+                mask,
+                color=mask_color,
+                edge_color=_OFF_WHITE,
+                text=text,
+                alpha=alpha,
+                area_threshold=area_threshold,
+            )
+
+        # draw mask for all instances second
+        all_instances = list(pred.instance_masks())
+        if len(all_instances) == 0:
+            return self.output
+        masks, sinfo = list(zip(*all_instances))
+        category_ids = [x["category_id"] for x in sinfo]
+
+        try:
+            scores = [x["score"] for x in sinfo]
+        except KeyError:
+            scores = None
+        labels = _create_text_labels(
+            category_ids, scores, self.metadata.thing_classes, [x.get("iscrowd", 0) for x in sinfo]
+        )
+
+        try:
+            colors = [
+                self._jitter([x / 255 for x in self.metadata.thing_colors[c]]) for c in category_ids
+            ]
+        except AttributeError:
+            colors = None
+        self.overlay_instances(masks=masks, labels=labels, assigned_colors=colors, alpha=alpha)
+
+        return self.output
+
+    draw_panoptic_seg_predictions = draw_panoptic_seg  # backward compatibility
+
+    def draw_dataset_dict(self, dic):
+        """
+        Draw annotations/segmentations in Detectron2 Dataset format.
+
+        Args:
+            dic (dict): annotation/segmentation data of one image, in Detectron2 Dataset format.
+
+        Returns:
+            output (VisImage): image object with visualizations.
+        """
+        annos = dic.get("annotations", None)
+        if annos:
+            if "segmentation" in annos[0]:
+                masks = [x["segmentation"] for x in annos]
+            else:
+                masks = None
+            if "keypoints" in annos[0]:
+                keypts = [x["keypoints"] for x in annos]
+                keypts = np.array(keypts).reshape(len(annos), -1, 3)
+            else:
+                keypts = None
+
+            boxes = [
+                BoxMode.convert(x["bbox"], x["bbox_mode"], BoxMode.XYXY_ABS)
+                if len(x["bbox"]) == 4
+                else x["bbox"]
+                for x in annos
+            ]
+
+            colors = None
+            category_ids = [x["category_id"] for x in annos]
+            if self._instance_mode == ColorMode.SEGMENTATION and self.metadata.get("thing_colors"):
+                colors = [
+                    self._jitter([x / 255 for x in self.metadata.thing_colors[c]])
+                    for c in category_ids
+                ]
+            names = self.metadata.get("thing_classes", None)
+            labels = _create_text_labels(
+                category_ids,
+                scores=None,
+                class_names=names,
+                is_crowd=[x.get("iscrowd", 0) for x in annos],
+            )
+            self.overlay_instances(
+                labels=labels, boxes=boxes, masks=masks, keypoints=keypts, assigned_colors=colors
+            )
+
+        sem_seg = dic.get("sem_seg", None)
+        if sem_seg is None and "sem_seg_file_name" in dic:
+            with PathManager.open(dic["sem_seg_file_name"], "rb") as f:
+                sem_seg = Image.open(f)
+                sem_seg = np.asarray(sem_seg, dtype="uint8")
+        if sem_seg is not None:
+            self.draw_sem_seg(sem_seg, area_threshold=0, alpha=0.5)
+
+        pan_seg = dic.get("pan_seg", None)
+        if pan_seg is None and "pan_seg_file_name" in dic:
+            with PathManager.open(dic["pan_seg_file_name"], "rb") as f:
+                pan_seg = Image.open(f)
+                pan_seg = np.asarray(pan_seg)
+                from panopticapi.utils import rgb2id
+
+                pan_seg = rgb2id(pan_seg)
+        if pan_seg is not None:
+            segments_info = dic["segments_info"]
+            pan_seg = torch.tensor(pan_seg)
+            self.draw_panoptic_seg(pan_seg, segments_info, area_threshold=0, alpha=0.5)
+        return self.output
+
+    def overlay_instances(
+        self,
+        *,
+        boxes=None,
+        labels=None,
+        masks=None,
+        keypoints=None,
+        assigned_colors=None,
+        alpha=0.5,
+    ):
+        """
+        Args:
+            boxes (Boxes, RotatedBoxes or ndarray): either a :class:`Boxes`,
+                or an Nx4 numpy array of XYXY_ABS format for the N objects in a single image,
+                or a :class:`RotatedBoxes`,
+                or an Nx5 numpy array of (x_center, y_center, width, height, angle_degrees) format
+                for the N objects in a single image,
+            labels (list[str]): the text to be displayed for each instance.
+            masks (masks-like object): Supported types are:
+
+                * :class:`detectron2.structures.PolygonMasks`,
+                  :class:`detectron2.structures.BitMasks`.
+                * list[list[ndarray]]: contains the segmentation masks for all objects in one image.
+                  The first level of the list corresponds to individual instances. The second
+                  level to all the polygon that compose the instance, and the third level
+                  to the polygon coordinates. The third level should have the format of
+                  [x0, y0, x1, y1, ..., xn, yn] (n >= 3).
+                * list[ndarray]: each ndarray is a binary mask of shape (H, W).
+                * list[dict]: each dict is a COCO-style RLE.
+            keypoints (Keypoint or array like): an array-like object of shape (N, K, 3),
+                where the N is the number of instances and K is the number of keypoints.
+                The last dimension corresponds to (x, y, visibility or score).
+            assigned_colors (list[matplotlib.colors]): a list of colors, where each color
+                corresponds to each mask or box in the image. Refer to 'matplotlib.colors'
+                for full list of formats that the colors are accepted in.
+        Returns:
+            output (VisImage): image object with visualizations.
+        """
+        num_instances = 0
+        if boxes is not None:
+            boxes = self._convert_boxes(boxes)
+            num_instances = len(boxes)
+        if masks is not None:
+            masks = self._convert_masks(masks)
+            if num_instances:
+                assert len(masks) == num_instances
+            else:
+                num_instances = len(masks)
+        if keypoints is not None:
+            if num_instances:
+                assert len(keypoints) == num_instances
+            else:
+                num_instances = len(keypoints)
+            keypoints = self._convert_keypoints(keypoints)
+        if labels is not None:
+            assert len(labels) == num_instances
+        if assigned_colors is None:
+            assigned_colors = [random_color(rgb=True, maximum=1) for _ in range(num_instances)]
+        if num_instances == 0:
+            return self.output
+        if boxes is not None and boxes.shape[1] == 5:
+            return self.overlay_rotated_instances(
+                boxes=boxes, labels=labels, assigned_colors=assigned_colors
+            )
+
+        # Display in largest to smallest order to reduce occlusion.
+        areas = None
+        if boxes is not None:
+            areas = np.prod(boxes[:, 2:] - boxes[:, :2], axis=1)
+        elif masks is not None:
+            areas = np.asarray([x.area() for x in masks])
+
+        if areas is not None:
+            sorted_idxs = np.argsort(-areas).tolist()
+            # Re-order overlapped instances in descending order.
+            boxes = boxes[sorted_idxs] if boxes is not None else None
+            labels = [labels[k] for k in sorted_idxs] if labels is not None else None
+            masks = [masks[idx] for idx in sorted_idxs] if masks is not None else None
+            assigned_colors = [assigned_colors[idx] for idx in sorted_idxs]
+            keypoints = keypoints[sorted_idxs] if keypoints is not None else None
+
+        for i in range(num_instances):
+            color = assigned_colors[i]
+            if boxes is not None:
+                self.draw_box(boxes[i], edge_color=color)
+
+            if masks is not None:
+                for segment in masks[i].polygons:
+                    self.draw_polygon(segment.reshape(-1, 2), color, alpha=alpha)
+
+            if labels is not None:
+                # first get a box
+                if boxes is not None:
+                    x0, y0, x1, y1 = boxes[i]
+                    text_pos = (x0, y0)  # if drawing boxes, put text on the box corner.
+                    horiz_align = "left"
+                elif masks is not None:
+                    # skip small mask without polygon
+                    if len(masks[i].polygons) == 0:
+                        continue
+
+                    x0, y0, x1, y1 = masks[i].bbox()
+
+                    # draw text in the center (defined by median) when box is not drawn
+                    # median is less sensitive to outliers.
+                    text_pos = np.median(masks[i].mask.nonzero(), axis=1)[::-1]
+                    horiz_align = "center"
+                else:
+                    continue  # drawing the box confidence for keypoints isn't very useful.
+                # for small objects, draw text at the side to avoid occlusion
+                instance_area = (y1 - y0) * (x1 - x0)
+                if (
+                    instance_area < _SMALL_OBJECT_AREA_THRESH * self.output.scale
+                    or y1 - y0 < 40 * self.output.scale
+                ):
+                    if y1 >= self.output.height - 5:
+                        text_pos = (x1, y0)
+                    else:
+                        text_pos = (x0, y1)
+
+                height_ratio = (y1 - y0) / np.sqrt(self.output.height * self.output.width)
+                lighter_color = self._change_color_brightness(color, brightness_factor=0.7)
+                font_size = (
+                    np.clip((height_ratio - 0.02) / 0.08 + 1, 1.2, 2)
+                    * 0.5
+                    * self._default_font_size
+                )
+                self.draw_text(
+                    labels[i],
+                    text_pos,
+                    color=lighter_color,
+                    horizontal_alignment=horiz_align,
+                    font_size=font_size,
+                )
+
+        # draw keypoints
+        if keypoints is not None:
+            for keypoints_per_instance in keypoints:
+                self.draw_and_connect_keypoints(keypoints_per_instance)
+
+        return self.output
+
+    def overlay_rotated_instances(self, boxes=None, labels=None, assigned_colors=None):
+        """
+        Args:
+            boxes (ndarray): an Nx5 numpy array of
+                (x_center, y_center, width, height, angle_degrees) format
+                for the N objects in a single image.
+            labels (list[str]): the text to be displayed for each instance.
+            assigned_colors (list[matplotlib.colors]): a list of colors, where each color
+                corresponds to each mask or box in the image. Refer to 'matplotlib.colors'
+                for full list of formats that the colors are accepted in.
+
+        Returns:
+            output (VisImage): image object with visualizations.
+        """
+        num_instances = len(boxes)
+
+        if assigned_colors is None:
+            assigned_colors = [random_color(rgb=True, maximum=1) for _ in range(num_instances)]
+        if num_instances == 0:
+            return self.output
+
+        # Display in largest to smallest order to reduce occlusion.
+        if boxes is not None:
+            areas = boxes[:, 2] * boxes[:, 3]
+
+        sorted_idxs = np.argsort(-areas).tolist()
+        # Re-order overlapped instances in descending order.
+        boxes = boxes[sorted_idxs]
+        labels = [labels[k] for k in sorted_idxs] if labels is not None else None
+        colors = [assigned_colors[idx] for idx in sorted_idxs]
+
+        for i in range(num_instances):
+            self.draw_rotated_box_with_label(
+                boxes[i], edge_color=colors[i], label=labels[i] if labels is not None else None
+            )
+
+        return self.output
+
+    def draw_and_connect_keypoints(self, keypoints):
+        """
+        Draws keypoints of an instance and follows the rules for keypoint connections
+        to draw lines between appropriate keypoints. This follows color heuristics for
+        line color.
+
+        Args:
+            keypoints (Tensor): a tensor of shape (K, 3), where K is the number of keypoints
+                and the last dimension corresponds to (x, y, probability).
+
+        Returns:
+            output (VisImage): image object with visualizations.
+        """
+        visible = {}
+        keypoint_names = self.metadata.get("keypoint_names")
+        for idx, keypoint in enumerate(keypoints):
+
+            # draw keypoint
+            x, y, prob = keypoint
+            if prob > self.keypoint_threshold:
+                self.draw_circle((x, y), color=_RED)
+                if keypoint_names:
+                    keypoint_name = keypoint_names[idx]
+                    visible[keypoint_name] = (x, y)
+
+        if self.metadata.get("keypoint_connection_rules"):
+            for kp0, kp1, color in self.metadata.keypoint_connection_rules:
+                if kp0 in visible and kp1 in visible:
+                    x0, y0 = visible[kp0]
+                    x1, y1 = visible[kp1]
+                    color = tuple(x / 255.0 for x in color)
+                    self.draw_line([x0, x1], [y0, y1], color=color)
+
+        # draw lines from nose to mid-shoulder and mid-shoulder to mid-hip
+        # Note that this strategy is specific to person keypoints.
+        # For other keypoints, it should just do nothing
+        try:
+            ls_x, ls_y = visible["left_shoulder"]
+            rs_x, rs_y = visible["right_shoulder"]
+            mid_shoulder_x, mid_shoulder_y = (ls_x + rs_x) / 2, (ls_y + rs_y) / 2
+        except KeyError:
+            pass
+        else:
+            # draw line from nose to mid-shoulder
+            nose_x, nose_y = visible.get("nose", (None, None))
+            if nose_x is not None:
+                self.draw_line([nose_x, mid_shoulder_x], [nose_y, mid_shoulder_y], color=_RED)
+
+            try:
+                # draw line from mid-shoulder to mid-hip
+                lh_x, lh_y = visible["left_hip"]
+                rh_x, rh_y = visible["right_hip"]
+            except KeyError:
+                pass
+            else:
+                mid_hip_x, mid_hip_y = (lh_x + rh_x) / 2, (lh_y + rh_y) / 2
+                self.draw_line([mid_hip_x, mid_shoulder_x], [mid_hip_y, mid_shoulder_y], color=_RED)
+        return self.output
+
+    """
+    Primitive drawing functions:
+    """
+
+    def draw_text(
+        self,
+        text,
+        position,
+        *,
+        font_size=None,
+        color="g",
+        horizontal_alignment="center",
+        rotation=0,
+    ):
+        """
+        Args:
+            text (str): class label
+            position (tuple): a tuple of the x and y coordinates to place text on image.
+            font_size (int, optional): font of the text. If not provided, a font size
+                proportional to the image width is calculated and used.
+            color: color of the text. Refer to `matplotlib.colors` for full list
+                of formats that are accepted.
+            horizontal_alignment (str): see `matplotlib.text.Text`
+            rotation: rotation angle in degrees CCW
+
+        Returns:
+            output (VisImage): image object with text drawn.
+        """
+        if not font_size:
+            font_size = self._default_font_size
+
+        # since the text background is dark, we don't want the text to be dark
+        color = np.maximum(list(mplc.to_rgb(color)), 0.2)
+        color[np.argmax(color)] = max(0.8, np.max(color))
+
+        x, y = position
+        self.output.ax.text(
+            x,
+            y,
+            text,
+            size=font_size * self.output.scale,
+            family="sans-serif",
+            bbox={"facecolor": "black", "alpha": 0.8, "pad": 0.7, "edgecolor": "none"},
+            verticalalignment="top",
+            horizontalalignment=horizontal_alignment,
+            color=color,
+            zorder=10,
+            rotation=rotation,
+        )
+        return self.output
+
+    def draw_box(self, box_coord, alpha=0.5, edge_color="g", line_style="-"):
+        """
+        Args:
+            box_coord (tuple): a tuple containing x0, y0, x1, y1 coordinates, where x0 and y0
+                are the coordinates of the image's top left corner. x1 and y1 are the
+                coordinates of the image's bottom right corner.
+            alpha (float): blending efficient. Smaller values lead to more transparent masks.
+            edge_color: color of the outline of the box. Refer to `matplotlib.colors`
+                for full list of formats that are accepted.
+            line_style (string): the string to use to create the outline of the boxes.
+
+        Returns:
+            output (VisImage): image object with box drawn.
+        """
+        x0, y0, x1, y1 = box_coord
+        width = x1 - x0
+        height = y1 - y0
+
+        linewidth = max(self._default_font_size / 4, 1)
+
+        self.output.ax.add_patch(
+            mpl.patches.Rectangle(
+                (x0, y0),
+                width,
+                height,
+                fill=False,
+                edgecolor=edge_color,
+                linewidth=linewidth * self.output.scale,
+                alpha=alpha,
+                linestyle=line_style,
+            )
+        )
+        return self.output
+
+    def draw_rotated_box_with_label(
+        self, rotated_box, alpha=0.5, edge_color="g", line_style="-", label=None
+    ):
+        """
+        Draw a rotated box with label on its top-left corner.
+
+        Args:
+            rotated_box (tuple): a tuple containing (cnt_x, cnt_y, w, h, angle),
+                where cnt_x and cnt_y are the center coordinates of the box.
+                w and h are the width and height of the box. angle represents how
+                many degrees the box is rotated CCW with regard to the 0-degree box.
+            alpha (float): blending efficient. Smaller values lead to more transparent masks.
+            edge_color: color of the outline of the box. Refer to `matplotlib.colors`
+                for full list of formats that are accepted.
+            line_style (string): the string to use to create the outline of the boxes.
+            label (string): label for rotated box. It will not be rendered when set to None.
+
+        Returns:
+            output (VisImage): image object with box drawn.
+        """
+        cnt_x, cnt_y, w, h, angle = rotated_box
+        area = w * h
+        # use thinner lines when the box is small
+        linewidth = self._default_font_size / (
+            6 if area < _SMALL_OBJECT_AREA_THRESH * self.output.scale else 3
+        )
+
+        theta = angle * math.pi / 180.0
+        c = math.cos(theta)
+        s = math.sin(theta)
+        rect = [(-w / 2, h / 2), (-w / 2, -h / 2), (w / 2, -h / 2), (w / 2, h / 2)]
+        # x: left->right ; y: top->down
+        rotated_rect = [(s * yy + c * xx + cnt_x, c * yy - s * xx + cnt_y) for (xx, yy) in rect]
+        for k in range(4):
+            j = (k + 1) % 4
+            self.draw_line(
+                [rotated_rect[k][0], rotated_rect[j][0]],
+                [rotated_rect[k][1], rotated_rect[j][1]],
+                color=edge_color,
+                linestyle="--" if k == 1 else line_style,
+                linewidth=linewidth,
+            )
+
+        if label is not None:
+            text_pos = rotated_rect[1]  # topleft corner
+
+            height_ratio = h / np.sqrt(self.output.height * self.output.width)
+            label_color = self._change_color_brightness(edge_color, brightness_factor=0.7)
+            font_size = (
+                np.clip((height_ratio - 0.02) / 0.08 + 1, 1.2, 2) * 0.5 * self._default_font_size
+            )
+            self.draw_text(label, text_pos, color=label_color, font_size=font_size, rotation=angle)
+
+        return self.output
+
+    def draw_circle(self, circle_coord, color, radius=3):
+        """
+        Args:
+            circle_coord (list(int) or tuple(int)): contains the x and y coordinates
+                of the center of the circle.
+            color: color of the polygon. Refer to `matplotlib.colors` for a full list of
+                formats that are accepted.
+            radius (int): radius of the circle.
+
+        Returns:
+            output (VisImage): image object with box drawn.
+        """
+        x, y = circle_coord
+        self.output.ax.add_patch(
+            mpl.patches.Circle(circle_coord, radius=radius, fill=True, color=color)
+        )
+        return self.output
+
+    def draw_line(self, x_data, y_data, color, linestyle="-", linewidth=None):
+        """
+        Args:
+            x_data (list[int]): a list containing x values of all the points being drawn.
+                Length of list should match the length of y_data.
+            y_data (list[int]): a list containing y values of all the points being drawn.
+                Length of list should match the length of x_data.
+            color: color of the line. Refer to `matplotlib.colors` for a full list of
+                formats that are accepted.
+            linestyle: style of the line. Refer to `matplotlib.lines.Line2D`
+                for a full list of formats that are accepted.
+            linewidth (float or None): width of the line. When it's None,
+                a default value will be computed and used.
+
+        Returns:
+            output (VisImage): image object with line drawn.
+        """
+        if linewidth is None:
+            linewidth = self._default_font_size / 3
+        linewidth = max(linewidth, 1)
+        self.output.ax.add_line(
+            mpl.lines.Line2D(
+                x_data,
+                y_data,
+                linewidth=linewidth * self.output.scale,
+                color=color,
+                linestyle=linestyle,
+            )
+        )
+        return self.output
+
+    def draw_binary_mask(
+        self, binary_mask, color=None, *, edge_color=None, text=None, alpha=0.5, area_threshold=10
+    ):
+        """
+        Args:
+            binary_mask (ndarray): numpy array of shape (H, W), where H is the image height and
+                W is the image width. Each value in the array is either a 0 or 1 value of uint8
+                type.
+            color: color of the mask. Refer to `matplotlib.colors` for a full list of
+                formats that are accepted. If None, will pick a random color.
+            edge_color: color of the polygon edges. Refer to `matplotlib.colors` for a
+                full list of formats that are accepted.
+            text (str): if None, will be drawn on the object
+            alpha (float): blending efficient. Smaller values lead to more transparent masks.
+            area_threshold (float): a connected component smaller than this area will not be shown.
+
+        Returns:
+            output (VisImage): image object with mask drawn.
+        """
+        if color is None:
+            color = random_color(rgb=True, maximum=1)
+        color = mplc.to_rgb(color)
+
+        has_valid_segment = False
+        binary_mask = binary_mask.astype("uint8")  # opencv needs uint8
+        mask = GenericMask(binary_mask, self.output.height, self.output.width)
+        shape2d = (binary_mask.shape[0], binary_mask.shape[1])
+
+        if not mask.has_holes:
+            # draw polygons for regular masks
+            for segment in mask.polygons:
+                area = mask_util.area(mask_util.frPyObjects([segment], shape2d[0], shape2d[1]))
+                if area < (area_threshold or 0):
+                    continue
+                has_valid_segment = True
+                segment = segment.reshape(-1, 2)
+                self.draw_polygon(segment, color=color, edge_color=edge_color, alpha=alpha)
+        else:
+            # TODO: Use Path/PathPatch to draw vector graphics:
+            # https://stackoverflow.com/questions/8919719/how-to-plot-a-complex-polygon
+            rgba = np.zeros(shape2d + (4,), dtype="float32")
+            rgba[:, :, :3] = color
+            rgba[:, :, 3] = (mask.mask == 1).astype("float32") * alpha
+            has_valid_segment = True
+            self.output.ax.imshow(rgba, extent=(0, self.output.width, self.output.height, 0))
+
+        if text is not None and has_valid_segment:
+            lighter_color = self._change_color_brightness(color, brightness_factor=0.7)
+            self._draw_text_in_mask(binary_mask, text, lighter_color)
+        return self.output
+
+    def draw_soft_mask(self, soft_mask, color=None, *, text=None, alpha=0.5):
+        """
+        Args:
+            soft_mask (ndarray): float array of shape (H, W), each value in [0, 1].
+            color: color of the mask. Refer to `matplotlib.colors` for a full list of
+                formats that are accepted. If None, will pick a random color.
+            text (str): if None, will be drawn on the object
+            alpha (float): blending efficient. Smaller values lead to more transparent masks.
+
+        Returns:
+            output (VisImage): image object with mask drawn.
+        """
+        if color is None:
+            color = random_color(rgb=True, maximum=1)
+        color = mplc.to_rgb(color)
+
+        shape2d = (soft_mask.shape[0], soft_mask.shape[1])
+        rgba = np.zeros(shape2d + (4,), dtype="float32")
+        rgba[:, :, :3] = color
+        rgba[:, :, 3] = soft_mask * alpha
+        self.output.ax.imshow(rgba, extent=(0, self.output.width, self.output.height, 0))
+
+        if text is not None:
+            lighter_color = self._change_color_brightness(color, brightness_factor=0.7)
+            binary_mask = (soft_mask > 0.5).astype("uint8")
+            self._draw_text_in_mask(binary_mask, text, lighter_color)
+        return self.output
+
+    def draw_polygon(self, segment, color, edge_color=None, alpha=0.5):
+        """
+        Args:
+            segment: numpy array of shape Nx2, containing all the points in the polygon.
+            color: color of the polygon. Refer to `matplotlib.colors` for a full list of
+                formats that are accepted.
+            edge_color: color of the polygon edges. Refer to `matplotlib.colors` for a
+                full list of formats that are accepted. If not provided, a darker shade
+                of the polygon color will be used instead.
+            alpha (float): blending efficient. Smaller values lead to more transparent masks.
+
+        Returns:
+            output (VisImage): image object with polygon drawn.
+        """
+        if edge_color is None:
+            # make edge color darker than the polygon color
+            if alpha > 0.8:
+                edge_color = self._change_color_brightness(color, brightness_factor=-0.7)
+            else:
+                edge_color = color
+        edge_color = mplc.to_rgb(edge_color) + (1,)
+
+        polygon = mpl.patches.Polygon(
+            segment,
+            fill=True,
+            facecolor=mplc.to_rgb(color) + (alpha,),
+            edgecolor=edge_color,
+            linewidth=max(self._default_font_size // 15 * self.output.scale, 1),
+        )
+        self.output.ax.add_patch(polygon)
+        return self.output
+
+    """
+    Internal methods:
+    """
+
+    def _jitter(self, color):
+        """
+        Randomly modifies given color to produce a slightly different color than the color given.
+
+        Args:
+            color (tuple[double]): a tuple of 3 elements, containing the RGB values of the color
+                picked. The values in the list are in the [0.0, 1.0] range.
+
+        Returns:
+            jittered_color (tuple[double]): a tuple of 3 elements, containing the RGB values of the
+                color after being jittered. The values in the list are in the [0.0, 1.0] range.
+        """
+        color = mplc.to_rgb(color)
+        vec = np.random.rand(3)
+        # better to do it in another color space
+        vec = vec / np.linalg.norm(vec) * 0.5
+        res = np.clip(vec + color, 0, 1)
+        return tuple(res)
+
+    def _create_grayscale_image(self, mask=None):
+        """
+        Create a grayscale version of the original image.
+        The colors in masked area, if given, will be kept.
+        """
+        img_bw = self.img.astype("f4").mean(axis=2)
+        img_bw = np.stack([img_bw] * 3, axis=2)
+        if mask is not None:
+            img_bw[mask] = self.img[mask]
+        return img_bw
+
+    def _change_color_brightness(self, color, brightness_factor):
+        """
+        Depending on the brightness_factor, gives a lighter or darker color i.e. a color with
+        less or more saturation than the original color.
+
+        Args:
+            color: color of the polygon. Refer to `matplotlib.colors` for a full list of
+                formats that are accepted.
+            brightness_factor (float): a value in [-1.0, 1.0] range. A lightness factor of
+                0 will correspond to no change, a factor in [-1.0, 0) range will result in
+                a darker color and a factor in (0, 1.0] range will result in a lighter color.
+
+        Returns:
+            modified_color (tuple[double]): a tuple containing the RGB values of the
+                modified color. Each value in the tuple is in the [0.0, 1.0] range.
+        """
+        assert brightness_factor >= -1.0 and brightness_factor <= 1.0
+        color = mplc.to_rgb(color)
+        polygon_color = colorsys.rgb_to_hls(*mplc.to_rgb(color))
+        modified_lightness = polygon_color[1] + (brightness_factor * polygon_color[1])
+        modified_lightness = 0.0 if modified_lightness < 0.0 else modified_lightness
+        modified_lightness = 1.0 if modified_lightness > 1.0 else modified_lightness
+        modified_color = colorsys.hls_to_rgb(polygon_color[0], modified_lightness, polygon_color[2])
+        return tuple(np.clip(modified_color, 0.0, 1.0))
+
+    def _convert_boxes(self, boxes):
+        """
+        Convert different format of boxes to an NxB array, where B = 4 or 5 is the box dimension.
+        """
+        if isinstance(boxes, Boxes) or isinstance(boxes, RotatedBoxes):
+            return boxes.tensor.detach().numpy()
+        else:
+            return np.asarray(boxes)
+
+    def _convert_masks(self, masks_or_polygons):
+        """
+        Convert different format of masks or polygons to a tuple of masks and polygons.
+
+        Returns:
+            list[GenericMask]:
+        """
+
+        m = masks_or_polygons
+        if isinstance(m, PolygonMasks):
+            m = m.polygons
+        if isinstance(m, BitMasks):
+            m = m.tensor.numpy()
+        if isinstance(m, torch.Tensor):
+            m = m.numpy()
+        ret = []
+        for x in m:
+            if isinstance(x, GenericMask):
+                ret.append(x)
+            else:
+                ret.append(GenericMask(x, self.output.height, self.output.width))
+        return ret
+
+    def _draw_text_in_mask(self, binary_mask, text, color):
+        """
+        Find proper places to draw text given a binary mask.
+        """
+        # TODO sometimes drawn on wrong objects. the heuristics here can improve.
+        _num_cc, cc_labels, stats, centroids = cv2.connectedComponentsWithStats(binary_mask, 8)
+        if stats[1:, -1].size == 0:
+            return
+        largest_component_id = np.argmax(stats[1:, -1]) + 1
+
+        # draw text on the largest component, as well as other very large components.
+        for cid in range(1, _num_cc):
+            if cid == largest_component_id or stats[cid, -1] > _LARGE_MASK_AREA_THRESH:
+                # median is more stable than centroid
+                # center = centroids[largest_component_id]
+                center = np.median((cc_labels == cid).nonzero(), axis=1)[::-1]
+                self.draw_text(text, center, color=color)
+
+    def _convert_keypoints(self, keypoints):
+        if isinstance(keypoints, Keypoints):
+            keypoints = keypoints.tensor
+        keypoints = np.asarray(keypoints)
+        return keypoints
+
+    def get_output(self):
+        """
+        Returns:
+            output (VisImage): the image output containing the visualizations added
+            to the image.
+        """
+        return self.output
diff --git a/example/cloth/04469_00.jpg b/example/cloth/04469_00.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..88f4a5bfa02a8013e2a0bae9681610fc60998b4b
Binary files /dev/null and b/example/cloth/04469_00.jpg differ
diff --git a/example/cloth/04479_00.jpg b/example/cloth/04479_00.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..9a8ba2fa7b91476b73a94afd2edb283b5c501568
Binary files /dev/null and b/example/cloth/04479_00.jpg differ
diff --git a/example/cloth/04743_00.jpg b/example/cloth/04743_00.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..fd848a2cbf5152d94656a78614b31bb070dc12f6
Binary files /dev/null and b/example/cloth/04743_00.jpg differ
diff --git a/example/cloth/09133_00.jpg b/example/cloth/09133_00.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..8646d603bd3a9d9448b1020a0e75254da5ea3262
Binary files /dev/null and b/example/cloth/09133_00.jpg differ
diff --git a/example/cloth/09134_00.jpg b/example/cloth/09134_00.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..7c1c86040bdd711f8a9a530124e33c32da48acfb
Binary files /dev/null and b/example/cloth/09134_00.jpg differ
diff --git a/example/cloth/09163_00.jpg b/example/cloth/09163_00.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..ed3210e26927fbfb941e6a2ceb73868535f6bc7f
Binary files /dev/null and b/example/cloth/09163_00.jpg differ
diff --git a/example/cloth/09164_00.jpg b/example/cloth/09164_00.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..84f6706c5b6dbe1dacc90d055bc97f10b0081938
Binary files /dev/null and b/example/cloth/09164_00.jpg differ
diff --git a/example/cloth/09166_00.jpg b/example/cloth/09166_00.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..d96e82ad42fc1e9cf57ff4bd809da6a7dc92e13b
Binary files /dev/null and b/example/cloth/09166_00.jpg differ
diff --git a/example/cloth/09176_00.jpg b/example/cloth/09176_00.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..41f0bd6e86369f8cda5f795c7a1fd3a968c362a1
Binary files /dev/null and b/example/cloth/09176_00.jpg differ
diff --git a/example/cloth/09236_00.jpg b/example/cloth/09236_00.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..26fcc92e1c49adf3130119a01bfc06b181c1e38d
Binary files /dev/null and b/example/cloth/09236_00.jpg differ
diff --git a/example/cloth/09256_00.jpg b/example/cloth/09256_00.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..e2d7f917ec4c866d284a0d948975b701e1b191d6
Binary files /dev/null and b/example/cloth/09256_00.jpg differ
diff --git a/example/cloth/09263_00.jpg b/example/cloth/09263_00.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..4f8591390f45e351f640f396442a47c4237f9293
Binary files /dev/null and b/example/cloth/09263_00.jpg differ
diff --git a/example/cloth/09266_00.jpg b/example/cloth/09266_00.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..19183263ca66e2e710352832791172df7964ff5a
Binary files /dev/null and b/example/cloth/09266_00.jpg differ
diff --git a/example/cloth/09290_00.jpg b/example/cloth/09290_00.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..1955f9ec628e5235aba424b4640475d59af68545
Binary files /dev/null and b/example/cloth/09290_00.jpg differ
diff --git a/example/cloth/09305_00.jpg b/example/cloth/09305_00.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..4773e38a617fa706d4042f37875c0f9ef0bb65f2
Binary files /dev/null and b/example/cloth/09305_00.jpg differ
diff --git a/example/cloth/10165_00.jpg b/example/cloth/10165_00.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..b6268dbf6ef26c9f4c5e2bca700faba3933a7675
Binary files /dev/null and b/example/cloth/10165_00.jpg differ
diff --git a/example/cloth/14627_00.jpg b/example/cloth/14627_00.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..f16cbf859d821ec3554fdc2ac9c27a767d30f230
Binary files /dev/null and b/example/cloth/14627_00.jpg differ
diff --git a/example/cloth/14673_00.jpg b/example/cloth/14673_00.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..939e99a052f3439809734defa2b349c5d3bf03a8
Binary files /dev/null and b/example/cloth/14673_00.jpg differ
diff --git a/example/human/00034_00.jpg b/example/human/00034_00.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..4e239c48dd89ed814bce2482ad03b4f9b7ab50ec
Binary files /dev/null and b/example/human/00034_00.jpg differ
diff --git a/example/human/00035_00.jpg b/example/human/00035_00.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..08b1f025d31a8191a4c5d13853f6a7b51dad8892
Binary files /dev/null and b/example/human/00035_00.jpg differ
diff --git a/example/human/00055_00.jpg b/example/human/00055_00.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..37962f9a69f9f427b33695bbbc9266bde73f6657
Binary files /dev/null and b/example/human/00055_00.jpg differ
diff --git a/example/human/00121_00.jpg b/example/human/00121_00.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..536e81f784a5c90108e3980d73b884cc46eae106
Binary files /dev/null and b/example/human/00121_00.jpg differ
diff --git a/example/human/01992_00.jpg b/example/human/01992_00.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..7ed43e65422ed42a157efff45f072e66f911a506
Binary files /dev/null and b/example/human/01992_00.jpg differ
diff --git a/ip_adapter/__init__.py b/ip_adapter/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..3b1f1ff4e54e93ada7e85abc0f6687c5ecd3a338
--- /dev/null
+++ b/ip_adapter/__init__.py
@@ -0,0 +1,9 @@
+from .ip_adapter import IPAdapter, IPAdapterPlus, IPAdapterPlusXL, IPAdapterXL, IPAdapterFull
+
+__all__ = [
+    "IPAdapter",
+    "IPAdapterPlus",
+    "IPAdapterPlusXL",
+    "IPAdapterXL",
+    "IPAdapterFull",
+]
diff --git a/ip_adapter/__pycache__/__init__.cpython-310.pyc b/ip_adapter/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b2c7a2d71e006e9c3f8361ddbd012132862f79b0
Binary files /dev/null and b/ip_adapter/__pycache__/__init__.cpython-310.pyc differ
diff --git a/ip_adapter/__pycache__/attention_processor.cpython-310.pyc b/ip_adapter/__pycache__/attention_processor.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1b62abfc220d7340b42d499119653f704266492f
Binary files /dev/null and b/ip_adapter/__pycache__/attention_processor.cpython-310.pyc differ
diff --git a/ip_adapter/__pycache__/ip_adapter.cpython-310.pyc b/ip_adapter/__pycache__/ip_adapter.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..480b211276235a57595640df4b9f309f60c666e1
Binary files /dev/null and b/ip_adapter/__pycache__/ip_adapter.cpython-310.pyc differ
diff --git a/ip_adapter/__pycache__/resampler.cpython-310.pyc b/ip_adapter/__pycache__/resampler.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..61e8bedcfa5239af90876abc844fbd9e7de3cdf3
Binary files /dev/null and b/ip_adapter/__pycache__/resampler.cpython-310.pyc differ
diff --git a/ip_adapter/__pycache__/utils.cpython-310.pyc b/ip_adapter/__pycache__/utils.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8c62fea673050935e0f7b108ed18c25c026d28d0
Binary files /dev/null and b/ip_adapter/__pycache__/utils.cpython-310.pyc differ
diff --git a/ip_adapter/attention_processor.py b/ip_adapter/attention_processor.py
new file mode 100644
index 0000000000000000000000000000000000000000..07fd6d8e2fc622b1e5a2bdd52119566b8b5219ef
--- /dev/null
+++ b/ip_adapter/attention_processor.py
@@ -0,0 +1,558 @@
+# modified from https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+class AttnProcessor(nn.Module):
+    r"""
+    Default processor for performing attention-related computations.
+    """
+
+    def __init__(
+        self,
+        hidden_size=None,
+        cross_attention_dim=None,
+    ):
+        super().__init__()
+
+    def __call__(
+        self,
+        attn,
+        hidden_states,
+        encoder_hidden_states=None,
+        attention_mask=None,
+        temb=None,
+    ):
+        residual = hidden_states
+
+        if attn.spatial_norm is not None:
+            hidden_states = attn.spatial_norm(hidden_states, temb)
+
+        input_ndim = hidden_states.ndim
+
+        if input_ndim == 4:
+            batch_size, channel, height, width = hidden_states.shape
+            hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
+
+        batch_size, sequence_length, _ = (
+            hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
+        )
+        attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
+
+        if attn.group_norm is not None:
+            hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
+
+        query = attn.to_q(hidden_states)
+
+        if encoder_hidden_states is None:
+            encoder_hidden_states = hidden_states
+        elif attn.norm_cross:
+            encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
+
+        key = attn.to_k(encoder_hidden_states)
+        value = attn.to_v(encoder_hidden_states)
+
+        query = attn.head_to_batch_dim(query)
+        key = attn.head_to_batch_dim(key)
+        value = attn.head_to_batch_dim(value)
+
+        attention_probs = attn.get_attention_scores(query, key, attention_mask)
+        hidden_states = torch.bmm(attention_probs, value)
+        hidden_states = attn.batch_to_head_dim(hidden_states)
+
+        # linear proj
+        hidden_states = attn.to_out[0](hidden_states)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+
+        if input_ndim == 4:
+            hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
+
+        if attn.residual_connection:
+            hidden_states = hidden_states + residual
+
+        hidden_states = hidden_states / attn.rescale_output_factor
+
+        return hidden_states
+
+
+class IPAttnProcessor(nn.Module):
+    r"""
+    Attention processor for IP-Adapater.
+    Args:
+        hidden_size (`int`):
+            The hidden size of the attention layer.
+        cross_attention_dim (`int`):
+            The number of channels in the `encoder_hidden_states`.
+        scale (`float`, defaults to 1.0):
+            the weight scale of image prompt.
+        num_tokens (`int`, defaults to 4 when do ip_adapter_plus it should be 16):
+            The context length of the image features.
+    """
+
+    def __init__(self, hidden_size, cross_attention_dim=None, scale=1.0, num_tokens=4):
+        super().__init__()
+
+        self.hidden_size = hidden_size
+        self.cross_attention_dim = cross_attention_dim
+        self.scale = scale
+        self.num_tokens = num_tokens
+
+        self.to_k_ip = nn.Linear(cross_attention_dim or hidden_size, hidden_size, bias=False)
+        self.to_v_ip = nn.Linear(cross_attention_dim or hidden_size, hidden_size, bias=False)
+
+    def __call__(
+        self,
+        attn,
+        hidden_states,
+        encoder_hidden_states=None,
+        attention_mask=None,
+        temb=None,
+    ):
+        residual = hidden_states
+
+        if attn.spatial_norm is not None:
+            hidden_states = attn.spatial_norm(hidden_states, temb)
+
+        input_ndim = hidden_states.ndim
+
+        if input_ndim == 4:
+            batch_size, channel, height, width = hidden_states.shape
+            hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
+
+        batch_size, sequence_length, _ = (
+            hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
+        )
+        attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
+
+        if attn.group_norm is not None:
+            hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
+
+        query = attn.to_q(hidden_states)
+
+        if encoder_hidden_states is None:
+            encoder_hidden_states = hidden_states
+        else:
+            # get encoder_hidden_states, ip_hidden_states
+            end_pos = encoder_hidden_states.shape[1] - self.num_tokens
+            encoder_hidden_states, ip_hidden_states = (
+                encoder_hidden_states[:, :end_pos, :],
+                encoder_hidden_states[:, end_pos:, :],
+            )
+            if attn.norm_cross:
+                encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
+
+        key = attn.to_k(encoder_hidden_states)
+        value = attn.to_v(encoder_hidden_states)
+
+        query = attn.head_to_batch_dim(query)
+        key = attn.head_to_batch_dim(key)
+        value = attn.head_to_batch_dim(value)
+
+        attention_probs = attn.get_attention_scores(query, key, attention_mask)
+        hidden_states = torch.bmm(attention_probs, value)
+        hidden_states = attn.batch_to_head_dim(hidden_states)
+
+        # for ip-adapter
+        ip_key = self.to_k_ip(ip_hidden_states)
+        ip_value = self.to_v_ip(ip_hidden_states)
+
+        ip_key = attn.head_to_batch_dim(ip_key)
+        ip_value = attn.head_to_batch_dim(ip_value)
+
+        ip_attention_probs = attn.get_attention_scores(query, ip_key, None)
+        self.attn_map = ip_attention_probs
+        ip_hidden_states = torch.bmm(ip_attention_probs, ip_value)
+        ip_hidden_states = attn.batch_to_head_dim(ip_hidden_states)
+
+        hidden_states = hidden_states + self.scale * ip_hidden_states
+
+        # linear proj
+        hidden_states = attn.to_out[0](hidden_states)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+
+        if input_ndim == 4:
+            hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
+
+        if attn.residual_connection:
+            hidden_states = hidden_states + residual
+
+        hidden_states = hidden_states / attn.rescale_output_factor
+
+        return hidden_states
+
+
+class AttnProcessor2_0(torch.nn.Module):
+    r"""
+    Processor for implementing scaled dot-product attention (enabled by default if you're using PyTorch 2.0).
+    """
+
+    def __init__(
+        self,
+        hidden_size=None,
+        cross_attention_dim=None,
+    ):
+        super().__init__()
+        if not hasattr(F, "scaled_dot_product_attention"):
+            raise ImportError("AttnProcessor2_0 requires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0.")
+
+    def __call__(
+        self,
+        attn,
+        hidden_states,
+        encoder_hidden_states=None,
+        attention_mask=None,
+        temb=None,
+    ):
+        residual = hidden_states
+
+        if attn.spatial_norm is not None:
+            hidden_states = attn.spatial_norm(hidden_states, temb)
+
+        input_ndim = hidden_states.ndim
+
+        if input_ndim == 4:
+            batch_size, channel, height, width = hidden_states.shape
+            hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
+
+        batch_size, sequence_length, _ = (
+            hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
+        )
+
+        if attention_mask is not None:
+            attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
+            # scaled_dot_product_attention expects attention_mask shape to be
+            # (batch, heads, source_length, target_length)
+            attention_mask = attention_mask.view(batch_size, attn.heads, -1, attention_mask.shape[-1])
+
+        if attn.group_norm is not None:
+            hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
+
+        query = attn.to_q(hidden_states)
+
+        if encoder_hidden_states is None:
+            encoder_hidden_states = hidden_states
+        elif attn.norm_cross:
+            encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
+
+        key = attn.to_k(encoder_hidden_states)
+        value = attn.to_v(encoder_hidden_states)
+
+        inner_dim = key.shape[-1]
+        head_dim = inner_dim // attn.heads
+
+        query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+
+        key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+
+        # the output of sdp = (batch, num_heads, seq_len, head_dim)
+        # TODO: add support for attn.scale when we move to Torch 2.1
+        hidden_states = F.scaled_dot_product_attention(
+            query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False
+        )
+
+        hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
+        hidden_states = hidden_states.to(query.dtype)
+
+        # linear proj
+        hidden_states = attn.to_out[0](hidden_states)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+
+        if input_ndim == 4:
+            hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
+
+        if attn.residual_connection:
+            hidden_states = hidden_states + residual
+
+        hidden_states = hidden_states / attn.rescale_output_factor
+
+        return hidden_states
+
+
+class IPAttnProcessor2_0(torch.nn.Module):
+    r"""
+    Attention processor for IP-Adapater for PyTorch 2.0.
+    Args:
+        hidden_size (`int`):
+            The hidden size of the attention layer.
+        cross_attention_dim (`int`):
+            The number of channels in the `encoder_hidden_states`.
+        scale (`float`, defaults to 1.0):
+            the weight scale of image prompt.
+        num_tokens (`int`, defaults to 4 when do ip_adapter_plus it should be 16):
+            The context length of the image features.
+    """
+
+    def __init__(self, hidden_size, cross_attention_dim=None, scale=1.0, num_tokens=4):
+        super().__init__()
+
+        if not hasattr(F, "scaled_dot_product_attention"):
+            raise ImportError("AttnProcessor2_0 requires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0.")
+
+        self.hidden_size = hidden_size
+        self.cross_attention_dim = cross_attention_dim
+        self.scale = scale
+        self.num_tokens = num_tokens
+
+        self.to_k_ip = nn.Linear(cross_attention_dim or hidden_size, hidden_size, bias=False)
+        self.to_v_ip = nn.Linear(cross_attention_dim or hidden_size, hidden_size, bias=False)
+
+    def __call__(
+        self,
+        attn,
+        hidden_states,
+        encoder_hidden_states=None,
+        attention_mask=None,
+        temb=None,
+    ):
+        residual = hidden_states
+
+        if attn.spatial_norm is not None:
+            hidden_states = attn.spatial_norm(hidden_states, temb)
+
+        input_ndim = hidden_states.ndim
+
+        if input_ndim == 4:
+            batch_size, channel, height, width = hidden_states.shape
+            hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
+
+        batch_size, sequence_length, _ = (
+            hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
+        )
+
+        if attention_mask is not None:
+            attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
+            # scaled_dot_product_attention expects attention_mask shape to be
+            # (batch, heads, source_length, target_length)
+            attention_mask = attention_mask.view(batch_size, attn.heads, -1, attention_mask.shape[-1])
+
+        if attn.group_norm is not None:
+            hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
+
+        query = attn.to_q(hidden_states)
+
+        if encoder_hidden_states is None:
+            encoder_hidden_states = hidden_states
+        else:
+            # get encoder_hidden_states, ip_hidden_states
+            end_pos = encoder_hidden_states.shape[1] - self.num_tokens
+            encoder_hidden_states, ip_hidden_states = (
+                encoder_hidden_states[:, :end_pos, :],
+                encoder_hidden_states[:, end_pos:, :],
+            )
+            if attn.norm_cross:
+                encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
+
+        key = attn.to_k(encoder_hidden_states)
+        value = attn.to_v(encoder_hidden_states)
+
+        inner_dim = key.shape[-1]
+        head_dim = inner_dim // attn.heads
+
+        query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+
+        key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+
+        # the output of sdp = (batch, num_heads, seq_len, head_dim)
+        # TODO: add support for attn.scale when we move to Torch 2.1
+        hidden_states = F.scaled_dot_product_attention(
+            query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False
+        )
+
+        hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
+        hidden_states = hidden_states.to(query.dtype)
+
+        # for ip-adapter
+        ip_key = self.to_k_ip(ip_hidden_states)
+        ip_value = self.to_v_ip(ip_hidden_states)
+
+        ip_key = ip_key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        ip_value = ip_value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+
+        # the output of sdp = (batch, num_heads, seq_len, head_dim)
+        # TODO: add support for attn.scale when we move to Torch 2.1
+        ip_hidden_states = F.scaled_dot_product_attention(
+            query, ip_key, ip_value, attn_mask=None, dropout_p=0.0, is_causal=False
+        )
+        with torch.no_grad():
+            self.attn_map = query @ ip_key.transpose(-2, -1).softmax(dim=-1)
+            #print(self.attn_map.shape)
+
+        ip_hidden_states = ip_hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
+        ip_hidden_states = ip_hidden_states.to(query.dtype)
+
+        hidden_states = hidden_states + self.scale * ip_hidden_states
+
+        # linear proj
+        hidden_states = attn.to_out[0](hidden_states)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+
+        if input_ndim == 4:
+            hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
+
+        if attn.residual_connection:
+            hidden_states = hidden_states + residual
+
+        hidden_states = hidden_states / attn.rescale_output_factor
+
+        return hidden_states
+
+
+## for controlnet
+class CNAttnProcessor:
+    r"""
+    Default processor for performing attention-related computations.
+    """
+
+    def __init__(self, num_tokens=4):
+        self.num_tokens = num_tokens
+
+    def __call__(self, attn, hidden_states, encoder_hidden_states=None, attention_mask=None, temb=None):
+        residual = hidden_states
+
+        if attn.spatial_norm is not None:
+            hidden_states = attn.spatial_norm(hidden_states, temb)
+
+        input_ndim = hidden_states.ndim
+
+        if input_ndim == 4:
+            batch_size, channel, height, width = hidden_states.shape
+            hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
+
+        batch_size, sequence_length, _ = (
+            hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
+        )
+        attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
+
+        if attn.group_norm is not None:
+            hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
+
+        query = attn.to_q(hidden_states)
+
+        if encoder_hidden_states is None:
+            encoder_hidden_states = hidden_states
+        else:
+            end_pos = encoder_hidden_states.shape[1] - self.num_tokens
+            encoder_hidden_states = encoder_hidden_states[:, :end_pos]  # only use text
+            if attn.norm_cross:
+                encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
+
+        key = attn.to_k(encoder_hidden_states)
+        value = attn.to_v(encoder_hidden_states)
+
+        query = attn.head_to_batch_dim(query)
+        key = attn.head_to_batch_dim(key)
+        value = attn.head_to_batch_dim(value)
+
+        attention_probs = attn.get_attention_scores(query, key, attention_mask)
+        hidden_states = torch.bmm(attention_probs, value)
+        hidden_states = attn.batch_to_head_dim(hidden_states)
+
+        # linear proj
+        hidden_states = attn.to_out[0](hidden_states)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+
+        if input_ndim == 4:
+            hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
+
+        if attn.residual_connection:
+            hidden_states = hidden_states + residual
+
+        hidden_states = hidden_states / attn.rescale_output_factor
+
+        return hidden_states
+
+
+class CNAttnProcessor2_0:
+    r"""
+    Processor for implementing scaled dot-product attention (enabled by default if you're using PyTorch 2.0).
+    """
+
+    def __init__(self, num_tokens=4):
+        if not hasattr(F, "scaled_dot_product_attention"):
+            raise ImportError("AttnProcessor2_0 requires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0.")
+        self.num_tokens = num_tokens
+
+    def __call__(
+        self,
+        attn,
+        hidden_states,
+        encoder_hidden_states=None,
+        attention_mask=None,
+        temb=None,
+    ):
+        residual = hidden_states
+
+        if attn.spatial_norm is not None:
+            hidden_states = attn.spatial_norm(hidden_states, temb)
+
+        input_ndim = hidden_states.ndim
+
+        if input_ndim == 4:
+            batch_size, channel, height, width = hidden_states.shape
+            hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
+
+        batch_size, sequence_length, _ = (
+            hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
+        )
+
+        if attention_mask is not None:
+            attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
+            # scaled_dot_product_attention expects attention_mask shape to be
+            # (batch, heads, source_length, target_length)
+            attention_mask = attention_mask.view(batch_size, attn.heads, -1, attention_mask.shape[-1])
+
+        if attn.group_norm is not None:
+            hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
+
+        query = attn.to_q(hidden_states)
+
+        if encoder_hidden_states is None:
+            encoder_hidden_states = hidden_states
+        else:
+            end_pos = encoder_hidden_states.shape[1] - self.num_tokens
+            encoder_hidden_states = encoder_hidden_states[:, :end_pos]  # only use text
+            if attn.norm_cross:
+                encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
+
+        key = attn.to_k(encoder_hidden_states)
+        value = attn.to_v(encoder_hidden_states)
+
+        inner_dim = key.shape[-1]
+        head_dim = inner_dim // attn.heads
+
+        query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+
+        key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+
+        # the output of sdp = (batch, num_heads, seq_len, head_dim)
+        # TODO: add support for attn.scale when we move to Torch 2.1
+        hidden_states = F.scaled_dot_product_attention(
+            query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False
+        )
+
+        hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
+        hidden_states = hidden_states.to(query.dtype)
+
+        # linear proj
+        hidden_states = attn.to_out[0](hidden_states)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+
+        if input_ndim == 4:
+            hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
+
+        if attn.residual_connection:
+            hidden_states = hidden_states + residual
+
+        hidden_states = hidden_states / attn.rescale_output_factor
+
+        return hidden_states
diff --git a/ip_adapter/attention_processor_faceid.py b/ip_adapter/attention_processor_faceid.py
new file mode 100644
index 0000000000000000000000000000000000000000..ebc1ca1e5f45e1da21be543f8d28d5de2bb0eba9
--- /dev/null
+++ b/ip_adapter/attention_processor_faceid.py
@@ -0,0 +1,427 @@
+# modified from https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from diffusers.models.lora import LoRALinearLayer
+
+
+class LoRAAttnProcessor(nn.Module):
+    r"""
+    Default processor for performing attention-related computations.
+    """
+
+    def __init__(
+        self,
+        hidden_size=None,
+        cross_attention_dim=None,
+        rank=4,
+        network_alpha=None,
+        lora_scale=1.0,
+    ):
+        super().__init__()
+
+        self.rank = rank
+        self.lora_scale = lora_scale
+        
+        self.to_q_lora = LoRALinearLayer(hidden_size, hidden_size, rank, network_alpha)
+        self.to_k_lora = LoRALinearLayer(cross_attention_dim or hidden_size, hidden_size, rank, network_alpha)
+        self.to_v_lora = LoRALinearLayer(cross_attention_dim or hidden_size, hidden_size, rank, network_alpha)
+        self.to_out_lora = LoRALinearLayer(hidden_size, hidden_size, rank, network_alpha)
+
+    def __call__(
+        self,
+        attn,
+        hidden_states,
+        encoder_hidden_states=None,
+        attention_mask=None,
+        temb=None,
+    ):
+        residual = hidden_states
+
+        if attn.spatial_norm is not None:
+            hidden_states = attn.spatial_norm(hidden_states, temb)
+
+        input_ndim = hidden_states.ndim
+
+        if input_ndim == 4:
+            batch_size, channel, height, width = hidden_states.shape
+            hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
+
+        batch_size, sequence_length, _ = (
+            hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
+        )
+        attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
+
+        if attn.group_norm is not None:
+            hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
+
+        query = attn.to_q(hidden_states) + self.lora_scale * self.to_q_lora(hidden_states)
+
+        if encoder_hidden_states is None:
+            encoder_hidden_states = hidden_states
+        elif attn.norm_cross:
+            encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
+
+        key = attn.to_k(encoder_hidden_states) + self.lora_scale * self.to_k_lora(encoder_hidden_states)
+        value = attn.to_v(encoder_hidden_states) + self.lora_scale * self.to_v_lora(encoder_hidden_states)
+
+        query = attn.head_to_batch_dim(query)
+        key = attn.head_to_batch_dim(key)
+        value = attn.head_to_batch_dim(value)
+
+        attention_probs = attn.get_attention_scores(query, key, attention_mask)
+        hidden_states = torch.bmm(attention_probs, value)
+        hidden_states = attn.batch_to_head_dim(hidden_states)
+
+        # linear proj
+        hidden_states = attn.to_out[0](hidden_states) + self.lora_scale * self.to_out_lora(hidden_states)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+
+        if input_ndim == 4:
+            hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
+
+        if attn.residual_connection:
+            hidden_states = hidden_states + residual
+
+        hidden_states = hidden_states / attn.rescale_output_factor
+
+        return hidden_states
+
+
+class LoRAIPAttnProcessor(nn.Module):
+    r"""
+    Attention processor for IP-Adapater.
+    Args:
+        hidden_size (`int`):
+            The hidden size of the attention layer.
+        cross_attention_dim (`int`):
+            The number of channels in the `encoder_hidden_states`.
+        scale (`float`, defaults to 1.0):
+            the weight scale of image prompt.
+        num_tokens (`int`, defaults to 4 when do ip_adapter_plus it should be 16):
+            The context length of the image features.
+    """
+
+    def __init__(self, hidden_size, cross_attention_dim=None, rank=4, network_alpha=None, lora_scale=1.0, scale=1.0, num_tokens=4):
+        super().__init__()
+
+        self.rank = rank
+        self.lora_scale = lora_scale
+        
+        self.to_q_lora = LoRALinearLayer(hidden_size, hidden_size, rank, network_alpha)
+        self.to_k_lora = LoRALinearLayer(cross_attention_dim or hidden_size, hidden_size, rank, network_alpha)
+        self.to_v_lora = LoRALinearLayer(cross_attention_dim or hidden_size, hidden_size, rank, network_alpha)
+        self.to_out_lora = LoRALinearLayer(hidden_size, hidden_size, rank, network_alpha)
+
+        self.hidden_size = hidden_size
+        self.cross_attention_dim = cross_attention_dim
+        self.scale = scale
+        self.num_tokens = num_tokens
+
+        self.to_k_ip = nn.Linear(cross_attention_dim or hidden_size, hidden_size, bias=False)
+        self.to_v_ip = nn.Linear(cross_attention_dim or hidden_size, hidden_size, bias=False)
+
+    def __call__(
+        self,
+        attn,
+        hidden_states,
+        encoder_hidden_states=None,
+        attention_mask=None,
+        temb=None,
+    ):
+        residual = hidden_states
+
+        if attn.spatial_norm is not None:
+            hidden_states = attn.spatial_norm(hidden_states, temb)
+
+        input_ndim = hidden_states.ndim
+
+        if input_ndim == 4:
+            batch_size, channel, height, width = hidden_states.shape
+            hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
+
+        batch_size, sequence_length, _ = (
+            hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
+        )
+        attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
+
+        if attn.group_norm is not None:
+            hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
+
+        query = attn.to_q(hidden_states) + self.lora_scale * self.to_q_lora(hidden_states)
+
+        if encoder_hidden_states is None:
+            encoder_hidden_states = hidden_states
+        else:
+            # get encoder_hidden_states, ip_hidden_states
+            end_pos = encoder_hidden_states.shape[1] - self.num_tokens
+            encoder_hidden_states, ip_hidden_states = (
+                encoder_hidden_states[:, :end_pos, :],
+                encoder_hidden_states[:, end_pos:, :],
+            )
+            if attn.norm_cross:
+                encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
+
+        key = attn.to_k(encoder_hidden_states) + self.lora_scale * self.to_k_lora(encoder_hidden_states)
+        value = attn.to_v(encoder_hidden_states) + self.lora_scale * self.to_v_lora(encoder_hidden_states)
+
+        query = attn.head_to_batch_dim(query)
+        key = attn.head_to_batch_dim(key)
+        value = attn.head_to_batch_dim(value)
+
+        attention_probs = attn.get_attention_scores(query, key, attention_mask)
+        hidden_states = torch.bmm(attention_probs, value)
+        hidden_states = attn.batch_to_head_dim(hidden_states)
+
+        # for ip-adapter
+        ip_key = self.to_k_ip(ip_hidden_states)
+        ip_value = self.to_v_ip(ip_hidden_states)
+
+        ip_key = attn.head_to_batch_dim(ip_key)
+        ip_value = attn.head_to_batch_dim(ip_value)
+
+        ip_attention_probs = attn.get_attention_scores(query, ip_key, None)
+        self.attn_map = ip_attention_probs
+        ip_hidden_states = torch.bmm(ip_attention_probs, ip_value)
+        ip_hidden_states = attn.batch_to_head_dim(ip_hidden_states)
+
+        hidden_states = hidden_states + self.scale * ip_hidden_states
+
+        # linear proj
+        hidden_states = attn.to_out[0](hidden_states) + self.lora_scale * self.to_out_lora(hidden_states)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+
+        if input_ndim == 4:
+            hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
+
+        if attn.residual_connection:
+            hidden_states = hidden_states + residual
+
+        hidden_states = hidden_states / attn.rescale_output_factor
+
+        return hidden_states
+
+
+class LoRAAttnProcessor2_0(nn.Module):
+    
+    r"""
+    Default processor for performing attention-related computations.
+    """
+    
+    def __init__(
+        self,
+        hidden_size=None,
+        cross_attention_dim=None,
+        rank=4,
+        network_alpha=None,
+        lora_scale=1.0,
+    ):
+        super().__init__()
+        
+        self.rank = rank
+        self.lora_scale = lora_scale
+        
+        self.to_q_lora = LoRALinearLayer(hidden_size, hidden_size, rank, network_alpha)
+        self.to_k_lora = LoRALinearLayer(cross_attention_dim or hidden_size, hidden_size, rank, network_alpha)
+        self.to_v_lora = LoRALinearLayer(cross_attention_dim or hidden_size, hidden_size, rank, network_alpha)
+        self.to_out_lora = LoRALinearLayer(hidden_size, hidden_size, rank, network_alpha)
+
+    def __call__(
+        self,
+        attn,
+        hidden_states,
+        encoder_hidden_states=None,
+        attention_mask=None,
+        temb=None,
+    ):
+        residual = hidden_states
+
+        if attn.spatial_norm is not None:
+            hidden_states = attn.spatial_norm(hidden_states, temb)
+
+        input_ndim = hidden_states.ndim
+
+        if input_ndim == 4:
+            batch_size, channel, height, width = hidden_states.shape
+            hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
+
+        batch_size, sequence_length, _ = (
+            hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
+        )
+        attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
+
+        if attn.group_norm is not None:
+            hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
+
+        query = attn.to_q(hidden_states) + self.lora_scale * self.to_q_lora(hidden_states)
+
+        if encoder_hidden_states is None:
+            encoder_hidden_states = hidden_states
+        elif attn.norm_cross:
+            encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
+
+        key = attn.to_k(encoder_hidden_states) + self.lora_scale * self.to_k_lora(encoder_hidden_states)
+        value = attn.to_v(encoder_hidden_states) + self.lora_scale * self.to_v_lora(encoder_hidden_states)
+
+        inner_dim = key.shape[-1]
+        head_dim = inner_dim // attn.heads
+
+        query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+
+        key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+
+        # the output of sdp = (batch, num_heads, seq_len, head_dim)
+        # TODO: add support for attn.scale when we move to Torch 2.1
+        hidden_states = F.scaled_dot_product_attention(
+            query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False
+        )
+
+        hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
+        hidden_states = hidden_states.to(query.dtype)
+
+        # linear proj
+        hidden_states = attn.to_out[0](hidden_states) + self.lora_scale * self.to_out_lora(hidden_states)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+
+        if input_ndim == 4:
+            hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
+
+        if attn.residual_connection:
+            hidden_states = hidden_states + residual
+
+        hidden_states = hidden_states / attn.rescale_output_factor
+
+        return hidden_states
+
+
+class LoRAIPAttnProcessor2_0(nn.Module):
+    r"""
+    Processor for implementing the LoRA attention mechanism.
+
+    Args:
+        hidden_size (`int`, *optional*):
+            The hidden size of the attention layer.
+        cross_attention_dim (`int`, *optional*):
+            The number of channels in the `encoder_hidden_states`.
+        rank (`int`, defaults to 4):
+            The dimension of the LoRA update matrices.
+        network_alpha (`int`, *optional*):
+            Equivalent to `alpha` but it's usage is specific to Kohya (A1111) style LoRAs.
+    """
+
+    def __init__(self, hidden_size, cross_attention_dim=None, rank=4, network_alpha=None, lora_scale=1.0, scale=1.0, num_tokens=4):
+        super().__init__()
+        
+        self.rank = rank
+        self.lora_scale = lora_scale
+        self.num_tokens = num_tokens
+        
+        self.to_q_lora = LoRALinearLayer(hidden_size, hidden_size, rank, network_alpha)
+        self.to_k_lora = LoRALinearLayer(cross_attention_dim or hidden_size, hidden_size, rank, network_alpha)
+        self.to_v_lora = LoRALinearLayer(cross_attention_dim or hidden_size, hidden_size, rank, network_alpha)
+        self.to_out_lora = LoRALinearLayer(hidden_size, hidden_size, rank, network_alpha)
+        
+        
+        self.hidden_size = hidden_size
+        self.cross_attention_dim = cross_attention_dim
+        self.scale = scale
+
+        self.to_k_ip = nn.Linear(cross_attention_dim or hidden_size, hidden_size, bias=False)
+        self.to_v_ip = nn.Linear(cross_attention_dim or hidden_size, hidden_size, bias=False)
+
+    def __call__(
+        self, attn, hidden_states, encoder_hidden_states=None, attention_mask=None, scale=1.0, temb=None
+    ):
+        residual = hidden_states
+
+        if attn.spatial_norm is not None:
+            hidden_states = attn.spatial_norm(hidden_states, temb)
+
+        input_ndim = hidden_states.ndim
+
+        if input_ndim == 4:
+            batch_size, channel, height, width = hidden_states.shape
+            hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
+
+        batch_size, sequence_length, _ = (
+            hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
+        )
+        attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
+
+        if attn.group_norm is not None:
+            hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
+
+        query = attn.to_q(hidden_states) + self.lora_scale * self.to_q_lora(hidden_states)
+        #query = attn.head_to_batch_dim(query)
+
+        if encoder_hidden_states is None:
+            encoder_hidden_states = hidden_states
+        else:
+            # get encoder_hidden_states, ip_hidden_states
+            end_pos = encoder_hidden_states.shape[1] - self.num_tokens
+            encoder_hidden_states, ip_hidden_states = (
+                encoder_hidden_states[:, :end_pos, :],
+                encoder_hidden_states[:, end_pos:, :],
+            )
+            if attn.norm_cross:
+                encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
+
+        # for text
+        key = attn.to_k(encoder_hidden_states) + self.lora_scale * self.to_k_lora(encoder_hidden_states)
+        value = attn.to_v(encoder_hidden_states) + self.lora_scale * self.to_v_lora(encoder_hidden_states)
+
+        inner_dim = key.shape[-1]
+        head_dim = inner_dim // attn.heads
+
+        query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+
+        key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+
+        # the output of sdp = (batch, num_heads, seq_len, head_dim)
+        # TODO: add support for attn.scale when we move to Torch 2.1
+        hidden_states = F.scaled_dot_product_attention(
+            query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False
+        )
+
+        hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
+        hidden_states = hidden_states.to(query.dtype)
+        
+        # for ip
+        ip_key = self.to_k_ip(ip_hidden_states)
+        ip_value = self.to_v_ip(ip_hidden_states)
+        
+        ip_key = ip_key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        ip_value = ip_value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+
+        # the output of sdp = (batch, num_heads, seq_len, head_dim)
+        # TODO: add support for attn.scale when we move to Torch 2.1
+        ip_hidden_states = F.scaled_dot_product_attention(
+            query, ip_key, ip_value, attn_mask=None, dropout_p=0.0, is_causal=False
+        )
+        
+
+        ip_hidden_states = ip_hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
+        ip_hidden_states = ip_hidden_states.to(query.dtype)
+        
+        hidden_states = hidden_states + self.scale * ip_hidden_states
+
+        # linear proj
+        hidden_states = attn.to_out[0](hidden_states) + self.lora_scale * self.to_out_lora(hidden_states)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+
+        if input_ndim == 4:
+            hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
+
+        if attn.residual_connection:
+            hidden_states = hidden_states + residual
+
+        hidden_states = hidden_states / attn.rescale_output_factor
+
+        return hidden_states
diff --git a/ip_adapter/custom_pipelines.py b/ip_adapter/custom_pipelines.py
new file mode 100644
index 0000000000000000000000000000000000000000..7d43d2c34db9b83f6148fac53425a9fd4c60fc93
--- /dev/null
+++ b/ip_adapter/custom_pipelines.py
@@ -0,0 +1,394 @@
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+
+import torch
+from diffusers import StableDiffusionXLPipeline
+from diffusers.pipelines.stable_diffusion_xl import StableDiffusionXLPipelineOutput
+from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl import rescale_noise_cfg
+
+from .utils import is_torch2_available
+
+if is_torch2_available():
+    from .attention_processor import IPAttnProcessor2_0 as IPAttnProcessor
+else:
+    from .attention_processor import IPAttnProcessor
+
+
+class StableDiffusionXLCustomPipeline(StableDiffusionXLPipeline):
+    def set_scale(self, scale):
+        for attn_processor in self.unet.attn_processors.values():
+            if isinstance(attn_processor, IPAttnProcessor):
+                attn_processor.scale = scale
+
+    @torch.no_grad()
+    def __call__(  # noqa: C901
+        self,
+        prompt: Optional[Union[str, List[str]]] = None,
+        prompt_2: Optional[Union[str, List[str]]] = None,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 50,
+        denoising_end: Optional[float] = None,
+        guidance_scale: float = 5.0,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        negative_prompt_2: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
+        callback_steps: int = 1,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        guidance_rescale: float = 0.0,
+        original_size: Optional[Tuple[int, int]] = None,
+        crops_coords_top_left: Tuple[int, int] = (0, 0),
+        target_size: Optional[Tuple[int, int]] = None,
+        negative_original_size: Optional[Tuple[int, int]] = None,
+        negative_crops_coords_top_left: Tuple[int, int] = (0, 0),
+        negative_target_size: Optional[Tuple[int, int]] = None,
+        control_guidance_start: float = 0.0,
+        control_guidance_end: float = 1.0,
+    ):
+        r"""
+        Function invoked when calling the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
+                instead.
+            prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts to be sent to the `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is
+                used in both text-encoders
+            height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
+                The height in pixels of the generated image. This is set to 1024 by default for the best results.
+                Anything below 512 pixels won't work well for
+                [stabilityai/stable-diffusion-xl-base-1.0](https://huggingface.co./stabilityai/stable-diffusion-xl-base-1.0)
+                and checkpoints that are not specifically fine-tuned on low resolutions.
+            width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
+                The width in pixels of the generated image. This is set to 1024 by default for the best results.
+                Anything below 512 pixels won't work well for
+                [stabilityai/stable-diffusion-xl-base-1.0](https://huggingface.co./stabilityai/stable-diffusion-xl-base-1.0)
+                and checkpoints that are not specifically fine-tuned on low resolutions.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            denoising_end (`float`, *optional*):
+                When specified, determines the fraction (between 0.0 and 1.0) of the total denoising process to be
+                completed before it is intentionally prematurely terminated. As a result, the returned sample will
+                still retain a substantial amount of noise as determined by the discrete timesteps selected by the
+                scheduler. The denoising_end parameter should ideally be utilized when this pipeline forms a part of a
+                "Mixture of Denoisers" multi-pipeline setup, as elaborated in [**Refining the Image
+                Output**](https://huggingface.co./docs/diffusers/api/pipelines/stable_diffusion/stable_diffusion_xl#refining-the-image-output)
+            guidance_scale (`float`, *optional*, defaults to 5.0):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            negative_prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and
+                `text_encoder_2`. If not defined, `negative_prompt` is used in both text-encoders
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
+                [`schedulers.DDIMScheduler`], will be ignored for others.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
+                If not provided, pooled text embeddings will be generated from `prompt` input argument.
+            negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt`
+                input argument.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion_xl.StableDiffusionXLPipelineOutput`] instead
+                of a plain tuple.
+            callback (`Callable`, *optional*):
+                A function that will be called every `callback_steps` steps during inference. The function will be
+                called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function will be called. If not specified, the callback will be
+                called at every step.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
+                `self.processor` in
+                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            guidance_rescale (`float`, *optional*, defaults to 0.7):
+                Guidance rescale factor proposed by [Common Diffusion Noise Schedules and Sample Steps are
+                Flawed](https://arxiv.org/pdf/2305.08891.pdf) `guidance_scale` is defined as `φ` in equation 16. of
+                [Common Diffusion Noise Schedules and Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf).
+                Guidance rescale factor should fix overexposure when using zero terminal SNR.
+            original_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
+                If `original_size` is not the same as `target_size` the image will appear to be down- or upsampled.
+                `original_size` defaults to `(width, height)` if not specified. Part of SDXL's micro-conditioning as
+                explained in section 2.2 of
+                [https://huggingface.co./papers/2307.01952](https://huggingface.co./papers/2307.01952).
+            crops_coords_top_left (`Tuple[int]`, *optional*, defaults to (0, 0)):
+                `crops_coords_top_left` can be used to generate an image that appears to be "cropped" from the position
+                `crops_coords_top_left` downwards. Favorable, well-centered images are usually achieved by setting
+                `crops_coords_top_left` to (0, 0). Part of SDXL's micro-conditioning as explained in section 2.2 of
+                [https://huggingface.co./papers/2307.01952](https://huggingface.co./papers/2307.01952).
+            target_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
+                For most cases, `target_size` should be set to the desired height and width of the generated image. If
+                not specified it will default to `(width, height)`. Part of SDXL's micro-conditioning as explained in
+                section 2.2 of [https://huggingface.co./papers/2307.01952](https://huggingface.co./papers/2307.01952).
+            negative_original_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
+                To negatively condition the generation process based on a specific image resolution. Part of SDXL's
+                micro-conditioning as explained in section 2.2 of
+                [https://huggingface.co./papers/2307.01952](https://huggingface.co./papers/2307.01952). For more
+                information, refer to this issue thread: https://github.com/huggingface/diffusers/issues/4208.
+            negative_crops_coords_top_left (`Tuple[int]`, *optional*, defaults to (0, 0)):
+                To negatively condition the generation process based on a specific crop coordinates. Part of SDXL's
+                micro-conditioning as explained in section 2.2 of
+                [https://huggingface.co./papers/2307.01952](https://huggingface.co./papers/2307.01952). For more
+                information, refer to this issue thread: https://github.com/huggingface/diffusers/issues/4208.
+            negative_target_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
+                To negatively condition the generation process based on a target image resolution. It should be as same
+                as the `target_size` for most cases. Part of SDXL's micro-conditioning as explained in section 2.2 of
+                [https://huggingface.co./papers/2307.01952](https://huggingface.co./papers/2307.01952). For more
+                information, refer to this issue thread: https://github.com/huggingface/diffusers/issues/4208.
+            control_guidance_start (`float`, *optional*, defaults to 0.0):
+                The percentage of total steps at which the ControlNet starts applying.
+            control_guidance_end (`float`, *optional*, defaults to 1.0):
+                The percentage of total steps at which the ControlNet stops applying.
+
+        Examples:
+
+        Returns:
+            [`~pipelines.stable_diffusion_xl.StableDiffusionXLPipelineOutput`] or `tuple`:
+            [`~pipelines.stable_diffusion_xl.StableDiffusionXLPipelineOutput`] if `return_dict` is True, otherwise a
+            `tuple`. When returning a tuple, the first element is a list with the generated images.
+        """
+        # 0. Default height and width to unet
+        height = height or self.default_sample_size * self.vae_scale_factor
+        width = width or self.default_sample_size * self.vae_scale_factor
+
+        original_size = original_size or (height, width)
+        target_size = target_size or (height, width)
+
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt,
+            prompt_2,
+            height,
+            width,
+            callback_steps,
+            negative_prompt,
+            negative_prompt_2,
+            prompt_embeds,
+            negative_prompt_embeds,
+            pooled_prompt_embeds,
+            negative_pooled_prompt_embeds,
+        )
+
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        device = self._execution_device
+
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = guidance_scale > 1.0
+
+        # 3. Encode input prompt
+        text_encoder_lora_scale = (
+            cross_attention_kwargs.get("scale", None) if cross_attention_kwargs is not None else None
+        )
+        (
+            prompt_embeds,
+            negative_prompt_embeds,
+            pooled_prompt_embeds,
+            negative_pooled_prompt_embeds,
+        ) = self.encode_prompt(
+            prompt=prompt,
+            prompt_2=prompt_2,
+            device=device,
+            num_images_per_prompt=num_images_per_prompt,
+            do_classifier_free_guidance=do_classifier_free_guidance,
+            negative_prompt=negative_prompt,
+            negative_prompt_2=negative_prompt_2,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            pooled_prompt_embeds=pooled_prompt_embeds,
+            negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
+            lora_scale=text_encoder_lora_scale,
+        )
+
+        # 4. Prepare timesteps
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+
+        timesteps = self.scheduler.timesteps
+
+        # 5. Prepare latent variables
+        num_channels_latents = self.unet.config.in_channels
+        latents = self.prepare_latents(
+            batch_size * num_images_per_prompt,
+            num_channels_latents,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            latents,
+        )
+
+        # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+
+        # 7. Prepare added time ids & embeddings
+        add_text_embeds = pooled_prompt_embeds
+        if self.text_encoder_2 is None:
+            text_encoder_projection_dim = int(pooled_prompt_embeds.shape[-1])
+        else:
+            text_encoder_projection_dim = self.text_encoder_2.config.projection_dim
+
+        add_time_ids = self._get_add_time_ids(
+            original_size,
+            crops_coords_top_left,
+            target_size,
+            dtype=prompt_embeds.dtype,
+            text_encoder_projection_dim=text_encoder_projection_dim,
+        )
+        if negative_original_size is not None and negative_target_size is not None:
+            negative_add_time_ids = self._get_add_time_ids(
+                negative_original_size,
+                negative_crops_coords_top_left,
+                negative_target_size,
+                dtype=prompt_embeds.dtype,
+                text_encoder_projection_dim=text_encoder_projection_dim,
+            )
+        else:
+            negative_add_time_ids = add_time_ids
+
+        if do_classifier_free_guidance:
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds], dim=0)
+            add_text_embeds = torch.cat([negative_pooled_prompt_embeds, add_text_embeds], dim=0)
+            add_time_ids = torch.cat([negative_add_time_ids, add_time_ids], dim=0)
+
+        prompt_embeds = prompt_embeds.to(device)
+        add_text_embeds = add_text_embeds.to(device)
+        add_time_ids = add_time_ids.to(device).repeat(batch_size * num_images_per_prompt, 1)
+
+        # 8. Denoising loop
+        num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0)
+
+        # 7.1 Apply denoising_end
+        if denoising_end is not None and isinstance(denoising_end, float) and denoising_end > 0 and denoising_end < 1:
+            discrete_timestep_cutoff = int(
+                round(
+                    self.scheduler.config.num_train_timesteps
+                    - (denoising_end * self.scheduler.config.num_train_timesteps)
+                )
+            )
+            num_inference_steps = len(list(filter(lambda ts: ts >= discrete_timestep_cutoff, timesteps)))
+            timesteps = timesteps[:num_inference_steps]
+
+        # get init conditioning scale
+        for attn_processor in self.unet.attn_processors.values():
+            if isinstance(attn_processor, IPAttnProcessor):
+                conditioning_scale = attn_processor.scale
+                break
+
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                if (i / len(timesteps) < control_guidance_start) or ((i + 1) / len(timesteps) > control_guidance_end):
+                    self.set_scale(0.0)
+                else:
+                    self.set_scale(conditioning_scale)
+
+                # expand the latents if we are doing classifier free guidance
+                latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+                # predict the noise residual
+                added_cond_kwargs = {"text_embeds": add_text_embeds, "time_ids": add_time_ids}
+                noise_pred = self.unet(
+                    latent_model_input,
+                    t,
+                    encoder_hidden_states=prompt_embeds,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    added_cond_kwargs=added_cond_kwargs,
+                    return_dict=False,
+                )[0]
+
+                # perform guidance
+                if do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+                if do_classifier_free_guidance and guidance_rescale > 0.0:
+                    # Based on 3.4. in https://arxiv.org/pdf/2305.08891.pdf
+                    noise_pred = rescale_noise_cfg(noise_pred, noise_pred_text, guidance_rescale=guidance_rescale)
+
+                # compute the previous noisy sample x_t -> x_t-1
+                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
+
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+                    if callback is not None and i % callback_steps == 0:
+                        callback(i, t, latents)
+
+        if not output_type == "latent":
+            # make sure the VAE is in float32 mode, as it overflows in float16
+            needs_upcasting = self.vae.dtype == torch.float16 and self.vae.config.force_upcast
+
+            if needs_upcasting:
+                self.upcast_vae()
+                latents = latents.to(next(iter(self.vae.post_quant_conv.parameters())).dtype)
+
+            image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
+
+            # cast back to fp16 if needed
+            if needs_upcasting:
+                self.vae.to(dtype=torch.float16)
+        else:
+            image = latents
+
+        if output_type != "latent":
+            # apply watermark if available
+            if self.watermark is not None:
+                image = self.watermark.apply_watermark(image)
+
+            image = self.image_processor.postprocess(image, output_type=output_type)
+
+        # Offload all models
+        self.maybe_free_model_hooks()
+
+        if not return_dict:
+            return (image,)
+
+        return StableDiffusionXLPipelineOutput(images=image)
diff --git a/ip_adapter/ip_adapter.py b/ip_adapter/ip_adapter.py
new file mode 100644
index 0000000000000000000000000000000000000000..dcb8824aebae0554ee51b363d56a40d022edacdc
--- /dev/null
+++ b/ip_adapter/ip_adapter.py
@@ -0,0 +1,417 @@
+import os
+from typing import List
+
+import torch
+from diffusers import StableDiffusionPipeline
+from diffusers.pipelines.controlnet import MultiControlNetModel
+from PIL import Image
+from safetensors import safe_open
+from transformers import CLIPImageProcessor, CLIPVisionModelWithProjection
+
+from .utils import is_torch2_available, get_generator
+
+if is_torch2_available():
+    from .attention_processor import (
+        AttnProcessor2_0 as AttnProcessor,
+    )
+    from .attention_processor import (
+        CNAttnProcessor2_0 as CNAttnProcessor,
+    )
+    from .attention_processor import (
+        IPAttnProcessor2_0 as IPAttnProcessor,
+    )
+else:
+    from .attention_processor import AttnProcessor, CNAttnProcessor, IPAttnProcessor
+from .resampler import Resampler
+
+
+class ImageProjModel(torch.nn.Module):
+    """Projection Model"""
+
+    def __init__(self, cross_attention_dim=1024, clip_embeddings_dim=1024, clip_extra_context_tokens=4):
+        super().__init__()
+
+        self.generator = None
+        self.cross_attention_dim = cross_attention_dim
+        self.clip_extra_context_tokens = clip_extra_context_tokens
+        self.proj = torch.nn.Linear(clip_embeddings_dim, self.clip_extra_context_tokens * cross_attention_dim)
+        self.norm = torch.nn.LayerNorm(cross_attention_dim)
+
+    def forward(self, image_embeds):
+        embeds = image_embeds
+        clip_extra_context_tokens = self.proj(embeds).reshape(
+            -1, self.clip_extra_context_tokens, self.cross_attention_dim
+        )
+        clip_extra_context_tokens = self.norm(clip_extra_context_tokens)
+        return clip_extra_context_tokens
+
+
+class MLPProjModel(torch.nn.Module):
+    """SD model with image prompt"""
+    def __init__(self, cross_attention_dim=1024, clip_embeddings_dim=1024):
+        super().__init__()
+        
+        self.proj = torch.nn.Sequential(
+            torch.nn.Linear(clip_embeddings_dim, clip_embeddings_dim),
+            torch.nn.GELU(),
+            torch.nn.Linear(clip_embeddings_dim, cross_attention_dim),
+            torch.nn.LayerNorm(cross_attention_dim)
+        )
+        
+    def forward(self, image_embeds):
+        clip_extra_context_tokens = self.proj(image_embeds)
+        return clip_extra_context_tokens
+
+
+class IPAdapter:
+    def __init__(self, sd_pipe, image_encoder_path, ip_ckpt, device, num_tokens=4):
+        self.device = device
+        self.image_encoder_path = image_encoder_path
+        self.ip_ckpt = ip_ckpt
+        self.num_tokens = num_tokens
+
+        self.pipe = sd_pipe.to(self.device)
+        self.set_ip_adapter()
+
+        # load image encoder
+        self.image_encoder = CLIPVisionModelWithProjection.from_pretrained(self.image_encoder_path).to(
+            self.device, dtype=torch.float16
+        )
+        self.clip_image_processor = CLIPImageProcessor()
+        # image proj model
+        self.image_proj_model = self.init_proj()
+
+        self.load_ip_adapter()
+
+    def init_proj(self):
+        image_proj_model = ImageProjModel(
+            cross_attention_dim=self.pipe.unet.config.cross_attention_dim,
+            clip_embeddings_dim=self.image_encoder.config.projection_dim,
+            clip_extra_context_tokens=self.num_tokens,
+        ).to(self.device, dtype=torch.float16)
+        return image_proj_model
+
+    def set_ip_adapter(self):
+        unet = self.pipe.unet
+        attn_procs = {}
+        for name in unet.attn_processors.keys():
+            cross_attention_dim = None if name.endswith("attn1.processor") else unet.config.cross_attention_dim
+            if name.startswith("mid_block"):
+                hidden_size = unet.config.block_out_channels[-1]
+            elif name.startswith("up_blocks"):
+                block_id = int(name[len("up_blocks.")])
+                hidden_size = list(reversed(unet.config.block_out_channels))[block_id]
+            elif name.startswith("down_blocks"):
+                block_id = int(name[len("down_blocks.")])
+                hidden_size = unet.config.block_out_channels[block_id]
+            if cross_attention_dim is None:
+                attn_procs[name] = AttnProcessor()
+            else:
+                attn_procs[name] = IPAttnProcessor(
+                    hidden_size=hidden_size,
+                    cross_attention_dim=cross_attention_dim,
+                    scale=1.0,
+                    num_tokens=self.num_tokens,
+                ).to(self.device, dtype=torch.float16)
+        unet.set_attn_processor(attn_procs)
+        if hasattr(self.pipe, "controlnet"):
+            if isinstance(self.pipe.controlnet, MultiControlNetModel):
+                for controlnet in self.pipe.controlnet.nets:
+                    controlnet.set_attn_processor(CNAttnProcessor(num_tokens=self.num_tokens))
+            else:
+                self.pipe.controlnet.set_attn_processor(CNAttnProcessor(num_tokens=self.num_tokens))
+
+    def load_ip_adapter(self):
+        if os.path.splitext(self.ip_ckpt)[-1] == ".safetensors":
+            state_dict = {"image_proj": {}, "ip_adapter": {}}
+            with safe_open(self.ip_ckpt, framework="pt", device="cpu") as f:
+                for key in f.keys():
+                    if key.startswith("image_proj."):
+                        state_dict["image_proj"][key.replace("image_proj.", "")] = f.get_tensor(key)
+                    elif key.startswith("ip_adapter."):
+                        state_dict["ip_adapter"][key.replace("ip_adapter.", "")] = f.get_tensor(key)
+        else:
+            state_dict = torch.load(self.ip_ckpt, map_location="cpu")
+        self.image_proj_model.load_state_dict(state_dict["image_proj"])
+        ip_layers = torch.nn.ModuleList(self.pipe.unet.attn_processors.values())
+        ip_layers.load_state_dict(state_dict["ip_adapter"])
+
+    @torch.inference_mode()
+    def get_image_embeds(self, pil_image=None, clip_image_embeds=None):
+        if pil_image is not None:
+            if isinstance(pil_image, Image.Image):
+                pil_image = [pil_image]
+            clip_image = self.clip_image_processor(images=pil_image, return_tensors="pt").pixel_values
+            clip_image_embeds = self.image_encoder(clip_image.to(self.device, dtype=torch.float16)).image_embeds
+        else:
+            clip_image_embeds = clip_image_embeds.to(self.device, dtype=torch.float16)
+        image_prompt_embeds = self.image_proj_model(clip_image_embeds)
+        uncond_image_prompt_embeds = self.image_proj_model(torch.zeros_like(clip_image_embeds))
+        return image_prompt_embeds, uncond_image_prompt_embeds
+
+    def set_scale(self, scale):
+        for attn_processor in self.pipe.unet.attn_processors.values():
+            if isinstance(attn_processor, IPAttnProcessor):
+                attn_processor.scale = scale
+
+    def generate(
+        self,
+        pil_image=None,
+        clip_image_embeds=None,
+        prompt=None,
+        negative_prompt=None,
+        scale=1.0,
+        num_samples=4,
+        seed=None,
+        guidance_scale=7.5,
+        num_inference_steps=30,
+        **kwargs,
+    ):
+        self.set_scale(scale)
+
+        if pil_image is not None:
+            num_prompts = 1 if isinstance(pil_image, Image.Image) else len(pil_image)
+        else:
+            num_prompts = clip_image_embeds.size(0)
+
+        if prompt is None:
+            prompt = "best quality, high quality"
+        if negative_prompt is None:
+            negative_prompt = "monochrome, lowres, bad anatomy, worst quality, low quality"
+
+        if not isinstance(prompt, List):
+            prompt = [prompt] * num_prompts
+        if not isinstance(negative_prompt, List):
+            negative_prompt = [negative_prompt] * num_prompts
+
+        image_prompt_embeds, uncond_image_prompt_embeds = self.get_image_embeds(
+            pil_image=pil_image, clip_image_embeds=clip_image_embeds
+        )
+        bs_embed, seq_len, _ = image_prompt_embeds.shape
+        image_prompt_embeds = image_prompt_embeds.repeat(1, num_samples, 1)
+        image_prompt_embeds = image_prompt_embeds.view(bs_embed * num_samples, seq_len, -1)
+        uncond_image_prompt_embeds = uncond_image_prompt_embeds.repeat(1, num_samples, 1)
+        uncond_image_prompt_embeds = uncond_image_prompt_embeds.view(bs_embed * num_samples, seq_len, -1)
+
+        with torch.inference_mode():
+            prompt_embeds_, negative_prompt_embeds_ = self.pipe.encode_prompt(
+                prompt,
+                device=self.device,
+                num_images_per_prompt=num_samples,
+                do_classifier_free_guidance=True,
+                negative_prompt=negative_prompt,
+            )
+            prompt_embeds = torch.cat([prompt_embeds_, image_prompt_embeds], dim=1)
+            negative_prompt_embeds = torch.cat([negative_prompt_embeds_, uncond_image_prompt_embeds], dim=1)
+
+        generator = get_generator(seed, self.device)
+
+        images = self.pipe(
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            guidance_scale=guidance_scale,
+            num_inference_steps=num_inference_steps,
+            generator=generator,
+            **kwargs,
+        ).images
+
+        return images
+
+
+class IPAdapterXL(IPAdapter):
+    """SDXL"""
+
+    def generate(
+        self,
+        pil_image,
+        prompt=None,
+        negative_prompt=None,
+        scale=1.0,
+        num_samples=4,
+        seed=None,
+        num_inference_steps=30,
+        **kwargs,
+    ):
+        self.set_scale(scale)
+
+        num_prompts = 1 if isinstance(pil_image, Image.Image) else len(pil_image)
+
+        if prompt is None:
+            prompt = "best quality, high quality"
+        if negative_prompt is None:
+            negative_prompt = "monochrome, lowres, bad anatomy, worst quality, low quality"
+
+        if not isinstance(prompt, List):
+            prompt = [prompt] * num_prompts
+        if not isinstance(negative_prompt, List):
+            negative_prompt = [negative_prompt] * num_prompts
+
+        image_prompt_embeds, uncond_image_prompt_embeds = self.get_image_embeds(pil_image)
+        bs_embed, seq_len, _ = image_prompt_embeds.shape
+        image_prompt_embeds = image_prompt_embeds.repeat(1, num_samples, 1)
+        image_prompt_embeds = image_prompt_embeds.view(bs_embed * num_samples, seq_len, -1)
+        uncond_image_prompt_embeds = uncond_image_prompt_embeds.repeat(1, num_samples, 1)
+        uncond_image_prompt_embeds = uncond_image_prompt_embeds.view(bs_embed * num_samples, seq_len, -1)
+
+        with torch.inference_mode():
+            (
+                prompt_embeds,
+                negative_prompt_embeds,
+                pooled_prompt_embeds,
+                negative_pooled_prompt_embeds,
+            ) = self.pipe.encode_prompt(
+                prompt,
+                num_images_per_prompt=num_samples,
+                do_classifier_free_guidance=True,
+                negative_prompt=negative_prompt,
+            )
+            prompt_embeds = torch.cat([prompt_embeds, image_prompt_embeds], dim=1)
+            negative_prompt_embeds = torch.cat([negative_prompt_embeds, uncond_image_prompt_embeds], dim=1)
+
+        self.generator = get_generator(seed, self.device)
+        
+        images = self.pipe(
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            pooled_prompt_embeds=pooled_prompt_embeds,
+            negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
+            num_inference_steps=num_inference_steps,
+            generator=self.generator,
+            **kwargs,
+        ).images
+
+        return images
+
+
+class IPAdapterPlus(IPAdapter):
+    """IP-Adapter with fine-grained features"""
+
+    def init_proj(self):
+        image_proj_model = Resampler(
+            dim=self.pipe.unet.config.cross_attention_dim,
+            depth=4,
+            dim_head=64,
+            heads=12,
+            num_queries=self.num_tokens,
+            embedding_dim=self.image_encoder.config.hidden_size,
+            output_dim=self.pipe.unet.config.cross_attention_dim,
+            ff_mult=4,
+        ).to(self.device, dtype=torch.float16)
+        return image_proj_model
+
+    @torch.inference_mode()
+    def get_image_embeds(self, pil_image=None, clip_image_embeds=None):
+        if isinstance(pil_image, Image.Image):
+            pil_image = [pil_image]
+        clip_image = self.clip_image_processor(images=pil_image, return_tensors="pt").pixel_values
+        clip_image = clip_image.to(self.device, dtype=torch.float16)
+        clip_image_embeds = self.image_encoder(clip_image, output_hidden_states=True).hidden_states[-2]
+        image_prompt_embeds = self.image_proj_model(clip_image_embeds)
+        uncond_clip_image_embeds = self.image_encoder(
+            torch.zeros_like(clip_image), output_hidden_states=True
+        ).hidden_states[-2]
+        uncond_image_prompt_embeds = self.image_proj_model(uncond_clip_image_embeds)
+        return image_prompt_embeds, uncond_image_prompt_embeds
+
+
+class IPAdapterFull(IPAdapterPlus):
+    """IP-Adapter with full features"""
+
+    def init_proj(self):
+        image_proj_model = MLPProjModel(
+            cross_attention_dim=self.pipe.unet.config.cross_attention_dim,
+            clip_embeddings_dim=self.image_encoder.config.hidden_size,
+        ).to(self.device, dtype=torch.float16)
+        return image_proj_model
+
+
+class IPAdapterPlusXL(IPAdapter):
+    """SDXL"""
+
+    def init_proj(self):
+        image_proj_model = Resampler(
+            dim=1280,
+            depth=4,
+            dim_head=64,
+            heads=20,
+            num_queries=self.num_tokens,
+            embedding_dim=self.image_encoder.config.hidden_size,
+            output_dim=self.pipe.unet.config.cross_attention_dim,
+            ff_mult=4,
+        ).to(self.device, dtype=torch.float16)
+        return image_proj_model
+
+    @torch.inference_mode()
+    def get_image_embeds(self, pil_image):
+        if isinstance(pil_image, Image.Image):
+            pil_image = [pil_image]
+        clip_image = self.clip_image_processor(images=pil_image, return_tensors="pt").pixel_values
+        clip_image = clip_image.to(self.device, dtype=torch.float16)
+        clip_image_embeds = self.image_encoder(clip_image, output_hidden_states=True).hidden_states[-2]
+        image_prompt_embeds = self.image_proj_model(clip_image_embeds)
+        uncond_clip_image_embeds = self.image_encoder(
+            torch.zeros_like(clip_image), output_hidden_states=True
+        ).hidden_states[-2]
+        uncond_image_prompt_embeds = self.image_proj_model(uncond_clip_image_embeds)
+        return image_prompt_embeds, uncond_image_prompt_embeds
+
+    def generate(
+        self,
+        pil_image,
+        prompt=None,
+        negative_prompt=None,
+        scale=1.0,
+        num_samples=4,
+        seed=None,
+        num_inference_steps=30,
+        **kwargs,
+    ):
+        self.set_scale(scale)
+
+        num_prompts = 1 if isinstance(pil_image, Image.Image) else len(pil_image)
+
+        if prompt is None:
+            prompt = "best quality, high quality"
+        if negative_prompt is None:
+            negative_prompt = "monochrome, lowres, bad anatomy, worst quality, low quality"
+
+        if not isinstance(prompt, List):
+            prompt = [prompt] * num_prompts
+        if not isinstance(negative_prompt, List):
+            negative_prompt = [negative_prompt] * num_prompts
+
+        image_prompt_embeds, uncond_image_prompt_embeds = self.get_image_embeds(pil_image)
+        bs_embed, seq_len, _ = image_prompt_embeds.shape
+        image_prompt_embeds = image_prompt_embeds.repeat(1, num_samples, 1)
+        image_prompt_embeds = image_prompt_embeds.view(bs_embed * num_samples, seq_len, -1)
+        uncond_image_prompt_embeds = uncond_image_prompt_embeds.repeat(1, num_samples, 1)
+        uncond_image_prompt_embeds = uncond_image_prompt_embeds.view(bs_embed * num_samples, seq_len, -1)
+
+        with torch.inference_mode():
+            (
+                prompt_embeds,
+                negative_prompt_embeds,
+                pooled_prompt_embeds,
+                negative_pooled_prompt_embeds,
+            ) = self.pipe.encode_prompt(
+                prompt,
+                num_images_per_prompt=num_samples,
+                do_classifier_free_guidance=True,
+                negative_prompt=negative_prompt,
+            )
+            prompt_embeds = torch.cat([prompt_embeds, image_prompt_embeds], dim=1)
+            negative_prompt_embeds = torch.cat([negative_prompt_embeds, uncond_image_prompt_embeds], dim=1)
+
+        generator = get_generator(seed, self.device)
+
+        images = self.pipe(
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            pooled_prompt_embeds=pooled_prompt_embeds,
+            negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
+            num_inference_steps=num_inference_steps,
+            generator=generator,
+            **kwargs,
+        ).images
+
+        return images
diff --git a/ip_adapter/ip_adapter_faceid.py b/ip_adapter/ip_adapter_faceid.py
new file mode 100644
index 0000000000000000000000000000000000000000..fe98ad540648d429a3a227733c8607626fe0784e
--- /dev/null
+++ b/ip_adapter/ip_adapter_faceid.py
@@ -0,0 +1,542 @@
+import os
+from typing import List
+
+import torch
+from diffusers import StableDiffusionPipeline
+from diffusers.pipelines.controlnet import MultiControlNetModel
+from PIL import Image
+from safetensors import safe_open
+from transformers import CLIPImageProcessor, CLIPVisionModelWithProjection
+
+from .attention_processor_faceid import LoRAAttnProcessor, LoRAIPAttnProcessor
+from .utils import is_torch2_available, get_generator
+
+USE_DAFAULT_ATTN = False # should be True for visualization_attnmap
+if is_torch2_available() and (not USE_DAFAULT_ATTN):
+    from .attention_processor_faceid import (
+        LoRAAttnProcessor2_0 as LoRAAttnProcessor,
+    )
+    from .attention_processor_faceid import (
+        LoRAIPAttnProcessor2_0 as LoRAIPAttnProcessor,
+    )
+else:
+    from .attention_processor_faceid import LoRAAttnProcessor, LoRAIPAttnProcessor
+from .resampler import PerceiverAttention, FeedForward
+
+
+class FacePerceiverResampler(torch.nn.Module):
+    def __init__(
+        self,
+        *,
+        dim=768,
+        depth=4,
+        dim_head=64,
+        heads=16,
+        embedding_dim=1280,
+        output_dim=768,
+        ff_mult=4,
+    ):
+        super().__init__()
+        
+        self.proj_in = torch.nn.Linear(embedding_dim, dim)
+        self.proj_out = torch.nn.Linear(dim, output_dim)
+        self.norm_out = torch.nn.LayerNorm(output_dim)
+        self.layers = torch.nn.ModuleList([])
+        for _ in range(depth):
+            self.layers.append(
+                torch.nn.ModuleList(
+                    [
+                        PerceiverAttention(dim=dim, dim_head=dim_head, heads=heads),
+                        FeedForward(dim=dim, mult=ff_mult),
+                    ]
+                )
+            )
+
+    def forward(self, latents, x):
+        x = self.proj_in(x)
+        for attn, ff in self.layers:
+            latents = attn(x, latents) + latents
+            latents = ff(latents) + latents
+        latents = self.proj_out(latents)
+        return self.norm_out(latents)
+
+
+class MLPProjModel(torch.nn.Module):
+    def __init__(self, cross_attention_dim=768, id_embeddings_dim=512, num_tokens=4):
+        super().__init__()
+        
+        self.cross_attention_dim = cross_attention_dim
+        self.num_tokens = num_tokens
+        
+        self.proj = torch.nn.Sequential(
+            torch.nn.Linear(id_embeddings_dim, id_embeddings_dim*2),
+            torch.nn.GELU(),
+            torch.nn.Linear(id_embeddings_dim*2, cross_attention_dim*num_tokens),
+        )
+        self.norm = torch.nn.LayerNorm(cross_attention_dim)
+        
+    def forward(self, id_embeds):
+        x = self.proj(id_embeds)
+        x = x.reshape(-1, self.num_tokens, self.cross_attention_dim)
+        x = self.norm(x)
+        return x
+
+
+class ProjPlusModel(torch.nn.Module):
+    def __init__(self, cross_attention_dim=768, id_embeddings_dim=512, clip_embeddings_dim=1280, num_tokens=4):
+        super().__init__()
+        
+        self.cross_attention_dim = cross_attention_dim
+        self.num_tokens = num_tokens
+        
+        self.proj = torch.nn.Sequential(
+            torch.nn.Linear(id_embeddings_dim, id_embeddings_dim*2),
+            torch.nn.GELU(),
+            torch.nn.Linear(id_embeddings_dim*2, cross_attention_dim*num_tokens),
+        )
+        self.norm = torch.nn.LayerNorm(cross_attention_dim)
+        
+        self.perceiver_resampler = FacePerceiverResampler(
+            dim=cross_attention_dim,
+            depth=4,
+            dim_head=64,
+            heads=cross_attention_dim // 64,
+            embedding_dim=clip_embeddings_dim,
+            output_dim=cross_attention_dim,
+            ff_mult=4,
+        )
+        
+    def forward(self, id_embeds, clip_embeds, shortcut=False, scale=1.0):
+        
+        x = self.proj(id_embeds)
+        x = x.reshape(-1, self.num_tokens, self.cross_attention_dim)
+        x = self.norm(x)
+        out = self.perceiver_resampler(x, clip_embeds)
+        if shortcut:
+            out = x + scale * out
+        return out
+
+
+class IPAdapterFaceID:
+    def __init__(self, sd_pipe, ip_ckpt, device, lora_rank=128, num_tokens=4, torch_dtype=torch.float16):
+        self.device = device
+        self.ip_ckpt = ip_ckpt
+        self.lora_rank = lora_rank
+        self.num_tokens = num_tokens
+        self.torch_dtype = torch_dtype
+
+        self.pipe = sd_pipe.to(self.device)
+        self.set_ip_adapter()
+
+        # image proj model
+        self.image_proj_model = self.init_proj()
+
+        self.load_ip_adapter()
+
+    def init_proj(self):
+        image_proj_model = MLPProjModel(
+            cross_attention_dim=self.pipe.unet.config.cross_attention_dim,
+            id_embeddings_dim=512,
+            num_tokens=self.num_tokens,
+        ).to(self.device, dtype=self.torch_dtype)
+        return image_proj_model
+
+    def set_ip_adapter(self):
+        unet = self.pipe.unet
+        attn_procs = {}
+        for name in unet.attn_processors.keys():
+            cross_attention_dim = None if name.endswith("attn1.processor") else unet.config.cross_attention_dim
+            if name.startswith("mid_block"):
+                hidden_size = unet.config.block_out_channels[-1]
+            elif name.startswith("up_blocks"):
+                block_id = int(name[len("up_blocks.")])
+                hidden_size = list(reversed(unet.config.block_out_channels))[block_id]
+            elif name.startswith("down_blocks"):
+                block_id = int(name[len("down_blocks.")])
+                hidden_size = unet.config.block_out_channels[block_id]
+            if cross_attention_dim is None:
+                attn_procs[name] = LoRAAttnProcessor(
+                    hidden_size=hidden_size, cross_attention_dim=cross_attention_dim, rank=self.lora_rank,
+                ).to(self.device, dtype=self.torch_dtype)
+            else:
+                attn_procs[name] = LoRAIPAttnProcessor(
+                    hidden_size=hidden_size, cross_attention_dim=cross_attention_dim, scale=1.0, rank=self.lora_rank, num_tokens=self.num_tokens,
+                ).to(self.device, dtype=self.torch_dtype)
+        unet.set_attn_processor(attn_procs)
+
+    def load_ip_adapter(self):
+        if os.path.splitext(self.ip_ckpt)[-1] == ".safetensors":
+            state_dict = {"image_proj": {}, "ip_adapter": {}}
+            with safe_open(self.ip_ckpt, framework="pt", device="cpu") as f:
+                for key in f.keys():
+                    if key.startswith("image_proj."):
+                        state_dict["image_proj"][key.replace("image_proj.", "")] = f.get_tensor(key)
+                    elif key.startswith("ip_adapter."):
+                        state_dict["ip_adapter"][key.replace("ip_adapter.", "")] = f.get_tensor(key)
+        else:
+            state_dict = torch.load(self.ip_ckpt, map_location="cpu")
+        self.image_proj_model.load_state_dict(state_dict["image_proj"])
+        ip_layers = torch.nn.ModuleList(self.pipe.unet.attn_processors.values())
+        ip_layers.load_state_dict(state_dict["ip_adapter"])
+
+    @torch.inference_mode()
+    def get_image_embeds(self, faceid_embeds):
+        
+        faceid_embeds = faceid_embeds.to(self.device, dtype=self.torch_dtype)
+        image_prompt_embeds = self.image_proj_model(faceid_embeds)
+        uncond_image_prompt_embeds = self.image_proj_model(torch.zeros_like(faceid_embeds))
+        return image_prompt_embeds, uncond_image_prompt_embeds
+
+    def set_scale(self, scale):
+        for attn_processor in self.pipe.unet.attn_processors.values():
+            if isinstance(attn_processor, LoRAIPAttnProcessor):
+                attn_processor.scale = scale
+
+    def generate(
+        self,
+        faceid_embeds=None,
+        prompt=None,
+        negative_prompt=None,
+        scale=1.0,
+        num_samples=4,
+        seed=None,
+        guidance_scale=7.5,
+        num_inference_steps=30,
+        **kwargs,
+    ):
+        self.set_scale(scale)
+
+       
+        num_prompts = faceid_embeds.size(0)
+
+        if prompt is None:
+            prompt = "best quality, high quality"
+        if negative_prompt is None:
+            negative_prompt = "monochrome, lowres, bad anatomy, worst quality, low quality"
+
+        if not isinstance(prompt, List):
+            prompt = [prompt] * num_prompts
+        if not isinstance(negative_prompt, List):
+            negative_prompt = [negative_prompt] * num_prompts
+
+        image_prompt_embeds, uncond_image_prompt_embeds = self.get_image_embeds(faceid_embeds)
+
+        bs_embed, seq_len, _ = image_prompt_embeds.shape
+        image_prompt_embeds = image_prompt_embeds.repeat(1, num_samples, 1)
+        image_prompt_embeds = image_prompt_embeds.view(bs_embed * num_samples, seq_len, -1)
+        uncond_image_prompt_embeds = uncond_image_prompt_embeds.repeat(1, num_samples, 1)
+        uncond_image_prompt_embeds = uncond_image_prompt_embeds.view(bs_embed * num_samples, seq_len, -1)
+
+        with torch.inference_mode():
+            prompt_embeds_, negative_prompt_embeds_ = self.pipe.encode_prompt(
+                prompt,
+                device=self.device,
+                num_images_per_prompt=num_samples,
+                do_classifier_free_guidance=True,
+                negative_prompt=negative_prompt,
+            )
+            prompt_embeds = torch.cat([prompt_embeds_, image_prompt_embeds], dim=1)
+            negative_prompt_embeds = torch.cat([negative_prompt_embeds_, uncond_image_prompt_embeds], dim=1)
+
+        generator = get_generator(seed, self.device)
+
+        images = self.pipe(
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            guidance_scale=guidance_scale,
+            num_inference_steps=num_inference_steps,
+            generator=generator,
+            **kwargs,
+        ).images
+
+        return images
+
+
+class IPAdapterFaceIDPlus:
+    def __init__(self, sd_pipe, image_encoder_path, ip_ckpt, device, lora_rank=128, num_tokens=4, torch_dtype=torch.float16):
+        self.device = device
+        self.image_encoder_path = image_encoder_path
+        self.ip_ckpt = ip_ckpt
+        self.lora_rank = lora_rank
+        self.num_tokens = num_tokens
+        self.torch_dtype = torch_dtype
+
+        self.pipe = sd_pipe.to(self.device)
+        self.set_ip_adapter()
+
+        # load image encoder
+        self.image_encoder = CLIPVisionModelWithProjection.from_pretrained(self.image_encoder_path).to(
+            self.device, dtype=self.torch_dtype
+        )
+        self.clip_image_processor = CLIPImageProcessor()
+        # image proj model
+        self.image_proj_model = self.init_proj()
+
+        self.load_ip_adapter()
+
+    def init_proj(self):
+        image_proj_model = ProjPlusModel(
+            cross_attention_dim=self.pipe.unet.config.cross_attention_dim,
+            id_embeddings_dim=512,
+            clip_embeddings_dim=self.image_encoder.config.hidden_size,
+            num_tokens=self.num_tokens,
+        ).to(self.device, dtype=self.torch_dtype)
+        return image_proj_model
+
+    def set_ip_adapter(self):
+        unet = self.pipe.unet
+        attn_procs = {}
+        for name in unet.attn_processors.keys():
+            cross_attention_dim = None if name.endswith("attn1.processor") else unet.config.cross_attention_dim
+            if name.startswith("mid_block"):
+                hidden_size = unet.config.block_out_channels[-1]
+            elif name.startswith("up_blocks"):
+                block_id = int(name[len("up_blocks.")])
+                hidden_size = list(reversed(unet.config.block_out_channels))[block_id]
+            elif name.startswith("down_blocks"):
+                block_id = int(name[len("down_blocks.")])
+                hidden_size = unet.config.block_out_channels[block_id]
+            if cross_attention_dim is None:
+                attn_procs[name] = LoRAAttnProcessor(
+                    hidden_size=hidden_size, cross_attention_dim=cross_attention_dim, rank=self.lora_rank,
+                ).to(self.device, dtype=self.torch_dtype)
+            else:
+                attn_procs[name] = LoRAIPAttnProcessor(
+                    hidden_size=hidden_size, cross_attention_dim=cross_attention_dim, scale=1.0, rank=self.lora_rank, num_tokens=self.num_tokens,
+                ).to(self.device, dtype=self.torch_dtype)
+        unet.set_attn_processor(attn_procs)
+
+    def load_ip_adapter(self):
+        if os.path.splitext(self.ip_ckpt)[-1] == ".safetensors":
+            state_dict = {"image_proj": {}, "ip_adapter": {}}
+            with safe_open(self.ip_ckpt, framework="pt", device="cpu") as f:
+                for key in f.keys():
+                    if key.startswith("image_proj."):
+                        state_dict["image_proj"][key.replace("image_proj.", "")] = f.get_tensor(key)
+                    elif key.startswith("ip_adapter."):
+                        state_dict["ip_adapter"][key.replace("ip_adapter.", "")] = f.get_tensor(key)
+        else:
+            state_dict = torch.load(self.ip_ckpt, map_location="cpu")
+        self.image_proj_model.load_state_dict(state_dict["image_proj"])
+        ip_layers = torch.nn.ModuleList(self.pipe.unet.attn_processors.values())
+        ip_layers.load_state_dict(state_dict["ip_adapter"])
+
+    @torch.inference_mode()
+    def get_image_embeds(self, faceid_embeds, face_image, s_scale, shortcut):
+        if isinstance(face_image, Image.Image):
+            pil_image = [face_image]
+        clip_image = self.clip_image_processor(images=face_image, return_tensors="pt").pixel_values
+        clip_image = clip_image.to(self.device, dtype=self.torch_dtype)
+        clip_image_embeds = self.image_encoder(clip_image, output_hidden_states=True).hidden_states[-2]
+        uncond_clip_image_embeds = self.image_encoder(
+            torch.zeros_like(clip_image), output_hidden_states=True
+        ).hidden_states[-2]
+        
+        faceid_embeds = faceid_embeds.to(self.device, dtype=self.torch_dtype)
+        image_prompt_embeds = self.image_proj_model(faceid_embeds, clip_image_embeds, shortcut=shortcut, scale=s_scale)
+        uncond_image_prompt_embeds = self.image_proj_model(torch.zeros_like(faceid_embeds), uncond_clip_image_embeds, shortcut=shortcut, scale=s_scale)
+        return image_prompt_embeds, uncond_image_prompt_embeds
+
+    def set_scale(self, scale):
+        for attn_processor in self.pipe.unet.attn_processors.values():
+            if isinstance(attn_processor, LoRAIPAttnProcessor):
+                attn_processor.scale = scale
+
+    def generate(
+        self,
+        face_image=None,
+        faceid_embeds=None,
+        prompt=None,
+        negative_prompt=None,
+        scale=1.0,
+        num_samples=4,
+        seed=None,
+        guidance_scale=7.5,
+        num_inference_steps=30,
+        s_scale=1.0,
+        shortcut=False,
+        **kwargs,
+    ):
+        self.set_scale(scale)
+
+       
+        num_prompts = faceid_embeds.size(0)
+
+        if prompt is None:
+            prompt = "best quality, high quality"
+        if negative_prompt is None:
+            negative_prompt = "monochrome, lowres, bad anatomy, worst quality, low quality"
+
+        if not isinstance(prompt, List):
+            prompt = [prompt] * num_prompts
+        if not isinstance(negative_prompt, List):
+            negative_prompt = [negative_prompt] * num_prompts
+
+        image_prompt_embeds, uncond_image_prompt_embeds = self.get_image_embeds(faceid_embeds, face_image, s_scale, shortcut)
+
+        bs_embed, seq_len, _ = image_prompt_embeds.shape
+        image_prompt_embeds = image_prompt_embeds.repeat(1, num_samples, 1)
+        image_prompt_embeds = image_prompt_embeds.view(bs_embed * num_samples, seq_len, -1)
+        uncond_image_prompt_embeds = uncond_image_prompt_embeds.repeat(1, num_samples, 1)
+        uncond_image_prompt_embeds = uncond_image_prompt_embeds.view(bs_embed * num_samples, seq_len, -1)
+
+        with torch.inference_mode():
+            prompt_embeds_, negative_prompt_embeds_ = self.pipe.encode_prompt(
+                prompt,
+                device=self.device,
+                num_images_per_prompt=num_samples,
+                do_classifier_free_guidance=True,
+                negative_prompt=negative_prompt,
+            )
+            prompt_embeds = torch.cat([prompt_embeds_, image_prompt_embeds], dim=1)
+            negative_prompt_embeds = torch.cat([negative_prompt_embeds_, uncond_image_prompt_embeds], dim=1)
+
+        generator = get_generator(seed, self.device)
+
+        images = self.pipe(
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            guidance_scale=guidance_scale,
+            num_inference_steps=num_inference_steps,
+            generator=generator,
+            **kwargs,
+        ).images
+
+        return images
+
+
+class IPAdapterFaceIDXL(IPAdapterFaceID):
+    """SDXL"""
+
+    def generate(
+        self,
+        faceid_embeds=None,
+        prompt=None,
+        negative_prompt=None,
+        scale=1.0,
+        num_samples=4,
+        seed=None,
+        num_inference_steps=30,
+        **kwargs,
+    ):
+        self.set_scale(scale)
+
+        num_prompts = faceid_embeds.size(0)
+
+        if prompt is None:
+            prompt = "best quality, high quality"
+        if negative_prompt is None:
+            negative_prompt = "monochrome, lowres, bad anatomy, worst quality, low quality"
+
+        if not isinstance(prompt, List):
+            prompt = [prompt] * num_prompts
+        if not isinstance(negative_prompt, List):
+            negative_prompt = [negative_prompt] * num_prompts
+
+        image_prompt_embeds, uncond_image_prompt_embeds = self.get_image_embeds(faceid_embeds)
+
+        bs_embed, seq_len, _ = image_prompt_embeds.shape
+        image_prompt_embeds = image_prompt_embeds.repeat(1, num_samples, 1)
+        image_prompt_embeds = image_prompt_embeds.view(bs_embed * num_samples, seq_len, -1)
+        uncond_image_prompt_embeds = uncond_image_prompt_embeds.repeat(1, num_samples, 1)
+        uncond_image_prompt_embeds = uncond_image_prompt_embeds.view(bs_embed * num_samples, seq_len, -1)
+
+        with torch.inference_mode():
+            (
+                prompt_embeds,
+                negative_prompt_embeds,
+                pooled_prompt_embeds,
+                negative_pooled_prompt_embeds,
+            ) = self.pipe.encode_prompt(
+                prompt,
+                num_images_per_prompt=num_samples,
+                do_classifier_free_guidance=True,
+                negative_prompt=negative_prompt,
+            )
+            prompt_embeds = torch.cat([prompt_embeds, image_prompt_embeds], dim=1)
+            negative_prompt_embeds = torch.cat([negative_prompt_embeds, uncond_image_prompt_embeds], dim=1)
+
+        generator = get_generator(seed, self.device)
+
+        images = self.pipe(
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            pooled_prompt_embeds=pooled_prompt_embeds,
+            negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
+            num_inference_steps=num_inference_steps,
+            generator=generator,
+            **kwargs,
+        ).images
+
+        return images
+
+
+class IPAdapterFaceIDPlusXL(IPAdapterFaceIDPlus):
+    """SDXL"""
+
+    def generate(
+        self,
+        face_image=None,
+        faceid_embeds=None,
+        prompt=None,
+        negative_prompt=None,
+        scale=1.0,
+        num_samples=4,
+        seed=None,
+        guidance_scale=7.5,
+        num_inference_steps=30,
+        s_scale=1.0,
+        shortcut=True,
+        **kwargs,
+    ):
+        self.set_scale(scale)
+
+        num_prompts = faceid_embeds.size(0)
+
+        if prompt is None:
+            prompt = "best quality, high quality"
+        if negative_prompt is None:
+            negative_prompt = "monochrome, lowres, bad anatomy, worst quality, low quality"
+
+        if not isinstance(prompt, List):
+            prompt = [prompt] * num_prompts
+        if not isinstance(negative_prompt, List):
+            negative_prompt = [negative_prompt] * num_prompts
+
+        image_prompt_embeds, uncond_image_prompt_embeds = self.get_image_embeds(faceid_embeds, face_image, s_scale, shortcut)
+
+        bs_embed, seq_len, _ = image_prompt_embeds.shape
+        image_prompt_embeds = image_prompt_embeds.repeat(1, num_samples, 1)
+        image_prompt_embeds = image_prompt_embeds.view(bs_embed * num_samples, seq_len, -1)
+        uncond_image_prompt_embeds = uncond_image_prompt_embeds.repeat(1, num_samples, 1)
+        uncond_image_prompt_embeds = uncond_image_prompt_embeds.view(bs_embed * num_samples, seq_len, -1)
+
+        with torch.inference_mode():
+            (
+                prompt_embeds,
+                negative_prompt_embeds,
+                pooled_prompt_embeds,
+                negative_pooled_prompt_embeds,
+            ) = self.pipe.encode_prompt(
+                prompt,
+                num_images_per_prompt=num_samples,
+                do_classifier_free_guidance=True,
+                negative_prompt=negative_prompt,
+            )
+            prompt_embeds = torch.cat([prompt_embeds, image_prompt_embeds], dim=1)
+            negative_prompt_embeds = torch.cat([negative_prompt_embeds, uncond_image_prompt_embeds], dim=1)
+
+        generator = get_generator(seed, self.device)
+
+        images = self.pipe(
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            pooled_prompt_embeds=pooled_prompt_embeds,
+            negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
+            num_inference_steps=num_inference_steps,
+            generator=generator,
+            guidance_scale=guidance_scale,
+            **kwargs,
+        ).images
+
+        return images
diff --git a/ip_adapter/ip_adapter_faceid_separate.py b/ip_adapter/ip_adapter_faceid_separate.py
new file mode 100644
index 0000000000000000000000000000000000000000..80ca84c4fdb56e9b0dfb56195426427b31f07fb8
--- /dev/null
+++ b/ip_adapter/ip_adapter_faceid_separate.py
@@ -0,0 +1,547 @@
+import os
+from typing import List
+
+import torch
+from diffusers import StableDiffusionPipeline
+from diffusers.pipelines.controlnet import MultiControlNetModel
+from PIL import Image
+from safetensors import safe_open
+from transformers import CLIPImageProcessor, CLIPVisionModelWithProjection
+
+from .utils import is_torch2_available, get_generator
+
+USE_DAFAULT_ATTN = False # should be True for visualization_attnmap
+if is_torch2_available() and (not USE_DAFAULT_ATTN):
+    from .attention_processor import (
+        AttnProcessor2_0 as AttnProcessor,
+    )
+    from .attention_processor import (
+        IPAttnProcessor2_0 as IPAttnProcessor,
+    )
+else:
+    from .attention_processor import AttnProcessor, IPAttnProcessor
+from .resampler import PerceiverAttention, FeedForward
+
+
+class FacePerceiverResampler(torch.nn.Module):
+    def __init__(
+        self,
+        *,
+        dim=768,
+        depth=4,
+        dim_head=64,
+        heads=16,
+        embedding_dim=1280,
+        output_dim=768,
+        ff_mult=4,
+    ):
+        super().__init__()
+        
+        self.proj_in = torch.nn.Linear(embedding_dim, dim)
+        self.proj_out = torch.nn.Linear(dim, output_dim)
+        self.norm_out = torch.nn.LayerNorm(output_dim)
+        self.layers = torch.nn.ModuleList([])
+        for _ in range(depth):
+            self.layers.append(
+                torch.nn.ModuleList(
+                    [
+                        PerceiverAttention(dim=dim, dim_head=dim_head, heads=heads),
+                        FeedForward(dim=dim, mult=ff_mult),
+                    ]
+                )
+            )
+
+    def forward(self, latents, x):
+        x = self.proj_in(x)
+        for attn, ff in self.layers:
+            latents = attn(x, latents) + latents
+            latents = ff(latents) + latents
+        latents = self.proj_out(latents)
+        return self.norm_out(latents)
+
+
+class MLPProjModel(torch.nn.Module):
+    def __init__(self, cross_attention_dim=768, id_embeddings_dim=512, num_tokens=4):
+        super().__init__()
+        
+        self.cross_attention_dim = cross_attention_dim
+        self.num_tokens = num_tokens
+        
+        self.proj = torch.nn.Sequential(
+            torch.nn.Linear(id_embeddings_dim, id_embeddings_dim*2),
+            torch.nn.GELU(),
+            torch.nn.Linear(id_embeddings_dim*2, cross_attention_dim*num_tokens),
+        )
+        self.norm = torch.nn.LayerNorm(cross_attention_dim)
+        
+    def forward(self, id_embeds):
+        x = self.proj(id_embeds)
+        x = x.reshape(-1, self.num_tokens, self.cross_attention_dim)
+        x = self.norm(x)
+        return x
+
+
+class ProjPlusModel(torch.nn.Module):
+    def __init__(self, cross_attention_dim=768, id_embeddings_dim=512, clip_embeddings_dim=1280, num_tokens=4):
+        super().__init__()
+        
+        self.cross_attention_dim = cross_attention_dim
+        self.num_tokens = num_tokens
+        
+        self.proj = torch.nn.Sequential(
+            torch.nn.Linear(id_embeddings_dim, id_embeddings_dim*2),
+            torch.nn.GELU(),
+            torch.nn.Linear(id_embeddings_dim*2, cross_attention_dim*num_tokens),
+        )
+        self.norm = torch.nn.LayerNorm(cross_attention_dim)
+        
+        self.perceiver_resampler = FacePerceiverResampler(
+            dim=cross_attention_dim,
+            depth=4,
+            dim_head=64,
+            heads=cross_attention_dim // 64,
+            embedding_dim=clip_embeddings_dim,
+            output_dim=cross_attention_dim,
+            ff_mult=4,
+        )
+        
+    def forward(self, id_embeds, clip_embeds, shortcut=False, scale=1.0):
+        
+        x = self.proj(id_embeds)
+        x = x.reshape(-1, self.num_tokens, self.cross_attention_dim)
+        x = self.norm(x)
+        out = self.perceiver_resampler(x, clip_embeds)
+        if shortcut:
+            out = x + scale * out
+        return out
+
+
+class IPAdapterFaceID:
+    def __init__(self, sd_pipe, ip_ckpt, device, num_tokens=4, n_cond=1, torch_dtype=torch.float16):
+        self.device = device
+        self.ip_ckpt = ip_ckpt
+        self.num_tokens = num_tokens
+        self.n_cond = n_cond
+        self.torch_dtype = torch_dtype
+
+        self.pipe = sd_pipe.to(self.device)
+        self.set_ip_adapter()
+
+        # image proj model
+        self.image_proj_model = self.init_proj()
+
+        self.load_ip_adapter()
+
+    def init_proj(self):
+        image_proj_model = MLPProjModel(
+            cross_attention_dim=self.pipe.unet.config.cross_attention_dim,
+            id_embeddings_dim=512,
+            num_tokens=self.num_tokens,
+        ).to(self.device, dtype=self.torch_dtype)
+        return image_proj_model
+
+    def set_ip_adapter(self):
+        unet = self.pipe.unet
+        attn_procs = {}
+        for name in unet.attn_processors.keys():
+            cross_attention_dim = None if name.endswith("attn1.processor") else unet.config.cross_attention_dim
+            if name.startswith("mid_block"):
+                hidden_size = unet.config.block_out_channels[-1]
+            elif name.startswith("up_blocks"):
+                block_id = int(name[len("up_blocks.")])
+                hidden_size = list(reversed(unet.config.block_out_channels))[block_id]
+            elif name.startswith("down_blocks"):
+                block_id = int(name[len("down_blocks.")])
+                hidden_size = unet.config.block_out_channels[block_id]
+            if cross_attention_dim is None:
+                attn_procs[name] = AttnProcessor()
+            else:
+                attn_procs[name] = IPAttnProcessor(
+                    hidden_size=hidden_size, cross_attention_dim=cross_attention_dim, scale=1.0, num_tokens=self.num_tokens*self.n_cond,
+                ).to(self.device, dtype=self.torch_dtype)
+        unet.set_attn_processor(attn_procs)
+
+    def load_ip_adapter(self):
+        if os.path.splitext(self.ip_ckpt)[-1] == ".safetensors":
+            state_dict = {"image_proj": {}, "ip_adapter": {}}
+            with safe_open(self.ip_ckpt, framework="pt", device="cpu") as f:
+                for key in f.keys():
+                    if key.startswith("image_proj."):
+                        state_dict["image_proj"][key.replace("image_proj.", "")] = f.get_tensor(key)
+                    elif key.startswith("ip_adapter."):
+                        state_dict["ip_adapter"][key.replace("ip_adapter.", "")] = f.get_tensor(key)
+        else:
+            state_dict = torch.load(self.ip_ckpt, map_location="cpu")
+        self.image_proj_model.load_state_dict(state_dict["image_proj"])
+        ip_layers = torch.nn.ModuleList(self.pipe.unet.attn_processors.values())
+        ip_layers.load_state_dict(state_dict["ip_adapter"], strict=False)
+
+    @torch.inference_mode()
+    def get_image_embeds(self, faceid_embeds):
+
+        multi_face = False
+        if faceid_embeds.dim() == 3:
+            multi_face = True
+            b, n, c = faceid_embeds.shape
+            faceid_embeds = faceid_embeds.reshape(b*n, c)
+
+        faceid_embeds = faceid_embeds.to(self.device, dtype=self.torch_dtype)
+        image_prompt_embeds = self.image_proj_model(faceid_embeds)
+        uncond_image_prompt_embeds = self.image_proj_model(torch.zeros_like(faceid_embeds))
+        if multi_face:
+            c = image_prompt_embeds.size(-1)
+            image_prompt_embeds = image_prompt_embeds.reshape(b, -1, c)
+            uncond_image_prompt_embeds = uncond_image_prompt_embeds.reshape(b, -1, c)
+        
+        return image_prompt_embeds, uncond_image_prompt_embeds
+
+    def set_scale(self, scale):
+        for attn_processor in self.pipe.unet.attn_processors.values():
+            if isinstance(attn_processor, IPAttnProcessor):
+                attn_processor.scale = scale
+
+    def generate(
+        self,
+        faceid_embeds=None,
+        prompt=None,
+        negative_prompt=None,
+        scale=1.0,
+        num_samples=4,
+        seed=None,
+        guidance_scale=7.5,
+        num_inference_steps=30,
+        **kwargs,
+    ):
+        self.set_scale(scale)
+
+       
+        num_prompts = faceid_embeds.size(0)
+
+        if prompt is None:
+            prompt = "best quality, high quality"
+        if negative_prompt is None:
+            negative_prompt = "monochrome, lowres, bad anatomy, worst quality, low quality"
+
+        if not isinstance(prompt, List):
+            prompt = [prompt] * num_prompts
+        if not isinstance(negative_prompt, List):
+            negative_prompt = [negative_prompt] * num_prompts
+
+        image_prompt_embeds, uncond_image_prompt_embeds = self.get_image_embeds(faceid_embeds)
+
+        bs_embed, seq_len, _ = image_prompt_embeds.shape
+        image_prompt_embeds = image_prompt_embeds.repeat(1, num_samples, 1)
+        image_prompt_embeds = image_prompt_embeds.view(bs_embed * num_samples, seq_len, -1)
+        uncond_image_prompt_embeds = uncond_image_prompt_embeds.repeat(1, num_samples, 1)
+        uncond_image_prompt_embeds = uncond_image_prompt_embeds.view(bs_embed * num_samples, seq_len, -1)
+
+        with torch.inference_mode():
+            prompt_embeds_, negative_prompt_embeds_ = self.pipe.encode_prompt(
+                prompt,
+                device=self.device,
+                num_images_per_prompt=num_samples,
+                do_classifier_free_guidance=True,
+                negative_prompt=negative_prompt,
+            )
+            prompt_embeds = torch.cat([prompt_embeds_, image_prompt_embeds], dim=1)
+            negative_prompt_embeds = torch.cat([negative_prompt_embeds_, uncond_image_prompt_embeds], dim=1)
+
+        generator = get_generator(seed, self.device)
+
+        images = self.pipe(
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            guidance_scale=guidance_scale,
+            num_inference_steps=num_inference_steps,
+            generator=generator,
+            **kwargs,
+        ).images
+
+        return images
+
+
+class IPAdapterFaceIDPlus:
+    def __init__(self, sd_pipe, image_encoder_path, ip_ckpt, device, num_tokens=4, torch_dtype=torch.float16):
+        self.device = device
+        self.image_encoder_path = image_encoder_path
+        self.ip_ckpt = ip_ckpt
+        self.num_tokens = num_tokens
+        self.torch_dtype = torch_dtype
+
+        self.pipe = sd_pipe.to(self.device)
+        self.set_ip_adapter()
+
+        # load image encoder
+        self.image_encoder = CLIPVisionModelWithProjection.from_pretrained(self.image_encoder_path).to(
+            self.device, dtype=self.torch_dtype
+        )
+        self.clip_image_processor = CLIPImageProcessor()
+        # image proj model
+        self.image_proj_model = self.init_proj()
+
+        self.load_ip_adapter()
+
+    def init_proj(self):
+        image_proj_model = ProjPlusModel(
+            cross_attention_dim=self.pipe.unet.config.cross_attention_dim,
+            id_embeddings_dim=512,
+            clip_embeddings_dim=self.image_encoder.config.hidden_size,
+            num_tokens=self.num_tokens,
+        ).to(self.device, dtype=self.torch_dtype)
+        return image_proj_model
+
+    def set_ip_adapter(self):
+        unet = self.pipe.unet
+        attn_procs = {}
+        for name in unet.attn_processors.keys():
+            cross_attention_dim = None if name.endswith("attn1.processor") else unet.config.cross_attention_dim
+            if name.startswith("mid_block"):
+                hidden_size = unet.config.block_out_channels[-1]
+            elif name.startswith("up_blocks"):
+                block_id = int(name[len("up_blocks.")])
+                hidden_size = list(reversed(unet.config.block_out_channels))[block_id]
+            elif name.startswith("down_blocks"):
+                block_id = int(name[len("down_blocks.")])
+                hidden_size = unet.config.block_out_channels[block_id]
+            if cross_attention_dim is None:
+                attn_procs[name] = AttnProcessor()
+            else:
+                attn_procs[name] = IPAttnProcessor(
+                    hidden_size=hidden_size, cross_attention_dim=cross_attention_dim, scale=1.0, num_tokens=self.num_tokens,
+                ).to(self.device, dtype=self.torch_dtype)
+        unet.set_attn_processor(attn_procs)
+
+    def load_ip_adapter(self):
+        if os.path.splitext(self.ip_ckpt)[-1] == ".safetensors":
+            state_dict = {"image_proj": {}, "ip_adapter": {}}
+            with safe_open(self.ip_ckpt, framework="pt", device="cpu") as f:
+                for key in f.keys():
+                    if key.startswith("image_proj."):
+                        state_dict["image_proj"][key.replace("image_proj.", "")] = f.get_tensor(key)
+                    elif key.startswith("ip_adapter."):
+                        state_dict["ip_adapter"][key.replace("ip_adapter.", "")] = f.get_tensor(key)
+        else:
+            state_dict = torch.load(self.ip_ckpt, map_location="cpu")
+        self.image_proj_model.load_state_dict(state_dict["image_proj"])
+        ip_layers = torch.nn.ModuleList(self.pipe.unet.attn_processors.values())
+        ip_layers.load_state_dict(state_dict["ip_adapter"], strict=False)
+
+    @torch.inference_mode()
+    def get_image_embeds(self, faceid_embeds, face_image, s_scale, shortcut):
+        if isinstance(face_image, Image.Image):
+            pil_image = [face_image]
+        clip_image = self.clip_image_processor(images=face_image, return_tensors="pt").pixel_values
+        clip_image = clip_image.to(self.device, dtype=self.torch_dtype)
+        clip_image_embeds = self.image_encoder(clip_image, output_hidden_states=True).hidden_states[-2]
+        uncond_clip_image_embeds = self.image_encoder(
+            torch.zeros_like(clip_image), output_hidden_states=True
+        ).hidden_states[-2]
+        
+        faceid_embeds = faceid_embeds.to(self.device, dtype=self.torch_dtype)
+        image_prompt_embeds = self.image_proj_model(faceid_embeds, clip_image_embeds, shortcut=shortcut, scale=s_scale)
+        uncond_image_prompt_embeds = self.image_proj_model(torch.zeros_like(faceid_embeds), uncond_clip_image_embeds, shortcut=shortcut, scale=s_scale)
+        return image_prompt_embeds, uncond_image_prompt_embeds
+
+    def set_scale(self, scale):
+        for attn_processor in self.pipe.unet.attn_processors.values():
+            if isinstance(attn_processor, LoRAIPAttnProcessor):
+                attn_processor.scale = scale
+
+    def generate(
+        self,
+        face_image=None,
+        faceid_embeds=None,
+        prompt=None,
+        negative_prompt=None,
+        scale=1.0,
+        num_samples=4,
+        seed=None,
+        guidance_scale=7.5,
+        num_inference_steps=30,
+        s_scale=1.0,
+        shortcut=False,
+        **kwargs,
+    ):
+        self.set_scale(scale)
+
+       
+        num_prompts = faceid_embeds.size(0)
+
+        if prompt is None:
+            prompt = "best quality, high quality"
+        if negative_prompt is None:
+            negative_prompt = "monochrome, lowres, bad anatomy, worst quality, low quality"
+
+        if not isinstance(prompt, List):
+            prompt = [prompt] * num_prompts
+        if not isinstance(negative_prompt, List):
+            negative_prompt = [negative_prompt] * num_prompts
+
+        image_prompt_embeds, uncond_image_prompt_embeds = self.get_image_embeds(faceid_embeds, face_image, s_scale, shortcut)
+
+        bs_embed, seq_len, _ = image_prompt_embeds.shape
+        image_prompt_embeds = image_prompt_embeds.repeat(1, num_samples, 1)
+        image_prompt_embeds = image_prompt_embeds.view(bs_embed * num_samples, seq_len, -1)
+        uncond_image_prompt_embeds = uncond_image_prompt_embeds.repeat(1, num_samples, 1)
+        uncond_image_prompt_embeds = uncond_image_prompt_embeds.view(bs_embed * num_samples, seq_len, -1)
+
+        with torch.inference_mode():
+            prompt_embeds_, negative_prompt_embeds_ = self.pipe.encode_prompt(
+                prompt,
+                device=self.device,
+                num_images_per_prompt=num_samples,
+                do_classifier_free_guidance=True,
+                negative_prompt=negative_prompt,
+            )
+            prompt_embeds = torch.cat([prompt_embeds_, image_prompt_embeds], dim=1)
+            negative_prompt_embeds = torch.cat([negative_prompt_embeds_, uncond_image_prompt_embeds], dim=1)
+
+        generator = get_generator(seed, self.device)
+
+        images = self.pipe(
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            guidance_scale=guidance_scale,
+            num_inference_steps=num_inference_steps,
+            generator=generator,
+            **kwargs,
+        ).images
+
+        return images
+
+
+class IPAdapterFaceIDXL(IPAdapterFaceID):
+    """SDXL"""
+
+    def generate(
+        self,
+        faceid_embeds=None,
+        prompt=None,
+        negative_prompt=None,
+        scale=1.0,
+        num_samples=4,
+        seed=None,
+        num_inference_steps=30,
+        **kwargs,
+    ):
+        self.set_scale(scale)
+
+        num_prompts = faceid_embeds.size(0)
+
+        if prompt is None:
+            prompt = "best quality, high quality"
+        if negative_prompt is None:
+            negative_prompt = "monochrome, lowres, bad anatomy, worst quality, low quality"
+
+        if not isinstance(prompt, List):
+            prompt = [prompt] * num_prompts
+        if not isinstance(negative_prompt, List):
+            negative_prompt = [negative_prompt] * num_prompts
+
+        image_prompt_embeds, uncond_image_prompt_embeds = self.get_image_embeds(faceid_embeds)
+
+        bs_embed, seq_len, _ = image_prompt_embeds.shape
+        image_prompt_embeds = image_prompt_embeds.repeat(1, num_samples, 1)
+        image_prompt_embeds = image_prompt_embeds.view(bs_embed * num_samples, seq_len, -1)
+        uncond_image_prompt_embeds = uncond_image_prompt_embeds.repeat(1, num_samples, 1)
+        uncond_image_prompt_embeds = uncond_image_prompt_embeds.view(bs_embed * num_samples, seq_len, -1)
+
+        with torch.inference_mode():
+            (
+                prompt_embeds,
+                negative_prompt_embeds,
+                pooled_prompt_embeds,
+                negative_pooled_prompt_embeds,
+            ) = self.pipe.encode_prompt(
+                prompt,
+                num_images_per_prompt=num_samples,
+                do_classifier_free_guidance=True,
+                negative_prompt=negative_prompt,
+            )
+            prompt_embeds = torch.cat([prompt_embeds, image_prompt_embeds], dim=1)
+            negative_prompt_embeds = torch.cat([negative_prompt_embeds, uncond_image_prompt_embeds], dim=1)
+
+        generator = get_generator(seed, self.device)
+
+        images = self.pipe(
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            pooled_prompt_embeds=pooled_prompt_embeds,
+            negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
+            num_inference_steps=num_inference_steps,
+            generator=generator,
+            **kwargs,
+        ).images
+
+        return images
+
+
+class IPAdapterFaceIDPlusXL(IPAdapterFaceIDPlus):
+    """SDXL"""
+
+    def generate(
+        self,
+        face_image=None,
+        faceid_embeds=None,
+        prompt=None,
+        negative_prompt=None,
+        scale=1.0,
+        num_samples=4,
+        seed=None,
+        guidance_scale=7.5,
+        num_inference_steps=30,
+        s_scale=1.0,
+        shortcut=True,
+        **kwargs,
+    ):
+        self.set_scale(scale)
+
+        num_prompts = faceid_embeds.size(0)
+
+        if prompt is None:
+            prompt = "best quality, high quality"
+        if negative_prompt is None:
+            negative_prompt = "monochrome, lowres, bad anatomy, worst quality, low quality"
+
+        if not isinstance(prompt, List):
+            prompt = [prompt] * num_prompts
+        if not isinstance(negative_prompt, List):
+            negative_prompt = [negative_prompt] * num_prompts
+
+        image_prompt_embeds, uncond_image_prompt_embeds = self.get_image_embeds(faceid_embeds, face_image, s_scale, shortcut)
+
+        bs_embed, seq_len, _ = image_prompt_embeds.shape
+        image_prompt_embeds = image_prompt_embeds.repeat(1, num_samples, 1)
+        image_prompt_embeds = image_prompt_embeds.view(bs_embed * num_samples, seq_len, -1)
+        uncond_image_prompt_embeds = uncond_image_prompt_embeds.repeat(1, num_samples, 1)
+        uncond_image_prompt_embeds = uncond_image_prompt_embeds.view(bs_embed * num_samples, seq_len, -1)
+
+        with torch.inference_mode():
+            (
+                prompt_embeds,
+                negative_prompt_embeds,
+                pooled_prompt_embeds,
+                negative_pooled_prompt_embeds,
+            ) = self.pipe.encode_prompt(
+                prompt,
+                num_images_per_prompt=num_samples,
+                do_classifier_free_guidance=True,
+                negative_prompt=negative_prompt,
+            )
+            prompt_embeds = torch.cat([prompt_embeds, image_prompt_embeds], dim=1)
+            negative_prompt_embeds = torch.cat([negative_prompt_embeds, uncond_image_prompt_embeds], dim=1)
+
+        generator = get_generator(seed, self.device)
+
+        images = self.pipe(
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            pooled_prompt_embeds=pooled_prompt_embeds,
+            negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
+            num_inference_steps=num_inference_steps,
+            generator=generator,
+            guidance_scale=guidance_scale,
+            **kwargs,
+        ).images
+
+        return images
diff --git a/ip_adapter/resampler.py b/ip_adapter/resampler.py
new file mode 100644
index 0000000000000000000000000000000000000000..24266671d02092438ae6576336a59659fef9c054
--- /dev/null
+++ b/ip_adapter/resampler.py
@@ -0,0 +1,158 @@
+# modified from https://github.com/mlfoundations/open_flamingo/blob/main/open_flamingo/src/helpers.py
+# and https://github.com/lucidrains/imagen-pytorch/blob/main/imagen_pytorch/imagen_pytorch.py
+
+import math
+
+import torch
+import torch.nn as nn
+from einops import rearrange
+from einops.layers.torch import Rearrange
+
+
+# FFN
+def FeedForward(dim, mult=4):
+    inner_dim = int(dim * mult)
+    return nn.Sequential(
+        nn.LayerNorm(dim),
+        nn.Linear(dim, inner_dim, bias=False),
+        nn.GELU(),
+        nn.Linear(inner_dim, dim, bias=False),
+    )
+
+
+def reshape_tensor(x, heads):
+    bs, length, width = x.shape
+    # (bs, length, width) --> (bs, length, n_heads, dim_per_head)
+    x = x.view(bs, length, heads, -1)
+    # (bs, length, n_heads, dim_per_head) --> (bs, n_heads, length, dim_per_head)
+    x = x.transpose(1, 2)
+    # (bs, n_heads, length, dim_per_head) --> (bs*n_heads, length, dim_per_head)
+    x = x.reshape(bs, heads, length, -1)
+    return x
+
+
+class PerceiverAttention(nn.Module):
+    def __init__(self, *, dim, dim_head=64, heads=8):
+        super().__init__()
+        self.scale = dim_head**-0.5
+        self.dim_head = dim_head
+        self.heads = heads
+        inner_dim = dim_head * heads
+
+        self.norm1 = nn.LayerNorm(dim)
+        self.norm2 = nn.LayerNorm(dim)
+
+        self.to_q = nn.Linear(dim, inner_dim, bias=False)
+        self.to_kv = nn.Linear(dim, inner_dim * 2, bias=False)
+        self.to_out = nn.Linear(inner_dim, dim, bias=False)
+
+    def forward(self, x, latents):
+        """
+        Args:
+            x (torch.Tensor): image features
+                shape (b, n1, D)
+            latent (torch.Tensor): latent features
+                shape (b, n2, D)
+        """
+        x = self.norm1(x)
+        latents = self.norm2(latents)
+
+        b, l, _ = latents.shape
+
+        q = self.to_q(latents)
+        kv_input = torch.cat((x, latents), dim=-2)
+        k, v = self.to_kv(kv_input).chunk(2, dim=-1)
+
+        q = reshape_tensor(q, self.heads)
+        k = reshape_tensor(k, self.heads)
+        v = reshape_tensor(v, self.heads)
+
+        # attention
+        scale = 1 / math.sqrt(math.sqrt(self.dim_head))
+        weight = (q * scale) @ (k * scale).transpose(-2, -1)  # More stable with f16 than dividing afterwards
+        weight = torch.softmax(weight.float(), dim=-1).type(weight.dtype)
+        out = weight @ v
+
+        out = out.permute(0, 2, 1, 3).reshape(b, l, -1)
+
+        return self.to_out(out)
+
+
+class Resampler(nn.Module):
+    def __init__(
+        self,
+        dim=1024,
+        depth=8,
+        dim_head=64,
+        heads=16,
+        num_queries=8,
+        embedding_dim=768,
+        output_dim=1024,
+        ff_mult=4,
+        max_seq_len: int = 257,  # CLIP tokens + CLS token
+        apply_pos_emb: bool = False,
+        num_latents_mean_pooled: int = 0,  # number of latents derived from mean pooled representation of the sequence
+    ):
+        super().__init__()
+        self.pos_emb = nn.Embedding(max_seq_len, embedding_dim) if apply_pos_emb else None
+
+        self.latents = nn.Parameter(torch.randn(1, num_queries, dim) / dim**0.5)
+
+        self.proj_in = nn.Linear(embedding_dim, dim)
+
+        self.proj_out = nn.Linear(dim, output_dim)
+        self.norm_out = nn.LayerNorm(output_dim)
+
+        self.to_latents_from_mean_pooled_seq = (
+            nn.Sequential(
+                nn.LayerNorm(dim),
+                nn.Linear(dim, dim * num_latents_mean_pooled),
+                Rearrange("b (n d) -> b n d", n=num_latents_mean_pooled),
+            )
+            if num_latents_mean_pooled > 0
+            else None
+        )
+
+        self.layers = nn.ModuleList([])
+        for _ in range(depth):
+            self.layers.append(
+                nn.ModuleList(
+                    [
+                        PerceiverAttention(dim=dim, dim_head=dim_head, heads=heads),
+                        FeedForward(dim=dim, mult=ff_mult),
+                    ]
+                )
+            )
+
+    def forward(self, x):
+        if self.pos_emb is not None:
+            n, device = x.shape[1], x.device
+            pos_emb = self.pos_emb(torch.arange(n, device=device))
+            x = x + pos_emb
+
+        latents = self.latents.repeat(x.size(0), 1, 1)
+
+        x = self.proj_in(x)
+
+        if self.to_latents_from_mean_pooled_seq:
+            meanpooled_seq = masked_mean(x, dim=1, mask=torch.ones(x.shape[:2], device=x.device, dtype=torch.bool))
+            meanpooled_latents = self.to_latents_from_mean_pooled_seq(meanpooled_seq)
+            latents = torch.cat((meanpooled_latents, latents), dim=-2)
+
+        for attn, ff in self.layers:
+            latents = attn(x, latents) + latents
+            latents = ff(latents) + latents
+
+        latents = self.proj_out(latents)
+        return self.norm_out(latents)
+
+
+def masked_mean(t, *, dim, mask=None):
+    if mask is None:
+        return t.mean(dim=dim)
+
+    denom = mask.sum(dim=dim, keepdim=True)
+    mask = rearrange(mask, "b n -> b n 1")
+    masked_t = t.masked_fill(~mask, 0.0)
+
+    return masked_t.sum(dim=dim) / denom.clamp(min=1e-5)
diff --git a/ip_adapter/test_resampler.py b/ip_adapter/test_resampler.py
new file mode 100644
index 0000000000000000000000000000000000000000..8978c8e19c0f6326fc849930086253db53a8a17b
--- /dev/null
+++ b/ip_adapter/test_resampler.py
@@ -0,0 +1,44 @@
+import torch
+from resampler import Resampler
+from transformers import CLIPVisionModel
+
+BATCH_SIZE = 2
+OUTPUT_DIM = 1280
+NUM_QUERIES = 8
+NUM_LATENTS_MEAN_POOLED = 4  # 0 for no mean pooling (previous behavior)
+APPLY_POS_EMB = True  # False for no positional embeddings (previous behavior)
+IMAGE_ENCODER_NAME_OR_PATH = "laion/CLIP-ViT-H-14-laion2B-s32B-b79K"
+
+
+def main():
+    image_encoder = CLIPVisionModel.from_pretrained(IMAGE_ENCODER_NAME_OR_PATH)
+    embedding_dim = image_encoder.config.hidden_size
+    print(f"image_encoder hidden size: ", embedding_dim)
+
+    image_proj_model = Resampler(
+        dim=1024,
+        depth=2,
+        dim_head=64,
+        heads=16,
+        num_queries=NUM_QUERIES,
+        embedding_dim=embedding_dim,
+        output_dim=OUTPUT_DIM,
+        ff_mult=2,
+        max_seq_len=257,
+        apply_pos_emb=APPLY_POS_EMB,
+        num_latents_mean_pooled=NUM_LATENTS_MEAN_POOLED,
+    )
+
+    dummy_images = torch.randn(BATCH_SIZE, 3, 224, 224)
+    with torch.no_grad():
+        image_embeds = image_encoder(dummy_images, output_hidden_states=True).hidden_states[-2]
+    print("image_embds shape: ", image_embeds.shape)
+
+    with torch.no_grad():
+        ip_tokens = image_proj_model(image_embeds)
+    print("ip_tokens shape:", ip_tokens.shape)
+    assert ip_tokens.shape == (BATCH_SIZE, NUM_QUERIES + NUM_LATENTS_MEAN_POOLED, OUTPUT_DIM)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/ip_adapter/utils.py b/ip_adapter/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..6a273358585962fdf383d0bb7a0e1c654b4999b8
--- /dev/null
+++ b/ip_adapter/utils.py
@@ -0,0 +1,93 @@
+import torch
+import torch.nn.functional as F
+import numpy as np
+from PIL import Image
+
+attn_maps = {}
+def hook_fn(name):
+    def forward_hook(module, input, output):
+        if hasattr(module.processor, "attn_map"):
+            attn_maps[name] = module.processor.attn_map
+            del module.processor.attn_map
+
+    return forward_hook
+
+def register_cross_attention_hook(unet):
+    for name, module in unet.named_modules():
+        if name.split('.')[-1].startswith('attn2'):
+            module.register_forward_hook(hook_fn(name))
+
+    return unet
+
+def upscale(attn_map, target_size):
+    attn_map = torch.mean(attn_map, dim=0)
+    attn_map = attn_map.permute(1,0)
+    temp_size = None
+
+    for i in range(0,5):
+        scale = 2 ** i
+        if ( target_size[0] // scale ) * ( target_size[1] // scale) == attn_map.shape[1]*64:
+            temp_size = (target_size[0]//(scale*8), target_size[1]//(scale*8))
+            break
+
+    assert temp_size is not None, "temp_size cannot is None"
+
+    attn_map = attn_map.view(attn_map.shape[0], *temp_size)
+
+    attn_map = F.interpolate(
+        attn_map.unsqueeze(0).to(dtype=torch.float32),
+        size=target_size,
+        mode='bilinear',
+        align_corners=False
+    )[0]
+
+    attn_map = torch.softmax(attn_map, dim=0)
+    return attn_map
+def get_net_attn_map(image_size, batch_size=2, instance_or_negative=False, detach=True):
+
+    idx = 0 if instance_or_negative else 1
+    net_attn_maps = []
+
+    for name, attn_map in attn_maps.items():
+        attn_map = attn_map.cpu() if detach else attn_map
+        attn_map = torch.chunk(attn_map, batch_size)[idx].squeeze()
+        attn_map = upscale(attn_map, image_size) 
+        net_attn_maps.append(attn_map) 
+
+    net_attn_maps = torch.mean(torch.stack(net_attn_maps,dim=0),dim=0)
+
+    return net_attn_maps
+
+def attnmaps2images(net_attn_maps):
+
+    #total_attn_scores = 0
+    images = []
+
+    for attn_map in net_attn_maps:
+        attn_map = attn_map.cpu().numpy()
+        #total_attn_scores += attn_map.mean().item()
+
+        normalized_attn_map = (attn_map - np.min(attn_map)) / (np.max(attn_map) - np.min(attn_map)) * 255
+        normalized_attn_map = normalized_attn_map.astype(np.uint8)
+        #print("norm: ", normalized_attn_map.shape)
+        image = Image.fromarray(normalized_attn_map)
+
+        #image = fix_save_attn_map(attn_map)
+        images.append(image)
+
+    #print(total_attn_scores)
+    return images
+def is_torch2_available():
+    return hasattr(F, "scaled_dot_product_attention")
+
+def get_generator(seed, device):
+
+    if seed is not None:
+        if isinstance(seed, list):
+            generator = [torch.Generator(device).manual_seed(seed_item) for seed_item in seed]
+        else:
+            generator = torch.Generator(device).manual_seed(seed)
+    else:
+        generator = None
+
+    return generator
\ No newline at end of file
diff --git a/preprocess/humanparsing/__pycache__/parsing_api.cpython-310.pyc b/preprocess/humanparsing/__pycache__/parsing_api.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..cbf9c3dd90407c07014417ed1cc8693bdc7c1184
Binary files /dev/null and b/preprocess/humanparsing/__pycache__/parsing_api.cpython-310.pyc differ
diff --git a/preprocess/humanparsing/__pycache__/run_parsing.cpython-310.pyc b/preprocess/humanparsing/__pycache__/run_parsing.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..df2ee1f6cb9661faa419f5e9f56044ee3bb8131b
Binary files /dev/null and b/preprocess/humanparsing/__pycache__/run_parsing.cpython-310.pyc differ
diff --git a/preprocess/humanparsing/datasets/__init__.py b/preprocess/humanparsing/datasets/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/preprocess/humanparsing/datasets/__pycache__/__init__.cpython-310.pyc b/preprocess/humanparsing/datasets/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8d4655ec369ad51a60be72c63cad60b24a1006de
Binary files /dev/null and b/preprocess/humanparsing/datasets/__pycache__/__init__.cpython-310.pyc differ
diff --git a/preprocess/humanparsing/datasets/__pycache__/simple_extractor_dataset.cpython-310.pyc b/preprocess/humanparsing/datasets/__pycache__/simple_extractor_dataset.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..bd0586cefb3a859e41097f5b29893c440a3f8474
Binary files /dev/null and b/preprocess/humanparsing/datasets/__pycache__/simple_extractor_dataset.cpython-310.pyc differ
diff --git a/preprocess/humanparsing/datasets/datasets.py b/preprocess/humanparsing/datasets/datasets.py
new file mode 100644
index 0000000000000000000000000000000000000000..433f15af93029538b3b039f8f207764fcfe426d9
--- /dev/null
+++ b/preprocess/humanparsing/datasets/datasets.py
@@ -0,0 +1,201 @@
+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+
+"""
+@Author  :   Peike Li
+@Contact :   peike.li@yahoo.com
+@File    :   datasets.py
+@Time    :   8/4/19 3:35 PM
+@Desc    :
+@License :   This source code is licensed under the license found in the
+             LICENSE file in the root directory of this source tree.
+"""
+
+import os
+import numpy as np
+import random
+import torch
+import cv2
+from torch.utils import data
+from utils.transforms import get_affine_transform
+
+
+class LIPDataSet(data.Dataset):
+    def __init__(self, root, dataset, crop_size=[473, 473], scale_factor=0.25,
+                 rotation_factor=30, ignore_label=255, transform=None):
+        self.root = root
+        self.aspect_ratio = crop_size[1] * 1.0 / crop_size[0]
+        self.crop_size = np.asarray(crop_size)
+        self.ignore_label = ignore_label
+        self.scale_factor = scale_factor
+        self.rotation_factor = rotation_factor
+        self.flip_prob = 0.5
+        self.transform = transform
+        self.dataset = dataset
+
+        list_path = os.path.join(self.root, self.dataset + '_id.txt')
+        train_list = [i_id.strip() for i_id in open(list_path)]
+
+        self.train_list = train_list
+        self.number_samples = len(self.train_list)
+
+    def __len__(self):
+        return self.number_samples
+
+    def _box2cs(self, box):
+        x, y, w, h = box[:4]
+        return self._xywh2cs(x, y, w, h)
+
+    def _xywh2cs(self, x, y, w, h):
+        center = np.zeros((2), dtype=np.float32)
+        center[0] = x + w * 0.5
+        center[1] = y + h * 0.5
+        if w > self.aspect_ratio * h:
+            h = w * 1.0 / self.aspect_ratio
+        elif w < self.aspect_ratio * h:
+            w = h * self.aspect_ratio
+        scale = np.array([w * 1.0, h * 1.0], dtype=np.float32)
+        return center, scale
+
+    def __getitem__(self, index):
+        train_item = self.train_list[index]
+
+        im_path = os.path.join(self.root, self.dataset + '_images', train_item + '.jpg')
+        parsing_anno_path = os.path.join(self.root, self.dataset + '_segmentations', train_item + '.png')
+
+        im = cv2.imread(im_path, cv2.IMREAD_COLOR)
+        h, w, _ = im.shape
+        parsing_anno = np.zeros((h, w), dtype=np.long)
+
+        # Get person center and scale
+        person_center, s = self._box2cs([0, 0, w - 1, h - 1])
+        r = 0
+
+        if self.dataset != 'test':
+            # Get pose annotation
+            parsing_anno = cv2.imread(parsing_anno_path, cv2.IMREAD_GRAYSCALE)
+            if self.dataset == 'train' or self.dataset == 'trainval':
+                sf = self.scale_factor
+                rf = self.rotation_factor
+                s = s * np.clip(np.random.randn() * sf + 1, 1 - sf, 1 + sf)
+                r = np.clip(np.random.randn() * rf, -rf * 2, rf * 2) if random.random() <= 0.6 else 0
+
+                if random.random() <= self.flip_prob:
+                    im = im[:, ::-1, :]
+                    parsing_anno = parsing_anno[:, ::-1]
+                    person_center[0] = im.shape[1] - person_center[0] - 1
+                    right_idx = [15, 17, 19]
+                    left_idx = [14, 16, 18]
+                    for i in range(0, 3):
+                        right_pos = np.where(parsing_anno == right_idx[i])
+                        left_pos = np.where(parsing_anno == left_idx[i])
+                        parsing_anno[right_pos[0], right_pos[1]] = left_idx[i]
+                        parsing_anno[left_pos[0], left_pos[1]] = right_idx[i]
+
+        trans = get_affine_transform(person_center, s, r, self.crop_size)
+        input = cv2.warpAffine(
+            im,
+            trans,
+            (int(self.crop_size[1]), int(self.crop_size[0])),
+            flags=cv2.INTER_LINEAR,
+            borderMode=cv2.BORDER_CONSTANT,
+            borderValue=(0, 0, 0))
+
+        if self.transform:
+            input = self.transform(input)
+
+        meta = {
+            'name': train_item,
+            'center': person_center,
+            'height': h,
+            'width': w,
+            'scale': s,
+            'rotation': r
+        }
+
+        if self.dataset == 'val' or self.dataset == 'test':
+            return input, meta
+        else:
+            label_parsing = cv2.warpAffine(
+                parsing_anno,
+                trans,
+                (int(self.crop_size[1]), int(self.crop_size[0])),
+                flags=cv2.INTER_NEAREST,
+                borderMode=cv2.BORDER_CONSTANT,
+                borderValue=(255))
+
+            label_parsing = torch.from_numpy(label_parsing)
+
+            return input, label_parsing, meta
+
+
+class LIPDataValSet(data.Dataset):
+    def __init__(self, root, dataset='val', crop_size=[473, 473], transform=None, flip=False):
+        self.root = root
+        self.crop_size = crop_size
+        self.transform = transform
+        self.flip = flip
+        self.dataset = dataset
+        self.root = root
+        self.aspect_ratio = crop_size[1] * 1.0 / crop_size[0]
+        self.crop_size = np.asarray(crop_size)
+
+        list_path = os.path.join(self.root, self.dataset + '_id.txt')
+        val_list = [i_id.strip() for i_id in open(list_path)]
+
+        self.val_list = val_list
+        self.number_samples = len(self.val_list)
+
+    def __len__(self):
+        return len(self.val_list)
+
+    def _box2cs(self, box):
+        x, y, w, h = box[:4]
+        return self._xywh2cs(x, y, w, h)
+
+    def _xywh2cs(self, x, y, w, h):
+        center = np.zeros((2), dtype=np.float32)
+        center[0] = x + w * 0.5
+        center[1] = y + h * 0.5
+        if w > self.aspect_ratio * h:
+            h = w * 1.0 / self.aspect_ratio
+        elif w < self.aspect_ratio * h:
+            w = h * self.aspect_ratio
+        scale = np.array([w * 1.0, h * 1.0], dtype=np.float32)
+
+        return center, scale
+
+    def __getitem__(self, index):
+        val_item = self.val_list[index]
+        # Load training image
+        im_path = os.path.join(self.root, self.dataset + '_images', val_item + '.jpg')
+        im = cv2.imread(im_path, cv2.IMREAD_COLOR)
+        h, w, _ = im.shape
+        # Get person center and scale
+        person_center, s = self._box2cs([0, 0, w - 1, h - 1])
+        r = 0
+        trans = get_affine_transform(person_center, s, r, self.crop_size)
+        input = cv2.warpAffine(
+            im,
+            trans,
+            (int(self.crop_size[1]), int(self.crop_size[0])),
+            flags=cv2.INTER_LINEAR,
+            borderMode=cv2.BORDER_CONSTANT,
+            borderValue=(0, 0, 0))
+        input = self.transform(input)
+        flip_input = input.flip(dims=[-1])
+        if self.flip:
+            batch_input_im = torch.stack([input, flip_input])
+        else:
+            batch_input_im = input
+
+        meta = {
+            'name': val_item,
+            'center': person_center,
+            'height': h,
+            'width': w,
+            'scale': s,
+            'rotation': r
+        }
+
+        return batch_input_im, meta
diff --git a/preprocess/humanparsing/datasets/simple_extractor_dataset.py b/preprocess/humanparsing/datasets/simple_extractor_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..c5e85240701231f9789b822219c8b9eda47be4de
--- /dev/null
+++ b/preprocess/humanparsing/datasets/simple_extractor_dataset.py
@@ -0,0 +1,89 @@
+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+
+"""
+@Author  :   Peike Li
+@Contact :   peike.li@yahoo.com
+@File    :   dataset.py
+@Time    :   8/30/19 9:12 PM
+@Desc    :   Dataset Definition
+@License :   This source code is licensed under the license found in the
+             LICENSE file in the root directory of this source tree.
+"""
+
+import os
+import pdb
+
+import cv2
+import numpy as np
+from PIL import Image
+from torch.utils import data
+from utils.transforms import get_affine_transform
+
+
+class SimpleFolderDataset(data.Dataset):
+    def __init__(self, root, input_size=[512, 512], transform=None):
+        self.root = root
+        self.input_size = input_size
+        self.transform = transform
+        self.aspect_ratio = input_size[1] * 1.0 / input_size[0]
+        self.input_size = np.asarray(input_size)
+        self.is_pil_image = False
+        if isinstance(root, Image.Image):
+            self.file_list = [root]
+            self.is_pil_image = True
+        elif os.path.isfile(root):
+            self.file_list = [os.path.basename(root)]
+            self.root = os.path.dirname(root)
+        else:
+            self.file_list = os.listdir(self.root)
+
+    def __len__(self):
+        return len(self.file_list)
+
+    def _box2cs(self, box):
+        x, y, w, h = box[:4]
+        return self._xywh2cs(x, y, w, h)
+
+    def _xywh2cs(self, x, y, w, h):
+        center = np.zeros((2), dtype=np.float32)
+        center[0] = x + w * 0.5
+        center[1] = y + h * 0.5
+        if w > self.aspect_ratio * h:
+            h = w * 1.0 / self.aspect_ratio
+        elif w < self.aspect_ratio * h:
+            w = h * self.aspect_ratio
+        scale = np.array([w, h], dtype=np.float32)
+        return center, scale
+
+    def __getitem__(self, index):
+        if self.is_pil_image:
+            img = np.asarray(self.file_list[index])[:, :, [2, 1, 0]]
+        else:
+            img_name = self.file_list[index]
+            img_path = os.path.join(self.root, img_name)
+            img = cv2.imread(img_path, cv2.IMREAD_COLOR)
+        h, w, _ = img.shape
+
+        # Get person center and scale
+        person_center, s = self._box2cs([0, 0, w - 1, h - 1])
+        r = 0
+        trans = get_affine_transform(person_center, s, r, self.input_size)
+        input = cv2.warpAffine(
+            img,
+            trans,
+            (int(self.input_size[1]), int(self.input_size[0])),
+            flags=cv2.INTER_LINEAR,
+            borderMode=cv2.BORDER_CONSTANT,
+            borderValue=(0, 0, 0))
+
+        input = self.transform(input)
+        meta = {
+            'center': person_center,
+            'height': h,
+            'width': w,
+            'scale': s,
+            'rotation': r
+        }
+
+        return input, meta
diff --git a/preprocess/humanparsing/datasets/target_generation.py b/preprocess/humanparsing/datasets/target_generation.py
new file mode 100644
index 0000000000000000000000000000000000000000..8524db4427755c12ce71a4292d87ebb3e91762c1
--- /dev/null
+++ b/preprocess/humanparsing/datasets/target_generation.py
@@ -0,0 +1,40 @@
+import torch
+from torch.nn import functional as F
+
+
+def generate_edge_tensor(label, edge_width=3):
+    label = label.type(torch.cuda.FloatTensor)
+    if len(label.shape) == 2:
+        label = label.unsqueeze(0)
+    n, h, w = label.shape
+    edge = torch.zeros(label.shape, dtype=torch.float).cuda()
+    # right
+    edge_right = edge[:, 1:h, :]
+    edge_right[(label[:, 1:h, :] != label[:, :h - 1, :]) & (label[:, 1:h, :] != 255)
+               & (label[:, :h - 1, :] != 255)] = 1
+
+    # up
+    edge_up = edge[:, :, :w - 1]
+    edge_up[(label[:, :, :w - 1] != label[:, :, 1:w])
+            & (label[:, :, :w - 1] != 255)
+            & (label[:, :, 1:w] != 255)] = 1
+
+    # upright
+    edge_upright = edge[:, :h - 1, :w - 1]
+    edge_upright[(label[:, :h - 1, :w - 1] != label[:, 1:h, 1:w])
+                 & (label[:, :h - 1, :w - 1] != 255)
+                 & (label[:, 1:h, 1:w] != 255)] = 1
+
+    # bottomright
+    edge_bottomright = edge[:, :h - 1, 1:w]
+    edge_bottomright[(label[:, :h - 1, 1:w] != label[:, 1:h, :w - 1])
+                     & (label[:, :h - 1, 1:w] != 255)
+                     & (label[:, 1:h, :w - 1] != 255)] = 1
+
+    kernel = torch.ones((1, 1, edge_width, edge_width), dtype=torch.float).cuda()
+    with torch.no_grad():
+        edge = edge.unsqueeze(1)
+        edge = F.conv2d(edge, kernel, stride=1, padding=1)
+    edge[edge!=0] = 1
+    edge = edge.squeeze()
+    return edge
diff --git a/preprocess/humanparsing/mhp_extension/coco_style_annotation_creator/human_to_coco.py b/preprocess/humanparsing/mhp_extension/coco_style_annotation_creator/human_to_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..8eccb3a8f63e9b76eade5b2036526d91b8483dc2
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/coco_style_annotation_creator/human_to_coco.py
@@ -0,0 +1,166 @@
+import argparse
+import datetime
+import json
+import os
+from PIL import Image
+import numpy as np
+
+import pycococreatortools
+
+
+def get_arguments():
+    parser = argparse.ArgumentParser(description="transform mask annotation to coco annotation")
+    parser.add_argument("--dataset", type=str, default='CIHP', help="name of dataset (CIHP, MHPv2 or VIP)")
+    parser.add_argument("--json_save_dir", type=str, default='../data/msrcnn_finetune_annotations',
+                        help="path to save coco-style annotation json file")
+    parser.add_argument("--use_val", type=bool, default=False,
+                        help="use train+val set for finetuning or not")
+    parser.add_argument("--train_img_dir", type=str, default='../data/instance-level_human_parsing/Training/Images',
+                        help="train image path")
+    parser.add_argument("--train_anno_dir", type=str,
+                        default='../data/instance-level_human_parsing/Training/Human_ids',
+                        help="train human mask path")
+    parser.add_argument("--val_img_dir", type=str, default='../data/instance-level_human_parsing/Validation/Images',
+                        help="val image path")
+    parser.add_argument("--val_anno_dir", type=str,
+                        default='../data/instance-level_human_parsing/Validation/Human_ids',
+                        help="val human mask path")
+    return parser.parse_args()
+
+
+def main(args):
+    INFO = {
+        "description": args.split_name + " Dataset",
+        "url": "",
+        "version": "",
+        "year": 2019,
+        "contributor": "xyq",
+        "date_created": datetime.datetime.utcnow().isoformat(' ')
+    }
+
+    LICENSES = [
+        {
+            "id": 1,
+            "name": "",
+            "url": ""
+        }
+    ]
+
+    CATEGORIES = [
+        {
+            'id': 1,
+            'name': 'person',
+            'supercategory': 'person',
+        },
+    ]
+
+    coco_output = {
+        "info": INFO,
+        "licenses": LICENSES,
+        "categories": CATEGORIES,
+        "images": [],
+        "annotations": []
+    }
+
+    image_id = 1
+    segmentation_id = 1
+
+    for image_name in os.listdir(args.train_img_dir):
+        image = Image.open(os.path.join(args.train_img_dir, image_name))
+        image_info = pycococreatortools.create_image_info(
+            image_id, image_name, image.size
+        )
+        coco_output["images"].append(image_info)
+
+        human_mask_name = os.path.splitext(image_name)[0] + '.png'
+        human_mask = np.asarray(Image.open(os.path.join(args.train_anno_dir, human_mask_name)))
+        human_gt_labels = np.unique(human_mask)
+
+        for i in range(1, len(human_gt_labels)):
+            category_info = {'id': 1, 'is_crowd': 0}
+            binary_mask = np.uint8(human_mask == i)
+            annotation_info = pycococreatortools.create_annotation_info(
+                segmentation_id, image_id, category_info, binary_mask,
+                image.size, tolerance=10
+            )
+            if annotation_info is not None:
+                coco_output["annotations"].append(annotation_info)
+
+            segmentation_id += 1
+        image_id += 1
+
+    if not os.path.exists(args.json_save_dir):
+        os.makedirs(args.json_save_dir)
+    if not args.use_val:
+        with open('{}/{}_train.json'.format(args.json_save_dir, args.split_name), 'w') as output_json_file:
+            json.dump(coco_output, output_json_file)
+    else:
+        for image_name in os.listdir(args.val_img_dir):
+            image = Image.open(os.path.join(args.val_img_dir, image_name))
+            image_info = pycococreatortools.create_image_info(
+                image_id, image_name, image.size
+            )
+            coco_output["images"].append(image_info)
+
+            human_mask_name = os.path.splitext(image_name)[0] + '.png'
+            human_mask = np.asarray(Image.open(os.path.join(args.val_anno_dir, human_mask_name)))
+            human_gt_labels = np.unique(human_mask)
+
+            for i in range(1, len(human_gt_labels)):
+                category_info = {'id': 1, 'is_crowd': 0}
+                binary_mask = np.uint8(human_mask == i)
+                annotation_info = pycococreatortools.create_annotation_info(
+                    segmentation_id, image_id, category_info, binary_mask,
+                    image.size, tolerance=10
+                )
+                if annotation_info is not None:
+                    coco_output["annotations"].append(annotation_info)
+
+                segmentation_id += 1
+            image_id += 1
+
+        with open('{}/{}_trainval.json'.format(args.json_save_dir, args.split_name), 'w') as output_json_file:
+            json.dump(coco_output, output_json_file)
+
+    coco_output_val = {
+        "info": INFO,
+        "licenses": LICENSES,
+        "categories": CATEGORIES,
+        "images": [],
+        "annotations": []
+    }
+
+    image_id_val = 1
+    segmentation_id_val = 1
+
+    for image_name in os.listdir(args.val_img_dir):
+        image = Image.open(os.path.join(args.val_img_dir, image_name))
+        image_info = pycococreatortools.create_image_info(
+            image_id_val, image_name, image.size
+        )
+        coco_output_val["images"].append(image_info)
+
+        human_mask_name = os.path.splitext(image_name)[0] + '.png'
+        human_mask = np.asarray(Image.open(os.path.join(args.val_anno_dir, human_mask_name)))
+        human_gt_labels = np.unique(human_mask)
+
+        for i in range(1, len(human_gt_labels)):
+            category_info = {'id': 1, 'is_crowd': 0}
+            binary_mask = np.uint8(human_mask == i)
+            annotation_info = pycococreatortools.create_annotation_info(
+                segmentation_id_val, image_id_val, category_info, binary_mask,
+                image.size, tolerance=10
+            )
+            if annotation_info is not None:
+                coco_output_val["annotations"].append(annotation_info)
+
+            segmentation_id_val += 1
+        image_id_val += 1
+
+    with open('{}/{}_val.json'.format(args.json_save_dir, args.split_name), 'w') as output_json_file_val:
+        json.dump(coco_output_val, output_json_file_val)
+
+
+if __name__ == "__main__":
+    args = get_arguments()
+    main(args)
diff --git a/preprocess/humanparsing/mhp_extension/coco_style_annotation_creator/pycococreatortools.py b/preprocess/humanparsing/mhp_extension/coco_style_annotation_creator/pycococreatortools.py
new file mode 100644
index 0000000000000000000000000000000000000000..3f3d8332ceda5fa4409095a0ec56d181ea162273
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/coco_style_annotation_creator/pycococreatortools.py
@@ -0,0 +1,114 @@
+import re
+import datetime
+import numpy as np
+from itertools import groupby
+from skimage import measure
+from PIL import Image
+from pycocotools import mask
+
+convert = lambda text: int(text) if text.isdigit() else text.lower()
+natrual_key = lambda key: [convert(c) for c in re.split('([0-9]+)', key)]
+
+
+def resize_binary_mask(array, new_size):
+    image = Image.fromarray(array.astype(np.uint8) * 255)
+    image = image.resize(new_size)
+    return np.asarray(image).astype(np.bool_)
+
+
+def close_contour(contour):
+    if not np.array_equal(contour[0], contour[-1]):
+        contour = np.vstack((contour, contour[0]))
+    return contour
+
+
+def binary_mask_to_rle(binary_mask):
+    rle = {'counts': [], 'size': list(binary_mask.shape)}
+    counts = rle.get('counts')
+    for i, (value, elements) in enumerate(groupby(binary_mask.ravel(order='F'))):
+        if i == 0 and value == 1:
+            counts.append(0)
+        counts.append(len(list(elements)))
+
+    return rle
+
+
+def binary_mask_to_polygon(binary_mask, tolerance=0):
+    """Converts a binary mask to COCO polygon representation
+    Args:
+        binary_mask: a 2D binary numpy array where '1's represent the object
+        tolerance: Maximum distance from original points of polygon to approximated
+            polygonal chain. If tolerance is 0, the original coordinate array is returned.
+    """
+    polygons = []
+    # pad mask to close contours of shapes which start and end at an edge
+    padded_binary_mask = np.pad(binary_mask, pad_width=1, mode='constant', constant_values=0)
+    contours = measure.find_contours(padded_binary_mask, 0.5)
+    contours = np.subtract(contours, 1)
+    for contour in contours:
+        contour = close_contour(contour)
+        contour = measure.approximate_polygon(contour, tolerance)
+        if len(contour) < 3:
+            continue
+        contour = np.flip(contour, axis=1)
+        segmentation = contour.ravel().tolist()
+        # after padding and subtracting 1 we may get -0.5 points in our segmentation 
+        segmentation = [0 if i < 0 else i for i in segmentation]
+        polygons.append(segmentation)
+
+    return polygons
+
+
+def create_image_info(image_id, file_name, image_size,
+                      date_captured=datetime.datetime.utcnow().isoformat(' '),
+                      license_id=1, coco_url="", flickr_url=""):
+    image_info = {
+        "id": image_id,
+        "file_name": file_name,
+        "width": image_size[0],
+        "height": image_size[1],
+        "date_captured": date_captured,
+        "license": license_id,
+        "coco_url": coco_url,
+        "flickr_url": flickr_url
+    }
+
+    return image_info
+
+
+def create_annotation_info(annotation_id, image_id, category_info, binary_mask,
+                           image_size=None, tolerance=2, bounding_box=None):
+    if image_size is not None:
+        binary_mask = resize_binary_mask(binary_mask, image_size)
+
+    binary_mask_encoded = mask.encode(np.asfortranarray(binary_mask.astype(np.uint8)))
+
+    area = mask.area(binary_mask_encoded)
+    if area < 1:
+        return None
+
+    if bounding_box is None:
+        bounding_box = mask.toBbox(binary_mask_encoded)
+
+    if category_info["is_crowd"]:
+        is_crowd = 1
+        segmentation = binary_mask_to_rle(binary_mask)
+    else:
+        is_crowd = 0
+        segmentation = binary_mask_to_polygon(binary_mask, tolerance)
+        if not segmentation:
+            return None
+
+    annotation_info = {
+        "id": annotation_id,
+        "image_id": image_id,
+        "category_id": category_info["id"],
+        "iscrowd": is_crowd,
+        "area": area.tolist(),
+        "bbox": bounding_box.tolist(),
+        "segmentation": segmentation,
+        "width": binary_mask.shape[1],
+        "height": binary_mask.shape[0],
+    }
+
+    return annotation_info
diff --git a/preprocess/humanparsing/mhp_extension/coco_style_annotation_creator/test_human2coco_format.py b/preprocess/humanparsing/mhp_extension/coco_style_annotation_creator/test_human2coco_format.py
new file mode 100644
index 0000000000000000000000000000000000000000..17339187305a97fa7ab198cf1d8127a76ebdf854
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/coco_style_annotation_creator/test_human2coco_format.py
@@ -0,0 +1,74 @@
+import argparse
+import datetime
+import json
+import os
+from PIL import Image
+
+import pycococreatortools
+
+
+def get_arguments():
+    parser = argparse.ArgumentParser(description="transform mask annotation to coco annotation")
+    parser.add_argument("--dataset", type=str, default='CIHP', help="name of dataset (CIHP, MHPv2 or VIP)")
+    parser.add_argument("--json_save_dir", type=str, default='../data/CIHP/annotations',
+                        help="path to save coco-style annotation json file")
+    parser.add_argument("--test_img_dir", type=str, default='../data/CIHP/Testing/Images',
+                        help="test image path")
+    return parser.parse_args()
+
+args = get_arguments()
+
+INFO = {
+    "description": args.dataset + "Dataset",
+    "url": "",
+    "version": "",
+    "year": 2020,
+    "contributor": "yunqiuxu",
+    "date_created": datetime.datetime.utcnow().isoformat(' ')
+}
+
+LICENSES = [
+    {
+        "id": 1,
+        "name": "",
+        "url": ""
+    }
+]
+
+CATEGORIES = [
+    {
+        'id': 1,
+        'name': 'person',
+        'supercategory': 'person',
+    },
+]
+
+
+def main(args):
+    coco_output = {
+        "info": INFO,
+        "licenses": LICENSES,
+        "categories": CATEGORIES,
+        "images": [],
+        "annotations": []
+    }
+
+    image_id = 1
+
+    for image_name in os.listdir(args.test_img_dir):
+        image = Image.open(os.path.join(args.test_img_dir, image_name))
+        image_info = pycococreatortools.create_image_info(
+            image_id, image_name, image.size
+        )
+        coco_output["images"].append(image_info)
+        image_id += 1
+
+    if not os.path.exists(os.path.join(args.json_save_dir)):
+        os.mkdir(os.path.join(args.json_save_dir))
+
+    with open('{}/{}.json'.format(args.json_save_dir, args.dataset), 'w') as output_json_file:
+        json.dump(coco_output, output_json_file)
+
+
+if __name__ == "__main__":
+    main(args)
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/.circleci/config.yml b/preprocess/humanparsing/mhp_extension/detectron2/.circleci/config.yml
new file mode 100644
index 0000000000000000000000000000000000000000..6c605889cf4ac01d3ed63f62d65a0d6ae1f6edd0
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/.circleci/config.yml
@@ -0,0 +1,179 @@
+# Python CircleCI 2.0 configuration file
+#
+# Check https://circleci.com/docs/2.0/language-python/ for more details
+#
+version: 2
+
+# -------------------------------------------------------------------------------------
+# Environments to run the jobs in
+# -------------------------------------------------------------------------------------
+cpu: &cpu
+  docker:
+    - image: circleci/python:3.6.8-stretch
+  resource_class: medium
+
+gpu: &gpu
+  machine:
+    image: ubuntu-1604:201903-01
+    docker_layer_caching: true
+  resource_class: gpu.small
+
+# -------------------------------------------------------------------------------------
+# Re-usable commands
+# -------------------------------------------------------------------------------------
+install_python: &install_python
+  - run:
+      name: Install Python
+      working_directory: ~/
+      command: |
+        pyenv install 3.6.1
+        pyenv global 3.6.1
+
+setup_venv: &setup_venv
+  - run:
+      name: Setup Virtual Env
+      working_directory: ~/
+      command: |
+        python -m venv ~/venv
+        echo ". ~/venv/bin/activate" >> $BASH_ENV
+        . ~/venv/bin/activate
+        python --version
+        which python
+        which pip
+        pip install --upgrade pip
+
+install_dep: &install_dep
+  - run:
+      name: Install Dependencies
+      command: |
+        pip install --progress-bar off -U 'git+https://github.com/facebookresearch/fvcore'
+        pip install --progress-bar off cython opencv-python
+        pip install --progress-bar off 'git+https://github.com/cocodataset/cocoapi.git#subdirectory=PythonAPI'
+        pip install --progress-bar off torch torchvision
+
+install_detectron2: &install_detectron2
+  - run:
+      name: Install Detectron2
+      command: |
+        gcc --version
+        pip install -U --progress-bar off -e .[dev]
+        python -m detectron2.utils.collect_env
+
+install_nvidia_driver: &install_nvidia_driver
+  - run:
+      name: Install nvidia driver
+      working_directory: ~/
+      command: |
+        wget -q 'https://s3.amazonaws.com/ossci-linux/nvidia_driver/NVIDIA-Linux-x86_64-430.40.run'
+        sudo /bin/bash ./NVIDIA-Linux-x86_64-430.40.run -s --no-drm
+        nvidia-smi
+
+run_unittests: &run_unittests
+  - run:
+      name: Run Unit Tests
+      command: |
+        python -m unittest discover -v -s tests
+
+# -------------------------------------------------------------------------------------
+# Jobs to run
+# -------------------------------------------------------------------------------------
+jobs:
+  cpu_tests:
+    <<: *cpu
+
+    working_directory: ~/detectron2
+
+    steps:
+      - checkout
+      - <<: *setup_venv
+
+      # Cache the venv directory that contains dependencies
+      - restore_cache:
+          keys:
+            - cache-key-{{ .Branch }}-ID-20200425
+
+      - <<: *install_dep
+
+      - save_cache:
+          paths:
+            - ~/venv
+          key: cache-key-{{ .Branch }}-ID-20200425
+
+      - <<: *install_detectron2
+
+      - run:
+          name: isort
+          command: |
+            isort -c -sp .
+      - run:
+          name: black
+          command: |
+            black --check -l 100 .
+      - run:
+          name: flake8
+          command: |
+            flake8 .
+
+      - <<: *run_unittests
+
+  gpu_tests:
+    <<: *gpu
+
+    working_directory: ~/detectron2
+
+    steps:
+      - checkout
+      - <<: *install_nvidia_driver
+
+      - run:
+          name: Install nvidia-docker
+          working_directory: ~/
+          command: |
+            curl -s -L https://nvidia.github.io/nvidia-docker/gpgkey | sudo apt-key add -
+            distribution=$(. /etc/os-release;echo $ID$VERSION_ID)
+            curl -s -L https://nvidia.github.io/nvidia-docker/$distribution/nvidia-docker.list | \
+            sudo tee /etc/apt/sources.list.d/nvidia-docker.list
+            sudo apt-get update && sudo apt-get install -y nvidia-docker2
+            # reload the docker daemon configuration
+            sudo pkill -SIGHUP dockerd
+
+      - run:
+          name: Launch docker
+          working_directory: ~/detectron2/docker
+          command: |
+            nvidia-docker build -t detectron2:v0 -f Dockerfile-circleci .
+            nvidia-docker run -itd --name d2 detectron2:v0
+            docker exec -it d2 nvidia-smi
+
+      - run:
+          name: Build Detectron2
+          command: |
+            docker exec -it d2 pip install 'git+https://github.com/facebookresearch/fvcore'
+            docker cp ~/detectron2 d2:/detectron2
+            # This will build d2 for the target GPU arch only
+            docker exec -it d2 pip install -e /detectron2
+            docker exec -it d2 python3 -m detectron2.utils.collect_env
+            docker exec -it d2 python3 -c 'import torch; assert(torch.cuda.is_available())'
+
+      - run:
+          name: Run Unit Tests
+          command: |
+            docker exec -e CIRCLECI=true -it d2 python3 -m unittest discover -v -s /detectron2/tests
+
+workflows:
+  version: 2
+  regular_test:
+    jobs:
+      - cpu_tests
+      - gpu_tests
+
+  #nightly_test:
+    #jobs:
+      #- gpu_tests
+    #triggers:
+      #- schedule:
+          #cron: "0 0 * * *"
+          #filters:
+            #branches:
+              #only:
+                #- master
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/.clang-format b/preprocess/humanparsing/mhp_extension/detectron2/.clang-format
new file mode 100644
index 0000000000000000000000000000000000000000..a757d4fff0c2f065d7d51719b52aef35ec48d04e
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/.clang-format
@@ -0,0 +1,85 @@
+AccessModifierOffset: -1
+AlignAfterOpenBracket: AlwaysBreak
+AlignConsecutiveAssignments: false
+AlignConsecutiveDeclarations: false
+AlignEscapedNewlinesLeft: true
+AlignOperands:   false
+AlignTrailingComments: false
+AllowAllParametersOfDeclarationOnNextLine: false
+AllowShortBlocksOnASingleLine: false
+AllowShortCaseLabelsOnASingleLine: false
+AllowShortFunctionsOnASingleLine: Empty
+AllowShortIfStatementsOnASingleLine: false
+AllowShortLoopsOnASingleLine: false
+AlwaysBreakAfterReturnType: None
+AlwaysBreakBeforeMultilineStrings: true
+AlwaysBreakTemplateDeclarations: true
+BinPackArguments: false
+BinPackParameters: false
+BraceWrapping:
+  AfterClass:      false
+  AfterControlStatement: false
+  AfterEnum:       false
+  AfterFunction:   false
+  AfterNamespace:  false
+  AfterObjCDeclaration: false
+  AfterStruct:     false
+  AfterUnion:      false
+  BeforeCatch:     false
+  BeforeElse:      false
+  IndentBraces:    false
+BreakBeforeBinaryOperators: None
+BreakBeforeBraces: Attach
+BreakBeforeTernaryOperators: true
+BreakConstructorInitializersBeforeComma: false
+BreakAfterJavaFieldAnnotations: false
+BreakStringLiterals: false
+ColumnLimit:     80
+CommentPragmas:  '^ IWYU pragma:'
+ConstructorInitializerAllOnOneLineOrOnePerLine: true
+ConstructorInitializerIndentWidth: 4
+ContinuationIndentWidth: 4
+Cpp11BracedListStyle: true
+DerivePointerAlignment: false
+DisableFormat:   false
+ForEachMacros:   [ FOR_EACH, FOR_EACH_ENUMERATE, FOR_EACH_KV, FOR_EACH_R, FOR_EACH_RANGE, ]
+IncludeCategories:
+  - Regex:           '^<.*\.h(pp)?>'
+    Priority:        1
+  - Regex:           '^<.*'
+    Priority:        2
+  - Regex:           '.*'
+    Priority:        3
+IndentCaseLabels: true
+IndentWidth:     2
+IndentWrappedFunctionNames: false
+KeepEmptyLinesAtTheStartOfBlocks: false
+MacroBlockBegin: ''
+MacroBlockEnd:   ''
+MaxEmptyLinesToKeep: 1
+NamespaceIndentation: None
+ObjCBlockIndentWidth: 2
+ObjCSpaceAfterProperty: false
+ObjCSpaceBeforeProtocolList: false
+PenaltyBreakBeforeFirstCallParameter: 1
+PenaltyBreakComment: 300
+PenaltyBreakFirstLessLess: 120
+PenaltyBreakString: 1000
+PenaltyExcessCharacter: 1000000
+PenaltyReturnTypeOnItsOwnLine: 200
+PointerAlignment: Left
+ReflowComments:  true
+SortIncludes:    true
+SpaceAfterCStyleCast: false
+SpaceBeforeAssignmentOperators: true
+SpaceBeforeParens: ControlStatements
+SpaceInEmptyParentheses: false
+SpacesBeforeTrailingComments: 1
+SpacesInAngles:  false
+SpacesInContainerLiterals: true
+SpacesInCStyleCastParentheses: false
+SpacesInParentheses: false
+SpacesInSquareBrackets: false
+Standard:        Cpp11
+TabWidth:        8
+UseTab:          Never
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/.flake8 b/preprocess/humanparsing/mhp_extension/detectron2/.flake8
new file mode 100644
index 0000000000000000000000000000000000000000..0cc61b77a7e7005b3499394c36288dc8f3bcad39
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/.flake8
@@ -0,0 +1,9 @@
+# This is an example .flake8 config, used when developing *Black* itself.
+# Keep in sync with setup.cfg which is used for source packages.
+
+[flake8]
+ignore = W503, E203, E221, C901, C408, E741
+max-line-length = 100
+max-complexity = 18
+select = B,C,E,F,W,T4,B9
+exclude = build,__init__.py
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/.github/CODE_OF_CONDUCT.md b/preprocess/humanparsing/mhp_extension/detectron2/.github/CODE_OF_CONDUCT.md
new file mode 100644
index 0000000000000000000000000000000000000000..0f7ad8bfc173eac554f0b6ef7c684861e8014bbe
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/.github/CODE_OF_CONDUCT.md
@@ -0,0 +1,5 @@
+# Code of Conduct
+
+Facebook has adopted a Code of Conduct that we expect project participants to adhere to.
+Please read the [full text](https://code.fb.com/codeofconduct/)
+so that you can understand what actions will and will not be tolerated.
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/.github/CONTRIBUTING.md b/preprocess/humanparsing/mhp_extension/detectron2/.github/CONTRIBUTING.md
new file mode 100644
index 0000000000000000000000000000000000000000..81936dfedb495dd5cd21da2bfcf9819b97ed1dff
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/.github/CONTRIBUTING.md
@@ -0,0 +1,49 @@
+# Contributing to detectron2
+
+## Issues
+We use GitHub issues to track public bugs and questions.
+Please make sure to follow one of the
+[issue templates](https://github.com/facebookresearch/detectron2/issues/new/choose)
+when reporting any issues.
+
+Facebook has a [bounty program](https://www.facebook.com/whitehat/) for the safe
+disclosure of security bugs. In those cases, please go through the process
+outlined on that page and do not file a public issue.
+
+## Pull Requests
+We actively welcome your pull requests.
+
+However, if you're adding any significant features (e.g. > 50 lines), please
+make sure to have a corresponding issue to discuss your motivation and proposals,
+before sending a PR. We do not always accept new features, and we take the following
+factors into consideration:
+
+1. Whether the same feature can be achieved without modifying detectron2.
+Detectron2 is designed so that you can implement many extensions from the outside, e.g.
+those in [projects](https://github.com/facebookresearch/detectron2/tree/master/projects).
+If some part is not as extensible, you can also bring up the issue to make it more extensible.
+2. Whether the feature is potentially useful to a large audience, or only to a small portion of users.
+3. Whether the proposed solution has a good design / interface.
+4. Whether the proposed solution adds extra mental/practical overhead to users who don't
+   need such feature.
+5. Whether the proposed solution breaks existing APIs.
+
+When sending a PR, please do:
+
+1. If a PR contains multiple orthogonal changes, split it to several PRs.
+2. If you've added code that should be tested, add tests.
+3. For PRs that need experiments (e.g. adding a new model or new methods),
+	 you don't need to update model zoo, but do provide experiment results in the description of the PR.
+4. If APIs are changed, update the documentation.
+5. Make sure your code lints with `./dev/linter.sh`.
+
+
+## Contributor License Agreement ("CLA")
+In order to accept your pull request, we need you to submit a CLA. You only need
+to do this once to work on any of Facebook's open source projects.
+
+Complete your CLA here: <https://code.facebook.com/cla>
+
+## License
+By contributing to detectron2, you agree that your contributions will be licensed
+under the LICENSE file in the root directory of this source tree.
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/.github/Detectron2-Logo-Horz.svg b/preprocess/humanparsing/mhp_extension/detectron2/.github/Detectron2-Logo-Horz.svg
new file mode 100644
index 0000000000000000000000000000000000000000..eb2d643ddd940cd8bdb5eaad093029969ff2364c
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/.github/Detectron2-Logo-Horz.svg
@@ -0,0 +1 @@
+<svg id="Layer_1" data-name="Layer 1" xmlns="http://www.w3.org/2000/svg" viewBox="0 0 1930.09 354.96"><defs><style>.cls-1{fill:#aab4bc;}.cls-2{fill:#d2d6d7;}.cls-3{fill:#9da2ab;}.cls-4{fill:#e7eef1;}.cls-5{fill:#5173f1;}.cls-6{opacity:0.7;}.cls-7{fill:#797f89;}.cls-8{fill:#e3e7e9;}.cls-9{fill:#161622;}.cls-10{fill:#3f4652;}.cls-11{fill:#fff;}</style></defs><title>Detectron2-Logo-Horz</title><path class="cls-1" d="M191.24,31h71.34a4.87,4.87,0,0,1,4.87,4.87v5a0,0,0,0,1,0,0H186.38a0,0,0,0,1,0,0v-5A4.87,4.87,0,0,1,191.24,31Z"/><path class="cls-2" d="M412.92,100.67V263.61c0,.69,0,1.33,0,2a59.73,59.73,0,0,1-59.73,57.74H100.73A59.8,59.8,0,0,1,40.9,263.61V100.67c0-.69,0-1.33,0-2a59.33,59.33,0,0,1,8.79-29.21c.76-1.24,1.57-2.46,2.42-3.64A59.76,59.76,0,0,1,100.73,40.9H353.15a59.78,59.78,0,0,1,59.77,59.77Z"/><rect class="cls-3" x="198.81" y="262.89" width="55.95" height="41.28" rx="10.15"/><path class="cls-4" d="M244.61,260.72H209A12.33,12.33,0,0,0,196.64,273v21A12.33,12.33,0,0,0,209,306.33h35.65A12.32,12.32,0,0,0,256.92,294V273A12.32,12.32,0,0,0,244.61,260.72ZM209,265.05h35.65a8,8,0,0,1,8,8v1.45H201V273A8,8,0,0,1,209,265.05Zm43.63,13.76v9.43H201v-9.43Zm-8,23.19H209a8,8,0,0,1-8-8v-1.44h51.61V294A8,8,0,0,1,244.61,302Z"/><path class="cls-1" d="M382.21,177.18h71.34a4.87,4.87,0,0,1,4.87,4.87v5a0,0,0,0,1,0,0H377.35a0,0,0,0,1,0,0v-5A4.87,4.87,0,0,1,382.21,177.18Z" transform="translate(600.02 -235.74) rotate(90)"/><path class="cls-1" d="M.28,177.18H71.62a4.87,4.87,0,0,1,4.87,4.87v5a0,0,0,0,1,0,0H-4.59a0,0,0,0,1,0,0v-5A4.87,4.87,0,0,1,.28,177.18Z" transform="translate(-146.19 218.09) rotate(-90)"/><circle class="cls-1" cx="83.04" cy="283.53" r="6.28"/><circle class="cls-1" cx="370.79" cy="283.53" r="6.28"/><circle class="cls-1" cx="226.91" cy="66.06" r="6.28"/><circle class="cls-5" cx="368.44" cy="82.89" r="20.49"/><polygon class="cls-1" points="412.92 179.98 316.61 179.98 312.27 179.98 141.55 179.98 137.21 179.98 40.9 179.98 40.9 184.3 137.21 184.3 137.21 323.38 141.55 323.38 141.55 184.3 312.27 184.3 312.27 323.38 316.61 323.38 316.61 184.3 412.92 184.3 412.92 179.98"/><g class="cls-6"><path class="cls-7" d="M403.72,193a81.13,81.13,0,1,1-81.15-81.1A81.12,81.12,0,0,1,403.72,193Z"/></g><path class="cls-8" d="M313.71,104.06a76.74,76.74,0,1,0,76.74,76.74A76.75,76.75,0,0,0,313.71,104.06Zm0,132.48a55.74,55.74,0,1,1,55.73-55.74A55.8,55.8,0,0,1,313.71,236.54Z"/><path class="cls-9" d="M376.27,180.79a62.57,62.57,0,1,1-125.13,0,61,61,0,0,1,1.93-15.33,62.55,62.55,0,0,1,123.2,15.33Z"/><path class="cls-3" d="M313.71,121.19a59.6,59.6,0,1,1-59.6,59.6A57.93,57.93,0,0,1,256,166.18a59.72,59.72,0,0,1,57.76-45m0-3.65a63.36,63.36,0,0,0-61.3,47.75,61.81,61.81,0,0,0-1.95,15.5,63.25,63.25,0,1,0,63.25-63.25Z"/><g class="cls-6"><path class="cls-7" d="M228.66,193a81.12,81.12,0,1,1-81.14-81.1A81.11,81.11,0,0,1,228.66,193Z"/></g><path class="cls-8" d="M138.65,104.06A76.74,76.74,0,1,0,215.4,180.8,76.74,76.74,0,0,0,138.65,104.06Zm0,132.48a55.74,55.74,0,1,1,55.74-55.74A55.8,55.8,0,0,1,138.65,236.54Z"/><path class="cls-9" d="M201.22,180.79a62.57,62.57,0,1,1-125.13,0A61,61,0,0,1,78,165.46a62.55,62.55,0,0,1,123.2,15.33Z"/><path class="cls-3" d="M138.65,121.19a59.6,59.6,0,1,1-59.6,59.6,58.38,58.38,0,0,1,1.84-14.61,59.72,59.72,0,0,1,57.76-45m0-3.65a63.39,63.39,0,0,0-61.3,47.75,62.28,62.28,0,0,0-1.94,15.5,63.25,63.25,0,1,0,63.24-63.25Z"/><circle class="cls-10" cx="313.71" cy="180.79" r="29"/><circle class="cls-10" cx="138.65" cy="180.79" r="29"/><circle class="cls-11" cx="154.83" cy="156.49" r="12.7"/><circle class="cls-11" cx="329.89" cy="156.49" r="12.7"/><path class="cls-1" d="M312.27,40.91V81.77a100.32,100.32,0,0,0-72.71,33.61H214.3A100.51,100.51,0,0,0,142,81.82V40.9h-4.33V81.77A99.56,99.56,0,0,0,86.17,97.06l-34-31.27c-.85,1.18-1.66,2.4-2.42,3.64l36,33.1,0,0a95.88,95.88,0,0,1,126,16.46l.65.74h29.18l.65-.74a96,96,0,0,1,72.27-32.89h2.17V40.91Z"/><path class="cls-5" d="M1899.11,280.92H1758.56V251.65l81.53-77.75q19.44-18.35,19.44-39.32,0-14.55-9-23.29t-24.15-8.74q-16.59,0-25,9.6t-8.44,25.32l.87,9.6h-35.21a77.72,77.72,0,0,1-.58-10.19q0-30,18.77-48.45T1826.36,70q32,0,50.48,17.75t18.48,46q0,20.08-8,35.49t-27.22,32.29l-52.95,46.87h92Z"/><path class="cls-10" d="M557.9,280.92H487.77V74.32H557.9q52.38,0,81.62,28.37t29.24,74.93q0,46.56-29.24,74.93T557.9,280.92Zm54.85-51.36q18.76-18.76,18.77-51.94t-18.77-51.94q-18.76-18.77-56-18.77H523V248.33h33.76Q594,248.33,612.75,229.56Z"/><path class="cls-10" d="M826.87,215.45H711.93q2,18,13.1,28.66t29.1,10.62a40.72,40.72,0,0,0,21.53-5.82,32.61,32.61,0,0,0,13.68-15.71h34.91a70.46,70.46,0,0,1-26,37.1q-19.07,14.11-45,14.11-33.75,0-54.56-22.41t-20.8-56.74q0-33.45,21-56.15T753,126.41q33.18,0,53.69,22.26t20.51,56ZM753,154.63q-16.29,0-27.06,9.61t-13.38,25.6h80.31q-2.34-16-12.81-25.6T753,154.63Z"/><path class="cls-10" d="M915.19,250.08v30q-6.41,1.74-18,1.74-44.24,0-44.23-44.52V157.54h-23V129.9h23V90.62h34.63V129.9h28.22v27.64H887.55v76.24q0,17.76,16.88,17.75Z"/><path class="cls-10" d="M1075.48,215.45H960.54q2,18,13.09,28.66t29.1,10.62a40.72,40.72,0,0,0,21.53-5.82,32.55,32.55,0,0,0,13.68-15.71h34.92a70.48,70.48,0,0,1-26,37.1q-19.05,14.11-44.95,14.11-33.76,0-54.56-22.41t-20.8-56.74q0-33.45,20.94-56.15t54.13-22.7q33.16,0,53.68,22.26t20.52,56Zm-73.91-60.82q-16.3,0-27.06,9.61t-13.39,25.6h80.31q-2.33-16-12.8-25.6T1001.57,154.63Z"/><path class="cls-10" d="M1086.1,205.56q0-33.47,21.24-56.31t54.13-22.84q31.13,0,49.61,17.6t22,40.59h-35.5a36,36,0,0,0-13-20.08q-9.75-7.56-23.42-7.56-18.33,0-29.39,13.53t-11.06,35.07q0,21.52,11.06,34.91t29.39,13.39q13.66,0,23.42-7.57a35.88,35.88,0,0,0,13-20.08h35.5q-3.49,23-22,40.6t-49.61,17.6q-32.9,0-54.13-22.84T1086.1,205.56Z"/><path class="cls-10" d="M1322.58,250.08v30q-6.4,1.74-18,1.74-44.22,0-44.23-44.52V157.54h-23V129.9h23V90.62h34.63V129.9h28.23v27.64h-28.23v76.24q0,17.76,16.88,17.75Z"/><path class="cls-10" d="M1428.44,128.74V161a55.4,55.4,0,0,0-7.85-.59q-39,0-39,41.91v78.56H1347v-151h32v20.95q12.8-22.41,44.51-22.41Z"/><path class="cls-10" d="M1507.79,284.41q-34.92,0-56.6-23t-21.67-55.86q0-32.9,21.67-56t56.6-23.13q35.2,0,56.89,23.13t21.68,56q0,32.88-21.68,55.86T1507.79,284.41Zm-43.65-78.85q0,21.52,12.37,34.91t31.28,13.39q19.2,0,31.57-13.39t12.37-34.91q0-21.84-12.37-35.21T1507.79,157q-18.91,0-31.28,13.39T1464.14,205.56Z"/><path class="cls-10" d="M1631.22,129.9V150q5.25-9.9,17.32-16.74t29.24-6.83q26.78,0,41.47,16.29t14.69,43.36v94.86h-34.63v-90.5q0-16-7.42-25.17t-22.55-9.16q-16.58,0-26,9.89t-9.46,27.35v87.59h-34.62v-151Z"/></svg>
\ No newline at end of file
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/.github/ISSUE_TEMPLATE.md b/preprocess/humanparsing/mhp_extension/detectron2/.github/ISSUE_TEMPLATE.md
new file mode 100644
index 0000000000000000000000000000000000000000..5e8aaa2d3722e7e73a3d94b2b7dfc4f751d7a240
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/.github/ISSUE_TEMPLATE.md
@@ -0,0 +1,5 @@
+
+Please select an issue template from
+https://github.com/facebookresearch/detectron2/issues/new/choose .
+
+Otherwise your issue will be closed.
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/.github/ISSUE_TEMPLATE/bugs.md b/preprocess/humanparsing/mhp_extension/detectron2/.github/ISSUE_TEMPLATE/bugs.md
new file mode 100644
index 0000000000000000000000000000000000000000..52d299886a457480d27c54a27734a704786a1d28
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/.github/ISSUE_TEMPLATE/bugs.md
@@ -0,0 +1,36 @@
+---
+name: "🐛 Bugs"
+about: Report bugs in detectron2
+title: Please read & provide the following
+
+---
+
+## Instructions To Reproduce the 🐛 Bug:
+
+1. what changes you made (`git diff`) or what code you wrote
+```
+<put diff or code here>
+```
+2. what exact command you run:
+3. what you observed (including __full logs__):
+```
+<put logs here>
+```
+4. please simplify the steps as much as possible so they do not require additional resources to
+	 run, such as a private dataset.
+
+## Expected behavior:
+
+If there are no obvious error in "what you observed" provided above,
+please tell us the expected behavior.
+
+## Environment:
+
+Provide your environment information using the following command:
+```
+wget -nc -q https://github.com/facebookresearch/detectron2/raw/master/detectron2/utils/collect_env.py && python collect_env.py
+```
+
+If your issue looks like an installation issue / environment issue,
+please first try to solve it yourself with the instructions in
+https://detectron2.readthedocs.io/tutorials/install.html#common-installation-issues
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/.github/ISSUE_TEMPLATE/config.yml b/preprocess/humanparsing/mhp_extension/detectron2/.github/ISSUE_TEMPLATE/config.yml
new file mode 100644
index 0000000000000000000000000000000000000000..c19e2490a71893c516b2bd54b887399493fadcd4
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/.github/ISSUE_TEMPLATE/config.yml
@@ -0,0 +1,9 @@
+# require an issue template to be chosen
+blank_issues_enabled: false
+
+# Unexpected behaviors & bugs are split to two templates.
+# When they are one template, users think "it's not a bug" and don't choose the template.
+#
+# But the file name is still "unexpected-problems-bugs.md" so that old references
+# to this issue template still works.
+# It's ok since this template should be a superset of "bugs.md" (unexpected behaviors is a superset of bugs)
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/.github/ISSUE_TEMPLATE/feature-request.md b/preprocess/humanparsing/mhp_extension/detectron2/.github/ISSUE_TEMPLATE/feature-request.md
new file mode 100644
index 0000000000000000000000000000000000000000..dd69a33478c85068cdd7b8b90161f97cc55c1621
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/.github/ISSUE_TEMPLATE/feature-request.md
@@ -0,0 +1,31 @@
+---
+name: "\U0001F680Feature Request"
+about: Submit a proposal/request for a new detectron2 feature
+
+---
+
+## 🚀 Feature
+A clear and concise description of the feature proposal.
+
+
+## Motivation & Examples
+
+Tell us why the feature is useful.
+
+Describe what the feature would look like, if it is implemented.
+Best demonstrated using **code examples** in addition to words.
+
+## Note
+
+We only consider adding new features if they are relevant to many users.
+
+If you request implementation of research papers --
+we only consider papers that have enough significance and prevalance in the object detection field.
+
+We do not take requests for most projects in the `projects/` directory,
+because they are research code release that is mainly for other researchers to reproduce results.
+
+Instead of adding features inside detectron2,
+you can implement many features by [extending detectron2](https://detectron2.readthedocs.io/tutorials/extend.html).
+The [projects/](https://github.com/facebookresearch/detectron2/tree/master/projects/) directory contains many of such examples.
+
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/.github/ISSUE_TEMPLATE/questions-help-support.md b/preprocess/humanparsing/mhp_extension/detectron2/.github/ISSUE_TEMPLATE/questions-help-support.md
new file mode 100644
index 0000000000000000000000000000000000000000..081156136b709b1e0ec4d27404b9cb8fa9ba1d27
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/.github/ISSUE_TEMPLATE/questions-help-support.md
@@ -0,0 +1,26 @@
+---
+name: "❓How to do something?"
+about: How to do something using detectron2? What does an API do?
+
+---
+
+## ❓ How to do something using detectron2
+
+Describe what you want to do, including:
+1. what inputs you will provide, if any:
+2. what outputs you are expecting:
+
+## ❓ What does an API do and how to use it?
+Please link to which API or documentation you're asking about from
+https://detectron2.readthedocs.io/
+
+
+NOTE:
+
+1. Only general answers are provided.
+   If you want to ask about "why X did not work", please use the
+   [Unexpected behaviors](https://github.com/facebookresearch/detectron2/issues/new/choose) issue template.
+
+2. About how to implement new models / new dataloader / new training logic, etc., check documentation first.
+
+3. We do not answer general machine learning / computer vision questions that are not specific to detectron2, such as how a model works, how to improve your training/make it converge, or what algorithm/methods can be used to achieve X.
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/.github/ISSUE_TEMPLATE/unexpected-problems-bugs.md b/preprocess/humanparsing/mhp_extension/detectron2/.github/ISSUE_TEMPLATE/unexpected-problems-bugs.md
new file mode 100644
index 0000000000000000000000000000000000000000..bafee7a1a3897903d26e68001d3d3d2b7686015b
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/.github/ISSUE_TEMPLATE/unexpected-problems-bugs.md
@@ -0,0 +1,45 @@
+---
+name: "Unexpected behaviors"
+about: Run into unexpected behaviors when using detectron2
+title: Please read & provide the following
+
+---
+
+If you do not know the root cause of the problem, and wish someone to help you, please
+post according to this template:
+
+## Instructions To Reproduce the Issue:
+
+1. what changes you made (`git diff`) or what code you wrote
+```
+<put diff or code here>
+```
+2. what exact command you run:
+3. what you observed (including __full logs__):
+```
+<put logs here>
+```
+4. please simplify the steps as much as possible so they do not require additional resources to
+	 run, such as a private dataset.
+
+## Expected behavior:
+
+If there are no obvious error in "what you observed" provided above,
+please tell us the expected behavior.
+
+If you expect the model to converge / work better, note that we do not give suggestions
+on how to train a new model.
+Only in one of the two conditions we will help with it:
+(1) You're unable to reproduce the results in detectron2 model zoo.
+(2) It indicates a detectron2 bug.
+
+## Environment:
+
+Provide your environment information using the following command:
+```
+wget -nc -q https://github.com/facebookresearch/detectron2/raw/master/detectron2/utils/collect_env.py && python collect_env.py
+```
+
+If your issue looks like an installation issue / environment issue,
+please first try to solve it yourself with the instructions in
+https://detectron2.readthedocs.io/tutorials/install.html#common-installation-issues
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/.github/pull_request_template.md b/preprocess/humanparsing/mhp_extension/detectron2/.github/pull_request_template.md
new file mode 100644
index 0000000000000000000000000000000000000000..4ff5ea51776ff27b3e794e366a92a455e2f06a01
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/.github/pull_request_template.md
@@ -0,0 +1,9 @@
+Thanks for your contribution!
+
+If you're sending a large PR (e.g., >50 lines),
+please open an issue first about the feature / bug, and indicate how you want to contribute.
+
+Before submitting a PR, please run `dev/linter.sh` to lint the code.
+
+See https://detectron2.readthedocs.io/notes/contributing.html#pull-requests
+about how we handle PRs.
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/.gitignore b/preprocess/humanparsing/mhp_extension/detectron2/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..e85df4cf713e2c4a6fc02885f2b2ff3d0f104763
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/.gitignore
@@ -0,0 +1,46 @@
+# output dir
+output
+instant_test_output
+inference_test_output
+
+
+*.jpg
+*.png
+*.txt
+*.json
+*.diff
+
+# compilation and distribution
+__pycache__
+_ext
+*.pyc
+*.so
+detectron2.egg-info/
+build/
+dist/
+wheels/
+
+# pytorch/python/numpy formats
+*.pth
+*.pkl
+*.npy
+
+# ipython/jupyter notebooks
+*.ipynb
+**/.ipynb_checkpoints/
+
+# Editor temporaries
+*.swn
+*.swo
+*.swp
+*~
+
+# editor settings
+.idea
+.vscode
+
+# project dirs
+/detectron2/model_zoo/configs
+/datasets
+/projects/*/datasets
+/models
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/GETTING_STARTED.md b/preprocess/humanparsing/mhp_extension/detectron2/GETTING_STARTED.md
new file mode 100644
index 0000000000000000000000000000000000000000..acaf13f02c906b45ffc2f49ee5a0ce01d82b4786
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/GETTING_STARTED.md
@@ -0,0 +1,79 @@
+## Getting Started with Detectron2
+
+This document provides a brief intro of the usage of builtin command-line tools in detectron2.
+
+For a tutorial that involves actual coding with the API,
+see our [Colab Notebook](https://colab.research.google.com/drive/16jcaJoc6bCFAQ96jDe2HwtXj7BMD_-m5)
+which covers how to run inference with an
+existing model, and how to train a builtin model on a custom dataset.
+
+For more advanced tutorials, refer to our [documentation](https://detectron2.readthedocs.io/tutorials/extend.html).
+
+
+### Inference Demo with Pre-trained Models
+
+1. Pick a model and its config file from
+	[model zoo](MODEL_ZOO.md),
+	for example, `mask_rcnn_R_50_FPN_3x.yaml`.
+2. We provide `demo.py` that is able to run builtin standard models. Run it with:
+```
+cd demo/
+python demo.py --config-file ../configs/COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_3x.yaml \
+  --input input1.jpg input2.jpg \
+  [--other-options]
+  --opts MODEL.WEIGHTS detectron2://COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_3x/137849600/model_final_f10217.pkl
+```
+The configs are made for training, therefore we need to specify `MODEL.WEIGHTS` to a model from model zoo for evaluation.
+This command will run the inference and show visualizations in an OpenCV window.
+
+For details of the command line arguments, see `demo.py -h` or look at its source code
+to understand its behavior. Some common arguments are:
+* To run __on your webcam__, replace `--input files` with `--webcam`.
+* To run __on a video__, replace `--input files` with `--video-input video.mp4`.
+* To run __on cpu__, add `MODEL.DEVICE cpu` after `--opts`.
+* To save outputs to a directory (for images) or a file (for webcam or video), use `--output`.
+
+
+### Training & Evaluation in Command Line
+
+We provide a script in "tools/{,plain_}train_net.py", that is made to train
+all the configs provided in detectron2.
+You may want to use it as a reference to write your own training script.
+
+To train a model with "train_net.py", first
+setup the corresponding datasets following
+[datasets/README.md](./datasets/README.md),
+then run:
+```
+cd tools/
+./train_net.py --num-gpus 8 \
+	--config-file ../configs/COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_1x.yaml
+```
+
+The configs are made for 8-GPU training.
+To train on 1 GPU, you may need to [change some parameters](https://arxiv.org/abs/1706.02677), e.g.:
+```
+./train_net.py \
+	--config-file ../configs/COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_1x.yaml \
+	--num-gpus 1 SOLVER.IMS_PER_BATCH 2 SOLVER.BASE_LR 0.0025
+```
+
+For most models, CPU training is not supported.
+
+To evaluate a model's performance, use
+```
+./train_net.py \
+	--config-file ../configs/COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_1x.yaml \
+	--eval-only MODEL.WEIGHTS /path/to/checkpoint_file
+```
+For more options, see `./train_net.py -h`.
+
+### Use Detectron2 APIs in Your Code
+
+See our [Colab Notebook](https://colab.research.google.com/drive/16jcaJoc6bCFAQ96jDe2HwtXj7BMD_-m5)
+to learn how to use detectron2 APIs to:
+1. run inference with an existing model
+2. train a builtin model on a custom dataset
+
+See [detectron2/projects](https://github.com/facebookresearch/detectron2/tree/master/projects)
+for more ways to build your project on detectron2.
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/INSTALL.md b/preprocess/humanparsing/mhp_extension/detectron2/INSTALL.md
new file mode 100644
index 0000000000000000000000000000000000000000..3985f8ae4f5ecde26b310b4ab01c49b922f742e9
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/INSTALL.md
@@ -0,0 +1,184 @@
+## Installation
+
+Our [Colab Notebook](https://colab.research.google.com/drive/16jcaJoc6bCFAQ96jDe2HwtXj7BMD_-m5)
+has step-by-step instructions that install detectron2.
+The [Dockerfile](docker)
+also installs detectron2 with a few simple commands.
+
+### Requirements
+- Linux or macOS with Python ≥ 3.6
+- PyTorch ≥ 1.4
+- [torchvision](https://github.com/pytorch/vision/) that matches the PyTorch installation.
+	You can install them together at [pytorch.org](https://pytorch.org) to make sure of this.
+- OpenCV, optional, needed by demo and visualization
+- pycocotools: `pip install cython; pip install -U 'git+https://github.com/cocodataset/cocoapi.git#subdirectory=PythonAPI'`
+
+
+### Build Detectron2 from Source
+
+gcc & g++ ≥ 5 are required. [ninja](https://ninja-build.org/) is recommended for faster build.
+After having them, run:
+```
+python -m pip install 'git+https://github.com/facebookresearch/detectron2.git'
+# (add --user if you don't have permission)
+
+# Or, to install it from a local clone:
+git clone https://github.com/facebookresearch/detectron2.git
+python -m pip install -e detectron2
+
+# Or if you are on macOS
+# CC=clang CXX=clang++ python -m pip install -e .
+```
+
+To __rebuild__ detectron2 that's built from a local clone, use `rm -rf build/ **/*.so` to clean the
+old build first. You often need to rebuild detectron2 after reinstalling PyTorch.
+
+### Install Pre-Built Detectron2 (Linux only)
+```
+# for CUDA 10.1:
+python -m pip install detectron2 -f https://dl.fbaipublicfiles.com/detectron2/wheels/cu101/index.html
+```
+You can replace cu101 with "cu{100,92}" or "cpu".
+
+Note that:
+1. Such installation has to be used with certain version of official PyTorch release.
+   See [releases](https://github.com/facebookresearch/detectron2/releases) for requirements.
+   It will not work with a different version of PyTorch or a non-official build of PyTorch.
+2. Such installation is out-of-date w.r.t. master branch of detectron2. It may not be
+	 compatible with the master branch of a research project that uses detectron2 (e.g. those in
+	 [projects](projects) or [meshrcnn](https://github.com/facebookresearch/meshrcnn/)).
+
+### Common Installation Issues
+
+If you met issues using the pre-built detectron2, please uninstall it and try building it from source.
+
+Click each issue for its solutions:
+
+<details>
+<summary>
+Undefined torch/aten/caffe2 symbols, or segmentation fault immediately when running the library.
+</summary>
+<br/>
+
+This usually happens when detectron2 or torchvision is not
+compiled with the version of PyTorch you're running.
+
+Pre-built torchvision or detectron2 has to work with the corresponding official release of pytorch.
+If the error comes from a pre-built torchvision, uninstall torchvision and pytorch and reinstall them
+following [pytorch.org](http://pytorch.org). So the versions will match.
+
+If the error comes from a pre-built detectron2, check [release notes](https://github.com/facebookresearch/detectron2/releases)
+to see the corresponding pytorch version required for each pre-built detectron2.
+
+If the error comes from detectron2 or torchvision that you built manually from source,
+remove files you built (`build/`, `**/*.so`) and rebuild it so it can pick up the version of pytorch currently in your environment.
+
+If you cannot resolve this problem, please include the output of `gdb -ex "r" -ex "bt" -ex "quit" --args python -m detectron2.utils.collect_env`
+in your issue.
+</details>
+
+<details>
+<summary>
+Undefined C++ symbols (e.g. `GLIBCXX`) or C++ symbols not found.
+</summary>
+<br/>
+Usually it's because the library is compiled with a newer C++ compiler but run with an old C++ runtime.
+
+This often happens with old anaconda.
+Try `conda update libgcc`. Then rebuild detectron2.
+
+The fundamental solution is to run the code with proper C++ runtime.
+One way is to use `LD_PRELOAD=/path/to/libstdc++.so`.
+
+</details>
+
+<details>
+<summary>
+"Not compiled with GPU support" or "Detectron2 CUDA Compiler: not available".
+</summary>
+<br/>
+CUDA is not found when building detectron2.
+You should make sure
+
+```
+python -c 'import torch; from torch.utils.cpp_extension import CUDA_HOME; print(torch.cuda.is_available(), CUDA_HOME)'
+```
+
+print valid outputs at the time you build detectron2.
+
+Most models can run inference (but not training) without GPU support. To use CPUs, set `MODEL.DEVICE='cpu'` in the config.
+</details>
+
+<details>
+<summary>
+"invalid device function" or "no kernel image is available for execution".
+</summary>
+<br/>
+Two possibilities:
+
+* You build detectron2 with one version of CUDA but run it with a different version.
+
+  To check whether it is the case,
+  use `python -m detectron2.utils.collect_env` to find out inconsistent CUDA versions.
+	In the output of this command, you should expect "Detectron2 CUDA Compiler", "CUDA_HOME", "PyTorch built with - CUDA"
+	to contain cuda libraries of the same version.
+
+	When they are inconsistent,
+	you need to either install a different build of PyTorch (or build by yourself)
+	to match your local CUDA installation, or install a different version of CUDA to match PyTorch.
+
+* Detectron2 or PyTorch/torchvision is not built for the correct GPU architecture (compute compatibility).
+
+	The GPU architecture for PyTorch/detectron2/torchvision is available in the "architecture flags" in
+	`python -m detectron2.utils.collect_env`.
+
+	The GPU architecture flags of detectron2/torchvision by default matches the GPU model detected
+	during compilation. This means the compiled code may not work on a different GPU model.
+	To overwrite the GPU architecture for detectron2/torchvision, use `TORCH_CUDA_ARCH_LIST` environment variable during compilation.
+
+	For example, `export TORCH_CUDA_ARCH_LIST=6.0,7.0` makes it compile for both P100s and V100s.
+	Visit [developer.nvidia.com/cuda-gpus](https://developer.nvidia.com/cuda-gpus) to find out
+	the correct compute compatibility number for your device.
+
+</details>
+
+<details>
+<summary>
+Undefined CUDA symbols; cannot open libcudart.so; other nvcc failures.
+</summary>
+<br/>
+The version of NVCC you use to build detectron2 or torchvision does
+not match the version of CUDA you are running with.
+This often happens when using anaconda's CUDA runtime.
+
+Use `python -m detectron2.utils.collect_env` to find out inconsistent CUDA versions.
+In the output of this command, you should expect "Detectron2 CUDA Compiler", "CUDA_HOME", "PyTorch built with - CUDA"
+to contain cuda libraries of the same version.
+
+When they are inconsistent,
+you need to either install a different build of PyTorch (or build by yourself)
+to match your local CUDA installation, or install a different version of CUDA to match PyTorch.
+</details>
+
+
+<details>
+<summary>
+"ImportError: cannot import name '_C'".
+</summary>
+<br/>
+Please build and install detectron2 following the instructions above.
+
+If you are running code from detectron2's root directory, `cd` to a different one.
+Otherwise you may not import the code that you installed.
+</details>
+
+<details>
+<summary>
+ONNX conversion segfault after some "TraceWarning".
+</summary>
+<br/>
+The ONNX package is compiled with too old compiler.
+
+Please build and install ONNX from its source code using a compiler
+whose version is closer to what's used by PyTorch (available in `torch.__config__.show()`).
+</details>
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/LICENSE b/preprocess/humanparsing/mhp_extension/detectron2/LICENSE
new file mode 100644
index 0000000000000000000000000000000000000000..d4836895578c791dffd78d07d83a72a961e270a4
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/LICENSE
@@ -0,0 +1,201 @@
+Apache License
+Version 2.0, January 2004
+http://www.apache.org/licenses/
+
+TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+1. Definitions.
+
+"License" shall mean the terms and conditions for use, reproduction,
+and distribution as defined by Sections 1 through 9 of this document.
+
+"Licensor" shall mean the copyright owner or entity authorized by
+the copyright owner that is granting the License.
+
+"Legal Entity" shall mean the union of the acting entity and all
+other entities that control, are controlled by, or are under common
+control with that entity. For the purposes of this definition,
+"control" means (i) the power, direct or indirect, to cause the
+direction or management of such entity, whether by contract or
+otherwise, or (ii) ownership of fifty percent (50%) or more of the
+outstanding shares, or (iii) beneficial ownership of such entity.
+
+"You" (or "Your") shall mean an individual or Legal Entity
+exercising permissions granted by this License.
+
+"Source" form shall mean the preferred form for making modifications,
+including but not limited to software source code, documentation
+source, and configuration files.
+
+"Object" form shall mean any form resulting from mechanical
+transformation or translation of a Source form, including but
+not limited to compiled object code, generated documentation,
+and conversions to other media types.
+
+"Work" shall mean the work of authorship, whether in Source or
+Object form, made available under the License, as indicated by a
+copyright notice that is included in or attached to the work
+(an example is provided in the Appendix below).
+
+"Derivative Works" shall mean any work, whether in Source or Object
+form, that is based on (or derived from) the Work and for which the
+editorial revisions, annotations, elaborations, or other modifications
+represent, as a whole, an original work of authorship. For the purposes
+of this License, Derivative Works shall not include works that remain
+separable from, or merely link (or bind by name) to the interfaces of,
+the Work and Derivative Works thereof.
+
+"Contribution" shall mean any work of authorship, including
+the original version of the Work and any modifications or additions
+to that Work or Derivative Works thereof, that is intentionally
+submitted to Licensor for inclusion in the Work by the copyright owner
+or by an individual or Legal Entity authorized to submit on behalf of
+the copyright owner. For the purposes of this definition, "submitted"
+means any form of electronic, verbal, or written communication sent
+to the Licensor or its representatives, including but not limited to
+communication on electronic mailing lists, source code control systems,
+and issue tracking systems that are managed by, or on behalf of, the
+Licensor for the purpose of discussing and improving the Work, but
+excluding communication that is conspicuously marked or otherwise
+designated in writing by the copyright owner as "Not a Contribution."
+
+"Contributor" shall mean Licensor and any individual or Legal Entity
+on behalf of whom a Contribution has been received by Licensor and
+subsequently incorporated within the Work.
+
+2. Grant of Copyright License. Subject to the terms and conditions of
+this License, each Contributor hereby grants to You a perpetual,
+worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+copyright license to reproduce, prepare Derivative Works of,
+publicly display, publicly perform, sublicense, and distribute the
+Work and such Derivative Works in Source or Object form.
+
+3. Grant of Patent License. Subject to the terms and conditions of
+this License, each Contributor hereby grants to You a perpetual,
+worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+(except as stated in this section) patent license to make, have made,
+use, offer to sell, sell, import, and otherwise transfer the Work,
+where such license applies only to those patent claims licensable
+by such Contributor that are necessarily infringed by their
+Contribution(s) alone or by combination of their Contribution(s)
+with the Work to which such Contribution(s) was submitted. If You
+institute patent litigation against any entity (including a
+cross-claim or counterclaim in a lawsuit) alleging that the Work
+or a Contribution incorporated within the Work constitutes direct
+or contributory patent infringement, then any patent licenses
+granted to You under this License for that Work shall terminate
+as of the date such litigation is filed.
+
+4. Redistribution. You may reproduce and distribute copies of the
+Work or Derivative Works thereof in any medium, with or without
+modifications, and in Source or Object form, provided that You
+meet the following conditions:
+
+(a) You must give any other recipients of the Work or
+Derivative Works a copy of this License; and
+
+(b) You must cause any modified files to carry prominent notices
+stating that You changed the files; and
+
+(c) You must retain, in the Source form of any Derivative Works
+that You distribute, all copyright, patent, trademark, and
+attribution notices from the Source form of the Work,
+excluding those notices that do not pertain to any part of
+the Derivative Works; and
+
+(d) If the Work includes a "NOTICE" text file as part of its
+distribution, then any Derivative Works that You distribute must
+include a readable copy of the attribution notices contained
+within such NOTICE file, excluding those notices that do not
+pertain to any part of the Derivative Works, in at least one
+of the following places: within a NOTICE text file distributed
+as part of the Derivative Works; within the Source form or
+documentation, if provided along with the Derivative Works; or,
+within a display generated by the Derivative Works, if and
+wherever such third-party notices normally appear. The contents
+of the NOTICE file are for informational purposes only and
+do not modify the License. You may add Your own attribution
+notices within Derivative Works that You distribute, alongside
+or as an addendum to the NOTICE text from the Work, provided
+that such additional attribution notices cannot be construed
+as modifying the License.
+
+You may add Your own copyright statement to Your modifications and
+may provide additional or different license terms and conditions
+for use, reproduction, or distribution of Your modifications, or
+for any such Derivative Works as a whole, provided Your use,
+reproduction, and distribution of the Work otherwise complies with
+the conditions stated in this License.
+
+5. Submission of Contributions. Unless You explicitly state otherwise,
+any Contribution intentionally submitted for inclusion in the Work
+by You to the Licensor shall be under the terms and conditions of
+this License, without any additional terms or conditions.
+Notwithstanding the above, nothing herein shall supersede or modify
+the terms of any separate license agreement you may have executed
+with Licensor regarding such Contributions.
+
+6. Trademarks. This License does not grant permission to use the trade
+names, trademarks, service marks, or product names of the Licensor,
+except as required for reasonable and customary use in describing the
+origin of the Work and reproducing the content of the NOTICE file.
+
+7. Disclaimer of Warranty. Unless required by applicable law or
+agreed to in writing, Licensor provides the Work (and each
+Contributor provides its Contributions) on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+implied, including, without limitation, any warranties or conditions
+of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+PARTICULAR PURPOSE. You are solely responsible for determining the
+appropriateness of using or redistributing the Work and assume any
+risks associated with Your exercise of permissions under this License.
+
+8. Limitation of Liability. In no event and under no legal theory,
+whether in tort (including negligence), contract, or otherwise,
+unless required by applicable law (such as deliberate and grossly
+negligent acts) or agreed to in writing, shall any Contributor be
+liable to You for damages, including any direct, indirect, special,
+incidental, or consequential damages of any character arising as a
+result of this License or out of the use or inability to use the
+Work (including but not limited to damages for loss of goodwill,
+work stoppage, computer failure or malfunction, or any and all
+other commercial damages or losses), even if such Contributor
+has been advised of the possibility of such damages.
+
+9. Accepting Warranty or Additional Liability. While redistributing
+the Work or Derivative Works thereof, You may choose to offer,
+and charge a fee for, acceptance of support, warranty, indemnity,
+or other liability obligations and/or rights consistent with this
+License. However, in accepting such obligations, You may act only
+on Your own behalf and on Your sole responsibility, not on behalf
+of any other Contributor, and only if You agree to indemnify,
+defend, and hold each Contributor harmless for any liability
+incurred by, or claims asserted against, such Contributor by reason
+of your accepting any such warranty or additional liability.
+
+END OF TERMS AND CONDITIONS
+
+APPENDIX: How to apply the Apache License to your work.
+
+To apply the Apache License to your work, attach the following
+boilerplate notice, with the fields enclosed by brackets "[]"
+replaced with your own identifying information. (Don't include
+the brackets!)  The text should be enclosed in the appropriate
+comment syntax for the file format. We also recommend that a
+file or class name and description of purpose be included on the
+same "printed page" as the copyright notice for easier
+identification within third-party archives.
+
+Copyright 2019 - present, Facebook, Inc
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/MODEL_ZOO.md b/preprocess/humanparsing/mhp_extension/detectron2/MODEL_ZOO.md
new file mode 100644
index 0000000000000000000000000000000000000000..07b81ffffa37d97b10f8d39f934b9f62bcb51cc1
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/MODEL_ZOO.md
@@ -0,0 +1,903 @@
+# Detectron2 Model Zoo and Baselines
+
+## Introduction
+
+This file documents a large collection of baselines trained
+with detectron2 in Sep-Oct, 2019.
+All numbers were obtained on [Big Basin](https://engineering.fb.com/data-center-engineering/introducing-big-basin-our-next-generation-ai-hardware/)
+servers with 8 NVIDIA V100 GPUs & NVLink. The software in use were PyTorch 1.3, CUDA 9.2, cuDNN 7.4.2 or 7.6.3.
+You can access these models from code using [detectron2.model_zoo](https://detectron2.readthedocs.io/modules/model_zoo.html) APIs.
+
+In addition to these official baseline models, you can find more models in [projects/](projects/).
+
+#### How to Read the Tables
+* The "Name" column contains a link to the config file. Running `tools/train_net.py` with this config file
+	and 8 GPUs will reproduce the model.
+* Training speed is averaged across the entire training.
+	We keep updating the speed with latest version of detectron2/pytorch/etc.,
+	so they might be different from the `metrics` file.
+	Training speed for multi-machine jobs is not provided.
+* Inference speed is measured by `tools/train_net.py --eval-only`, or [inference_on_dataset()](https://detectron2.readthedocs.io/modules/evaluation.html#detectron2.evaluation.inference_on_dataset),
+  with batch size 1 in detectron2 directly.
+	Measuring it with your own code will likely introduce other overhead.
+  Actual deployment in production should in general be faster than the given inference
+  speed due to more optimizations.
+* The *model id* column is provided for ease of reference.
+  To check downloaded file integrity, any model on this page contains its md5 prefix in its file name.
+* Training curves and other statistics can be found in `metrics` for each model.
+
+#### Common Settings for COCO Models
+* All COCO models were trained on `train2017` and evaluated on `val2017`.
+* The default settings are __not directly comparable__ with Detectron's standard settings.
+  For example, our default training data augmentation uses scale jittering in addition to horizontal flipping.
+
+  To make fair comparisons with Detectron's settings, see
+  [Detectron1-Comparisons](configs/Detectron1-Comparisons/) for accuracy comparison,
+  and [benchmarks](https://detectron2.readthedocs.io/notes/benchmarks.html)
+  for speed comparison.
+* For Faster/Mask R-CNN, we provide baselines based on __3 different backbone combinations__:
+  * __FPN__: Use a ResNet+FPN backbone with standard conv and FC heads for mask and box prediction,
+    respectively. It obtains the best
+    speed/accuracy tradeoff, but the other two are still useful for research.
+  * __C4__: Use a ResNet conv4 backbone with conv5 head. The original baseline in the Faster R-CNN paper.
+  * __DC5__ (Dilated-C5): Use a ResNet conv5 backbone with dilations in conv5, and standard conv and FC heads
+    for mask and box prediction, respectively.
+    This is used by the Deformable ConvNet paper.
+* Most models are trained with the 3x schedule (~37 COCO epochs).
+  Although 1x models are heavily under-trained, we provide some ResNet-50 models with the 1x (~12 COCO epochs)
+  training schedule for comparison when doing quick research iteration.
+
+#### ImageNet Pretrained Models
+
+We provide backbone models pretrained on ImageNet-1k dataset.
+These models have __different__ format from those provided in Detectron: we do not fuse BatchNorm into an affine layer.
+* [R-50.pkl](https://dl.fbaipublicfiles.com/detectron2/ImageNetPretrained/MSRA/R-50.pkl): converted copy of [MSRA's original ResNet-50](https://github.com/KaimingHe/deep-residual-networks) model.
+* [R-101.pkl](https://dl.fbaipublicfiles.com/detectron2/ImageNetPretrained/MSRA/R-101.pkl): converted copy of [MSRA's original ResNet-101](https://github.com/KaimingHe/deep-residual-networks) model.
+* [X-101-32x8d.pkl](https://dl.fbaipublicfiles.com/detectron2/ImageNetPretrained/FAIR/X-101-32x8d.pkl): ResNeXt-101-32x8d model trained with Caffe2 at FB.
+
+Pretrained models in Detectron's format can still be used. For example:
+* [X-152-32x8d-IN5k.pkl](https://dl.fbaipublicfiles.com/detectron/ImageNetPretrained/25093814/X-152-32x8d-IN5k.pkl):
+  ResNeXt-152-32x8d model trained on ImageNet-5k with Caffe2 at FB (see ResNeXt paper for details on ImageNet-5k).
+* [R-50-GN.pkl](https://dl.fbaipublicfiles.com/detectron/ImageNetPretrained/47261647/R-50-GN.pkl):
+  ResNet-50 with Group Normalization.
+* [R-101-GN.pkl](https://dl.fbaipublicfiles.com/detectron/ImageNetPretrained/47592356/R-101-GN.pkl):
+  ResNet-101 with Group Normalization.
+
+Torchvision's ResNet models can be used after converted by [this script](tools/convert-torchvision-to-d2.py).
+
+#### License
+
+All models available for download through this document are licensed under the
+[Creative Commons Attribution-ShareAlike 3.0 license](https://creativecommons.org/licenses/by-sa/3.0/).
+
+### COCO Object Detection Baselines
+
+#### Faster R-CNN:
+<!--
+(fb only) To update the table in vim:
+1. Remove the old table: d}
+2. Copy the below command to the place of the table
+3. :.!bash
+
+./gen_html_table.py --config 'COCO-Detection/faster*50*'{1x,3x}'*' 'COCO-Detection/faster*101*' --name R50-C4 R50-DC5 R50-FPN R50-C4 R50-DC5 R50-FPN R101-C4 R101-DC5 R101-FPN X101-FPN --fields lr_sched train_speed inference_speed mem box_AP
+-->
+
+
+<table><tbody>
+<!-- START TABLE -->
+<!-- TABLE HEADER -->
+<th valign="bottom">Name</th>
+<th valign="bottom">lr<br/>sched</th>
+<th valign="bottom">train<br/>time<br/>(s/iter)</th>
+<th valign="bottom">inference<br/>time<br/>(s/im)</th>
+<th valign="bottom">train<br/>mem<br/>(GB)</th>
+<th valign="bottom">box<br/>AP</th>
+<th valign="bottom">model id</th>
+<th valign="bottom">download</th>
+<!-- TABLE BODY -->
+<!-- ROW: faster_rcnn_R_50_C4_1x -->
+ <tr><td align="left"><a href="configs/COCO-Detection/faster_rcnn_R_50_C4_1x.yaml">R50-C4</a></td>
+<td align="center">1x</td>
+<td align="center">0.551</td>
+<td align="center">0.102</td>
+<td align="center">4.8</td>
+<td align="center">35.7</td>
+<td align="center">137257644</td>
+<td align="center"><a href="https://dl.fbaipublicfiles.com/detectron2/COCO-Detection/faster_rcnn_R_50_C4_1x/137257644/model_final_721ade.pkl">model</a>&nbsp;|&nbsp;<a href="https://dl.fbaipublicfiles.com/detectron2/COCO-Detection/faster_rcnn_R_50_C4_1x/137257644/metrics.json">metrics</a></td>
+</tr>
+<!-- ROW: faster_rcnn_R_50_DC5_1x -->
+ <tr><td align="left"><a href="configs/COCO-Detection/faster_rcnn_R_50_DC5_1x.yaml">R50-DC5</a></td>
+<td align="center">1x</td>
+<td align="center">0.380</td>
+<td align="center">0.068</td>
+<td align="center">5.0</td>
+<td align="center">37.3</td>
+<td align="center">137847829</td>
+<td align="center"><a href="https://dl.fbaipublicfiles.com/detectron2/COCO-Detection/faster_rcnn_R_50_DC5_1x/137847829/model_final_51d356.pkl">model</a>&nbsp;|&nbsp;<a href="https://dl.fbaipublicfiles.com/detectron2/COCO-Detection/faster_rcnn_R_50_DC5_1x/137847829/metrics.json">metrics</a></td>
+</tr>
+<!-- ROW: faster_rcnn_R_50_FPN_1x -->
+ <tr><td align="left"><a href="configs/COCO-Detection/faster_rcnn_R_50_FPN_1x.yaml">R50-FPN</a></td>
+<td align="center">1x</td>
+<td align="center">0.210</td>
+<td align="center">0.038</td>
+<td align="center">3.0</td>
+<td align="center">37.9</td>
+<td align="center">137257794</td>
+<td align="center"><a href="https://dl.fbaipublicfiles.com/detectron2/COCO-Detection/faster_rcnn_R_50_FPN_1x/137257794/model_final_b275ba.pkl">model</a>&nbsp;|&nbsp;<a href="https://dl.fbaipublicfiles.com/detectron2/COCO-Detection/faster_rcnn_R_50_FPN_1x/137257794/metrics.json">metrics</a></td>
+</tr>
+<!-- ROW: faster_rcnn_R_50_C4_3x -->
+ <tr><td align="left"><a href="configs/COCO-Detection/faster_rcnn_R_50_C4_3x.yaml">R50-C4</a></td>
+<td align="center">3x</td>
+<td align="center">0.543</td>
+<td align="center">0.104</td>
+<td align="center">4.8</td>
+<td align="center">38.4</td>
+<td align="center">137849393</td>
+<td align="center"><a href="https://dl.fbaipublicfiles.com/detectron2/COCO-Detection/faster_rcnn_R_50_C4_3x/137849393/model_final_f97cb7.pkl">model</a>&nbsp;|&nbsp;<a href="https://dl.fbaipublicfiles.com/detectron2/COCO-Detection/faster_rcnn_R_50_C4_3x/137849393/metrics.json">metrics</a></td>
+</tr>
+<!-- ROW: faster_rcnn_R_50_DC5_3x -->
+ <tr><td align="left"><a href="configs/COCO-Detection/faster_rcnn_R_50_DC5_3x.yaml">R50-DC5</a></td>
+<td align="center">3x</td>
+<td align="center">0.378</td>
+<td align="center">0.070</td>
+<td align="center">5.0</td>
+<td align="center">39.0</td>
+<td align="center">137849425</td>
+<td align="center"><a href="https://dl.fbaipublicfiles.com/detectron2/COCO-Detection/faster_rcnn_R_50_DC5_3x/137849425/model_final_68d202.pkl">model</a>&nbsp;|&nbsp;<a href="https://dl.fbaipublicfiles.com/detectron2/COCO-Detection/faster_rcnn_R_50_DC5_3x/137849425/metrics.json">metrics</a></td>
+</tr>
+<!-- ROW: faster_rcnn_R_50_FPN_3x -->
+ <tr><td align="left"><a href="configs/COCO-Detection/faster_rcnn_R_50_FPN_3x.yaml">R50-FPN</a></td>
+<td align="center">3x</td>
+<td align="center">0.209</td>
+<td align="center">0.038</td>
+<td align="center">3.0</td>
+<td align="center">40.2</td>
+<td align="center">137849458</td>
+<td align="center"><a href="https://dl.fbaipublicfiles.com/detectron2/COCO-Detection/faster_rcnn_R_50_FPN_3x/137849458/model_final_280758.pkl">model</a>&nbsp;|&nbsp;<a href="https://dl.fbaipublicfiles.com/detectron2/COCO-Detection/faster_rcnn_R_50_FPN_3x/137849458/metrics.json">metrics</a></td>
+</tr>
+<!-- ROW: faster_rcnn_R_101_C4_3x -->
+ <tr><td align="left"><a href="configs/COCO-Detection/faster_rcnn_R_101_C4_3x.yaml">R101-C4</a></td>
+<td align="center">3x</td>
+<td align="center">0.619</td>
+<td align="center">0.139</td>
+<td align="center">5.9</td>
+<td align="center">41.1</td>
+<td align="center">138204752</td>
+<td align="center"><a href="https://dl.fbaipublicfiles.com/detectron2/COCO-Detection/faster_rcnn_R_101_C4_3x/138204752/model_final_298dad.pkl">model</a>&nbsp;|&nbsp;<a href="https://dl.fbaipublicfiles.com/detectron2/COCO-Detection/faster_rcnn_R_101_C4_3x/138204752/metrics.json">metrics</a></td>
+</tr>
+<!-- ROW: faster_rcnn_R_101_DC5_3x -->
+ <tr><td align="left"><a href="configs/COCO-Detection/faster_rcnn_R_101_DC5_3x.yaml">R101-DC5</a></td>
+<td align="center">3x</td>
+<td align="center">0.452</td>
+<td align="center">0.086</td>
+<td align="center">6.1</td>
+<td align="center">40.6</td>
+<td align="center">138204841</td>
+<td align="center"><a href="https://dl.fbaipublicfiles.com/detectron2/COCO-Detection/faster_rcnn_R_101_DC5_3x/138204841/model_final_3e0943.pkl">model</a>&nbsp;|&nbsp;<a href="https://dl.fbaipublicfiles.com/detectron2/COCO-Detection/faster_rcnn_R_101_DC5_3x/138204841/metrics.json">metrics</a></td>
+</tr>
+<!-- ROW: faster_rcnn_R_101_FPN_3x -->
+ <tr><td align="left"><a href="configs/COCO-Detection/faster_rcnn_R_101_FPN_3x.yaml">R101-FPN</a></td>
+<td align="center">3x</td>
+<td align="center">0.286</td>
+<td align="center">0.051</td>
+<td align="center">4.1</td>
+<td align="center">42.0</td>
+<td align="center">137851257</td>
+<td align="center"><a href="https://dl.fbaipublicfiles.com/detectron2/COCO-Detection/faster_rcnn_R_101_FPN_3x/137851257/model_final_f6e8b1.pkl">model</a>&nbsp;|&nbsp;<a href="https://dl.fbaipublicfiles.com/detectron2/COCO-Detection/faster_rcnn_R_101_FPN_3x/137851257/metrics.json">metrics</a></td>
+</tr>
+<!-- ROW: faster_rcnn_X_101_32x8d_FPN_3x -->
+ <tr><td align="left"><a href="configs/COCO-Detection/faster_rcnn_X_101_32x8d_FPN_3x.yaml">X101-FPN</a></td>
+<td align="center">3x</td>
+<td align="center">0.638</td>
+<td align="center">0.098</td>
+<td align="center">6.7</td>
+<td align="center">43.0</td>
+<td align="center">139173657</td>
+<td align="center"><a href="https://dl.fbaipublicfiles.com/detectron2/COCO-Detection/faster_rcnn_X_101_32x8d_FPN_3x/139173657/model_final_68b088.pkl">model</a>&nbsp;|&nbsp;<a href="https://dl.fbaipublicfiles.com/detectron2/COCO-Detection/faster_rcnn_X_101_32x8d_FPN_3x/139173657/metrics.json">metrics</a></td>
+</tr>
+</tbody></table>
+
+#### RetinaNet:
+<!--
+./gen_html_table.py --config 'COCO-Detection/retina*50*' 'COCO-Detection/retina*101*' --name R50 R50 R101 --fields lr_sched train_speed inference_speed mem box_AP
+-->
+
+
+<table><tbody>
+<!-- START TABLE -->
+<!-- TABLE HEADER -->
+<th valign="bottom">Name</th>
+<th valign="bottom">lr<br/>sched</th>
+<th valign="bottom">train<br/>time<br/>(s/iter)</th>
+<th valign="bottom">inference<br/>time<br/>(s/im)</th>
+<th valign="bottom">train<br/>mem<br/>(GB)</th>
+<th valign="bottom">box<br/>AP</th>
+<th valign="bottom">model id</th>
+<th valign="bottom">download</th>
+<!-- TABLE BODY -->
+<!-- ROW: retinanet_R_50_FPN_1x -->
+ <tr><td align="left"><a href="configs/COCO-Detection/retinanet_R_50_FPN_1x.yaml">R50</a></td>
+<td align="center">1x</td>
+<td align="center">0.200</td>
+<td align="center">0.055</td>
+<td align="center">3.9</td>
+<td align="center">36.5</td>
+<td align="center">137593951</td>
+<td align="center"><a href="https://dl.fbaipublicfiles.com/detectron2/COCO-Detection/retinanet_R_50_FPN_1x/137593951/model_final_b796dc.pkl">model</a>&nbsp;|&nbsp;<a href="https://dl.fbaipublicfiles.com/detectron2/COCO-Detection/retinanet_R_50_FPN_1x/137593951/metrics.json">metrics</a></td>
+</tr>
+<!-- ROW: retinanet_R_50_FPN_3x -->
+ <tr><td align="left"><a href="configs/COCO-Detection/retinanet_R_50_FPN_3x.yaml">R50</a></td>
+<td align="center">3x</td>
+<td align="center">0.201</td>
+<td align="center">0.055</td>
+<td align="center">3.9</td>
+<td align="center">37.9</td>
+<td align="center">137849486</td>
+<td align="center"><a href="https://dl.fbaipublicfiles.com/detectron2/COCO-Detection/retinanet_R_50_FPN_3x/137849486/model_final_4cafe0.pkl">model</a>&nbsp;|&nbsp;<a href="https://dl.fbaipublicfiles.com/detectron2/COCO-Detection/retinanet_R_50_FPN_3x/137849486/metrics.json">metrics</a></td>
+</tr>
+<!-- ROW: retinanet_R_101_FPN_3x -->
+ <tr><td align="left"><a href="configs/COCO-Detection/retinanet_R_101_FPN_3x.yaml">R101</a></td>
+<td align="center">3x</td>
+<td align="center">0.280</td>
+<td align="center">0.068</td>
+<td align="center">5.1</td>
+<td align="center">39.9</td>
+<td align="center">138363263</td>
+<td align="center"><a href="https://dl.fbaipublicfiles.com/detectron2/COCO-Detection/retinanet_R_101_FPN_3x/138363263/model_final_59f53c.pkl">model</a>&nbsp;|&nbsp;<a href="https://dl.fbaipublicfiles.com/detectron2/COCO-Detection/retinanet_R_101_FPN_3x/138363263/metrics.json">metrics</a></td>
+</tr>
+</tbody></table>
+
+#### RPN & Fast R-CNN:
+<!--
+./gen_html_table.py --config 'COCO-Detection/rpn*' 'COCO-Detection/fast_rcnn*' --name "RPN R50-C4" "RPN R50-FPN" "Fast R-CNN R50-FPN" --fields lr_sched train_speed inference_speed mem box_AP prop_AR
+-->
+
+<table><tbody>
+<!-- START TABLE -->
+<!-- TABLE HEADER -->
+<th valign="bottom">Name</th>
+<th valign="bottom">lr<br/>sched</th>
+<th valign="bottom">train<br/>time<br/>(s/iter)</th>
+<th valign="bottom">inference<br/>time<br/>(s/im)</th>
+<th valign="bottom">train<br/>mem<br/>(GB)</th>
+<th valign="bottom">box<br/>AP</th>
+<th valign="bottom">prop.<br/>AR</th>
+<th valign="bottom">model id</th>
+<th valign="bottom">download</th>
+<!-- TABLE BODY -->
+<!-- ROW: rpn_R_50_C4_1x -->
+ <tr><td align="left"><a href="configs/COCO-Detection/rpn_R_50_C4_1x.yaml">RPN R50-C4</a></td>
+<td align="center">1x</td>
+<td align="center">0.130</td>
+<td align="center">0.034</td>
+<td align="center">1.5</td>
+<td align="center"></td>
+<td align="center">51.6</td>
+<td align="center">137258005</td>
+<td align="center"><a href="https://dl.fbaipublicfiles.com/detectron2/COCO-Detection/rpn_R_50_C4_1x/137258005/model_final_450694.pkl">model</a>&nbsp;|&nbsp;<a href="https://dl.fbaipublicfiles.com/detectron2/COCO-Detection/rpn_R_50_C4_1x/137258005/metrics.json">metrics</a></td>
+</tr>
+<!-- ROW: rpn_R_50_FPN_1x -->
+ <tr><td align="left"><a href="configs/COCO-Detection/rpn_R_50_FPN_1x.yaml">RPN R50-FPN</a></td>
+<td align="center">1x</td>
+<td align="center">0.186</td>
+<td align="center">0.032</td>
+<td align="center">2.7</td>
+<td align="center"></td>
+<td align="center">58.0</td>
+<td align="center">137258492</td>
+<td align="center"><a href="https://dl.fbaipublicfiles.com/detectron2/COCO-Detection/rpn_R_50_FPN_1x/137258492/model_final_02ce48.pkl">model</a>&nbsp;|&nbsp;<a href="https://dl.fbaipublicfiles.com/detectron2/COCO-Detection/rpn_R_50_FPN_1x/137258492/metrics.json">metrics</a></td>
+</tr>
+<!-- ROW: fast_rcnn_R_50_FPN_1x -->
+ <tr><td align="left"><a href="configs/COCO-Detection/fast_rcnn_R_50_FPN_1x.yaml">Fast R-CNN R50-FPN</a></td>
+<td align="center">1x</td>
+<td align="center">0.140</td>
+<td align="center">0.029</td>
+<td align="center">2.6</td>
+<td align="center">37.8</td>
+<td align="center"></td>
+<td align="center">137635226</td>
+<td align="center"><a href="https://dl.fbaipublicfiles.com/detectron2/COCO-Detection/fast_rcnn_R_50_FPN_1x/137635226/model_final_e5f7ce.pkl">model</a>&nbsp;|&nbsp;<a href="https://dl.fbaipublicfiles.com/detectron2/COCO-Detection/fast_rcnn_R_50_FPN_1x/137635226/metrics.json">metrics</a></td>
+</tr>
+</tbody></table>
+
+### COCO Instance Segmentation Baselines with Mask R-CNN
+<!--
+./gen_html_table.py --config 'COCO-InstanceSegmentation/mask*50*'{1x,3x}'*' 'COCO-InstanceSegmentation/mask*101*' --name R50-C4 R50-DC5 R50-FPN R50-C4 R50-DC5 R50-FPN R101-C4 R101-DC5 R101-FPN X101-FPN --fields lr_sched train_speed inference_speed mem box_AP mask_AP
+-->
+
+
+
+<table><tbody>
+<!-- START TABLE -->
+<!-- TABLE HEADER -->
+<th valign="bottom">Name</th>
+<th valign="bottom">lr<br/>sched</th>
+<th valign="bottom">train<br/>time<br/>(s/iter)</th>
+<th valign="bottom">inference<br/>time<br/>(s/im)</th>
+<th valign="bottom">train<br/>mem<br/>(GB)</th>
+<th valign="bottom">box<br/>AP</th>
+<th valign="bottom">mask<br/>AP</th>
+<th valign="bottom">model id</th>
+<th valign="bottom">download</th>
+<!-- TABLE BODY -->
+<!-- ROW: mask_rcnn_R_50_C4_1x -->
+ <tr><td align="left"><a href="configs/COCO-InstanceSegmentation/mask_rcnn_R_50_C4_1x.yaml">R50-C4</a></td>
+<td align="center">1x</td>
+<td align="center">0.584</td>
+<td align="center">0.110</td>
+<td align="center">5.2</td>
+<td align="center">36.8</td>
+<td align="center">32.2</td>
+<td align="center">137259246</td>
+<td align="center"><a href="https://dl.fbaipublicfiles.com/detectron2/COCO-InstanceSegmentation/mask_rcnn_R_50_C4_1x/137259246/model_final_9243eb.pkl">model</a>&nbsp;|&nbsp;<a href="https://dl.fbaipublicfiles.com/detectron2/COCO-InstanceSegmentation/mask_rcnn_R_50_C4_1x/137259246/metrics.json">metrics</a></td>
+</tr>
+<!-- ROW: mask_rcnn_R_50_DC5_1x -->
+ <tr><td align="left"><a href="configs/COCO-InstanceSegmentation/mask_rcnn_R_50_DC5_1x.yaml">R50-DC5</a></td>
+<td align="center">1x</td>
+<td align="center">0.471</td>
+<td align="center">0.076</td>
+<td align="center">6.5</td>
+<td align="center">38.3</td>
+<td align="center">34.2</td>
+<td align="center">137260150</td>
+<td align="center"><a href="https://dl.fbaipublicfiles.com/detectron2/COCO-InstanceSegmentation/mask_rcnn_R_50_DC5_1x/137260150/model_final_4f86c3.pkl">model</a>&nbsp;|&nbsp;<a href="https://dl.fbaipublicfiles.com/detectron2/COCO-InstanceSegmentation/mask_rcnn_R_50_DC5_1x/137260150/metrics.json">metrics</a></td>
+</tr>
+<!-- ROW: mask_rcnn_R_50_FPN_1x -->
+ <tr><td align="left"><a href="configs/COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_1x.yaml">R50-FPN</a></td>
+<td align="center">1x</td>
+<td align="center">0.261</td>
+<td align="center">0.043</td>
+<td align="center">3.4</td>
+<td align="center">38.6</td>
+<td align="center">35.2</td>
+<td align="center">137260431</td>
+<td align="center"><a href="https://dl.fbaipublicfiles.com/detectron2/COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_1x/137260431/model_final_a54504.pkl">model</a>&nbsp;|&nbsp;<a href="https://dl.fbaipublicfiles.com/detectron2/COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_1x/137260431/metrics.json">metrics</a></td>
+</tr>
+<!-- ROW: mask_rcnn_R_50_C4_3x -->
+ <tr><td align="left"><a href="configs/COCO-InstanceSegmentation/mask_rcnn_R_50_C4_3x.yaml">R50-C4</a></td>
+<td align="center">3x</td>
+<td align="center">0.575</td>
+<td align="center">0.111</td>
+<td align="center">5.2</td>
+<td align="center">39.8</td>
+<td align="center">34.4</td>
+<td align="center">137849525</td>
+<td align="center"><a href="https://dl.fbaipublicfiles.com/detectron2/COCO-InstanceSegmentation/mask_rcnn_R_50_C4_3x/137849525/model_final_4ce675.pkl">model</a>&nbsp;|&nbsp;<a href="https://dl.fbaipublicfiles.com/detectron2/COCO-InstanceSegmentation/mask_rcnn_R_50_C4_3x/137849525/metrics.json">metrics</a></td>
+</tr>
+<!-- ROW: mask_rcnn_R_50_DC5_3x -->
+ <tr><td align="left"><a href="configs/COCO-InstanceSegmentation/mask_rcnn_R_50_DC5_3x.yaml">R50-DC5</a></td>
+<td align="center">3x</td>
+<td align="center">0.470</td>
+<td align="center">0.076</td>
+<td align="center">6.5</td>
+<td align="center">40.0</td>
+<td align="center">35.9</td>
+<td align="center">137849551</td>
+<td align="center"><a href="https://dl.fbaipublicfiles.com/detectron2/COCO-InstanceSegmentation/mask_rcnn_R_50_DC5_3x/137849551/model_final_84107b.pkl">model</a>&nbsp;|&nbsp;<a href="https://dl.fbaipublicfiles.com/detectron2/COCO-InstanceSegmentation/mask_rcnn_R_50_DC5_3x/137849551/metrics.json">metrics</a></td>
+</tr>
+<!-- ROW: mask_rcnn_R_50_FPN_3x -->
+ <tr><td align="left"><a href="configs/COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_3x.yaml">R50-FPN</a></td>
+<td align="center">3x</td>
+<td align="center">0.261</td>
+<td align="center">0.043</td>
+<td align="center">3.4</td>
+<td align="center">41.0</td>
+<td align="center">37.2</td>
+<td align="center">137849600</td>
+<td align="center"><a href="https://dl.fbaipublicfiles.com/detectron2/COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_3x/137849600/model_final_f10217.pkl">model</a>&nbsp;|&nbsp;<a href="https://dl.fbaipublicfiles.com/detectron2/COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_3x/137849600/metrics.json">metrics</a></td>
+</tr>
+<!-- ROW: mask_rcnn_R_101_C4_3x -->
+ <tr><td align="left"><a href="configs/COCO-InstanceSegmentation/mask_rcnn_R_101_C4_3x.yaml">R101-C4</a></td>
+<td align="center">3x</td>
+<td align="center">0.652</td>
+<td align="center">0.145</td>
+<td align="center">6.3</td>
+<td align="center">42.6</td>
+<td align="center">36.7</td>
+<td align="center">138363239</td>
+<td align="center"><a href="https://dl.fbaipublicfiles.com/detectron2/COCO-InstanceSegmentation/mask_rcnn_R_101_C4_3x/138363239/model_final_a2914c.pkl">model</a>&nbsp;|&nbsp;<a href="https://dl.fbaipublicfiles.com/detectron2/COCO-InstanceSegmentation/mask_rcnn_R_101_C4_3x/138363239/metrics.json">metrics</a></td>
+</tr>
+<!-- ROW: mask_rcnn_R_101_DC5_3x -->
+ <tr><td align="left"><a href="configs/COCO-InstanceSegmentation/mask_rcnn_R_101_DC5_3x.yaml">R101-DC5</a></td>
+<td align="center">3x</td>
+<td align="center">0.545</td>
+<td align="center">0.092</td>
+<td align="center">7.6</td>
+<td align="center">41.9</td>
+<td align="center">37.3</td>
+<td align="center">138363294</td>
+<td align="center"><a href="https://dl.fbaipublicfiles.com/detectron2/COCO-InstanceSegmentation/mask_rcnn_R_101_DC5_3x/138363294/model_final_0464b7.pkl">model</a>&nbsp;|&nbsp;<a href="https://dl.fbaipublicfiles.com/detectron2/COCO-InstanceSegmentation/mask_rcnn_R_101_DC5_3x/138363294/metrics.json">metrics</a></td>
+</tr>
+<!-- ROW: mask_rcnn_R_101_FPN_3x -->
+ <tr><td align="left"><a href="configs/COCO-InstanceSegmentation/mask_rcnn_R_101_FPN_3x.yaml">R101-FPN</a></td>
+<td align="center">3x</td>
+<td align="center">0.340</td>
+<td align="center">0.056</td>
+<td align="center">4.6</td>
+<td align="center">42.9</td>
+<td align="center">38.6</td>
+<td align="center">138205316</td>
+<td align="center"><a href="https://dl.fbaipublicfiles.com/detectron2/COCO-InstanceSegmentation/mask_rcnn_R_101_FPN_3x/138205316/model_final_a3ec72.pkl">model</a>&nbsp;|&nbsp;<a href="https://dl.fbaipublicfiles.com/detectron2/COCO-InstanceSegmentation/mask_rcnn_R_101_FPN_3x/138205316/metrics.json">metrics</a></td>
+</tr>
+<!-- ROW: mask_rcnn_X_101_32x8d_FPN_3x -->
+ <tr><td align="left"><a href="configs/COCO-InstanceSegmentation/mask_rcnn_X_101_32x8d_FPN_3x.yaml">X101-FPN</a></td>
+<td align="center">3x</td>
+<td align="center">0.690</td>
+<td align="center">0.103</td>
+<td align="center">7.2</td>
+<td align="center">44.3</td>
+<td align="center">39.5</td>
+<td align="center">139653917</td>
+<td align="center"><a href="https://dl.fbaipublicfiles.com/detectron2/COCO-InstanceSegmentation/mask_rcnn_X_101_32x8d_FPN_3x/139653917/model_final_2d9806.pkl">model</a>&nbsp;|&nbsp;<a href="https://dl.fbaipublicfiles.com/detectron2/COCO-InstanceSegmentation/mask_rcnn_X_101_32x8d_FPN_3x/139653917/metrics.json">metrics</a></td>
+</tr>
+</tbody></table>
+
+### COCO Person Keypoint Detection Baselines with Keypoint R-CNN
+<!--
+./gen_html_table.py --config 'COCO-Keypoints/*50*' 'COCO-Keypoints/*101*'  --name R50-FPN R50-FPN R101-FPN X101-FPN --fields lr_sched train_speed inference_speed mem box_AP keypoint_AP
+-->
+
+
+<table><tbody>
+<!-- START TABLE -->
+<!-- TABLE HEADER -->
+<th valign="bottom">Name</th>
+<th valign="bottom">lr<br/>sched</th>
+<th valign="bottom">train<br/>time<br/>(s/iter)</th>
+<th valign="bottom">inference<br/>time<br/>(s/im)</th>
+<th valign="bottom">train<br/>mem<br/>(GB)</th>
+<th valign="bottom">box<br/>AP</th>
+<th valign="bottom">kp.<br/>AP</th>
+<th valign="bottom">model id</th>
+<th valign="bottom">download</th>
+<!-- TABLE BODY -->
+<!-- ROW: keypoint_rcnn_R_50_FPN_1x -->
+ <tr><td align="left"><a href="configs/COCO-Keypoints/keypoint_rcnn_R_50_FPN_1x.yaml">R50-FPN</a></td>
+<td align="center">1x</td>
+<td align="center">0.315</td>
+<td align="center">0.072</td>
+<td align="center">5.0</td>
+<td align="center">53.6</td>
+<td align="center">64.0</td>
+<td align="center">137261548</td>
+<td align="center"><a href="https://dl.fbaipublicfiles.com/detectron2/COCO-Keypoints/keypoint_rcnn_R_50_FPN_1x/137261548/model_final_04e291.pkl">model</a>&nbsp;|&nbsp;<a href="https://dl.fbaipublicfiles.com/detectron2/COCO-Keypoints/keypoint_rcnn_R_50_FPN_1x/137261548/metrics.json">metrics</a></td>
+</tr>
+<!-- ROW: keypoint_rcnn_R_50_FPN_3x -->
+ <tr><td align="left"><a href="configs/COCO-Keypoints/keypoint_rcnn_R_50_FPN_3x.yaml">R50-FPN</a></td>
+<td align="center">3x</td>
+<td align="center">0.316</td>
+<td align="center">0.066</td>
+<td align="center">5.0</td>
+<td align="center">55.4</td>
+<td align="center">65.5</td>
+<td align="center">137849621</td>
+<td align="center"><a href="https://dl.fbaipublicfiles.com/detectron2/COCO-Keypoints/keypoint_rcnn_R_50_FPN_3x/137849621/model_final_a6e10b.pkl">model</a>&nbsp;|&nbsp;<a href="https://dl.fbaipublicfiles.com/detectron2/COCO-Keypoints/keypoint_rcnn_R_50_FPN_3x/137849621/metrics.json">metrics</a></td>
+</tr>
+<!-- ROW: keypoint_rcnn_R_101_FPN_3x -->
+ <tr><td align="left"><a href="configs/COCO-Keypoints/keypoint_rcnn_R_101_FPN_3x.yaml">R101-FPN</a></td>
+<td align="center">3x</td>
+<td align="center">0.390</td>
+<td align="center">0.076</td>
+<td align="center">6.1</td>
+<td align="center">56.4</td>
+<td align="center">66.1</td>
+<td align="center">138363331</td>
+<td align="center"><a href="https://dl.fbaipublicfiles.com/detectron2/COCO-Keypoints/keypoint_rcnn_R_101_FPN_3x/138363331/model_final_997cc7.pkl">model</a>&nbsp;|&nbsp;<a href="https://dl.fbaipublicfiles.com/detectron2/COCO-Keypoints/keypoint_rcnn_R_101_FPN_3x/138363331/metrics.json">metrics</a></td>
+</tr>
+<!-- ROW: keypoint_rcnn_X_101_32x8d_FPN_3x -->
+ <tr><td align="left"><a href="configs/COCO-Keypoints/keypoint_rcnn_X_101_32x8d_FPN_3x.yaml">X101-FPN</a></td>
+<td align="center">3x</td>
+<td align="center">0.738</td>
+<td align="center">0.121</td>
+<td align="center">8.7</td>
+<td align="center">57.3</td>
+<td align="center">66.0</td>
+<td align="center">139686956</td>
+<td align="center"><a href="https://dl.fbaipublicfiles.com/detectron2/COCO-Keypoints/keypoint_rcnn_X_101_32x8d_FPN_3x/139686956/model_final_5ad38f.pkl">model</a>&nbsp;|&nbsp;<a href="https://dl.fbaipublicfiles.com/detectron2/COCO-Keypoints/keypoint_rcnn_X_101_32x8d_FPN_3x/139686956/metrics.json">metrics</a></td>
+</tr>
+</tbody></table>
+
+### COCO Panoptic Segmentation Baselines with Panoptic FPN
+<!--
+./gen_html_table.py --config 'COCO-PanopticSegmentation/*50*' 'COCO-PanopticSegmentation/*101*'  --name R50-FPN R50-FPN R101-FPN --fields lr_sched train_speed inference_speed mem box_AP mask_AP PQ
+-->
+
+
+<table><tbody>
+<!-- START TABLE -->
+<!-- TABLE HEADER -->
+<th valign="bottom">Name</th>
+<th valign="bottom">lr<br/>sched</th>
+<th valign="bottom">train<br/>time<br/>(s/iter)</th>
+<th valign="bottom">inference<br/>time<br/>(s/im)</th>
+<th valign="bottom">train<br/>mem<br/>(GB)</th>
+<th valign="bottom">box<br/>AP</th>
+<th valign="bottom">mask<br/>AP</th>
+<th valign="bottom">PQ</th>
+<th valign="bottom">model id</th>
+<th valign="bottom">download</th>
+<!-- TABLE BODY -->
+<!-- ROW: panoptic_fpn_R_50_1x -->
+ <tr><td align="left"><a href="configs/COCO-PanopticSegmentation/panoptic_fpn_R_50_1x.yaml">R50-FPN</a></td>
+<td align="center">1x</td>
+<td align="center">0.304</td>
+<td align="center">0.053</td>
+<td align="center">4.8</td>
+<td align="center">37.6</td>
+<td align="center">34.7</td>
+<td align="center">39.4</td>
+<td align="center">139514544</td>
+<td align="center"><a href="https://dl.fbaipublicfiles.com/detectron2/COCO-PanopticSegmentation/panoptic_fpn_R_50_1x/139514544/model_final_dbfeb4.pkl">model</a>&nbsp;|&nbsp;<a href="https://dl.fbaipublicfiles.com/detectron2/COCO-PanopticSegmentation/panoptic_fpn_R_50_1x/139514544/metrics.json">metrics</a></td>
+</tr>
+<!-- ROW: panoptic_fpn_R_50_3x -->
+ <tr><td align="left"><a href="configs/COCO-PanopticSegmentation/panoptic_fpn_R_50_3x.yaml">R50-FPN</a></td>
+<td align="center">3x</td>
+<td align="center">0.302</td>
+<td align="center">0.053</td>
+<td align="center">4.8</td>
+<td align="center">40.0</td>
+<td align="center">36.5</td>
+<td align="center">41.5</td>
+<td align="center">139514569</td>
+<td align="center"><a href="https://dl.fbaipublicfiles.com/detectron2/COCO-PanopticSegmentation/panoptic_fpn_R_50_3x/139514569/model_final_c10459.pkl">model</a>&nbsp;|&nbsp;<a href="https://dl.fbaipublicfiles.com/detectron2/COCO-PanopticSegmentation/panoptic_fpn_R_50_3x/139514569/metrics.json">metrics</a></td>
+</tr>
+<!-- ROW: panoptic_fpn_R_101_3x -->
+ <tr><td align="left"><a href="configs/COCO-PanopticSegmentation/panoptic_fpn_R_101_3x.yaml">R101-FPN</a></td>
+<td align="center">3x</td>
+<td align="center">0.392</td>
+<td align="center">0.066</td>
+<td align="center">6.0</td>
+<td align="center">42.4</td>
+<td align="center">38.5</td>
+<td align="center">43.0</td>
+<td align="center">139514519</td>
+<td align="center"><a href="https://dl.fbaipublicfiles.com/detectron2/COCO-PanopticSegmentation/panoptic_fpn_R_101_3x/139514519/model_final_cafdb1.pkl">model</a>&nbsp;|&nbsp;<a href="https://dl.fbaipublicfiles.com/detectron2/COCO-PanopticSegmentation/panoptic_fpn_R_101_3x/139514519/metrics.json">metrics</a></td>
+</tr>
+</tbody></table>
+
+
+### LVIS Instance Segmentation Baselines with Mask R-CNN
+
+Mask R-CNN baselines on the [LVIS dataset](https://lvisdataset.org), v0.5.
+These baselines are described in Table 3(c) of the [LVIS paper](https://arxiv.org/abs/1908.03195).
+
+NOTE: the 1x schedule here has the same amount of __iterations__ as the COCO 1x baselines.
+They are roughly 24 epochs of LVISv0.5 data.
+The final results of these configs have large variance across different runs.
+
+<!--
+./gen_html_table.py --config 'LVIS-InstanceSegmentation/mask*50*' 'LVIS-InstanceSegmentation/mask*101*' --name R50-FPN R101-FPN X101-FPN --fields lr_sched train_speed inference_speed mem box_AP mask_AP
+-->
+
+
+<table><tbody>
+<!-- START TABLE -->
+<!-- TABLE HEADER -->
+<th valign="bottom">Name</th>
+<th valign="bottom">lr<br/>sched</th>
+<th valign="bottom">train<br/>time<br/>(s/iter)</th>
+<th valign="bottom">inference<br/>time<br/>(s/im)</th>
+<th valign="bottom">train<br/>mem<br/>(GB)</th>
+<th valign="bottom">box<br/>AP</th>
+<th valign="bottom">mask<br/>AP</th>
+<th valign="bottom">model id</th>
+<th valign="bottom">download</th>
+<!-- TABLE BODY -->
+<!-- ROW: mask_rcnn_R_50_FPN_1x -->
+ <tr><td align="left"><a href="configs/LVIS-InstanceSegmentation/mask_rcnn_R_50_FPN_1x.yaml">R50-FPN</a></td>
+<td align="center">1x</td>
+<td align="center">0.292</td>
+<td align="center">0.107</td>
+<td align="center">7.1</td>
+<td align="center">23.6</td>
+<td align="center">24.4</td>
+<td align="center">144219072</td>
+<td align="center"><a href="https://dl.fbaipublicfiles.com/detectron2/LVIS-InstanceSegmentation/mask_rcnn_R_50_FPN_1x/144219072/model_final_571f7c.pkl">model</a>&nbsp;|&nbsp;<a href="https://dl.fbaipublicfiles.com/detectron2/LVIS-InstanceSegmentation/mask_rcnn_R_50_FPN_1x/144219072/metrics.json">metrics</a></td>
+</tr>
+<!-- ROW: mask_rcnn_R_101_FPN_1x -->
+ <tr><td align="left"><a href="configs/LVIS-InstanceSegmentation/mask_rcnn_R_101_FPN_1x.yaml">R101-FPN</a></td>
+<td align="center">1x</td>
+<td align="center">0.371</td>
+<td align="center">0.114</td>
+<td align="center">7.8</td>
+<td align="center">25.6</td>
+<td align="center">25.9</td>
+<td align="center">144219035</td>
+<td align="center"><a href="https://dl.fbaipublicfiles.com/detectron2/LVIS-InstanceSegmentation/mask_rcnn_R_101_FPN_1x/144219035/model_final_824ab5.pkl">model</a>&nbsp;|&nbsp;<a href="https://dl.fbaipublicfiles.com/detectron2/LVIS-InstanceSegmentation/mask_rcnn_R_101_FPN_1x/144219035/metrics.json">metrics</a></td>
+</tr>
+<!-- ROW: mask_rcnn_X_101_32x8d_FPN_1x -->
+ <tr><td align="left"><a href="configs/LVIS-InstanceSegmentation/mask_rcnn_X_101_32x8d_FPN_1x.yaml">X101-FPN</a></td>
+<td align="center">1x</td>
+<td align="center">0.712</td>
+<td align="center">0.151</td>
+<td align="center">10.2</td>
+<td align="center">26.7</td>
+<td align="center">27.1</td>
+<td align="center">144219108</td>
+<td align="center"><a href="https://dl.fbaipublicfiles.com/detectron2/LVIS-InstanceSegmentation/mask_rcnn_X_101_32x8d_FPN_1x/144219108/model_final_5e3439.pkl">model</a>&nbsp;|&nbsp;<a href="https://dl.fbaipublicfiles.com/detectron2/LVIS-InstanceSegmentation/mask_rcnn_X_101_32x8d_FPN_1x/144219108/metrics.json">metrics</a></td>
+</tr>
+</tbody></table>
+
+
+
+### Cityscapes & Pascal VOC Baselines
+
+Simple baselines for
+* Mask R-CNN on Cityscapes instance segmentation (initialized from COCO pre-training, then trained on Cityscapes fine annotations only)
+* Faster R-CNN on PASCAL VOC object detection (trained on VOC 2007 train+val + VOC 2012 train+val, tested on VOC 2007 using 11-point interpolated AP)
+
+<!--
+./gen_html_table.py --config 'Cityscapes/*' 'PascalVOC-Detection/*' --name "R50-FPN, Cityscapes" "R50-C4, VOC" --fields train_speed inference_speed mem box_AP box_AP50 mask_AP
+-->
+
+
+<table><tbody>
+<!-- START TABLE -->
+<!-- TABLE HEADER -->
+<th valign="bottom">Name</th>
+<th valign="bottom">train<br/>time<br/>(s/iter)</th>
+<th valign="bottom">inference<br/>time<br/>(s/im)</th>
+<th valign="bottom">train<br/>mem<br/>(GB)</th>
+<th valign="bottom">box<br/>AP</th>
+<th valign="bottom">box<br/>AP50</th>
+<th valign="bottom">mask<br/>AP</th>
+<th valign="bottom">model id</th>
+<th valign="bottom">download</th>
+<!-- TABLE BODY -->
+<!-- ROW: mask_rcnn_R_50_FPN -->
+ <tr><td align="left"><a href="configs/Cityscapes/mask_rcnn_R_50_FPN.yaml">R50-FPN, Cityscapes</a></td>
+<td align="center">0.240</td>
+<td align="center">0.078</td>
+<td align="center">4.4</td>
+<td align="center"></td>
+<td align="center"></td>
+<td align="center">36.5</td>
+<td align="center">142423278</td>
+<td align="center"><a href="https://dl.fbaipublicfiles.com/detectron2/Cityscapes/mask_rcnn_R_50_FPN/142423278/model_final_af9cf5.pkl">model</a>&nbsp;|&nbsp;<a href="https://dl.fbaipublicfiles.com/detectron2/Cityscapes/mask_rcnn_R_50_FPN/142423278/metrics.json">metrics</a></td>
+</tr>
+<!-- ROW: faster_rcnn_R_50_C4 -->
+ <tr><td align="left"><a href="configs/PascalVOC-Detection/faster_rcnn_R_50_C4.yaml">R50-C4, VOC</a></td>
+<td align="center">0.537</td>
+<td align="center">0.081</td>
+<td align="center">4.8</td>
+<td align="center">51.9</td>
+<td align="center">80.3</td>
+<td align="center"></td>
+<td align="center">142202221</td>
+<td align="center"><a href="https://dl.fbaipublicfiles.com/detectron2/PascalVOC-Detection/faster_rcnn_R_50_C4/142202221/model_final_b1acc2.pkl">model</a>&nbsp;|&nbsp;<a href="https://dl.fbaipublicfiles.com/detectron2/PascalVOC-Detection/faster_rcnn_R_50_C4/142202221/metrics.json">metrics</a></td>
+</tr>
+</tbody></table>
+
+
+
+### Other Settings
+
+Ablations for Deformable Conv and Cascade R-CNN:
+
+<!--
+./gen_html_table.py --config 'COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_1x.yaml' 'Misc/*R_50_FPN_1x_dconv*' 'Misc/cascade*1x.yaml' 'COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_3x.yaml' 'Misc/*R_50_FPN_3x_dconv*' 'Misc/cascade*3x.yaml' --name "Baseline R50-FPN" "Deformable Conv" "Cascade R-CNN" "Baseline R50-FPN" "Deformable Conv" "Cascade R-CNN"  --fields lr_sched train_speed inference_speed mem box_AP mask_AP
+-->
+
+
+<table><tbody>
+<!-- START TABLE -->
+<!-- TABLE HEADER -->
+<th valign="bottom">Name</th>
+<th valign="bottom">lr<br/>sched</th>
+<th valign="bottom">train<br/>time<br/>(s/iter)</th>
+<th valign="bottom">inference<br/>time<br/>(s/im)</th>
+<th valign="bottom">train<br/>mem<br/>(GB)</th>
+<th valign="bottom">box<br/>AP</th>
+<th valign="bottom">mask<br/>AP</th>
+<th valign="bottom">model id</th>
+<th valign="bottom">download</th>
+<!-- TABLE BODY -->
+<!-- ROW: mask_rcnn_R_50_FPN_1x -->
+ <tr><td align="left"><a href="configs/COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_1x.yaml">Baseline R50-FPN</a></td>
+<td align="center">1x</td>
+<td align="center">0.261</td>
+<td align="center">0.043</td>
+<td align="center">3.4</td>
+<td align="center">38.6</td>
+<td align="center">35.2</td>
+<td align="center">137260431</td>
+<td align="center"><a href="https://dl.fbaipublicfiles.com/detectron2/COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_1x/137260431/model_final_a54504.pkl">model</a>&nbsp;|&nbsp;<a href="https://dl.fbaipublicfiles.com/detectron2/COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_1x/137260431/metrics.json">metrics</a></td>
+</tr>
+<!-- ROW: mask_rcnn_R_50_FPN_1x_dconv_c3-c5 -->
+ <tr><td align="left"><a href="configs/Misc/mask_rcnn_R_50_FPN_1x_dconv_c3-c5.yaml">Deformable Conv</a></td>
+<td align="center">1x</td>
+<td align="center">0.342</td>
+<td align="center">0.048</td>
+<td align="center">3.5</td>
+<td align="center">41.5</td>
+<td align="center">37.5</td>
+<td align="center">138602867</td>
+<td align="center"><a href="https://dl.fbaipublicfiles.com/detectron2/Misc/mask_rcnn_R_50_FPN_1x_dconv_c3-c5/138602867/model_final_65c703.pkl">model</a>&nbsp;|&nbsp;<a href="https://dl.fbaipublicfiles.com/detectron2/Misc/mask_rcnn_R_50_FPN_1x_dconv_c3-c5/138602867/metrics.json">metrics</a></td>
+</tr>
+<!-- ROW: cascade_mask_rcnn_R_50_FPN_1x -->
+ <tr><td align="left"><a href="configs/Misc/cascade_mask_rcnn_R_50_FPN_1x.yaml">Cascade R-CNN</a></td>
+<td align="center">1x</td>
+<td align="center">0.317</td>
+<td align="center">0.052</td>
+<td align="center">4.0</td>
+<td align="center">42.1</td>
+<td align="center">36.4</td>
+<td align="center">138602847</td>
+<td align="center"><a href="https://dl.fbaipublicfiles.com/detectron2/Misc/cascade_mask_rcnn_R_50_FPN_1x/138602847/model_final_e9d89b.pkl">model</a>&nbsp;|&nbsp;<a href="https://dl.fbaipublicfiles.com/detectron2/Misc/cascade_mask_rcnn_R_50_FPN_1x/138602847/metrics.json">metrics</a></td>
+</tr>
+<!-- ROW: mask_rcnn_R_50_FPN_3x -->
+ <tr><td align="left"><a href="configs/COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_3x.yaml">Baseline R50-FPN</a></td>
+<td align="center">3x</td>
+<td align="center">0.261</td>
+<td align="center">0.043</td>
+<td align="center">3.4</td>
+<td align="center">41.0</td>
+<td align="center">37.2</td>
+<td align="center">137849600</td>
+<td align="center"><a href="https://dl.fbaipublicfiles.com/detectron2/COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_3x/137849600/model_final_f10217.pkl">model</a>&nbsp;|&nbsp;<a href="https://dl.fbaipublicfiles.com/detectron2/COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_3x/137849600/metrics.json">metrics</a></td>
+</tr>
+<!-- ROW: mask_rcnn_R_50_FPN_3x_dconv_c3-c5 -->
+ <tr><td align="left"><a href="configs/Misc/mask_rcnn_R_50_FPN_3x_dconv_c3-c5.yaml">Deformable Conv</a></td>
+<td align="center">3x</td>
+<td align="center">0.349</td>
+<td align="center">0.047</td>
+<td align="center">3.5</td>
+<td align="center">42.7</td>
+<td align="center">38.5</td>
+<td align="center">144998336</td>
+<td align="center"><a href="https://dl.fbaipublicfiles.com/detectron2/Misc/mask_rcnn_R_50_FPN_3x_dconv_c3-c5/144998336/model_final_821d0b.pkl">model</a>&nbsp;|&nbsp;<a href="https://dl.fbaipublicfiles.com/detectron2/Misc/mask_rcnn_R_50_FPN_3x_dconv_c3-c5/144998336/metrics.json">metrics</a></td>
+</tr>
+<!-- ROW: cascade_mask_rcnn_R_50_FPN_3x -->
+ <tr><td align="left"><a href="configs/Misc/cascade_mask_rcnn_R_50_FPN_3x.yaml">Cascade R-CNN</a></td>
+<td align="center">3x</td>
+<td align="center">0.328</td>
+<td align="center">0.053</td>
+<td align="center">4.0</td>
+<td align="center">44.3</td>
+<td align="center">38.5</td>
+<td align="center">144998488</td>
+<td align="center"><a href="https://dl.fbaipublicfiles.com/detectron2/Misc/cascade_mask_rcnn_R_50_FPN_3x/144998488/model_final_480dd8.pkl">model</a>&nbsp;|&nbsp;<a href="https://dl.fbaipublicfiles.com/detectron2/Misc/cascade_mask_rcnn_R_50_FPN_3x/144998488/metrics.json">metrics</a></td>
+</tr>
+</tbody></table>
+
+
+Ablations for normalization methods, and a few models trained from scratch following [Rethinking ImageNet Pre-training](https://arxiv.org/abs/1811.08883).
+(Note: The baseline uses `2fc` head while the others use [`4conv1fc` head](https://arxiv.org/abs/1803.08494))
+<!--
+./gen_html_table.py --config 'COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_3x.yaml' 'Misc/mask*50_FPN_3x_gn.yaml' 'Misc/mask*50_FPN_3x_syncbn.yaml' 'Misc/scratch*' --name "Baseline R50-FPN" "GN" "SyncBN" "GN (from scratch)" "GN (from scratch)" "SyncBN (from scratch)" --fields lr_sched train_speed inference_speed mem box_AP mask_AP
+   -->
+
+
+<table><tbody>
+<!-- START TABLE -->
+<!-- TABLE HEADER -->
+<th valign="bottom">Name</th>
+<th valign="bottom">lr<br/>sched</th>
+<th valign="bottom">train<br/>time<br/>(s/iter)</th>
+<th valign="bottom">inference<br/>time<br/>(s/im)</th>
+<th valign="bottom">train<br/>mem<br/>(GB)</th>
+<th valign="bottom">box<br/>AP</th>
+<th valign="bottom">mask<br/>AP</th>
+<th valign="bottom">model id</th>
+<th valign="bottom">download</th>
+<!-- TABLE BODY -->
+<!-- ROW: mask_rcnn_R_50_FPN_3x -->
+ <tr><td align="left"><a href="configs/COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_3x.yaml">Baseline R50-FPN</a></td>
+<td align="center">3x</td>
+<td align="center">0.261</td>
+<td align="center">0.043</td>
+<td align="center">3.4</td>
+<td align="center">41.0</td>
+<td align="center">37.2</td>
+<td align="center">137849600</td>
+<td align="center"><a href="https://dl.fbaipublicfiles.com/detectron2/COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_3x/137849600/model_final_f10217.pkl">model</a>&nbsp;|&nbsp;<a href="https://dl.fbaipublicfiles.com/detectron2/COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_3x/137849600/metrics.json">metrics</a></td>
+</tr>
+<!-- ROW: mask_rcnn_R_50_FPN_3x_gn -->
+ <tr><td align="left"><a href="configs/Misc/mask_rcnn_R_50_FPN_3x_gn.yaml">GN</a></td>
+<td align="center">3x</td>
+<td align="center">0.356</td>
+<td align="center">0.069</td>
+<td align="center">7.3</td>
+<td align="center">42.6</td>
+<td align="center">38.6</td>
+<td align="center">138602888</td>
+<td align="center"><a href="https://dl.fbaipublicfiles.com/detectron2/Misc/mask_rcnn_R_50_FPN_3x_gn/138602888/model_final_dc5d9e.pkl">model</a>&nbsp;|&nbsp;<a href="https://dl.fbaipublicfiles.com/detectron2/Misc/mask_rcnn_R_50_FPN_3x_gn/138602888/metrics.json">metrics</a></td>
+</tr>
+<!-- ROW: mask_rcnn_R_50_FPN_3x_syncbn -->
+ <tr><td align="left"><a href="configs/Misc/mask_rcnn_R_50_FPN_3x_syncbn.yaml">SyncBN</a></td>
+<td align="center">3x</td>
+<td align="center">0.371</td>
+<td align="center">0.053</td>
+<td align="center">5.5</td>
+<td align="center">41.9</td>
+<td align="center">37.8</td>
+<td align="center">169527823</td>
+<td align="center"><a href="https://dl.fbaipublicfiles.com/detectron2/Misc/mask_rcnn_R_50_FPN_3x_syncbn/169527823/model_final_3b3c51.pkl">model</a>&nbsp;|&nbsp;<a href="https://dl.fbaipublicfiles.com/detectron2/Misc/mask_rcnn_R_50_FPN_3x_syncbn/169527823/metrics.json">metrics</a></td>
+</tr>
+<!-- ROW: scratch_mask_rcnn_R_50_FPN_3x_gn -->
+ <tr><td align="left"><a href="configs/Misc/scratch_mask_rcnn_R_50_FPN_3x_gn.yaml">GN (from scratch)</a></td>
+<td align="center">3x</td>
+<td align="center">0.400</td>
+<td align="center">0.069</td>
+<td align="center">9.8</td>
+<td align="center">39.9</td>
+<td align="center">36.6</td>
+<td align="center">138602908</td>
+<td align="center"><a href="https://dl.fbaipublicfiles.com/detectron2/Misc/scratch_mask_rcnn_R_50_FPN_3x_gn/138602908/model_final_01ca85.pkl">model</a>&nbsp;|&nbsp;<a href="https://dl.fbaipublicfiles.com/detectron2/Misc/scratch_mask_rcnn_R_50_FPN_3x_gn/138602908/metrics.json">metrics</a></td>
+</tr>
+<!-- ROW: scratch_mask_rcnn_R_50_FPN_9x_gn -->
+ <tr><td align="left"><a href="configs/Misc/scratch_mask_rcnn_R_50_FPN_9x_gn.yaml">GN (from scratch)</a></td>
+<td align="center">9x</td>
+<td align="center">N/A</td>
+<td align="center">0.070</td>
+<td align="center">9.8</td>
+<td align="center">43.7</td>
+<td align="center">39.6</td>
+<td align="center">183808979</td>
+<td align="center"><a href="https://dl.fbaipublicfiles.com/detectron2/Misc/scratch_mask_rcnn_R_50_FPN_9x_gn/183808979/model_final_da7b4c.pkl">model</a>&nbsp;|&nbsp;<a href="https://dl.fbaipublicfiles.com/detectron2/Misc/scratch_mask_rcnn_R_50_FPN_9x_gn/183808979/metrics.json">metrics</a></td>
+</tr>
+<!-- ROW: scratch_mask_rcnn_R_50_FPN_9x_syncbn -->
+ <tr><td align="left"><a href="configs/Misc/scratch_mask_rcnn_R_50_FPN_9x_syncbn.yaml">SyncBN (from scratch)</a></td>
+<td align="center">9x</td>
+<td align="center">N/A</td>
+<td align="center">0.055</td>
+<td align="center">7.2</td>
+<td align="center">43.6</td>
+<td align="center">39.3</td>
+<td align="center">184226666</td>
+<td align="center"><a href="https://dl.fbaipublicfiles.com/detectron2/Misc/scratch_mask_rcnn_R_50_FPN_9x_syncbn/184226666/model_final_5ce33e.pkl">model</a>&nbsp;|&nbsp;<a href="https://dl.fbaipublicfiles.com/detectron2/Misc/scratch_mask_rcnn_R_50_FPN_9x_syncbn/184226666/metrics.json">metrics</a></td>
+</tr>
+</tbody></table>
+
+
+A few very large models trained for a long time, for demo purposes. They are trained using multiple machines:
+
+<!--
+./gen_html_table.py --config 'Misc/panoptic_*dconv*' 'Misc/cascade_*152*' --name "Panoptic FPN R101" "Mask R-CNN X152" --fields inference_speed mem box_AP mask_AP PQ
+# manually add TTA results
+-->
+
+
+<table><tbody>
+<!-- START TABLE -->
+<!-- TABLE HEADER -->
+<th valign="bottom">Name</th>
+<th valign="bottom">inference<br/>time<br/>(s/im)</th>
+<th valign="bottom">train<br/>mem<br/>(GB)</th>
+<th valign="bottom">box<br/>AP</th>
+<th valign="bottom">mask<br/>AP</th>
+<th valign="bottom">PQ</th>
+<th valign="bottom">model id</th>
+<th valign="bottom">download</th>
+<!-- TABLE BODY -->
+<!-- ROW: panoptic_fpn_R_101_dconv_cascade_gn_3x -->
+ <tr><td align="left"><a href="configs/Misc/panoptic_fpn_R_101_dconv_cascade_gn_3x.yaml">Panoptic FPN R101</a></td>
+<td align="center">0.107</td>
+<td align="center">11.4</td>
+<td align="center">47.4</td>
+<td align="center">41.3</td>
+<td align="center">46.1</td>
+<td align="center">139797668</td>
+<td align="center"><a href="https://dl.fbaipublicfiles.com/detectron2/Misc/panoptic_fpn_R_101_dconv_cascade_gn_3x/139797668/model_final_be35db.pkl">model</a>&nbsp;|&nbsp;<a href="https://dl.fbaipublicfiles.com/detectron2/Misc/panoptic_fpn_R_101_dconv_cascade_gn_3x/139797668/metrics.json">metrics</a></td>
+</tr>
+<!-- ROW: cascade_mask_rcnn_X_152_32x8d_FPN_IN5k_gn_dconv -->
+ <tr><td align="left"><a href="configs/Misc/cascade_mask_rcnn_X_152_32x8d_FPN_IN5k_gn_dconv.yaml">Mask R-CNN X152</a></td>
+<td align="center">0.242</td>
+<td align="center">15.1</td>
+<td align="center">50.2</td>
+<td align="center">44.0</td>
+<td align="center"></td>
+<td align="center">18131413</td>
+<td align="center"><a href="https://dl.fbaipublicfiles.com/detectron2/Misc/cascade_mask_rcnn_X_152_32x8d_FPN_IN5k_gn_dconv/18131413/model_0039999_e76410.pkl">model</a>&nbsp;|&nbsp;<a href="https://dl.fbaipublicfiles.com/detectron2/Misc/cascade_mask_rcnn_X_152_32x8d_FPN_IN5k_gn_dconv/18131413/metrics.json">metrics</a></td>
+</tr>
+<!-- ROW: TTA cascade_mask_rcnn_X_152_32x8d_FPN_IN5k_gn_dconv -->
+ <tr><td align="left">above + test-time aug.</td>
+<td align="center"></td>
+<td align="center"></td>
+<td align="center">51.9</td>
+<td align="center">45.9</td>
+<td align="center"></td>
+<td align="center"></td>
+<td align="center"></td>
+</tr>
+</tbody></table>
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/README.md b/preprocess/humanparsing/mhp_extension/detectron2/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..1fbb95b39ce9e9c0eab83079319a9298fca438b1
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/README.md
@@ -0,0 +1,56 @@
+<img src=".github/Detectron2-Logo-Horz.svg" width="300" >
+
+Detectron2 is Facebook AI Research's next generation software system
+that implements state-of-the-art object detection algorithms.
+It is a ground-up rewrite of the previous version,
+[Detectron](https://github.com/facebookresearch/Detectron/),
+and it originates from [maskrcnn-benchmark](https://github.com/facebookresearch/maskrcnn-benchmark/).
+
+<div align="center">
+  <img src="https://user-images.githubusercontent.com/1381301/66535560-d3422200-eace-11e9-9123-5535d469db19.png"/>
+</div>
+
+### What's New
+* It is powered by the [PyTorch](https://pytorch.org) deep learning framework.
+* Includes more features such as panoptic segmentation, densepose, Cascade R-CNN, rotated bounding boxes, etc.
+* Can be used as a library to support [different projects](projects/) on top of it.
+  We'll open source more research projects in this way.
+* It [trains much faster](https://detectron2.readthedocs.io/notes/benchmarks.html).
+
+See our [blog post](https://ai.facebook.com/blog/-detectron2-a-pytorch-based-modular-object-detection-library-/)
+to see more demos and learn about detectron2.
+
+## Installation
+
+See [INSTALL.md](INSTALL.md).
+
+## Quick Start
+
+See [GETTING_STARTED.md](GETTING_STARTED.md),
+or the [Colab Notebook](https://colab.research.google.com/drive/16jcaJoc6bCFAQ96jDe2HwtXj7BMD_-m5).
+
+Learn more at our [documentation](https://detectron2.readthedocs.org).
+And see [projects/](projects/) for some projects that are built on top of detectron2.
+
+## Model Zoo and Baselines
+
+We provide a large set of baseline results and trained models available for download in the [Detectron2 Model Zoo](MODEL_ZOO.md).
+
+
+## License
+
+Detectron2 is released under the [Apache 2.0 license](LICENSE).
+
+## Citing Detectron2
+
+If you use Detectron2 in your research or wish to refer to the baseline results published in the [Model Zoo](MODEL_ZOO.md), please use the following BibTeX entry.
+
+```BibTeX
+@misc{wu2019detectron2,
+  author =       {Yuxin Wu and Alexander Kirillov and Francisco Massa and
+                  Wan-Yen Lo and Ross Girshick},
+  title =        {Detectron2},
+  howpublished = {\url{https://github.com/facebookresearch/detectron2}},
+  year =         {2019}
+}
+```
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/configs/Base-RCNN-C4.yaml b/preprocess/humanparsing/mhp_extension/detectron2/configs/Base-RCNN-C4.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..fbf34a0ea57a587e09997edd94c4012d69d0b6ad
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/configs/Base-RCNN-C4.yaml
@@ -0,0 +1,18 @@
+MODEL:
+  META_ARCHITECTURE: "GeneralizedRCNN"
+  RPN:
+    PRE_NMS_TOPK_TEST: 6000
+    POST_NMS_TOPK_TEST: 1000
+  ROI_HEADS:
+    NAME: "Res5ROIHeads"
+DATASETS:
+  TRAIN: ("coco_2017_train",)
+  TEST: ("coco_2017_val",)
+SOLVER:
+  IMS_PER_BATCH: 16
+  BASE_LR: 0.02
+  STEPS: (60000, 80000)
+  MAX_ITER: 90000
+INPUT:
+  MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800)
+VERSION: 2
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/configs/Base-RCNN-DilatedC5.yaml b/preprocess/humanparsing/mhp_extension/detectron2/configs/Base-RCNN-DilatedC5.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..c0d6d16bdaf532f09e4976f0aa240a49e748da27
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/configs/Base-RCNN-DilatedC5.yaml
@@ -0,0 +1,31 @@
+MODEL:
+  META_ARCHITECTURE: "GeneralizedRCNN"
+  RESNETS:
+    OUT_FEATURES: ["res5"]
+    RES5_DILATION: 2
+  RPN:
+    IN_FEATURES: ["res5"]
+    PRE_NMS_TOPK_TEST: 6000
+    POST_NMS_TOPK_TEST: 1000
+  ROI_HEADS:
+    NAME: "StandardROIHeads"
+    IN_FEATURES: ["res5"]
+  ROI_BOX_HEAD:
+    NAME: "FastRCNNConvFCHead"
+    NUM_FC: 2
+    POOLER_RESOLUTION: 7
+  ROI_MASK_HEAD:
+    NAME: "MaskRCNNConvUpsampleHead"
+    NUM_CONV: 4
+    POOLER_RESOLUTION: 14
+DATASETS:
+  TRAIN: ("coco_2017_train",)
+  TEST: ("coco_2017_val",)
+SOLVER:
+  IMS_PER_BATCH: 16
+  BASE_LR: 0.02
+  STEPS: (60000, 80000)
+  MAX_ITER: 90000
+INPUT:
+  MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800)
+VERSION: 2
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/configs/Base-RCNN-FPN.yaml b/preprocess/humanparsing/mhp_extension/detectron2/configs/Base-RCNN-FPN.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..3e020f2e7b2f26765be317f907126a1556621abf
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/configs/Base-RCNN-FPN.yaml
@@ -0,0 +1,42 @@
+MODEL:
+  META_ARCHITECTURE: "GeneralizedRCNN"
+  BACKBONE:
+    NAME: "build_resnet_fpn_backbone"
+  RESNETS:
+    OUT_FEATURES: ["res2", "res3", "res4", "res5"]
+  FPN:
+    IN_FEATURES: ["res2", "res3", "res4", "res5"]
+  ANCHOR_GENERATOR:
+    SIZES: [[32], [64], [128], [256], [512]]  # One size for each in feature map
+    ASPECT_RATIOS: [[0.5, 1.0, 2.0]]  # Three aspect ratios (same for all in feature maps)
+  RPN:
+    IN_FEATURES: ["p2", "p3", "p4", "p5", "p6"]
+    PRE_NMS_TOPK_TRAIN: 2000  # Per FPN level
+    PRE_NMS_TOPK_TEST: 1000  # Per FPN level
+    # Detectron1 uses 2000 proposals per-batch,
+    # (See "modeling/rpn/rpn_outputs.py" for details of this legacy issue)
+    # which is approximately 1000 proposals per-image since the default batch size for FPN is 2.
+    POST_NMS_TOPK_TRAIN: 1000
+    POST_NMS_TOPK_TEST: 1000
+  ROI_HEADS:
+    NAME: "StandardROIHeads"
+    IN_FEATURES: ["p2", "p3", "p4", "p5"]
+  ROI_BOX_HEAD:
+    NAME: "FastRCNNConvFCHead"
+    NUM_FC: 2
+    POOLER_RESOLUTION: 7
+  ROI_MASK_HEAD:
+    NAME: "MaskRCNNConvUpsampleHead"
+    NUM_CONV: 4
+    POOLER_RESOLUTION: 14
+DATASETS:
+  TRAIN: ("coco_2017_train",)
+  TEST: ("coco_2017_val",)
+SOLVER:
+  IMS_PER_BATCH: 16
+  BASE_LR: 0.02
+  STEPS: (60000, 80000)
+  MAX_ITER: 90000
+INPUT:
+  MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800)
+VERSION: 2
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/configs/Base-RetinaNet.yaml b/preprocess/humanparsing/mhp_extension/detectron2/configs/Base-RetinaNet.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..12ec9d2fc20cc0438f17bde2c5f6fbee9496c1b0
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/configs/Base-RetinaNet.yaml
@@ -0,0 +1,24 @@
+MODEL:
+  META_ARCHITECTURE: "RetinaNet"
+  BACKBONE:
+    NAME: "build_retinanet_resnet_fpn_backbone"
+  RESNETS:
+    OUT_FEATURES: ["res3", "res4", "res5"]
+  ANCHOR_GENERATOR:
+    SIZES: !!python/object/apply:eval ["[[x, x * 2**(1.0/3), x * 2**(2.0/3) ] for x in [32, 64, 128, 256, 512 ]]"]
+  FPN:
+    IN_FEATURES: ["res3", "res4", "res5"]
+  RETINANET:
+    IOU_THRESHOLDS: [0.4, 0.5]
+    IOU_LABELS: [0, -1, 1]
+DATASETS:
+  TRAIN: ("coco_2017_train",)
+  TEST: ("coco_2017_val",)
+SOLVER:
+  IMS_PER_BATCH: 16
+  BASE_LR: 0.01  # Note that RetinaNet uses a different default learning rate
+  STEPS: (60000, 80000)
+  MAX_ITER: 90000
+INPUT:
+  MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800)
+VERSION: 2
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/configs/COCO-Detection/fast_rcnn_R_50_FPN_1x.yaml b/preprocess/humanparsing/mhp_extension/detectron2/configs/COCO-Detection/fast_rcnn_R_50_FPN_1x.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..773ac10e87c626760d00d831bf664ce9ff073c49
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/configs/COCO-Detection/fast_rcnn_R_50_FPN_1x.yaml
@@ -0,0 +1,17 @@
+_BASE_: "../Base-RCNN-FPN.yaml"
+MODEL:
+  WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
+  MASK_ON: False
+  LOAD_PROPOSALS: True
+  RESNETS:
+    DEPTH: 50
+  PROPOSAL_GENERATOR:
+    NAME: "PrecomputedProposals"
+DATASETS:
+  TRAIN: ("coco_2017_train",)
+  PROPOSAL_FILES_TRAIN: ("detectron2://COCO-Detection/rpn_R_50_FPN_1x/137258492/coco_2017_train_box_proposals_21bc3a.pkl", )
+  TEST: ("coco_2017_val",)
+  PROPOSAL_FILES_TEST: ("detectron2://COCO-Detection/rpn_R_50_FPN_1x/137258492/coco_2017_val_box_proposals_ee0dad.pkl", )
+DATALOADER:
+  # proposals are part of the dataset_dicts, and take a lot of RAM
+  NUM_WORKERS: 2
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/configs/COCO-Detection/faster_rcnn_R_101_C4_3x.yaml b/preprocess/humanparsing/mhp_extension/detectron2/configs/COCO-Detection/faster_rcnn_R_101_C4_3x.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..db142cd671c1841b4f64cf130bee7f7954ecdd28
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/configs/COCO-Detection/faster_rcnn_R_101_C4_3x.yaml
@@ -0,0 +1,9 @@
+_BASE_: "../Base-RCNN-C4.yaml"
+MODEL:
+  WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-101.pkl"
+  MASK_ON: False
+  RESNETS:
+    DEPTH: 101
+SOLVER:
+  STEPS: (210000, 250000)
+  MAX_ITER: 270000
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/configs/COCO-Detection/faster_rcnn_R_101_DC5_3x.yaml b/preprocess/humanparsing/mhp_extension/detectron2/configs/COCO-Detection/faster_rcnn_R_101_DC5_3x.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..bceb6b343618d8cd9a6c414ff9eb86ab31cc230a
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/configs/COCO-Detection/faster_rcnn_R_101_DC5_3x.yaml
@@ -0,0 +1,9 @@
+_BASE_: "../Base-RCNN-DilatedC5.yaml"
+MODEL:
+  WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-101.pkl"
+  MASK_ON: False
+  RESNETS:
+    DEPTH: 101
+SOLVER:
+  STEPS: (210000, 250000)
+  MAX_ITER: 270000
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/configs/COCO-Detection/faster_rcnn_R_101_FPN_3x.yaml b/preprocess/humanparsing/mhp_extension/detectron2/configs/COCO-Detection/faster_rcnn_R_101_FPN_3x.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..57a098f53ee8c54ecfa354cc96efefd890dc1b72
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/configs/COCO-Detection/faster_rcnn_R_101_FPN_3x.yaml
@@ -0,0 +1,9 @@
+_BASE_: "../Base-RCNN-FPN.yaml"
+MODEL:
+  WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-101.pkl"
+  MASK_ON: False
+  RESNETS:
+    DEPTH: 101
+SOLVER:
+  STEPS: (210000, 250000)
+  MAX_ITER: 270000
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/configs/COCO-Detection/faster_rcnn_R_50_C4_1x.yaml b/preprocess/humanparsing/mhp_extension/detectron2/configs/COCO-Detection/faster_rcnn_R_50_C4_1x.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..f96130105c3ba6ab393e0932870903875f5cb732
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/configs/COCO-Detection/faster_rcnn_R_50_C4_1x.yaml
@@ -0,0 +1,6 @@
+_BASE_: "../Base-RCNN-C4.yaml"
+MODEL:
+  WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
+  MASK_ON: False
+  RESNETS:
+    DEPTH: 50
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/configs/COCO-Detection/faster_rcnn_R_50_C4_3x.yaml b/preprocess/humanparsing/mhp_extension/detectron2/configs/COCO-Detection/faster_rcnn_R_50_C4_3x.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..bc51bce390a85ee3529ffdcebde05748e1646be0
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/configs/COCO-Detection/faster_rcnn_R_50_C4_3x.yaml
@@ -0,0 +1,9 @@
+_BASE_: "../Base-RCNN-C4.yaml"
+MODEL:
+  WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
+  MASK_ON: False
+  RESNETS:
+    DEPTH: 50
+SOLVER:
+  STEPS: (210000, 250000)
+  MAX_ITER: 270000
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/configs/COCO-Detection/faster_rcnn_R_50_DC5_1x.yaml b/preprocess/humanparsing/mhp_extension/detectron2/configs/COCO-Detection/faster_rcnn_R_50_DC5_1x.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..0fe96f57febdac5790ea4cec168fa4b97ac4807a
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/configs/COCO-Detection/faster_rcnn_R_50_DC5_1x.yaml
@@ -0,0 +1,6 @@
+_BASE_: "../Base-RCNN-DilatedC5.yaml"
+MODEL:
+  WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
+  MASK_ON: False
+  RESNETS:
+    DEPTH: 50
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/configs/COCO-Detection/faster_rcnn_R_50_DC5_3x.yaml b/preprocess/humanparsing/mhp_extension/detectron2/configs/COCO-Detection/faster_rcnn_R_50_DC5_3x.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..33fadeb87d1ef67ab2b55926b9a652ab4ac4a27d
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/configs/COCO-Detection/faster_rcnn_R_50_DC5_3x.yaml
@@ -0,0 +1,9 @@
+_BASE_: "../Base-RCNN-DilatedC5.yaml"
+MODEL:
+  WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
+  MASK_ON: False
+  RESNETS:
+    DEPTH: 50
+SOLVER:
+  STEPS: (210000, 250000)
+  MAX_ITER: 270000
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/configs/COCO-Detection/faster_rcnn_R_50_FPN_1x.yaml b/preprocess/humanparsing/mhp_extension/detectron2/configs/COCO-Detection/faster_rcnn_R_50_FPN_1x.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..3262019a1211b910d3b371569199ed1afaacf6a4
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/configs/COCO-Detection/faster_rcnn_R_50_FPN_1x.yaml
@@ -0,0 +1,6 @@
+_BASE_: "../Base-RCNN-FPN.yaml"
+MODEL:
+  WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
+  MASK_ON: False
+  RESNETS:
+    DEPTH: 50
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/configs/COCO-Detection/faster_rcnn_R_50_FPN_3x.yaml b/preprocess/humanparsing/mhp_extension/detectron2/configs/COCO-Detection/faster_rcnn_R_50_FPN_3x.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..41395182bf5c9dd8ab1241c4414068817298d554
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/configs/COCO-Detection/faster_rcnn_R_50_FPN_3x.yaml
@@ -0,0 +1,9 @@
+_BASE_: "../Base-RCNN-FPN.yaml"
+MODEL:
+  WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
+  MASK_ON: False
+  RESNETS:
+    DEPTH: 50
+SOLVER:
+  STEPS: (210000, 250000)
+  MAX_ITER: 270000
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/configs/COCO-Detection/faster_rcnn_X_101_32x8d_FPN_3x.yaml b/preprocess/humanparsing/mhp_extension/detectron2/configs/COCO-Detection/faster_rcnn_X_101_32x8d_FPN_3x.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..9c9b5ab77157baa581d90d9847c045c19ed6ffa3
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/configs/COCO-Detection/faster_rcnn_X_101_32x8d_FPN_3x.yaml
@@ -0,0 +1,13 @@
+_BASE_: "../Base-RCNN-FPN.yaml"
+MODEL:
+  MASK_ON: False
+  WEIGHTS: "detectron2://ImageNetPretrained/FAIR/X-101-32x8d.pkl"
+  PIXEL_STD: [57.375, 57.120, 58.395]
+  RESNETS:
+    STRIDE_IN_1X1: False  # this is a C2 model
+    NUM_GROUPS: 32
+    WIDTH_PER_GROUP: 8
+    DEPTH: 101
+SOLVER:
+  STEPS: (210000, 250000)
+  MAX_ITER: 270000
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/configs/COCO-Detection/retinanet_R_101_FPN_3x.yaml b/preprocess/humanparsing/mhp_extension/detectron2/configs/COCO-Detection/retinanet_R_101_FPN_3x.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..4abb1b9a547957aa6afc0b29129e00f89cf98d59
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/configs/COCO-Detection/retinanet_R_101_FPN_3x.yaml
@@ -0,0 +1,8 @@
+_BASE_: "../Base-RetinaNet.yaml"
+MODEL:
+  WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-101.pkl"
+  RESNETS:
+    DEPTH: 101
+SOLVER:
+  STEPS: (210000, 250000)
+  MAX_ITER: 270000
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/configs/COCO-Detection/retinanet_R_50_FPN_1x.yaml b/preprocess/humanparsing/mhp_extension/detectron2/configs/COCO-Detection/retinanet_R_50_FPN_1x.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..4a24ce3a9a108a8792e18c8aabfb7b712f0d3725
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/configs/COCO-Detection/retinanet_R_50_FPN_1x.yaml
@@ -0,0 +1,5 @@
+_BASE_: "../Base-RetinaNet.yaml"
+MODEL:
+  WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
+  RESNETS:
+    DEPTH: 50
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/configs/COCO-Detection/retinanet_R_50_FPN_3x.yaml b/preprocess/humanparsing/mhp_extension/detectron2/configs/COCO-Detection/retinanet_R_50_FPN_3x.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..3b5412d4a7aef1d6c3f7c1e34f94007de639b833
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/configs/COCO-Detection/retinanet_R_50_FPN_3x.yaml
@@ -0,0 +1,8 @@
+_BASE_: "../Base-RetinaNet.yaml"
+MODEL:
+  WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
+  RESNETS:
+    DEPTH: 50
+SOLVER:
+  STEPS: (210000, 250000)
+  MAX_ITER: 270000
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/configs/COCO-Detection/rpn_R_50_C4_1x.yaml b/preprocess/humanparsing/mhp_extension/detectron2/configs/COCO-Detection/rpn_R_50_C4_1x.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..e04821156b0376ba5215d5ce5b7010a36b43e6a1
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/configs/COCO-Detection/rpn_R_50_C4_1x.yaml
@@ -0,0 +1,10 @@
+_BASE_: "../Base-RCNN-C4.yaml"
+MODEL:
+  META_ARCHITECTURE: "ProposalNetwork"
+  WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
+  MASK_ON: False
+  RESNETS:
+    DEPTH: 50
+  RPN:
+    PRE_NMS_TOPK_TEST: 12000
+    POST_NMS_TOPK_TEST: 2000
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/configs/COCO-Detection/rpn_R_50_FPN_1x.yaml b/preprocess/humanparsing/mhp_extension/detectron2/configs/COCO-Detection/rpn_R_50_FPN_1x.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..dc9c95203b1c3c9cd9bb9876bb8d9a5dd9b31d9a
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/configs/COCO-Detection/rpn_R_50_FPN_1x.yaml
@@ -0,0 +1,9 @@
+_BASE_: "../Base-RCNN-FPN.yaml"
+MODEL:
+  META_ARCHITECTURE: "ProposalNetwork"
+  WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
+  MASK_ON: False
+  RESNETS:
+    DEPTH: 50
+  RPN:
+    POST_NMS_TOPK_TEST: 2000
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/configs/COCO-InstanceSegmentation/mask_rcnn_R_101_C4_3x.yaml b/preprocess/humanparsing/mhp_extension/detectron2/configs/COCO-InstanceSegmentation/mask_rcnn_R_101_C4_3x.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..1a94cc45a0f2aaa8c92e14871c553b736545e327
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/configs/COCO-InstanceSegmentation/mask_rcnn_R_101_C4_3x.yaml
@@ -0,0 +1,9 @@
+_BASE_: "../Base-RCNN-C4.yaml"
+MODEL:
+  WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-101.pkl"
+  MASK_ON: True
+  RESNETS:
+    DEPTH: 101
+SOLVER:
+  STEPS: (210000, 250000)
+  MAX_ITER: 270000
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/configs/COCO-InstanceSegmentation/mask_rcnn_R_101_DC5_3x.yaml b/preprocess/humanparsing/mhp_extension/detectron2/configs/COCO-InstanceSegmentation/mask_rcnn_R_101_DC5_3x.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..67b70cf4be8c19f5dc735b6f55a8690698f34b69
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/configs/COCO-InstanceSegmentation/mask_rcnn_R_101_DC5_3x.yaml
@@ -0,0 +1,9 @@
+_BASE_: "../Base-RCNN-DilatedC5.yaml"
+MODEL:
+  WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-101.pkl"
+  MASK_ON: True
+  RESNETS:
+    DEPTH: 101
+SOLVER:
+  STEPS: (210000, 250000)
+  MAX_ITER: 270000
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/configs/COCO-InstanceSegmentation/mask_rcnn_R_101_FPN_3x.yaml b/preprocess/humanparsing/mhp_extension/detectron2/configs/COCO-InstanceSegmentation/mask_rcnn_R_101_FPN_3x.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..1935a302d2d0fa7f69553b3fd50b5a7082c6c0d1
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/configs/COCO-InstanceSegmentation/mask_rcnn_R_101_FPN_3x.yaml
@@ -0,0 +1,9 @@
+_BASE_: "../Base-RCNN-FPN.yaml"
+MODEL:
+  WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-101.pkl"
+  MASK_ON: True
+  RESNETS:
+    DEPTH: 101
+SOLVER:
+  STEPS: (210000, 250000)
+  MAX_ITER: 270000
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/configs/COCO-InstanceSegmentation/mask_rcnn_R_50_C4_1x.yaml b/preprocess/humanparsing/mhp_extension/detectron2/configs/COCO-InstanceSegmentation/mask_rcnn_R_50_C4_1x.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..a9aeb4eac38026dbb867e799f9fd3a8d8eb3af80
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/configs/COCO-InstanceSegmentation/mask_rcnn_R_50_C4_1x.yaml
@@ -0,0 +1,6 @@
+_BASE_: "../Base-RCNN-C4.yaml"
+MODEL:
+  WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
+  MASK_ON: True
+  RESNETS:
+    DEPTH: 50
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/configs/COCO-InstanceSegmentation/mask_rcnn_R_50_C4_3x.yaml b/preprocess/humanparsing/mhp_extension/detectron2/configs/COCO-InstanceSegmentation/mask_rcnn_R_50_C4_3x.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..38ed867d897dfec839cbcf11a2e2dc8abb92f07c
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/configs/COCO-InstanceSegmentation/mask_rcnn_R_50_C4_3x.yaml
@@ -0,0 +1,9 @@
+_BASE_: "../Base-RCNN-C4.yaml"
+MODEL:
+  WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
+  MASK_ON: True
+  RESNETS:
+    DEPTH: 50
+SOLVER:
+  STEPS: (210000, 250000)
+  MAX_ITER: 270000
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/configs/COCO-InstanceSegmentation/mask_rcnn_R_50_DC5_1x.yaml b/preprocess/humanparsing/mhp_extension/detectron2/configs/COCO-InstanceSegmentation/mask_rcnn_R_50_DC5_1x.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..b13eefab2a049c48d94d5051c82ceb6dbde40579
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/configs/COCO-InstanceSegmentation/mask_rcnn_R_50_DC5_1x.yaml
@@ -0,0 +1,6 @@
+_BASE_: "../Base-RCNN-DilatedC5.yaml"
+MODEL:
+  WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
+  MASK_ON: True
+  RESNETS:
+    DEPTH: 50
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/configs/COCO-InstanceSegmentation/mask_rcnn_R_50_DC5_3x.yaml b/preprocess/humanparsing/mhp_extension/detectron2/configs/COCO-InstanceSegmentation/mask_rcnn_R_50_DC5_3x.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..d401016358f967f6619d88b1c9bd5673a1cdeba8
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/configs/COCO-InstanceSegmentation/mask_rcnn_R_50_DC5_3x.yaml
@@ -0,0 +1,9 @@
+_BASE_: "../Base-RCNN-DilatedC5.yaml"
+MODEL:
+  WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
+  MASK_ON: True
+  RESNETS:
+    DEPTH: 50
+SOLVER:
+  STEPS: (210000, 250000)
+  MAX_ITER: 270000
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/configs/COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_1x.yaml b/preprocess/humanparsing/mhp_extension/detectron2/configs/COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_1x.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..d50fb866ca7811a87b42555c7213f88e00bf6df1
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/configs/COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_1x.yaml
@@ -0,0 +1,6 @@
+_BASE_: "../Base-RCNN-FPN.yaml"
+MODEL:
+  WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
+  MASK_ON: True
+  RESNETS:
+    DEPTH: 50
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/configs/COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_3x.yaml b/preprocess/humanparsing/mhp_extension/detectron2/configs/COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_3x.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..be7d06b8e0f032ee7fcaabd7c122158518489fd2
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/configs/COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_3x.yaml
@@ -0,0 +1,9 @@
+_BASE_: "../Base-RCNN-FPN.yaml"
+MODEL:
+  WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
+  MASK_ON: True
+  RESNETS:
+    DEPTH: 50
+SOLVER:
+  STEPS: (210000, 250000)
+  MAX_ITER: 270000
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/configs/COCO-InstanceSegmentation/mask_rcnn_X_101_32x8d_FPN_3x.yaml b/preprocess/humanparsing/mhp_extension/detectron2/configs/COCO-InstanceSegmentation/mask_rcnn_X_101_32x8d_FPN_3x.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..d14c63f74383bfc308750f51d51344398b02a239
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/configs/COCO-InstanceSegmentation/mask_rcnn_X_101_32x8d_FPN_3x.yaml
@@ -0,0 +1,13 @@
+_BASE_: "../Base-RCNN-FPN.yaml"
+MODEL:
+  MASK_ON: True
+  WEIGHTS: "detectron2://ImageNetPretrained/FAIR/X-101-32x8d.pkl"
+  PIXEL_STD: [57.375, 57.120, 58.395]
+  RESNETS:
+    STRIDE_IN_1X1: False  # this is a C2 model
+    NUM_GROUPS: 32
+    WIDTH_PER_GROUP: 8
+    DEPTH: 101
+SOLVER:
+  STEPS: (210000, 250000)
+  MAX_ITER: 270000
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/configs/COCO-Keypoints/Base-Keypoint-RCNN-FPN.yaml b/preprocess/humanparsing/mhp_extension/detectron2/configs/COCO-Keypoints/Base-Keypoint-RCNN-FPN.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..4e03944a42d2e497da5ceca17c8fda797dac3f82
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/configs/COCO-Keypoints/Base-Keypoint-RCNN-FPN.yaml
@@ -0,0 +1,15 @@
+_BASE_: "../Base-RCNN-FPN.yaml"
+MODEL:
+  KEYPOINT_ON: True
+  ROI_HEADS:
+    NUM_CLASSES: 1
+  ROI_BOX_HEAD:
+    SMOOTH_L1_BETA: 0.5  # Keypoint AP degrades (though box AP improves) when using plain L1 loss
+  RPN:
+    # Detectron1 uses 2000 proposals per-batch, but this option is per-image in detectron2.
+    # 1000 proposals per-image is found to hurt box AP.
+    # Therefore we increase it to 1500 per-image.
+    POST_NMS_TOPK_TRAIN: 1500
+DATASETS:
+  TRAIN: ("keypoints_coco_2017_train",)
+  TEST: ("keypoints_coco_2017_val",)
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/configs/COCO-Keypoints/keypoint_rcnn_R_101_FPN_3x.yaml b/preprocess/humanparsing/mhp_extension/detectron2/configs/COCO-Keypoints/keypoint_rcnn_R_101_FPN_3x.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..9309535c57a1aa7d23297aac80a9bd78a6c79fcc
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/configs/COCO-Keypoints/keypoint_rcnn_R_101_FPN_3x.yaml
@@ -0,0 +1,8 @@
+_BASE_: "Base-Keypoint-RCNN-FPN.yaml"
+MODEL:
+  WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-101.pkl"
+  RESNETS:
+    DEPTH: 101
+SOLVER:
+  STEPS: (210000, 250000)
+  MAX_ITER: 270000
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/configs/COCO-Keypoints/keypoint_rcnn_R_50_FPN_1x.yaml b/preprocess/humanparsing/mhp_extension/detectron2/configs/COCO-Keypoints/keypoint_rcnn_R_50_FPN_1x.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..7bf85cf745b53b3e7ab28fe94b7f4f9e7fe6e335
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/configs/COCO-Keypoints/keypoint_rcnn_R_50_FPN_1x.yaml
@@ -0,0 +1,5 @@
+_BASE_: "Base-Keypoint-RCNN-FPN.yaml"
+MODEL:
+  WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
+  RESNETS:
+    DEPTH: 50
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/configs/COCO-Keypoints/keypoint_rcnn_R_50_FPN_3x.yaml b/preprocess/humanparsing/mhp_extension/detectron2/configs/COCO-Keypoints/keypoint_rcnn_R_50_FPN_3x.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..a07f243f650a497b9372501e3face75194cf0941
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/configs/COCO-Keypoints/keypoint_rcnn_R_50_FPN_3x.yaml
@@ -0,0 +1,8 @@
+_BASE_: "Base-Keypoint-RCNN-FPN.yaml"
+MODEL:
+  WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
+  RESNETS:
+    DEPTH: 50
+SOLVER:
+  STEPS: (210000, 250000)
+  MAX_ITER: 270000
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/configs/COCO-Keypoints/keypoint_rcnn_X_101_32x8d_FPN_3x.yaml b/preprocess/humanparsing/mhp_extension/detectron2/configs/COCO-Keypoints/keypoint_rcnn_X_101_32x8d_FPN_3x.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..d4bfa20a98c0a65c6bd60e93b07e8f4b7d92a867
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/configs/COCO-Keypoints/keypoint_rcnn_X_101_32x8d_FPN_3x.yaml
@@ -0,0 +1,12 @@
+_BASE_: "Base-Keypoint-RCNN-FPN.yaml"
+MODEL:
+  WEIGHTS: "detectron2://ImageNetPretrained/FAIR/X-101-32x8d.pkl"
+  PIXEL_STD: [57.375, 57.120, 58.395]
+  RESNETS:
+    STRIDE_IN_1X1: False  # this is a C2 model
+    NUM_GROUPS: 32
+    WIDTH_PER_GROUP: 8
+    DEPTH: 101
+SOLVER:
+  STEPS: (210000, 250000)
+  MAX_ITER: 270000
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/configs/COCO-PanopticSegmentation/Base-Panoptic-FPN.yaml b/preprocess/humanparsing/mhp_extension/detectron2/configs/COCO-PanopticSegmentation/Base-Panoptic-FPN.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..755c12018c5db8ca456d5e7fa8cbd18d90f97527
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/configs/COCO-PanopticSegmentation/Base-Panoptic-FPN.yaml
@@ -0,0 +1,9 @@
+_BASE_: "../Base-RCNN-FPN.yaml"
+MODEL:
+  META_ARCHITECTURE: "PanopticFPN"
+  MASK_ON: True
+  SEM_SEG_HEAD:
+    LOSS_WEIGHT: 0.5
+DATASETS:
+  TRAIN: ("coco_2017_train_panoptic_separated",)
+  TEST: ("coco_2017_val_panoptic_separated",)
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/configs/COCO-PanopticSegmentation/panoptic_fpn_R_101_3x.yaml b/preprocess/humanparsing/mhp_extension/detectron2/configs/COCO-PanopticSegmentation/panoptic_fpn_R_101_3x.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..0e01f6fb31e9b00b1857b7de3b5074184d1f4a21
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/configs/COCO-PanopticSegmentation/panoptic_fpn_R_101_3x.yaml
@@ -0,0 +1,8 @@
+_BASE_: "Base-Panoptic-FPN.yaml"
+MODEL:
+  WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-101.pkl"
+  RESNETS:
+    DEPTH: 101
+SOLVER:
+  STEPS: (210000, 250000)
+  MAX_ITER: 270000
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/configs/COCO-PanopticSegmentation/panoptic_fpn_R_50_1x.yaml b/preprocess/humanparsing/mhp_extension/detectron2/configs/COCO-PanopticSegmentation/panoptic_fpn_R_50_1x.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..6afa2c1cc92495309ed1553a17359fe5d7d6566e
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/configs/COCO-PanopticSegmentation/panoptic_fpn_R_50_1x.yaml
@@ -0,0 +1,5 @@
+_BASE_: "Base-Panoptic-FPN.yaml"
+MODEL:
+  WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
+  RESNETS:
+    DEPTH: 50
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/configs/COCO-PanopticSegmentation/panoptic_fpn_R_50_3x.yaml b/preprocess/humanparsing/mhp_extension/detectron2/configs/COCO-PanopticSegmentation/panoptic_fpn_R_50_3x.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..b956b3f673e78649184fe2c50e2700b3f1f14794
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/configs/COCO-PanopticSegmentation/panoptic_fpn_R_50_3x.yaml
@@ -0,0 +1,8 @@
+_BASE_: "Base-Panoptic-FPN.yaml"
+MODEL:
+  WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
+  RESNETS:
+    DEPTH: 50
+SOLVER:
+  STEPS: (210000, 250000)
+  MAX_ITER: 270000
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/configs/Cityscapes/mask_rcnn_R_50_FPN.yaml b/preprocess/humanparsing/mhp_extension/detectron2/configs/Cityscapes/mask_rcnn_R_50_FPN.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..1a7aaeb961581ed9492c4cfe5a69a1eb60495b3e
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/configs/Cityscapes/mask_rcnn_R_50_FPN.yaml
@@ -0,0 +1,27 @@
+_BASE_: "../Base-RCNN-FPN.yaml"
+MODEL:
+  # WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
+  # For better, more stable performance initialize from COCO
+  WEIGHTS: "detectron2://COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_3x/137849600/model_final_f10217.pkl"
+  MASK_ON: True
+  ROI_HEADS:
+    NUM_CLASSES: 8
+# This is similar to the setting used in Mask R-CNN paper, Appendix A
+# But there are some differences, e.g., we did not initialize the output
+# layer using the corresponding classes from COCO
+INPUT:
+  MIN_SIZE_TRAIN: (800, 832, 864, 896, 928, 960, 992, 1024)
+  MIN_SIZE_TRAIN_SAMPLING: "choice"
+  MIN_SIZE_TEST: 1024
+  MAX_SIZE_TRAIN: 2048
+  MAX_SIZE_TEST: 2048
+DATASETS:
+  TRAIN: ("cityscapes_fine_instance_seg_train",)
+  TEST: ("cityscapes_fine_instance_seg_val",)
+SOLVER:
+  BASE_LR: 0.01
+  STEPS: (18000,)
+  MAX_ITER: 24000
+  IMS_PER_BATCH: 8
+TEST:
+  EVAL_PERIOD: 8000
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/configs/Detectron1-Comparisons/README.md b/preprocess/humanparsing/mhp_extension/detectron2/configs/Detectron1-Comparisons/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..a90ed9e433a00b8b9f43961d7a2696d5b9013127
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/configs/Detectron1-Comparisons/README.md
@@ -0,0 +1,83 @@
+
+Detectron2 model zoo's experimental settings and a few implementation details are different from Detectron.
+
+The differences in implementation details are shared in
+[Compatibility with Other Libraries](../../docs/notes/compatibility.md).
+
+The differences in model zoo's experimental settings include:
+* Use scale augmentation during training. This improves AP with lower training cost.
+* Use L1 loss instead of smooth L1 loss for simplicity. This sometimes improves box AP but may
+  affect other AP.
+* Use `POOLER_SAMPLING_RATIO=0` instead of 2. This does not significantly affect AP.
+* Use `ROIAlignV2`. This does not significantly affect AP.
+
+In this directory, we provide a few configs that __do not__ have the above changes.
+They mimic Detectron's behavior as close as possible,
+and provide a fair comparison of accuracy and speed against Detectron.
+
+<!--
+./gen_html_table.py --config 'Detectron1-Comparisons/*.yaml' --name "Faster R-CNN" "Keypoint R-CNN" "Mask R-CNN" --fields lr_sched train_speed inference_speed mem box_AP mask_AP keypoint_AP --base-dir ../../../configs/Detectron1-Comparisons
+-->
+
+
+<table><tbody>
+<!-- START TABLE -->
+<!-- TABLE HEADER -->
+<th valign="bottom">Name</th>
+<th valign="bottom">lr<br/>sched</th>
+<th valign="bottom">train<br/>time<br/>(s/iter)</th>
+<th valign="bottom">inference<br/>time<br/>(s/im)</th>
+<th valign="bottom">train<br/>mem<br/>(GB)</th>
+<th valign="bottom">box<br/>AP</th>
+<th valign="bottom">mask<br/>AP</th>
+<th valign="bottom">kp.<br/>AP</th>
+<th valign="bottom">model id</th>
+<th valign="bottom">download</th>
+<!-- TABLE BODY -->
+<!-- ROW: faster_rcnn_R_50_FPN_noaug_1x -->
+ <tr><td align="left"><a href="faster_rcnn_R_50_FPN_noaug_1x.yaml">Faster R-CNN</a></td>
+<td align="center">1x</td>
+<td align="center">0.219</td>
+<td align="center">0.038</td>
+<td align="center">3.1</td>
+<td align="center">36.9</td>
+<td align="center"></td>
+<td align="center"></td>
+<td align="center">137781054</td>
+<td align="center"><a href="https://dl.fbaipublicfiles.com/detectron2/Detectron1-Comparisons/faster_rcnn_R_50_FPN_noaug_1x/137781054/model_final_7ab50c.pkl">model</a>&nbsp;|&nbsp;<a href="https://dl.fbaipublicfiles.com/detectron2/Detectron1-Comparisons/faster_rcnn_R_50_FPN_noaug_1x/137781054/metrics.json">metrics</a></td>
+</tr>
+<!-- ROW: keypoint_rcnn_R_50_FPN_1x -->
+ <tr><td align="left"><a href="keypoint_rcnn_R_50_FPN_1x.yaml">Keypoint R-CNN</a></td>
+<td align="center">1x</td>
+<td align="center">0.313</td>
+<td align="center">0.071</td>
+<td align="center">5.0</td>
+<td align="center">53.1</td>
+<td align="center"></td>
+<td align="center">64.2</td>
+<td align="center">137781195</td>
+<td align="center"><a href="https://dl.fbaipublicfiles.com/detectron2/Detectron1-Comparisons/keypoint_rcnn_R_50_FPN_1x/137781195/model_final_cce136.pkl">model</a>&nbsp;|&nbsp;<a href="https://dl.fbaipublicfiles.com/detectron2/Detectron1-Comparisons/keypoint_rcnn_R_50_FPN_1x/137781195/metrics.json">metrics</a></td>
+</tr>
+<!-- ROW: mask_rcnn_R_50_FPN_noaug_1x -->
+ <tr><td align="left"><a href="mask_rcnn_R_50_FPN_noaug_1x.yaml">Mask R-CNN</a></td>
+<td align="center">1x</td>
+<td align="center">0.273</td>
+<td align="center">0.043</td>
+<td align="center">3.4</td>
+<td align="center">37.8</td>
+<td align="center">34.9</td>
+<td align="center"></td>
+<td align="center">137781281</td>
+<td align="center"><a href="https://dl.fbaipublicfiles.com/detectron2/Detectron1-Comparisons/mask_rcnn_R_50_FPN_noaug_1x/137781281/model_final_62ca52.pkl">model</a>&nbsp;|&nbsp;<a href="https://dl.fbaipublicfiles.com/detectron2/Detectron1-Comparisons/mask_rcnn_R_50_FPN_noaug_1x/137781281/metrics.json">metrics</a></td>
+</tr>
+</tbody></table>
+
+## Comparisons:
+
+* Faster R-CNN: Detectron's AP is 36.7, similar to ours.
+* Keypoint R-CNN: Detectron's AP is box 53.6, keypoint 64.2. Fixing a Detectron's
+  [bug](https://github.com/facebookresearch/Detectron/issues/459) lead to a drop in box AP, and can be
+	compensated back by some parameter tuning.
+* Mask R-CNN: Detectron's AP is box 37.7, mask 33.9. We're 1 AP better in mask AP, due to more correct implementation.
+
+For speed comparison, see [benchmarks](https://detectron2.readthedocs.io/notes/benchmarks.html).
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/configs/Detectron1-Comparisons/faster_rcnn_R_50_FPN_noaug_1x.yaml b/preprocess/humanparsing/mhp_extension/detectron2/configs/Detectron1-Comparisons/faster_rcnn_R_50_FPN_noaug_1x.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..6ce77f137fa2c4e5254a62b58c18b8b76096f2aa
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/configs/Detectron1-Comparisons/faster_rcnn_R_50_FPN_noaug_1x.yaml
@@ -0,0 +1,17 @@
+_BASE_: "../Base-RCNN-FPN.yaml"
+MODEL:
+  WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
+  MASK_ON: False
+  RESNETS:
+    DEPTH: 50
+  # Detectron1 uses smooth L1 loss with some magic beta values.
+  # The defaults are changed to L1 loss in Detectron2.
+  RPN:
+    SMOOTH_L1_BETA: 0.1111
+  ROI_BOX_HEAD:
+    SMOOTH_L1_BETA: 1.0
+    POOLER_SAMPLING_RATIO: 2
+    POOLER_TYPE: "ROIAlign"
+INPUT:
+  # no scale augmentation
+  MIN_SIZE_TRAIN: (800, )
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/configs/Detectron1-Comparisons/keypoint_rcnn_R_50_FPN_1x.yaml b/preprocess/humanparsing/mhp_extension/detectron2/configs/Detectron1-Comparisons/keypoint_rcnn_R_50_FPN_1x.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..aacf868ba5290c752031c130a2081af48afc0808
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/configs/Detectron1-Comparisons/keypoint_rcnn_R_50_FPN_1x.yaml
@@ -0,0 +1,27 @@
+_BASE_: "../Base-RCNN-FPN.yaml"
+MODEL:
+  WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
+  KEYPOINT_ON: True
+  RESNETS:
+    DEPTH: 50
+  ROI_HEADS:
+    NUM_CLASSES: 1
+  ROI_KEYPOINT_HEAD:
+    POOLER_RESOLUTION: 14
+    POOLER_SAMPLING_RATIO: 2
+    POOLER_TYPE: "ROIAlign"
+  # Detectron1 uses smooth L1 loss with some magic beta values.
+  # The defaults are changed to L1 loss in Detectron2.
+  ROI_BOX_HEAD:
+    SMOOTH_L1_BETA: 1.0
+    POOLER_SAMPLING_RATIO: 2
+    POOLER_TYPE: "ROIAlign"
+  RPN:
+    SMOOTH_L1_BETA: 0.1111
+    # Detectron1 uses 2000 proposals per-batch, but this option is per-image in detectron2
+    # 1000 proposals per-image is found to hurt box AP.
+    # Therefore we increase it to 1500 per-image.
+    POST_NMS_TOPK_TRAIN: 1500
+DATASETS:
+  TRAIN: ("keypoints_coco_2017_train",)
+  TEST: ("keypoints_coco_2017_val",)
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/configs/Detectron1-Comparisons/mask_rcnn_R_50_FPN_noaug_1x.yaml b/preprocess/humanparsing/mhp_extension/detectron2/configs/Detectron1-Comparisons/mask_rcnn_R_50_FPN_noaug_1x.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..4ea86a8d8e2cd3e51cbc7311b0d00710c07d01f6
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/configs/Detectron1-Comparisons/mask_rcnn_R_50_FPN_noaug_1x.yaml
@@ -0,0 +1,20 @@
+_BASE_: "../Base-RCNN-FPN.yaml"
+MODEL:
+  WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
+  MASK_ON: True
+  RESNETS:
+    DEPTH: 50
+  # Detectron1 uses smooth L1 loss with some magic beta values.
+  # The defaults are changed to L1 loss in Detectron2.
+  RPN:
+    SMOOTH_L1_BETA: 0.1111
+  ROI_BOX_HEAD:
+    SMOOTH_L1_BETA: 1.0
+    POOLER_SAMPLING_RATIO: 2
+    POOLER_TYPE: "ROIAlign"
+  ROI_MASK_HEAD:
+    POOLER_SAMPLING_RATIO: 2
+    POOLER_TYPE: "ROIAlign"
+INPUT:
+  # no scale augmentation
+  MIN_SIZE_TRAIN: (800, )
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/configs/LVIS-InstanceSegmentation/mask_rcnn_R_101_FPN_1x.yaml b/preprocess/humanparsing/mhp_extension/detectron2/configs/LVIS-InstanceSegmentation/mask_rcnn_R_101_FPN_1x.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..f0c3a1bbc0a09e1384de522f30c443ba1e36fafa
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/configs/LVIS-InstanceSegmentation/mask_rcnn_R_101_FPN_1x.yaml
@@ -0,0 +1,19 @@
+_BASE_: "../Base-RCNN-FPN.yaml"
+MODEL:
+  WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-101.pkl"
+  MASK_ON: True
+  RESNETS:
+    DEPTH: 101
+  ROI_HEADS:
+    NUM_CLASSES: 1230
+    SCORE_THRESH_TEST: 0.0001
+INPUT:
+  MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800)
+DATASETS:
+  TRAIN: ("lvis_v0.5_train",)
+  TEST: ("lvis_v0.5_val",)
+TEST:
+  DETECTIONS_PER_IMAGE: 300  # LVIS allows up to 300
+DATALOADER:
+  SAMPLER_TRAIN: "RepeatFactorTrainingSampler"
+  REPEAT_THRESHOLD: 0.001
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/configs/LVIS-InstanceSegmentation/mask_rcnn_R_50_FPN_1x.yaml b/preprocess/humanparsing/mhp_extension/detectron2/configs/LVIS-InstanceSegmentation/mask_rcnn_R_50_FPN_1x.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..64b4caa4ef2b284782367ea702e1ae6653472630
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/configs/LVIS-InstanceSegmentation/mask_rcnn_R_50_FPN_1x.yaml
@@ -0,0 +1,19 @@
+_BASE_: "../Base-RCNN-FPN.yaml"
+MODEL:
+  WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
+  MASK_ON: True
+  RESNETS:
+    DEPTH: 50
+  ROI_HEADS:
+    NUM_CLASSES: 1230
+    SCORE_THRESH_TEST: 0.0001
+INPUT:
+  MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800)
+DATASETS:
+  TRAIN: ("lvis_v0.5_train",)
+  TEST: ("lvis_v0.5_val",)
+TEST:
+  DETECTIONS_PER_IMAGE: 300  # LVIS allows up to 300
+DATALOADER:
+  SAMPLER_TRAIN: "RepeatFactorTrainingSampler"
+  REPEAT_THRESHOLD: 0.001
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/configs/LVIS-InstanceSegmentation/mask_rcnn_X_101_32x8d_FPN_1x.yaml b/preprocess/humanparsing/mhp_extension/detectron2/configs/LVIS-InstanceSegmentation/mask_rcnn_X_101_32x8d_FPN_1x.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..c8b822c6c006ba642f4caf9b55e7983f6797427a
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/configs/LVIS-InstanceSegmentation/mask_rcnn_X_101_32x8d_FPN_1x.yaml
@@ -0,0 +1,23 @@
+_BASE_: "../Base-RCNN-FPN.yaml"
+MODEL:
+  WEIGHTS: "detectron2://ImageNetPretrained/FAIR/X-101-32x8d.pkl"
+  PIXEL_STD: [57.375, 57.120, 58.395]
+  MASK_ON: True
+  RESNETS:
+    STRIDE_IN_1X1: False  # this is a C2 model
+    NUM_GROUPS: 32
+    WIDTH_PER_GROUP: 8
+    DEPTH: 101
+  ROI_HEADS:
+    NUM_CLASSES: 1230
+    SCORE_THRESH_TEST: 0.0001
+INPUT:
+  MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800)
+DATASETS:
+  TRAIN: ("lvis_v0.5_train",)
+  TEST: ("lvis_v0.5_val",)
+TEST:
+  DETECTIONS_PER_IMAGE: 300  # LVIS allows up to 300
+DATALOADER:
+  SAMPLER_TRAIN: "RepeatFactorTrainingSampler"
+  REPEAT_THRESHOLD: 0.001
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/configs/Misc/cascade_mask_rcnn_R_50_FPN_1x.yaml b/preprocess/humanparsing/mhp_extension/detectron2/configs/Misc/cascade_mask_rcnn_R_50_FPN_1x.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..abb33b618932e94b66239945ac892f4c84a6e8f8
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/configs/Misc/cascade_mask_rcnn_R_50_FPN_1x.yaml
@@ -0,0 +1,12 @@
+_BASE_: "../Base-RCNN-FPN.yaml"
+MODEL:
+  WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
+  MASK_ON: True
+  RESNETS:
+    DEPTH: 50
+  ROI_HEADS:
+    NAME: CascadeROIHeads
+  ROI_BOX_HEAD:
+    CLS_AGNOSTIC_BBOX_REG: True
+  RPN:
+    POST_NMS_TOPK_TRAIN: 2000
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/configs/Misc/cascade_mask_rcnn_R_50_FPN_3x.yaml b/preprocess/humanparsing/mhp_extension/detectron2/configs/Misc/cascade_mask_rcnn_R_50_FPN_3x.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..e2201ad5c46ded91ccfa47b7698a521625c5e447
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/configs/Misc/cascade_mask_rcnn_R_50_FPN_3x.yaml
@@ -0,0 +1,15 @@
+_BASE_: "../Base-RCNN-FPN.yaml"
+MODEL:
+  WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
+  MASK_ON: True
+  RESNETS:
+    DEPTH: 50
+  ROI_HEADS:
+    NAME: CascadeROIHeads
+  ROI_BOX_HEAD:
+    CLS_AGNOSTIC_BBOX_REG: True
+  RPN:
+    POST_NMS_TOPK_TRAIN: 2000
+SOLVER:
+  STEPS: (210000, 250000)
+  MAX_ITER: 270000
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/configs/Misc/cascade_mask_rcnn_X_152_32x8d_FPN_IN5k_gn_dconv.yaml b/preprocess/humanparsing/mhp_extension/detectron2/configs/Misc/cascade_mask_rcnn_X_152_32x8d_FPN_IN5k_gn_dconv.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..fc117f6b5e3e51558ec2f01b73c5365622e5ce25
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/configs/Misc/cascade_mask_rcnn_X_152_32x8d_FPN_IN5k_gn_dconv.yaml
@@ -0,0 +1,36 @@
+_BASE_: "../Base-RCNN-FPN.yaml"
+MODEL:
+  MASK_ON: True
+  WEIGHTS: "catalog://ImageNetPretrained/FAIR/X-152-32x8d-IN5k"
+  RESNETS:
+    STRIDE_IN_1X1: False  # this is a C2 model
+    NUM_GROUPS: 32
+    WIDTH_PER_GROUP: 8
+    DEPTH: 152
+    DEFORM_ON_PER_STAGE: [False, True, True, True]
+  ROI_HEADS:
+    NAME: "CascadeROIHeads"
+  ROI_BOX_HEAD:
+    NAME: "FastRCNNConvFCHead"
+    NUM_CONV: 4
+    NUM_FC: 1
+    NORM: "GN"
+    CLS_AGNOSTIC_BBOX_REG: True
+  ROI_MASK_HEAD:
+    NUM_CONV: 8
+    NORM: "GN"
+  RPN:
+    POST_NMS_TOPK_TRAIN: 2000
+SOLVER:
+  IMS_PER_BATCH: 128
+  STEPS: (35000, 45000)
+  MAX_ITER: 50000
+  BASE_LR: 0.16
+INPUT:
+  MIN_SIZE_TRAIN: (640, 864)
+  MIN_SIZE_TRAIN_SAMPLING: "range"
+  MAX_SIZE_TRAIN: 1440
+  CROP:
+    ENABLED: True
+TEST:
+  EVAL_PERIOD: 2500
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/configs/Misc/cascade_mask_rcnn_X_152_32x8d_FPN_IN5k_gn_dconv_parsing.yaml b/preprocess/humanparsing/mhp_extension/detectron2/configs/Misc/cascade_mask_rcnn_X_152_32x8d_FPN_IN5k_gn_dconv_parsing.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..544f58f620607ba6eb592593a2f85243c8670451
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/configs/Misc/cascade_mask_rcnn_X_152_32x8d_FPN_IN5k_gn_dconv_parsing.yaml
@@ -0,0 +1,42 @@
+_BASE_: "../Base-RCNN-FPN.yaml"
+MODEL:
+  MASK_ON: True
+#   WEIGHTS: "catalog://ImageNetPretrained/FAIR/X-152-32x8d-IN5k"
+  WEIGHTS: "model_0039999_e76410.pkl"
+  RESNETS:
+    STRIDE_IN_1X1: False  # this is a C2 model
+    NUM_GROUPS: 32
+    WIDTH_PER_GROUP: 8
+    DEPTH: 152
+    DEFORM_ON_PER_STAGE: [False, True, True, True]
+  ROI_HEADS:
+    NAME: "CascadeROIHeads"
+    NUM_CLASSES: 1
+  ROI_BOX_HEAD:
+    NAME: "FastRCNNConvFCHead"
+    NUM_CONV: 4
+    NUM_FC: 1
+    NORM: "GN"
+    CLS_AGNOSTIC_BBOX_REG: True
+  ROI_MASK_HEAD:
+    NUM_CONV: 8
+    NORM: "GN"
+  RPN:
+    POST_NMS_TOPK_TRAIN: 2000
+SOLVER:
+#   IMS_PER_BATCH: 128
+  IMS_PER_BATCH: 1
+  STEPS: (35000, 45000)
+  MAX_ITER: 50000
+  BASE_LR: 0.16
+INPUT:
+  MIN_SIZE_TRAIN: (640, 864)
+  MIN_SIZE_TRAIN_SAMPLING: "range"
+  MAX_SIZE_TRAIN: 1440
+  CROP:
+    ENABLED: True
+TEST:
+  EVAL_PERIOD: 2500
+DATASETS:
+  TRAIN: ("CIHP_train","VIP_trainval")
+  TEST: ("CIHP_val",)
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/configs/Misc/demo.yaml b/preprocess/humanparsing/mhp_extension/detectron2/configs/Misc/demo.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..bbf9685f5921c7aa1c967b4e7da88aaf061a72e2
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/configs/Misc/demo.yaml
@@ -0,0 +1,25 @@
+_BASE_: "cascade_mask_rcnn_X_152_32x8d_FPN_IN5k_gn_dconv.yaml"
+MODEL:
+  MASK_ON: True
+  ROI_HEADS:
+    NMS_THRESH_TEST: 0.95
+    SCORE_THRESH_TEST: 0.5
+    NUM_CLASSES: 1
+SOLVER:
+  IMS_PER_BATCH: 1
+  STEPS: (30000, 45000)
+  MAX_ITER: 50000
+  BASE_LR: 0.02
+INPUT:
+  MIN_SIZE_TRAIN: (640, 864)
+  MIN_SIZE_TRAIN_SAMPLING: "range"
+  MAX_SIZE_TRAIN: 1440
+  CROP:
+    ENABLED: True
+TEST:
+  AUG:
+    ENABLED: True
+DATASETS:
+  TRAIN: ("demo_train",)
+  TEST: ("demo_val",)
+OUTPUT_DIR: "../../data/DemoDataset/detectron2_prediction"
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/configs/Misc/mask_rcnn_R_50_FPN_1x_cls_agnostic.yaml b/preprocess/humanparsing/mhp_extension/detectron2/configs/Misc/mask_rcnn_R_50_FPN_1x_cls_agnostic.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..4c3b767ff473bbab7225cc8a4a92608543d78246
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/configs/Misc/mask_rcnn_R_50_FPN_1x_cls_agnostic.yaml
@@ -0,0 +1,10 @@
+_BASE_: "../Base-RCNN-FPN.yaml"
+MODEL:
+  WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
+  MASK_ON: True
+  RESNETS:
+    DEPTH: 50
+  ROI_BOX_HEAD:
+    CLS_AGNOSTIC_BBOX_REG: True
+  ROI_MASK_HEAD:
+    CLS_AGNOSTIC_MASK: True
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/configs/Misc/mask_rcnn_R_50_FPN_1x_dconv_c3-c5.yaml b/preprocess/humanparsing/mhp_extension/detectron2/configs/Misc/mask_rcnn_R_50_FPN_1x_dconv_c3-c5.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..04ff988d073ef9169ee4ca2cbce0d6f030c15232
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/configs/Misc/mask_rcnn_R_50_FPN_1x_dconv_c3-c5.yaml
@@ -0,0 +1,8 @@
+_BASE_: "../Base-RCNN-FPN.yaml"
+MODEL:
+  WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
+  MASK_ON: True
+  RESNETS:
+    DEPTH: 50
+    DEFORM_ON_PER_STAGE: [False, True, True, True] # on Res3,Res4,Res5
+    DEFORM_MODULATED: False
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/configs/Misc/mask_rcnn_R_50_FPN_3x_dconv_c3-c5.yaml b/preprocess/humanparsing/mhp_extension/detectron2/configs/Misc/mask_rcnn_R_50_FPN_3x_dconv_c3-c5.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..68c0ca58d7df97ca728c339da0ca9828fe6be318
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/configs/Misc/mask_rcnn_R_50_FPN_3x_dconv_c3-c5.yaml
@@ -0,0 +1,11 @@
+_BASE_: "../Base-RCNN-FPN.yaml"
+MODEL:
+  WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
+  MASK_ON: True
+  RESNETS:
+    DEPTH: 50
+    DEFORM_ON_PER_STAGE: [False, True, True, True] # on Res3,Res4,Res5
+    DEFORM_MODULATED: False
+SOLVER:
+  STEPS: (210000, 250000)
+  MAX_ITER: 270000
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/configs/Misc/mask_rcnn_R_50_FPN_3x_gn.yaml b/preprocess/humanparsing/mhp_extension/detectron2/configs/Misc/mask_rcnn_R_50_FPN_3x_gn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..74d274e5a529b5a8afe186940868f9d48c6112b3
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/configs/Misc/mask_rcnn_R_50_FPN_3x_gn.yaml
@@ -0,0 +1,21 @@
+_BASE_: "../Base-RCNN-FPN.yaml"
+MODEL:
+  WEIGHTS: "catalog://ImageNetPretrained/FAIR/R-50-GN"
+  MASK_ON: True
+  RESNETS:
+    DEPTH: 50
+    NORM: "GN"
+    STRIDE_IN_1X1: False
+  FPN:
+    NORM: "GN"
+  ROI_BOX_HEAD:
+    NAME: "FastRCNNConvFCHead"
+    NUM_CONV: 4
+    NUM_FC: 1
+    NORM: "GN"
+  ROI_MASK_HEAD:
+    NORM: "GN"
+SOLVER:
+  # 3x schedule
+  STEPS: (210000, 250000)
+  MAX_ITER: 270000
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/configs/Misc/mask_rcnn_R_50_FPN_3x_syncbn.yaml b/preprocess/humanparsing/mhp_extension/detectron2/configs/Misc/mask_rcnn_R_50_FPN_3x_syncbn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..11ebb076ba529f26c71a0d972e96ca4c2d6a830b
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/configs/Misc/mask_rcnn_R_50_FPN_3x_syncbn.yaml
@@ -0,0 +1,24 @@
+_BASE_: "../Base-RCNN-FPN.yaml"
+MODEL:
+  WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
+  MASK_ON: True
+  RESNETS:
+    DEPTH: 50
+    NORM: "SyncBN"
+    STRIDE_IN_1X1: True
+  FPN:
+    NORM: "SyncBN"
+  ROI_BOX_HEAD:
+    NAME: "FastRCNNConvFCHead"
+    NUM_CONV: 4
+    NUM_FC: 1
+    NORM: "SyncBN"
+  ROI_MASK_HEAD:
+    NORM: "SyncBN"
+SOLVER:
+  # 3x schedule
+  STEPS: (210000, 250000)
+  MAX_ITER: 270000
+TEST:
+  PRECISE_BN:
+    ENABLED: True
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/configs/Misc/panoptic_fpn_R_101_dconv_cascade_gn_3x.yaml b/preprocess/humanparsing/mhp_extension/detectron2/configs/Misc/panoptic_fpn_R_101_dconv_cascade_gn_3x.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..34016cea3ca9d7fb69ef4fe01d6b47ee8690a13b
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/configs/Misc/panoptic_fpn_R_101_dconv_cascade_gn_3x.yaml
@@ -0,0 +1,26 @@
+# A large PanopticFPN for demo purposes.
+# Use GN on backbone to support semantic seg.
+# Use Cascade + Deform Conv to improve localization.
+_BASE_: "../COCO-PanopticSegmentation/Base-Panoptic-FPN.yaml"
+MODEL:
+  WEIGHTS: "catalog://ImageNetPretrained/FAIR/R-101-GN"
+  RESNETS:
+    DEPTH: 101
+    NORM: "GN"
+    DEFORM_ON_PER_STAGE: [False, True, True, True]
+    STRIDE_IN_1X1: False
+  FPN:
+    NORM: "GN"
+  ROI_HEADS:
+    NAME: CascadeROIHeads
+  ROI_BOX_HEAD:
+    CLS_AGNOSTIC_BBOX_REG: True
+  ROI_MASK_HEAD:
+    NORM: "GN"
+  RPN:
+    POST_NMS_TOPK_TRAIN: 2000
+SOLVER:
+  STEPS: (105000, 125000)
+  MAX_ITER: 135000
+  IMS_PER_BATCH: 32
+  BASE_LR: 0.04
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/configs/Misc/parsing_finetune_cihp.yaml b/preprocess/humanparsing/mhp_extension/detectron2/configs/Misc/parsing_finetune_cihp.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..766f46aa0cd3a80efb330052bdb695bebb5efb7d
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/configs/Misc/parsing_finetune_cihp.yaml
@@ -0,0 +1,24 @@
+_BASE_: "cascade_mask_rcnn_X_152_32x8d_FPN_IN5k_gn_dconv.yaml"
+MODEL:
+  MASK_ON: True
+  WEIGHTS: "model_0039999_e76410.pkl"
+  ROI_HEADS:
+    NUM_CLASSES: 1
+SOLVER:
+  IMS_PER_BATCH: 16
+  STEPS: (140000, 180000)
+  MAX_ITER: 200000
+  BASE_LR: 0.02
+INPUT:
+  MIN_SIZE_TRAIN: (640, 864)
+  MIN_SIZE_TRAIN_SAMPLING: "range"
+  MAX_SIZE_TRAIN: 1440
+  CROP:
+    ENABLED: True
+TEST:
+  EVAL_PERIOD: 0
+DATASETS:
+  TRAIN: ("CIHP_train")
+  TEST: ("CIHP_val",)
+OUTPUT_DIR: "./finetune_output"
+
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/configs/Misc/parsing_inference.yaml b/preprocess/humanparsing/mhp_extension/detectron2/configs/Misc/parsing_inference.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..d6a529b1eff2ddf553b1ba32f7b65172f03fae1f
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/configs/Misc/parsing_inference.yaml
@@ -0,0 +1,26 @@
+_BASE_: "cascade_mask_rcnn_X_152_32x8d_FPN_IN5k_gn_dconv.yaml"
+MODEL:
+  MASK_ON: True
+  WEIGHTS: "./finetune_ouput/model_final.pth"
+  ROI_HEADS:
+    NMS_THRESH_TEST: 0.95
+    SCORE_THRESH_TEST: 0.5
+    NUM_CLASSES: 1
+SOLVER:
+  IMS_PER_BATCH: 1
+  STEPS: (30000, 45000)
+  MAX_ITER: 50000
+  BASE_LR: 0.02
+INPUT:
+  MIN_SIZE_TRAIN: (640, 864)
+  MIN_SIZE_TRAIN_SAMPLING: "range"
+  MAX_SIZE_TRAIN: 1440
+  CROP:
+    ENABLED: True
+TEST:
+  AUG:
+    ENABLED: True
+DATASETS:
+  TRAIN: ("CIHP_trainval",)
+  TEST: ("CIHP_test",)
+OUTPUT_DIR: "./inference_output"
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/configs/Misc/scratch_mask_rcnn_R_50_FPN_3x_gn.yaml b/preprocess/humanparsing/mhp_extension/detectron2/configs/Misc/scratch_mask_rcnn_R_50_FPN_3x_gn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..f3400288cde242fcf66eef7f63b5a9165ca663c5
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/configs/Misc/scratch_mask_rcnn_R_50_FPN_3x_gn.yaml
@@ -0,0 +1,13 @@
+_BASE_: "mask_rcnn_R_50_FPN_3x_gn.yaml"
+MODEL:
+  # Train from random initialization.
+  WEIGHTS: ""
+  # It makes sense to divide by STD when training from scratch
+  # But it seems to make no difference on the results and C2's models didn't do this.
+  # So we keep things consistent with C2.
+  # PIXEL_STD: [57.375, 57.12, 58.395]
+  MASK_ON: True
+  BACKBONE:
+    FREEZE_AT: 0
+# NOTE: Please refer to Rethinking ImageNet Pre-training https://arxiv.org/abs/1811.08883
+# to learn what you need for training from scratch.
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/configs/Misc/scratch_mask_rcnn_R_50_FPN_9x_gn.yaml b/preprocess/humanparsing/mhp_extension/detectron2/configs/Misc/scratch_mask_rcnn_R_50_FPN_9x_gn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..d90c9ff0ef4573252ee165b4c958ec5f74178176
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/configs/Misc/scratch_mask_rcnn_R_50_FPN_9x_gn.yaml
@@ -0,0 +1,19 @@
+_BASE_: "mask_rcnn_R_50_FPN_3x_gn.yaml"
+MODEL:
+  PIXEL_STD: [57.375, 57.12, 58.395]
+  WEIGHTS: ""
+  MASK_ON: True
+  RESNETS:
+    STRIDE_IN_1X1: False
+  BACKBONE:
+    FREEZE_AT: 0
+SOLVER:
+  # 9x schedule
+  IMS_PER_BATCH: 64  # 4x the standard
+  STEPS: (187500, 197500)  # last 60/4==15k and last 20/4==5k
+  MAX_ITER: 202500   # 90k * 9 / 4
+  BASE_LR: 0.08
+TEST:
+  EVAL_PERIOD: 2500
+# NOTE: Please refer to Rethinking ImageNet Pre-training https://arxiv.org/abs/1811.08883
+# to learn what you need for training from scratch.
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/configs/Misc/scratch_mask_rcnn_R_50_FPN_9x_syncbn.yaml b/preprocess/humanparsing/mhp_extension/detectron2/configs/Misc/scratch_mask_rcnn_R_50_FPN_9x_syncbn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..60d4e42330e396a1901437df8e17b262d5ad547a
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/configs/Misc/scratch_mask_rcnn_R_50_FPN_9x_syncbn.yaml
@@ -0,0 +1,19 @@
+_BASE_: "mask_rcnn_R_50_FPN_3x_syncbn.yaml"
+MODEL:
+  PIXEL_STD: [57.375, 57.12, 58.395]
+  WEIGHTS: ""
+  MASK_ON: True
+  RESNETS:
+    STRIDE_IN_1X1: False
+  BACKBONE:
+    FREEZE_AT: 0
+SOLVER:
+  # 9x schedule
+  IMS_PER_BATCH: 64  # 4x the standard
+  STEPS: (187500, 197500)  # last 60/4==15k and last 20/4==5k
+  MAX_ITER: 202500   # 90k * 9 / 4
+  BASE_LR: 0.08
+TEST:
+  EVAL_PERIOD: 2500
+# NOTE: Please refer to Rethinking ImageNet Pre-training https://arxiv.org/abs/1811.08883
+# to learn what you need for training from scratch.
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/configs/Misc/semantic_R_50_FPN_1x.yaml b/preprocess/humanparsing/mhp_extension/detectron2/configs/Misc/semantic_R_50_FPN_1x.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..ac256e1372770ab3d9ae522c962de0fd0dbceeb5
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/configs/Misc/semantic_R_50_FPN_1x.yaml
@@ -0,0 +1,11 @@
+_BASE_: "../Base-RCNN-FPN.yaml"
+MODEL:
+  META_ARCHITECTURE: "SemanticSegmentor"
+  WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
+  RESNETS:
+    DEPTH: 50
+DATASETS:
+  TRAIN: ("coco_2017_train_panoptic_stuffonly",)
+  TEST: ("coco_2017_val_panoptic_stuffonly",)
+INPUT:
+  MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800)
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/configs/PascalVOC-Detection/faster_rcnn_R_50_C4.yaml b/preprocess/humanparsing/mhp_extension/detectron2/configs/PascalVOC-Detection/faster_rcnn_R_50_C4.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..ea2a6baaebd1a186db18f2904430ffb25901898e
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/configs/PascalVOC-Detection/faster_rcnn_R_50_C4.yaml
@@ -0,0 +1,18 @@
+_BASE_: "../Base-RCNN-C4.yaml"
+MODEL:
+  WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
+  MASK_ON: False
+  RESNETS:
+    DEPTH: 50
+  ROI_HEADS:
+    NUM_CLASSES: 20
+INPUT:
+  MIN_SIZE_TRAIN: (480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 800)
+  MIN_SIZE_TEST: 800
+DATASETS:
+  TRAIN: ('voc_2007_trainval', 'voc_2012_trainval')
+  TEST: ('voc_2007_test',)
+SOLVER:
+  STEPS: (12000, 16000)
+  MAX_ITER: 18000  # 17.4 epochs
+  WARMUP_ITERS: 100
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/configs/PascalVOC-Detection/faster_rcnn_R_50_FPN.yaml b/preprocess/humanparsing/mhp_extension/detectron2/configs/PascalVOC-Detection/faster_rcnn_R_50_FPN.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..e554cab18a358a27b630c1ab0c2359666b0e1514
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/configs/PascalVOC-Detection/faster_rcnn_R_50_FPN.yaml
@@ -0,0 +1,18 @@
+_BASE_: "../Base-RCNN-FPN.yaml"
+MODEL:
+  WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
+  MASK_ON: False
+  RESNETS:
+    DEPTH: 50
+  ROI_HEADS:
+    NUM_CLASSES: 20
+INPUT:
+  MIN_SIZE_TRAIN: (480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 800)
+  MIN_SIZE_TEST: 800
+DATASETS:
+  TRAIN: ('voc_2007_trainval', 'voc_2012_trainval')
+  TEST: ('voc_2007_test',)
+SOLVER:
+  STEPS: (12000, 16000)
+  MAX_ITER: 18000  # 17.4 epochs
+  WARMUP_ITERS: 100
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/configs/my_Base-RCNN-FPN.yaml b/preprocess/humanparsing/mhp_extension/detectron2/configs/my_Base-RCNN-FPN.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..d649eed7f333dfb07d7a096c6267dc0066e847c1
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/configs/my_Base-RCNN-FPN.yaml
@@ -0,0 +1,42 @@
+MODEL:
+  META_ARCHITECTURE: "GeneralizedRCNN"
+  BACKBONE:
+    NAME: "build_resnet_fpn_backbone"
+  RESNETS:
+    OUT_FEATURES: ["res2", "res3", "res4", "res5"]
+  FPN:
+    IN_FEATURES: ["res2", "res3", "res4", "res5"]
+  ANCHOR_GENERATOR:
+    SIZES: [[32], [64], [128], [256], [512]]  # One size for each in feature map
+    ASPECT_RATIOS: [[0.5, 1.0, 2.0]]  # Three aspect ratios (same for all in feature maps)
+  RPN:
+    IN_FEATURES: ["p2", "p3", "p4", "p5", "p6"]
+    PRE_NMS_TOPK_TRAIN: 2000  # Per FPN level
+    PRE_NMS_TOPK_TEST: 1000  # Per FPN level
+    # Detectron1 uses 2000 proposals per-batch,
+    # (See "modeling/rpn/rpn_outputs.py" for details of this legacy issue)
+    # which is approximately 1000 proposals per-image since the default batch size for FPN is 2.
+    POST_NMS_TOPK_TRAIN: 1000
+    POST_NMS_TOPK_TEST: 1000
+  ROI_HEADS:
+    NAME: "StandardROIHeads"
+    IN_FEATURES: ["p2", "p3", "p4", "p5"]
+  ROI_BOX_HEAD:
+    NAME: "FastRCNNConvFCHead"
+    NUM_FC: 2
+    POOLER_RESOLUTION: 7
+  ROI_MASK_HEAD:
+    NAME: "MaskRCNNConvUpsampleHead"
+    NUM_CONV: 4
+    POOLER_RESOLUTION: 14
+DATASETS:
+  TRAIN: ("coco_2017_train",)
+  TEST: ("coco_2017_val",)
+SOLVER:
+  IMS_PER_BATCH: 2
+  BASE_LR: 0.02
+  STEPS: (60000, 80000)
+  MAX_ITER: 90000
+INPUT:
+  MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800)
+VERSION: 2
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/configs/quick_schedules/README.md b/preprocess/humanparsing/mhp_extension/detectron2/configs/quick_schedules/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..a278199b8557a1e2fb341fe6757786a6cecb82b3
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/configs/quick_schedules/README.md
@@ -0,0 +1 @@
+These are quick configs for performance or accuracy regression tracking purposes.
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/configs/quick_schedules/cascade_mask_rcnn_R_50_FPN_inference_acc_test.yaml b/preprocess/humanparsing/mhp_extension/detectron2/configs/quick_schedules/cascade_mask_rcnn_R_50_FPN_inference_acc_test.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..fc5a4116cb096278823049c1f823e99f8e16e97e
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/configs/quick_schedules/cascade_mask_rcnn_R_50_FPN_inference_acc_test.yaml
@@ -0,0 +1,7 @@
+_BASE_: "../Misc/cascade_mask_rcnn_R_50_FPN_3x.yaml"
+MODEL:
+  WEIGHTS: "detectron2://Misc/cascade_mask_rcnn_R_50_FPN_3x/144998488/model_final_480dd8.pkl"
+DATASETS:
+  TEST: ("coco_2017_val_100",)
+TEST:
+  EXPECTED_RESULTS: [["bbox", "AP", 50.18, 0.02], ["segm", "AP",  43.87, 0.02]]
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/configs/quick_schedules/cascade_mask_rcnn_R_50_FPN_instant_test.yaml b/preprocess/humanparsing/mhp_extension/detectron2/configs/quick_schedules/cascade_mask_rcnn_R_50_FPN_instant_test.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..e41a0fe7ffe9c3531741df49e546aa45cfe4fdee
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/configs/quick_schedules/cascade_mask_rcnn_R_50_FPN_instant_test.yaml
@@ -0,0 +1,11 @@
+_BASE_: "../Misc/cascade_mask_rcnn_R_50_FPN_3x.yaml"
+DATASETS:
+  TRAIN: ("coco_2017_val_100",)
+  TEST: ("coco_2017_val_100",)
+SOLVER:
+  BASE_LR: 0.005
+  STEPS: (30,)
+  MAX_ITER: 40
+  IMS_PER_BATCH: 4
+DATALOADER:
+  NUM_WORKERS: 2
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/configs/quick_schedules/fast_rcnn_R_50_FPN_inference_acc_test.yaml b/preprocess/humanparsing/mhp_extension/detectron2/configs/quick_schedules/fast_rcnn_R_50_FPN_inference_acc_test.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..a2f37e5e2cc2a9e195e13703e9930e67e0f9a896
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/configs/quick_schedules/fast_rcnn_R_50_FPN_inference_acc_test.yaml
@@ -0,0 +1,7 @@
+_BASE_: "../COCO-Detection/fast_rcnn_R_50_FPN_1x.yaml"
+MODEL:
+  WEIGHTS: "detectron2://COCO-Detection/fast_rcnn_R_50_FPN_1x/137635226/model_final_e5f7ce.pkl"
+DATASETS:
+  TEST: ("coco_2017_val_100",)
+TEST:
+  EXPECTED_RESULTS: [["bbox", "AP", 45.70, 0.02]]
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/configs/quick_schedules/fast_rcnn_R_50_FPN_instant_test.yaml b/preprocess/humanparsing/mhp_extension/detectron2/configs/quick_schedules/fast_rcnn_R_50_FPN_instant_test.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..52fc0ec03c8b87ab2be1dda97bec1e8c93e6bb5c
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/configs/quick_schedules/fast_rcnn_R_50_FPN_instant_test.yaml
@@ -0,0 +1,15 @@
+_BASE_: "../COCO-Detection/fast_rcnn_R_50_FPN_1x.yaml"
+MODEL:
+  WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
+DATASETS:
+  TRAIN: ("coco_2017_val_100",)
+  PROPOSAL_FILES_TRAIN: ("detectron2://COCO-Detection/rpn_R_50_FPN_1x/137258492/coco_2017_val_box_proposals_ee0dad.pkl", )
+  TEST: ("coco_2017_val_100",)
+  PROPOSAL_FILES_TEST: ("detectron2://COCO-Detection/rpn_R_50_FPN_1x/137258492/coco_2017_val_box_proposals_ee0dad.pkl", )
+SOLVER:
+  BASE_LR: 0.005
+  STEPS: (30,)
+  MAX_ITER: 40
+  IMS_PER_BATCH: 4
+DATALOADER:
+  NUM_WORKERS: 2
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/configs/quick_schedules/keypoint_rcnn_R_50_FPN_inference_acc_test.yaml b/preprocess/humanparsing/mhp_extension/detectron2/configs/quick_schedules/keypoint_rcnn_R_50_FPN_inference_acc_test.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..14cf2aa82aec52ad44e28ead0665dad811d55457
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/configs/quick_schedules/keypoint_rcnn_R_50_FPN_inference_acc_test.yaml
@@ -0,0 +1,7 @@
+_BASE_: "../COCO-Keypoints/keypoint_rcnn_R_50_FPN_3x.yaml"
+MODEL:
+  WEIGHTS: "detectron2://COCO-Keypoints/keypoint_rcnn_R_50_FPN_3x/137849621/model_final_a6e10b.pkl"
+DATASETS:
+  TEST: ("keypoints_coco_2017_val_100",)
+TEST:
+  EXPECTED_RESULTS: [["bbox", "AP", 52.47, 0.02], ["keypoints", "AP", 67.36, 0.02]]
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/configs/quick_schedules/keypoint_rcnn_R_50_FPN_instant_test.yaml b/preprocess/humanparsing/mhp_extension/detectron2/configs/quick_schedules/keypoint_rcnn_R_50_FPN_instant_test.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..dc09034bdd3db9d3e0dc62a017a3883dbe79c649
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/configs/quick_schedules/keypoint_rcnn_R_50_FPN_instant_test.yaml
@@ -0,0 +1,14 @@
+_BASE_: "../Base-RCNN-FPN.yaml"
+MODEL:
+  WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
+  KEYPOINT_ON: True
+DATASETS:
+  TRAIN: ("keypoints_coco_2017_val_100",)
+  TEST: ("keypoints_coco_2017_val_100",)
+SOLVER:
+  BASE_LR: 0.005
+  STEPS: (30,)
+  MAX_ITER: 40
+  IMS_PER_BATCH: 4
+DATALOADER:
+  NUM_WORKERS: 2
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/configs/quick_schedules/keypoint_rcnn_R_50_FPN_normalized_training_acc_test.yaml b/preprocess/humanparsing/mhp_extension/detectron2/configs/quick_schedules/keypoint_rcnn_R_50_FPN_normalized_training_acc_test.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..4b92392f1c4457033ae4c87a521e339fe9e184ce
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/configs/quick_schedules/keypoint_rcnn_R_50_FPN_normalized_training_acc_test.yaml
@@ -0,0 +1,30 @@
+_BASE_: "../Base-RCNN-FPN.yaml"
+MODEL:
+  WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
+  KEYPOINT_ON: True
+  RESNETS:
+    DEPTH: 50
+  ROI_HEADS:
+    BATCH_SIZE_PER_IMAGE: 256
+    NUM_CLASSES: 1
+  ROI_KEYPOINT_HEAD:
+    POOLER_RESOLUTION: 14
+    POOLER_SAMPLING_RATIO: 2
+    NORMALIZE_LOSS_BY_VISIBLE_KEYPOINTS: False
+    LOSS_WEIGHT: 4.0
+  ROI_BOX_HEAD:
+    SMOOTH_L1_BETA: 1.0  # Keypoint AP degrades when using plain L1 loss
+  RPN:
+    SMOOTH_L1_BETA: 0.2  # Keypoint AP degrades when using plain L1 loss
+DATASETS:
+  TRAIN: ("keypoints_coco_2017_val",)
+  TEST: ("keypoints_coco_2017_val",)
+INPUT:
+  MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800)
+SOLVER:
+  WARMUP_FACTOR: 0.33333333
+  WARMUP_ITERS: 100
+  STEPS: (5500, 5800)
+  MAX_ITER: 6000
+TEST:
+  EXPECTED_RESULTS: [["bbox", "AP", 55.35, 1.0], ["keypoints", "AP", 76.91, 1.0]]
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/configs/quick_schedules/keypoint_rcnn_R_50_FPN_training_acc_test.yaml b/preprocess/humanparsing/mhp_extension/detectron2/configs/quick_schedules/keypoint_rcnn_R_50_FPN_training_acc_test.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..9bd962878fea64035887c48981beeb8d41bfdbd0
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/configs/quick_schedules/keypoint_rcnn_R_50_FPN_training_acc_test.yaml
@@ -0,0 +1,28 @@
+_BASE_: "../Base-RCNN-FPN.yaml"
+MODEL:
+  WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
+  KEYPOINT_ON: True
+  RESNETS:
+    DEPTH: 50
+  ROI_HEADS:
+    BATCH_SIZE_PER_IMAGE: 256
+    NUM_CLASSES: 1
+  ROI_KEYPOINT_HEAD:
+    POOLER_RESOLUTION: 14
+    POOLER_SAMPLING_RATIO: 2
+  ROI_BOX_HEAD:
+    SMOOTH_L1_BETA: 1.0  # Keypoint AP degrades when using plain L1 loss
+  RPN:
+    SMOOTH_L1_BETA: 0.2  # Keypoint AP degrades when using plain L1 loss
+DATASETS:
+  TRAIN: ("keypoints_coco_2017_val",)
+  TEST: ("keypoints_coco_2017_val",)
+INPUT:
+  MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800)
+SOLVER:
+  WARMUP_FACTOR: 0.33333333
+  WARMUP_ITERS: 100
+  STEPS: (5500, 5800)
+  MAX_ITER: 6000
+TEST:
+  EXPECTED_RESULTS: [["bbox", "AP", 53.5, 1.0], ["keypoints", "AP", 72.4, 1.0]]
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/configs/quick_schedules/mask_rcnn_R_50_C4_GCV_instant_test.yaml b/preprocess/humanparsing/mhp_extension/detectron2/configs/quick_schedules/mask_rcnn_R_50_C4_GCV_instant_test.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..ab6e69812b94ea7e071f29d9a6937d5c70805b5b
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/configs/quick_schedules/mask_rcnn_R_50_C4_GCV_instant_test.yaml
@@ -0,0 +1,18 @@
+_BASE_: "../Base-RCNN-C4.yaml"
+MODEL:
+  WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
+  MASK_ON: True
+DATASETS:
+  TRAIN: ("coco_2017_val_100",)
+  TEST: ("coco_2017_val_100",)
+SOLVER:
+  BASE_LR: 0.001
+  STEPS: (30,)
+  MAX_ITER: 40
+  IMS_PER_BATCH: 4
+  CLIP_GRADIENTS:
+    ENABLED: True
+    CLIP_TYPE: "value"
+    CLIP_VALUE: 1.0
+DATALOADER:
+  NUM_WORKERS: 2
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/configs/quick_schedules/mask_rcnn_R_50_C4_inference_acc_test.yaml b/preprocess/humanparsing/mhp_extension/detectron2/configs/quick_schedules/mask_rcnn_R_50_C4_inference_acc_test.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..b2d5b7ff87e069f8c774a230bdfd47b8c12d18a3
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/configs/quick_schedules/mask_rcnn_R_50_C4_inference_acc_test.yaml
@@ -0,0 +1,7 @@
+_BASE_: "../COCO-InstanceSegmentation/mask_rcnn_R_50_C4_3x.yaml"
+MODEL:
+  WEIGHTS: "detectron2://COCO-InstanceSegmentation/mask_rcnn_R_50_C4_3x/137849525/model_final_4ce675.pkl"
+DATASETS:
+  TEST: ("coco_2017_val_100",)
+TEST:
+  EXPECTED_RESULTS: [["bbox", "AP", 47.37, 0.02], ["segm", "AP", 40.99, 0.02]]
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/configs/quick_schedules/mask_rcnn_R_50_C4_instant_test.yaml b/preprocess/humanparsing/mhp_extension/detectron2/configs/quick_schedules/mask_rcnn_R_50_C4_instant_test.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..6c4f1214efa520944fd941daec082ad45c164a23
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/configs/quick_schedules/mask_rcnn_R_50_C4_instant_test.yaml
@@ -0,0 +1,14 @@
+_BASE_: "../Base-RCNN-C4.yaml"
+MODEL:
+  WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
+  MASK_ON: True
+DATASETS:
+  TRAIN: ("coco_2017_val_100",)
+  TEST: ("coco_2017_val_100",)
+SOLVER:
+  BASE_LR: 0.001
+  STEPS: (30,)
+  MAX_ITER: 40
+  IMS_PER_BATCH: 4
+DATALOADER:
+  NUM_WORKERS: 2
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/configs/quick_schedules/mask_rcnn_R_50_C4_training_acc_test.yaml b/preprocess/humanparsing/mhp_extension/detectron2/configs/quick_schedules/mask_rcnn_R_50_C4_training_acc_test.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..f68dd8f96c7896b5fc95d694a399f2ce417c1deb
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/configs/quick_schedules/mask_rcnn_R_50_C4_training_acc_test.yaml
@@ -0,0 +1,22 @@
+_BASE_: "../Base-RCNN-C4.yaml"
+MODEL:
+  WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
+  ROI_HEADS:
+    BATCH_SIZE_PER_IMAGE: 256
+  MASK_ON: True
+DATASETS:
+  TRAIN: ("coco_2017_val",)
+  TEST: ("coco_2017_val",)
+INPUT:
+  MIN_SIZE_TRAIN: (600,)
+  MAX_SIZE_TRAIN: 1000
+  MIN_SIZE_TEST: 800
+  MAX_SIZE_TEST: 1000
+SOLVER:
+  IMS_PER_BATCH: 8  # base uses 16
+  WARMUP_FACTOR: 0.33333
+  WARMUP_ITERS: 100
+  STEPS: (11000, 11600)
+  MAX_ITER: 12000
+TEST:
+  EXPECTED_RESULTS: [["bbox", "AP", 41.88, 0.7], ["segm", "AP", 33.79, 0.5]]
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/configs/quick_schedules/mask_rcnn_R_50_DC5_inference_acc_test.yaml b/preprocess/humanparsing/mhp_extension/detectron2/configs/quick_schedules/mask_rcnn_R_50_DC5_inference_acc_test.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..e3ce6cf922ae07fba5b5e01edbac19bf58a8e9dd
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/configs/quick_schedules/mask_rcnn_R_50_DC5_inference_acc_test.yaml
@@ -0,0 +1,7 @@
+_BASE_: "../COCO-InstanceSegmentation/mask_rcnn_R_50_DC5_3x.yaml"
+MODEL:
+  WEIGHTS: "detectron2://COCO-InstanceSegmentation/mask_rcnn_R_50_DC5_3x/137849551/model_final_84107b.pkl"
+DATASETS:
+  TEST: ("coco_2017_val_100",)
+TEST:
+  EXPECTED_RESULTS: [["bbox", "AP", 47.44, 0.02], ["segm", "AP", 42.94, 0.02]]
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/configs/quick_schedules/mask_rcnn_R_50_FPN_inference_acc_test.yaml b/preprocess/humanparsing/mhp_extension/detectron2/configs/quick_schedules/mask_rcnn_R_50_FPN_inference_acc_test.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..e5454bfd95cc37749c50aec7866f32d9a80ca2b7
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/configs/quick_schedules/mask_rcnn_R_50_FPN_inference_acc_test.yaml
@@ -0,0 +1,10 @@
+_BASE_: "../COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_3x.yaml"
+MODEL:
+  WEIGHTS: "detectron2://COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_3x/137849600/model_final_f10217.pkl"
+DATASETS:
+  TEST: ("coco_2017_val_100",)
+TEST:
+  EXPECTED_RESULTS: [["bbox", "AP", 47.34, 0.02], ["segm", "AP",  42.67, 0.02], ["bbox_TTA", "AP", 49.11, 0.02], ["segm_TTA", "AP", 45.04, 0.02]]
+  AUG:
+    ENABLED: True
+    MIN_SIZES: (700, 800)  # to save some time
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/configs/quick_schedules/mask_rcnn_R_50_FPN_instant_test.yaml b/preprocess/humanparsing/mhp_extension/detectron2/configs/quick_schedules/mask_rcnn_R_50_FPN_instant_test.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..6dbfcde0bf837990634d419a6dda1e2909c3cd7f
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/configs/quick_schedules/mask_rcnn_R_50_FPN_instant_test.yaml
@@ -0,0 +1,14 @@
+_BASE_: "../Base-RCNN-FPN.yaml"
+MODEL:
+  WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
+  MASK_ON: True
+DATASETS:
+  TRAIN: ("coco_2017_val_100",)
+  TEST: ("coco_2017_val_100",)
+SOLVER:
+  BASE_LR: 0.005
+  STEPS: (30,)
+  MAX_ITER: 40
+  IMS_PER_BATCH: 4
+DATALOADER:
+  NUM_WORKERS: 2
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/configs/quick_schedules/mask_rcnn_R_50_FPN_training_acc_test.yaml b/preprocess/humanparsing/mhp_extension/detectron2/configs/quick_schedules/mask_rcnn_R_50_FPN_training_acc_test.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..ffca550461035967a565dca39bca039658a68eed
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/configs/quick_schedules/mask_rcnn_R_50_FPN_training_acc_test.yaml
@@ -0,0 +1,21 @@
+_BASE_: "../Base-RCNN-FPN.yaml"
+MODEL:
+  WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
+  ROI_HEADS:
+    BATCH_SIZE_PER_IMAGE: 256
+  MASK_ON: True
+DATASETS:
+  TRAIN: ("coco_2017_val",)
+  TEST: ("coco_2017_val",)
+INPUT:
+  MIN_SIZE_TRAIN: (600,)
+  MAX_SIZE_TRAIN: 1000
+  MIN_SIZE_TEST: 800
+  MAX_SIZE_TEST: 1000
+SOLVER:
+  WARMUP_FACTOR: 0.3333333
+  WARMUP_ITERS: 100
+  STEPS: (5500, 5800)
+  MAX_ITER: 6000
+TEST:
+  EXPECTED_RESULTS: [["bbox", "AP", 42.0, 1.6], ["segm", "AP", 35.4, 1.25]]
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/configs/quick_schedules/panoptic_fpn_R_50_inference_acc_test.yaml b/preprocess/humanparsing/mhp_extension/detectron2/configs/quick_schedules/panoptic_fpn_R_50_inference_acc_test.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..70874e3a92c9034d75cbbebb145b61084ba15e42
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/configs/quick_schedules/panoptic_fpn_R_50_inference_acc_test.yaml
@@ -0,0 +1,7 @@
+_BASE_: "../COCO-PanopticSegmentation/panoptic_fpn_R_50_3x.yaml"
+MODEL:
+  WEIGHTS: "detectron2://COCO-PanopticSegmentation/panoptic_fpn_R_50_3x/139514569/model_final_c10459.pkl"
+DATASETS:
+  TEST: ("coco_2017_val_100_panoptic_separated",)
+TEST:
+  EXPECTED_RESULTS: [["bbox", "AP", 46.47, 0.02], ["segm", "AP", 43.39, 0.02], ["sem_seg", "mIoU", 42.55, 0.02], ["panoptic_seg", "PQ", 38.99, 0.02]]
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/configs/quick_schedules/panoptic_fpn_R_50_instant_test.yaml b/preprocess/humanparsing/mhp_extension/detectron2/configs/quick_schedules/panoptic_fpn_R_50_instant_test.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..7cdee7bfcf6dc75dda52602a0d9177ad0a9cc6ed
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/configs/quick_schedules/panoptic_fpn_R_50_instant_test.yaml
@@ -0,0 +1,19 @@
+_BASE_: "../Base-RCNN-FPN.yaml"
+MODEL:
+  META_ARCHITECTURE: "PanopticFPN"
+  WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
+  MASK_ON: True
+  RESNETS:
+    DEPTH: 50
+  SEM_SEG_HEAD:
+    LOSS_WEIGHT: 0.5
+DATASETS:
+  TRAIN: ("coco_2017_val_100_panoptic_separated",)
+  TEST: ("coco_2017_val_100_panoptic_separated",)
+SOLVER:
+  BASE_LR: 0.005
+  STEPS: (30,)
+  MAX_ITER: 40
+  IMS_PER_BATCH: 4
+DATALOADER:
+  NUM_WORKERS: 1
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/configs/quick_schedules/panoptic_fpn_R_50_training_acc_test.yaml b/preprocess/humanparsing/mhp_extension/detectron2/configs/quick_schedules/panoptic_fpn_R_50_training_acc_test.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..05816316f851690e60ee54b852b6f49ede73c886
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/configs/quick_schedules/panoptic_fpn_R_50_training_acc_test.yaml
@@ -0,0 +1,20 @@
+_BASE_: "../Base-RCNN-FPN.yaml"
+MODEL:
+  META_ARCHITECTURE: "PanopticFPN"
+  WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
+  MASK_ON: True
+  RESNETS:
+    DEPTH: 50
+  SEM_SEG_HEAD:
+    LOSS_WEIGHT: 0.5
+DATASETS:
+  TRAIN: ("coco_2017_val_panoptic_separated",)
+  TEST: ("coco_2017_val_panoptic_separated",)
+SOLVER:
+  BASE_LR: 0.01
+  WARMUP_FACTOR: 0.001
+  WARMUP_ITERS: 500
+  STEPS: (5500,)
+  MAX_ITER: 7000
+TEST:
+  EXPECTED_RESULTS: [["bbox", "AP", 46.70, 1.1], ["segm", "AP", 38.73, 0.7], ["sem_seg", "mIoU", 64.73, 1.2], ["panoptic_seg", "PQ", 48.13, 0.8]]
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/configs/quick_schedules/retinanet_R_50_FPN_inference_acc_test.yaml b/preprocess/humanparsing/mhp_extension/detectron2/configs/quick_schedules/retinanet_R_50_FPN_inference_acc_test.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..36b998833bac04c830d5ab9f44d5773b0437ac0b
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/configs/quick_schedules/retinanet_R_50_FPN_inference_acc_test.yaml
@@ -0,0 +1,7 @@
+_BASE_: "../COCO-Detection/retinanet_R_50_FPN_3x.yaml"
+MODEL:
+  WEIGHTS: "detectron2://COCO-Detection/retinanet_R_50_FPN_3x/137849486/model_final_4cafe0.pkl"
+DATASETS:
+  TEST: ("coco_2017_val_100",)
+TEST:
+  EXPECTED_RESULTS: [["bbox", "AP", 44.36, 0.02]]
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/configs/quick_schedules/retinanet_R_50_FPN_instant_test.yaml b/preprocess/humanparsing/mhp_extension/detectron2/configs/quick_schedules/retinanet_R_50_FPN_instant_test.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..8d95c1f614296716374686b22055a587ccd052b9
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/configs/quick_schedules/retinanet_R_50_FPN_instant_test.yaml
@@ -0,0 +1,13 @@
+_BASE_: "../COCO-Detection/retinanet_R_50_FPN_1x.yaml"
+MODEL:
+  WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
+DATASETS:
+  TRAIN: ("coco_2017_val_100",)
+  TEST: ("coco_2017_val_100",)
+SOLVER:
+  BASE_LR: 0.005
+  STEPS: (30,)
+  MAX_ITER: 40
+  IMS_PER_BATCH: 4
+DATALOADER:
+  NUM_WORKERS: 2
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/configs/quick_schedules/rpn_R_50_FPN_inference_acc_test.yaml b/preprocess/humanparsing/mhp_extension/detectron2/configs/quick_schedules/rpn_R_50_FPN_inference_acc_test.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..c7c3f908a9e80e98b2d25b6d384a60acaba9d4f8
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/configs/quick_schedules/rpn_R_50_FPN_inference_acc_test.yaml
@@ -0,0 +1,7 @@
+_BASE_: "../COCO-Detection/rpn_R_50_FPN_1x.yaml"
+MODEL:
+  WEIGHTS: "detectron2://COCO-Detection/rpn_R_50_FPN_1x/137258492/model_final_02ce48.pkl"
+DATASETS:
+  TEST: ("coco_2017_val_100",)
+TEST:
+  EXPECTED_RESULTS: [["box_proposals", "AR@1000", 58.16, 0.02]]
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/configs/quick_schedules/rpn_R_50_FPN_instant_test.yaml b/preprocess/humanparsing/mhp_extension/detectron2/configs/quick_schedules/rpn_R_50_FPN_instant_test.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..402d432477507dc36f04c4a9777cb80fe06b2809
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/configs/quick_schedules/rpn_R_50_FPN_instant_test.yaml
@@ -0,0 +1,13 @@
+_BASE_: "../COCO-Detection/rpn_R_50_FPN_1x.yaml"
+MODEL:
+  WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
+DATASETS:
+  TRAIN: ("coco_2017_val_100",)
+  TEST: ("coco_2017_val_100",)
+SOLVER:
+  STEPS: (30,)
+  MAX_ITER: 40
+  BASE_LR: 0.005
+  IMS_PER_BATCH: 4
+DATALOADER:
+  NUM_WORKERS: 2
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/configs/quick_schedules/semantic_R_50_FPN_inference_acc_test.yaml b/preprocess/humanparsing/mhp_extension/detectron2/configs/quick_schedules/semantic_R_50_FPN_inference_acc_test.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..bca74987d5218736983617883e0fe37f79d219b7
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/configs/quick_schedules/semantic_R_50_FPN_inference_acc_test.yaml
@@ -0,0 +1,10 @@
+_BASE_: "../Base-RCNN-FPN.yaml"
+MODEL:
+  META_ARCHITECTURE: "SemanticSegmentor"
+  WEIGHTS: "detectron2://semantic_R_50_FPN_1x/111802073/model_final_c18079783c55a94968edc28b7101c5f0.pkl"
+  RESNETS:
+    DEPTH: 50
+DATASETS:
+  TEST: ("coco_2017_val_100_panoptic_stuffonly",)
+TEST:
+  EXPECTED_RESULTS: [["sem_seg", "mIoU", 39.53, 0.02], ["sem_seg", "mACC", 51.50, 0.02]]
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/configs/quick_schedules/semantic_R_50_FPN_instant_test.yaml b/preprocess/humanparsing/mhp_extension/detectron2/configs/quick_schedules/semantic_R_50_FPN_instant_test.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..14ab606f219b462fe37fcc7d5fbdbe65cb5c2642
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/configs/quick_schedules/semantic_R_50_FPN_instant_test.yaml
@@ -0,0 +1,18 @@
+_BASE_: "../Base-RCNN-FPN.yaml"
+MODEL:
+  META_ARCHITECTURE: "SemanticSegmentor"
+  WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
+  RESNETS:
+    DEPTH: 50
+DATASETS:
+  TRAIN: ("coco_2017_val_100_panoptic_stuffonly",)
+  TEST: ("coco_2017_val_100_panoptic_stuffonly",)
+INPUT:
+  MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800)
+SOLVER:
+  BASE_LR: 0.005
+  STEPS: (30,)
+  MAX_ITER: 40
+  IMS_PER_BATCH: 4
+DATALOADER:
+  NUM_WORKERS: 2
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/configs/quick_schedules/semantic_R_50_FPN_training_acc_test.yaml b/preprocess/humanparsing/mhp_extension/detectron2/configs/quick_schedules/semantic_R_50_FPN_training_acc_test.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..1f78d775889b11e9e76743de5ddb8139198edf61
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/configs/quick_schedules/semantic_R_50_FPN_training_acc_test.yaml
@@ -0,0 +1,20 @@
+_BASE_: "../Base-RCNN-FPN.yaml"
+MODEL:
+  META_ARCHITECTURE: "SemanticSegmentor"
+  WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
+  RESNETS:
+    DEPTH: 50
+DATASETS:
+  TRAIN: ("coco_2017_val_panoptic_stuffonly",)
+  TEST: ("coco_2017_val_panoptic_stuffonly",)
+SOLVER:
+  BASE_LR: 0.01
+  WARMUP_FACTOR: 0.001
+  WARMUP_ITERS: 300
+  STEPS: (5500,)
+  MAX_ITER: 7000
+TEST:
+  EXPECTED_RESULTS: [["sem_seg", "mIoU", 76.51, 1.0], ["sem_seg", "mACC", 83.25, 1.0]]
+INPUT:
+  # no scale augmentation
+  MIN_SIZE_TRAIN: (800, )
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/demo/README.md b/preprocess/humanparsing/mhp_extension/detectron2/demo/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..caa755f6f0f472a04a419deec4a6acfdb949023b
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/demo/README.md
@@ -0,0 +1,8 @@
+
+## Detectron2 Demo
+
+We provide a command line tool to run a simple demo of builtin models.
+The usage is explained in [GETTING_STARTED.md](../GETTING_STARTED.md).
+
+See our [blog post](https://ai.facebook.com/blog/-detectron2-a-pytorch-based-modular-object-detection-library-)
+for a high-quality demo generated with this tool.
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/demo/demo.py b/preprocess/humanparsing/mhp_extension/detectron2/demo/demo.py
new file mode 100644
index 0000000000000000000000000000000000000000..1fd8df8f539cfe4a4f003fb820f49ffad0f54f80
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/demo/demo.py
@@ -0,0 +1,161 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+import argparse
+import glob
+import multiprocessing as mp
+import os
+import time
+import cv2
+import tqdm
+
+from detectron2.config import get_cfg
+from detectron2.data.detection_utils import read_image
+from detectron2.utils.logger import setup_logger
+
+from predictor import VisualizationDemo
+
+# constants
+WINDOW_NAME = "COCO detections"
+
+
+def setup_cfg(args):
+    # load config from file and command-line arguments
+    cfg = get_cfg()
+    cfg.merge_from_file(args.config_file)
+    cfg.merge_from_list(args.opts)
+    # Set score_threshold for builtin models
+    cfg.MODEL.RETINANET.SCORE_THRESH_TEST = args.confidence_threshold
+    cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST = args.confidence_threshold
+    cfg.MODEL.PANOPTIC_FPN.COMBINE.INSTANCES_CONFIDENCE_THRESH = args.confidence_threshold
+    cfg.freeze()
+    return cfg
+
+
+def get_parser():
+    parser = argparse.ArgumentParser(description="Detectron2 demo for builtin models")
+    parser.add_argument(
+        "--config-file",
+        default="configs/quick_schedules/mask_rcnn_R_50_FPN_inference_acc_test.yaml",
+        metavar="FILE",
+        help="path to config file",
+    )
+    parser.add_argument("--webcam", action="store_true", help="Take inputs from webcam.")
+    parser.add_argument("--video-input", help="Path to video file.")
+    parser.add_argument(
+        "--input",
+        nargs="+",
+        help="A list of space separated input images; "
+        "or a single glob pattern such as 'directory/*.jpg'",
+    )
+    parser.add_argument(
+        "--output",
+        help="A file or directory to save output visualizations. "
+        "If not given, will show output in an OpenCV window.",
+    )
+
+    parser.add_argument(
+        "--confidence-threshold",
+        type=float,
+        default=0.5,
+        help="Minimum score for instance predictions to be shown",
+    )
+    parser.add_argument(
+        "--opts",
+        help="Modify config options using the command-line 'KEY VALUE' pairs",
+        default=[],
+        nargs=argparse.REMAINDER,
+    )
+    return parser
+
+
+if __name__ == "__main__":
+    mp.set_start_method("spawn", force=True)
+    args = get_parser().parse_args()
+    setup_logger(name="fvcore")
+    logger = setup_logger()
+    logger.info("Arguments: " + str(args))
+
+    cfg = setup_cfg(args)
+
+    demo = VisualizationDemo(cfg)
+
+    if args.input:
+        if len(args.input) == 1:
+            args.input = glob.glob(os.path.expanduser(args.input[0]))
+            assert args.input, "The input path(s) was not found"
+        for path in tqdm.tqdm(args.input, disable=not args.output):
+            # use PIL, to be consistent with evaluation
+            img = read_image(path, format="BGR")
+            start_time = time.time()
+            predictions, visualized_output = demo.run_on_image(img)
+            logger.info(
+                "{}: {} in {:.2f}s".format(
+                    path,
+                    "detected {} instances".format(len(predictions["instances"]))
+                    if "instances" in predictions
+                    else "finished",
+                    time.time() - start_time,
+                )
+            )
+
+            if args.output:
+                if os.path.isdir(args.output):
+                    assert os.path.isdir(args.output), args.output
+                    out_filename = os.path.join(args.output, os.path.basename(path))
+                else:
+                    assert len(args.input) == 1, "Please specify a directory with args.output"
+                    out_filename = args.output
+                visualized_output.save(out_filename)
+            else:
+                cv2.namedWindow(WINDOW_NAME, cv2.WINDOW_NORMAL)
+                cv2.imshow(WINDOW_NAME, visualized_output.get_image()[:, :, ::-1])
+                if cv2.waitKey(0) == 27:
+                    break  # esc to quit
+    elif args.webcam:
+        assert args.input is None, "Cannot have both --input and --webcam!"
+        assert args.output is None, "output not yet supported with --webcam!"
+        cam = cv2.VideoCapture(0)
+        for vis in tqdm.tqdm(demo.run_on_video(cam)):
+            cv2.namedWindow(WINDOW_NAME, cv2.WINDOW_NORMAL)
+            cv2.imshow(WINDOW_NAME, vis)
+            if cv2.waitKey(1) == 27:
+                break  # esc to quit
+        cam.release()
+        cv2.destroyAllWindows()
+    elif args.video_input:
+        video = cv2.VideoCapture(args.video_input)
+        width = int(video.get(cv2.CAP_PROP_FRAME_WIDTH))
+        height = int(video.get(cv2.CAP_PROP_FRAME_HEIGHT))
+        frames_per_second = video.get(cv2.CAP_PROP_FPS)
+        num_frames = int(video.get(cv2.CAP_PROP_FRAME_COUNT))
+        basename = os.path.basename(args.video_input)
+
+        if args.output:
+            if os.path.isdir(args.output):
+                output_fname = os.path.join(args.output, basename)
+                output_fname = os.path.splitext(output_fname)[0] + ".mkv"
+            else:
+                output_fname = args.output
+            assert not os.path.isfile(output_fname), output_fname
+            output_file = cv2.VideoWriter(
+                filename=output_fname,
+                # some installation of opencv may not support x264 (due to its license),
+                # you can try other format (e.g. MPEG)
+                fourcc=cv2.VideoWriter_fourcc(*"x264"),
+                fps=float(frames_per_second),
+                frameSize=(width, height),
+                isColor=True,
+            )
+        assert os.path.isfile(args.video_input)
+        for vis_frame in tqdm.tqdm(demo.run_on_video(video), total=num_frames):
+            if args.output:
+                output_file.write(vis_frame)
+            else:
+                cv2.namedWindow(basename, cv2.WINDOW_NORMAL)
+                cv2.imshow(basename, vis_frame)
+                if cv2.waitKey(1) == 27:
+                    break  # esc to quit
+        video.release()
+        if args.output:
+            output_file.release()
+        else:
+            cv2.destroyAllWindows()
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/demo/predictor.py b/preprocess/humanparsing/mhp_extension/detectron2/demo/predictor.py
new file mode 100644
index 0000000000000000000000000000000000000000..689fa85436d928858e652df665f5e7460a1f3154
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/demo/predictor.py
@@ -0,0 +1,220 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+import atexit
+import bisect
+import multiprocessing as mp
+from collections import deque
+import cv2
+import torch
+
+from detectron2.data import MetadataCatalog
+from detectron2.engine.defaults import DefaultPredictor
+from detectron2.utils.video_visualizer import VideoVisualizer
+from detectron2.utils.visualizer import ColorMode, Visualizer
+
+
+class VisualizationDemo(object):
+    def __init__(self, cfg, instance_mode=ColorMode.IMAGE, parallel=False):
+        """
+        Args:
+            cfg (CfgNode):
+            instance_mode (ColorMode):
+            parallel (bool): whether to run the model in different processes from visualization.
+                Useful since the visualization logic can be slow.
+        """
+        self.metadata = MetadataCatalog.get(
+            cfg.DATASETS.TEST[0] if len(cfg.DATASETS.TEST) else "__unused"
+        )
+        self.cpu_device = torch.device("cpu")
+        self.instance_mode = instance_mode
+
+        self.parallel = parallel
+        if parallel:
+            num_gpu = torch.cuda.device_count()
+            self.predictor = AsyncPredictor(cfg, num_gpus=num_gpu)
+        else:
+            self.predictor = DefaultPredictor(cfg)
+
+    def run_on_image(self, image):
+        """
+        Args:
+            image (np.ndarray): an image of shape (H, W, C) (in BGR order).
+                This is the format used by OpenCV.
+
+        Returns:
+            predictions (dict): the output of the model.
+            vis_output (VisImage): the visualized image output.
+        """
+        vis_output = None
+        predictions = self.predictor(image)
+        # Convert image from OpenCV BGR format to Matplotlib RGB format.
+        image = image[:, :, ::-1]
+        visualizer = Visualizer(image, self.metadata, instance_mode=self.instance_mode)
+        if "panoptic_seg" in predictions:
+            panoptic_seg, segments_info = predictions["panoptic_seg"]
+            vis_output = visualizer.draw_panoptic_seg_predictions(
+                panoptic_seg.to(self.cpu_device), segments_info
+            )
+        else:
+            if "sem_seg" in predictions:
+                vis_output = visualizer.draw_sem_seg(
+                    predictions["sem_seg"].argmax(dim=0).to(self.cpu_device)
+                )
+            if "instances" in predictions:
+                instances = predictions["instances"].to(self.cpu_device)
+                vis_output = visualizer.draw_instance_predictions(predictions=instances)
+
+        return predictions, vis_output
+
+    def _frame_from_video(self, video):
+        while video.isOpened():
+            success, frame = video.read()
+            if success:
+                yield frame
+            else:
+                break
+
+    def run_on_video(self, video):
+        """
+        Visualizes predictions on frames of the input video.
+
+        Args:
+            video (cv2.VideoCapture): a :class:`VideoCapture` object, whose source can be
+                either a webcam or a video file.
+
+        Yields:
+            ndarray: BGR visualizations of each video frame.
+        """
+        video_visualizer = VideoVisualizer(self.metadata, self.instance_mode)
+
+        def process_predictions(frame, predictions):
+            frame = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR)
+            if "panoptic_seg" in predictions:
+                panoptic_seg, segments_info = predictions["panoptic_seg"]
+                vis_frame = video_visualizer.draw_panoptic_seg_predictions(
+                    frame, panoptic_seg.to(self.cpu_device), segments_info
+                )
+            elif "instances" in predictions:
+                predictions = predictions["instances"].to(self.cpu_device)
+                vis_frame = video_visualizer.draw_instance_predictions(frame, predictions)
+            elif "sem_seg" in predictions:
+                vis_frame = video_visualizer.draw_sem_seg(
+                    frame, predictions["sem_seg"].argmax(dim=0).to(self.cpu_device)
+                )
+
+            # Converts Matplotlib RGB format to OpenCV BGR format
+            vis_frame = cv2.cvtColor(vis_frame.get_image(), cv2.COLOR_RGB2BGR)
+            return vis_frame
+
+        frame_gen = self._frame_from_video(video)
+        if self.parallel:
+            buffer_size = self.predictor.default_buffer_size
+
+            frame_data = deque()
+
+            for cnt, frame in enumerate(frame_gen):
+                frame_data.append(frame)
+                self.predictor.put(frame)
+
+                if cnt >= buffer_size:
+                    frame = frame_data.popleft()
+                    predictions = self.predictor.get()
+                    yield process_predictions(frame, predictions)
+
+            while len(frame_data):
+                frame = frame_data.popleft()
+                predictions = self.predictor.get()
+                yield process_predictions(frame, predictions)
+        else:
+            for frame in frame_gen:
+                yield process_predictions(frame, self.predictor(frame))
+
+
+class AsyncPredictor:
+    """
+    A predictor that runs the model asynchronously, possibly on >1 GPUs.
+    Because rendering the visualization takes considerably amount of time,
+    this helps improve throughput when rendering videos.
+    """
+
+    class _StopToken:
+        pass
+
+    class _PredictWorker(mp.Process):
+        def __init__(self, cfg, task_queue, result_queue):
+            self.cfg = cfg
+            self.task_queue = task_queue
+            self.result_queue = result_queue
+            super().__init__()
+
+        def run(self):
+            predictor = DefaultPredictor(self.cfg)
+
+            while True:
+                task = self.task_queue.get()
+                if isinstance(task, AsyncPredictor._StopToken):
+                    break
+                idx, data = task
+                result = predictor(data)
+                self.result_queue.put((idx, result))
+
+    def __init__(self, cfg, num_gpus: int = 1):
+        """
+        Args:
+            cfg (CfgNode):
+            num_gpus (int): if 0, will run on CPU
+        """
+        num_workers = max(num_gpus, 1)
+        self.task_queue = mp.Queue(maxsize=num_workers * 3)
+        self.result_queue = mp.Queue(maxsize=num_workers * 3)
+        self.procs = []
+        for gpuid in range(max(num_gpus, 1)):
+            cfg = cfg.clone()
+            cfg.defrost()
+            cfg.MODEL.DEVICE = "cuda:{}".format(gpuid) if num_gpus > 0 else "cpu"
+            self.procs.append(
+                AsyncPredictor._PredictWorker(cfg, self.task_queue, self.result_queue)
+            )
+
+        self.put_idx = 0
+        self.get_idx = 0
+        self.result_rank = []
+        self.result_data = []
+
+        for p in self.procs:
+            p.start()
+        atexit.register(self.shutdown)
+
+    def put(self, image):
+        self.put_idx += 1
+        self.task_queue.put((self.put_idx, image))
+
+    def get(self):
+        self.get_idx += 1  # the index needed for this request
+        if len(self.result_rank) and self.result_rank[0] == self.get_idx:
+            res = self.result_data[0]
+            del self.result_data[0], self.result_rank[0]
+            return res
+
+        while True:
+            # make sure the results are returned in the correct order
+            idx, res = self.result_queue.get()
+            if idx == self.get_idx:
+                return res
+            insert = bisect.bisect(self.result_rank, idx)
+            self.result_rank.insert(insert, idx)
+            self.result_data.insert(insert, res)
+
+    def __len__(self):
+        return self.put_idx - self.get_idx
+
+    def __call__(self, image):
+        self.put(image)
+        return self.get()
+
+    def shutdown(self):
+        for _ in self.procs:
+            self.task_queue.put(AsyncPredictor._StopToken())
+
+    @property
+    def default_buffer_size(self):
+        return len(self.procs) * 5
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/detectron2/__init__.py b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..41816af2e8e538fa2ef4dc7b34f5667e0e823b90
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/__init__.py
@@ -0,0 +1,10 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+
+from .utils.env import setup_environment
+
+setup_environment()
+
+
+# This line will be programatically read/write by setup.py.
+# Leave them at the bottom of this file and don't touch them.
+__version__ = "0.1.3"
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/detectron2/checkpoint/__init__.py b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/checkpoint/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e17a9df03d886b379ffbb1c4ec41e03c5025410f
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/checkpoint/__init__.py
@@ -0,0 +1,10 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+# File:
+
+
+from . import catalog as _UNUSED  # register the handler
+from .detection_checkpoint import DetectionCheckpointer
+from fvcore.common.checkpoint import Checkpointer, PeriodicCheckpointer
+
+__all__ = ["Checkpointer", "PeriodicCheckpointer", "DetectionCheckpointer"]
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/detectron2/checkpoint/c2_model_loading.py b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/checkpoint/c2_model_loading.py
new file mode 100644
index 0000000000000000000000000000000000000000..e27ba8463c744438d44f04f23fd4975525eba667
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/checkpoint/c2_model_loading.py
@@ -0,0 +1,313 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+import copy
+import logging
+import re
+import torch
+from fvcore.common.checkpoint import (
+    get_missing_parameters_message,
+    get_unexpected_parameters_message,
+)
+
+
+def convert_basic_c2_names(original_keys):
+    """
+    Apply some basic name conversion to names in C2 weights.
+    It only deals with typical backbone models.
+
+    Args:
+        original_keys (list[str]):
+    Returns:
+        list[str]: The same number of strings matching those in original_keys.
+    """
+    layer_keys = copy.deepcopy(original_keys)
+    layer_keys = [
+        {"pred_b": "linear_b", "pred_w": "linear_w"}.get(k, k) for k in layer_keys
+    ]  # some hard-coded mappings
+
+    layer_keys = [k.replace("_", ".") for k in layer_keys]
+    layer_keys = [re.sub("\\.b$", ".bias", k) for k in layer_keys]
+    layer_keys = [re.sub("\\.w$", ".weight", k) for k in layer_keys]
+    # Uniform both bn and gn names to "norm"
+    layer_keys = [re.sub("bn\\.s$", "norm.weight", k) for k in layer_keys]
+    layer_keys = [re.sub("bn\\.bias$", "norm.bias", k) for k in layer_keys]
+    layer_keys = [re.sub("bn\\.rm", "norm.running_mean", k) for k in layer_keys]
+    layer_keys = [re.sub("bn\\.running.mean$", "norm.running_mean", k) for k in layer_keys]
+    layer_keys = [re.sub("bn\\.riv$", "norm.running_var", k) for k in layer_keys]
+    layer_keys = [re.sub("bn\\.running.var$", "norm.running_var", k) for k in layer_keys]
+    layer_keys = [re.sub("bn\\.gamma$", "norm.weight", k) for k in layer_keys]
+    layer_keys = [re.sub("bn\\.beta$", "norm.bias", k) for k in layer_keys]
+    layer_keys = [re.sub("gn\\.s$", "norm.weight", k) for k in layer_keys]
+    layer_keys = [re.sub("gn\\.bias$", "norm.bias", k) for k in layer_keys]
+
+    # stem
+    layer_keys = [re.sub("^res\\.conv1\\.norm\\.", "conv1.norm.", k) for k in layer_keys]
+    # to avoid mis-matching with "conv1" in other components (e.g. detection head)
+    layer_keys = [re.sub("^conv1\\.", "stem.conv1.", k) for k in layer_keys]
+
+    # layer1-4 is used by torchvision, however we follow the C2 naming strategy (res2-5)
+    # layer_keys = [re.sub("^res2.", "layer1.", k) for k in layer_keys]
+    # layer_keys = [re.sub("^res3.", "layer2.", k) for k in layer_keys]
+    # layer_keys = [re.sub("^res4.", "layer3.", k) for k in layer_keys]
+    # layer_keys = [re.sub("^res5.", "layer4.", k) for k in layer_keys]
+
+    # blocks
+    layer_keys = [k.replace(".branch1.", ".shortcut.") for k in layer_keys]
+    layer_keys = [k.replace(".branch2a.", ".conv1.") for k in layer_keys]
+    layer_keys = [k.replace(".branch2b.", ".conv2.") for k in layer_keys]
+    layer_keys = [k.replace(".branch2c.", ".conv3.") for k in layer_keys]
+
+    # DensePose substitutions
+    layer_keys = [re.sub("^body.conv.fcn", "body_conv_fcn", k) for k in layer_keys]
+    layer_keys = [k.replace("AnnIndex.lowres", "ann_index_lowres") for k in layer_keys]
+    layer_keys = [k.replace("Index.UV.lowres", "index_uv_lowres") for k in layer_keys]
+    layer_keys = [k.replace("U.lowres", "u_lowres") for k in layer_keys]
+    layer_keys = [k.replace("V.lowres", "v_lowres") for k in layer_keys]
+    return layer_keys
+
+
+def convert_c2_detectron_names(weights):
+    """
+    Map Caffe2 Detectron weight names to Detectron2 names.
+
+    Args:
+        weights (dict): name -> tensor
+
+    Returns:
+        dict: detectron2 names -> tensor
+        dict: detectron2 names -> C2 names
+    """
+    logger = logging.getLogger(__name__)
+    logger.info("Remapping C2 weights ......")
+    original_keys = sorted(weights.keys())
+    layer_keys = copy.deepcopy(original_keys)
+
+    layer_keys = convert_basic_c2_names(layer_keys)
+
+    # --------------------------------------------------------------------------
+    # RPN hidden representation conv
+    # --------------------------------------------------------------------------
+    # FPN case
+    # In the C2 model, the RPN hidden layer conv is defined for FPN level 2 and then
+    # shared for all other levels, hence the appearance of "fpn2"
+    layer_keys = [
+        k.replace("conv.rpn.fpn2", "proposal_generator.rpn_head.conv") for k in layer_keys
+    ]
+    # Non-FPN case
+    layer_keys = [k.replace("conv.rpn", "proposal_generator.rpn_head.conv") for k in layer_keys]
+
+    # --------------------------------------------------------------------------
+    # RPN box transformation conv
+    # --------------------------------------------------------------------------
+    # FPN case (see note above about "fpn2")
+    layer_keys = [
+        k.replace("rpn.bbox.pred.fpn2", "proposal_generator.rpn_head.anchor_deltas")
+        for k in layer_keys
+    ]
+    layer_keys = [
+        k.replace("rpn.cls.logits.fpn2", "proposal_generator.rpn_head.objectness_logits")
+        for k in layer_keys
+    ]
+    # Non-FPN case
+    layer_keys = [
+        k.replace("rpn.bbox.pred", "proposal_generator.rpn_head.anchor_deltas") for k in layer_keys
+    ]
+    layer_keys = [
+        k.replace("rpn.cls.logits", "proposal_generator.rpn_head.objectness_logits")
+        for k in layer_keys
+    ]
+
+    # --------------------------------------------------------------------------
+    # Fast R-CNN box head
+    # --------------------------------------------------------------------------
+    layer_keys = [re.sub("^bbox\\.pred", "bbox_pred", k) for k in layer_keys]
+    layer_keys = [re.sub("^cls\\.score", "cls_score", k) for k in layer_keys]
+    layer_keys = [re.sub("^fc6\\.", "box_head.fc1.", k) for k in layer_keys]
+    layer_keys = [re.sub("^fc7\\.", "box_head.fc2.", k) for k in layer_keys]
+    # 4conv1fc head tensor names: head_conv1_w, head_conv1_gn_s
+    layer_keys = [re.sub("^head\\.conv", "box_head.conv", k) for k in layer_keys]
+
+    # --------------------------------------------------------------------------
+    # FPN lateral and output convolutions
+    # --------------------------------------------------------------------------
+    def fpn_map(name):
+        """
+        Look for keys with the following patterns:
+        1) Starts with "fpn.inner."
+           Example: "fpn.inner.res2.2.sum.lateral.weight"
+           Meaning: These are lateral pathway convolutions
+        2) Starts with "fpn.res"
+           Example: "fpn.res2.2.sum.weight"
+           Meaning: These are FPN output convolutions
+        """
+        splits = name.split(".")
+        norm = ".norm" if "norm" in splits else ""
+        if name.startswith("fpn.inner."):
+            # splits example: ['fpn', 'inner', 'res2', '2', 'sum', 'lateral', 'weight']
+            stage = int(splits[2][len("res") :])
+            return "fpn_lateral{}{}.{}".format(stage, norm, splits[-1])
+        elif name.startswith("fpn.res"):
+            # splits example: ['fpn', 'res2', '2', 'sum', 'weight']
+            stage = int(splits[1][len("res") :])
+            return "fpn_output{}{}.{}".format(stage, norm, splits[-1])
+        return name
+
+    layer_keys = [fpn_map(k) for k in layer_keys]
+
+    # --------------------------------------------------------------------------
+    # Mask R-CNN mask head
+    # --------------------------------------------------------------------------
+    # roi_heads.StandardROIHeads case
+    layer_keys = [k.replace(".[mask].fcn", "mask_head.mask_fcn") for k in layer_keys]
+    layer_keys = [re.sub("^\\.mask\\.fcn", "mask_head.mask_fcn", k) for k in layer_keys]
+    layer_keys = [k.replace("mask.fcn.logits", "mask_head.predictor") for k in layer_keys]
+    # roi_heads.Res5ROIHeads case
+    layer_keys = [k.replace("conv5.mask", "mask_head.deconv") for k in layer_keys]
+
+    # --------------------------------------------------------------------------
+    # Keypoint R-CNN head
+    # --------------------------------------------------------------------------
+    # interestingly, the keypoint head convs have blob names that are simply "conv_fcnX"
+    layer_keys = [k.replace("conv.fcn", "roi_heads.keypoint_head.conv_fcn") for k in layer_keys]
+    layer_keys = [
+        k.replace("kps.score.lowres", "roi_heads.keypoint_head.score_lowres") for k in layer_keys
+    ]
+    layer_keys = [k.replace("kps.score.", "roi_heads.keypoint_head.score.") for k in layer_keys]
+
+    # --------------------------------------------------------------------------
+    # Done with replacements
+    # --------------------------------------------------------------------------
+    assert len(set(layer_keys)) == len(layer_keys)
+    assert len(original_keys) == len(layer_keys)
+
+    new_weights = {}
+    new_keys_to_original_keys = {}
+    for orig, renamed in zip(original_keys, layer_keys):
+        new_keys_to_original_keys[renamed] = orig
+        if renamed.startswith("bbox_pred.") or renamed.startswith("mask_head.predictor."):
+            # remove the meaningless prediction weight for background class
+            new_start_idx = 4 if renamed.startswith("bbox_pred.") else 1
+            new_weights[renamed] = weights[orig][new_start_idx:]
+            logger.info(
+                "Remove prediction weight for background class in {}. The shape changes from "
+                "{} to {}.".format(
+                    renamed, tuple(weights[orig].shape), tuple(new_weights[renamed].shape)
+                )
+            )
+        elif renamed.startswith("cls_score."):
+            # move weights of bg class from original index 0 to last index
+            logger.info(
+                "Move classification weights for background class in {} from index 0 to "
+                "index {}.".format(renamed, weights[orig].shape[0] - 1)
+            )
+            new_weights[renamed] = torch.cat([weights[orig][1:], weights[orig][:1]])
+        else:
+            new_weights[renamed] = weights[orig]
+
+    return new_weights, new_keys_to_original_keys
+
+
+# Note the current matching is not symmetric.
+# it assumes model_state_dict will have longer names.
+def align_and_update_state_dicts(model_state_dict, ckpt_state_dict, c2_conversion=True):
+    """
+    Match names between the two state-dict, and update the values of model_state_dict in-place with
+    copies of the matched tensor in ckpt_state_dict.
+    If `c2_conversion==True`, `ckpt_state_dict` is assumed to be a Caffe2
+    model and will be renamed at first.
+
+    Strategy: suppose that the models that we will create will have prefixes appended
+    to each of its keys, for example due to an extra level of nesting that the original
+    pre-trained weights from ImageNet won't contain. For example, model.state_dict()
+    might return backbone[0].body.res2.conv1.weight, while the pre-trained model contains
+    res2.conv1.weight. We thus want to match both parameters together.
+    For that, we look for each model weight, look among all loaded keys if there is one
+    that is a suffix of the current weight name, and use it if that's the case.
+    If multiple matches exist, take the one with longest size
+    of the corresponding name. For example, for the same model as before, the pretrained
+    weight file can contain both res2.conv1.weight, as well as conv1.weight. In this case,
+    we want to match backbone[0].body.conv1.weight to conv1.weight, and
+    backbone[0].body.res2.conv1.weight to res2.conv1.weight.
+    """
+    model_keys = sorted(model_state_dict.keys())
+    if c2_conversion:
+        ckpt_state_dict, original_keys = convert_c2_detectron_names(ckpt_state_dict)
+        # original_keys: the name in the original dict (before renaming)
+    else:
+        original_keys = {x: x for x in ckpt_state_dict.keys()}
+    ckpt_keys = sorted(ckpt_state_dict.keys())
+
+    def match(a, b):
+        # Matched ckpt_key should be a complete (starts with '.') suffix.
+        # For example, roi_heads.mesh_head.whatever_conv1 does not match conv1,
+        # but matches whatever_conv1 or mesh_head.whatever_conv1.
+        return a == b or a.endswith("." + b)
+
+    # get a matrix of string matches, where each (i, j) entry correspond to the size of the
+    # ckpt_key string, if it matches
+    match_matrix = [len(j) if match(i, j) else 0 for i in model_keys for j in ckpt_keys]
+    match_matrix = torch.as_tensor(match_matrix).view(len(model_keys), len(ckpt_keys))
+    # use the matched one with longest size in case of multiple matches
+    max_match_size, idxs = match_matrix.max(1)
+    # remove indices that correspond to no-match
+    idxs[max_match_size == 0] = -1
+
+    # used for logging
+    max_len_model = max(len(key) for key in model_keys) if model_keys else 1
+    max_len_ckpt = max(len(key) for key in ckpt_keys) if ckpt_keys else 1
+    log_str_template = "{: <{}} loaded from {: <{}} of shape {}"
+    logger = logging.getLogger(__name__)
+    # matched_pairs (matched checkpoint key --> matched model key)
+    matched_keys = {}
+    for idx_model, idx_ckpt in enumerate(idxs.tolist()):
+        if idx_ckpt == -1:
+            continue
+        key_model = model_keys[idx_model]
+        key_ckpt = ckpt_keys[idx_ckpt]
+        value_ckpt = ckpt_state_dict[key_ckpt]
+        shape_in_model = model_state_dict[key_model].shape
+
+        if shape_in_model != value_ckpt.shape:
+            logger.warning(
+                "Shape of {} in checkpoint is {}, while shape of {} in model is {}.".format(
+                    key_ckpt, value_ckpt.shape, key_model, shape_in_model
+                )
+            )
+            logger.warning(
+                "{} will not be loaded. Please double check and see if this is desired.".format(
+                    key_ckpt
+                )
+            )
+            continue
+
+        model_state_dict[key_model] = value_ckpt.clone()
+        if key_ckpt in matched_keys:  # already added to matched_keys
+            logger.error(
+                "Ambiguity found for {} in checkpoint!"
+                "It matches at least two keys in the model ({} and {}).".format(
+                    key_ckpt, key_model, matched_keys[key_ckpt]
+                )
+            )
+            raise ValueError("Cannot match one checkpoint key to multiple keys in the model.")
+
+        matched_keys[key_ckpt] = key_model
+        logger.info(
+            log_str_template.format(
+                key_model,
+                max_len_model,
+                original_keys[key_ckpt],
+                max_len_ckpt,
+                tuple(shape_in_model),
+            )
+        )
+    matched_model_keys = matched_keys.values()
+    matched_ckpt_keys = matched_keys.keys()
+    # print warnings about unmatched keys on both side
+    unmatched_model_keys = [k for k in model_keys if k not in matched_model_keys]
+    if len(unmatched_model_keys):
+        logger.info(get_missing_parameters_message(unmatched_model_keys))
+
+    unmatched_ckpt_keys = [k for k in ckpt_keys if k not in matched_ckpt_keys]
+    if len(unmatched_ckpt_keys):
+        logger.info(
+            get_unexpected_parameters_message(original_keys[x] for x in unmatched_ckpt_keys)
+        )
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/detectron2/checkpoint/catalog.py b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/checkpoint/catalog.py
new file mode 100644
index 0000000000000000000000000000000000000000..62f81f3c1531e2726400cba4c97b60d744670da5
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/checkpoint/catalog.py
@@ -0,0 +1,134 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+import logging
+from fvcore.common.file_io import PathHandler, PathManager
+
+
+class ModelCatalog(object):
+    """
+    Store mappings from names to third-party models.
+    """
+
+    S3_C2_DETECTRON_PREFIX = "https://dl.fbaipublicfiles.com/detectron"
+
+    # MSRA models have STRIDE_IN_1X1=True. False otherwise.
+    # NOTE: all BN models here have fused BN into an affine layer.
+    # As a result, you should only load them to a model with "FrozenBN".
+    # Loading them to a model with regular BN or SyncBN is wrong.
+    # Even when loaded to FrozenBN, it is still different from affine by an epsilon,
+    # which should be negligible for training.
+    # NOTE: all models here uses PIXEL_STD=[1,1,1]
+    # NOTE: Most of the BN models here are no longer used. We use the
+    # re-converted pre-trained models under detectron2 model zoo instead.
+    C2_IMAGENET_MODELS = {
+        "MSRA/R-50": "ImageNetPretrained/MSRA/R-50.pkl",
+        "MSRA/R-101": "ImageNetPretrained/MSRA/R-101.pkl",
+        "FAIR/R-50-GN": "ImageNetPretrained/47261647/R-50-GN.pkl",
+        "FAIR/R-101-GN": "ImageNetPretrained/47592356/R-101-GN.pkl",
+        "FAIR/X-101-32x8d": "ImageNetPretrained/20171220/X-101-32x8d.pkl",
+        "FAIR/X-101-64x4d": "ImageNetPretrained/FBResNeXt/X-101-64x4d.pkl",
+        "FAIR/X-152-32x8d-IN5k": "ImageNetPretrained/25093814/X-152-32x8d-IN5k.pkl",
+    }
+
+    C2_DETECTRON_PATH_FORMAT = (
+        "{prefix}/{url}/output/train/{dataset}/{type}/model_final.pkl"  # noqa B950
+    )
+
+    C2_DATASET_COCO = "coco_2014_train%3Acoco_2014_valminusminival"
+    C2_DATASET_COCO_KEYPOINTS = "keypoints_coco_2014_train%3Akeypoints_coco_2014_valminusminival"
+
+    # format: {model_name} -> part of the url
+    C2_DETECTRON_MODELS = {
+        "35857197/e2e_faster_rcnn_R-50-C4_1x": "35857197/12_2017_baselines/e2e_faster_rcnn_R-50-C4_1x.yaml.01_33_49.iAX0mXvW",  # noqa B950
+        "35857345/e2e_faster_rcnn_R-50-FPN_1x": "35857345/12_2017_baselines/e2e_faster_rcnn_R-50-FPN_1x.yaml.01_36_30.cUF7QR7I",  # noqa B950
+        "35857890/e2e_faster_rcnn_R-101-FPN_1x": "35857890/12_2017_baselines/e2e_faster_rcnn_R-101-FPN_1x.yaml.01_38_50.sNxI7sX7",  # noqa B950
+        "36761737/e2e_faster_rcnn_X-101-32x8d-FPN_1x": "36761737/12_2017_baselines/e2e_faster_rcnn_X-101-32x8d-FPN_1x.yaml.06_31_39.5MIHi1fZ",  # noqa B950
+        "35858791/e2e_mask_rcnn_R-50-C4_1x": "35858791/12_2017_baselines/e2e_mask_rcnn_R-50-C4_1x.yaml.01_45_57.ZgkA7hPB",  # noqa B950
+        "35858933/e2e_mask_rcnn_R-50-FPN_1x": "35858933/12_2017_baselines/e2e_mask_rcnn_R-50-FPN_1x.yaml.01_48_14.DzEQe4wC",  # noqa B950
+        "35861795/e2e_mask_rcnn_R-101-FPN_1x": "35861795/12_2017_baselines/e2e_mask_rcnn_R-101-FPN_1x.yaml.02_31_37.KqyEK4tT",  # noqa B950
+        "36761843/e2e_mask_rcnn_X-101-32x8d-FPN_1x": "36761843/12_2017_baselines/e2e_mask_rcnn_X-101-32x8d-FPN_1x.yaml.06_35_59.RZotkLKI",  # noqa B950
+        "48616381/e2e_mask_rcnn_R-50-FPN_2x_gn": "GN/48616381/04_2018_gn_baselines/e2e_mask_rcnn_R-50-FPN_2x_gn_0416.13_23_38.bTlTI97Q",  # noqa B950
+        "37697547/e2e_keypoint_rcnn_R-50-FPN_1x": "37697547/12_2017_baselines/e2e_keypoint_rcnn_R-50-FPN_1x.yaml.08_42_54.kdzV35ao",  # noqa B950
+        "35998355/rpn_R-50-C4_1x": "35998355/12_2017_baselines/rpn_R-50-C4_1x.yaml.08_00_43.njH5oD9L",  # noqa B950
+        "35998814/rpn_R-50-FPN_1x": "35998814/12_2017_baselines/rpn_R-50-FPN_1x.yaml.08_06_03.Axg0r179",  # noqa B950
+        "36225147/fast_R-50-FPN_1x": "36225147/12_2017_baselines/fast_rcnn_R-50-FPN_1x.yaml.08_39_09.L3obSdQ2",  # noqa B950
+    }
+
+    @staticmethod
+    def get(name):
+        if name.startswith("Caffe2Detectron/COCO"):
+            return ModelCatalog._get_c2_detectron_baseline(name)
+        if name.startswith("ImageNetPretrained/"):
+            return ModelCatalog._get_c2_imagenet_pretrained(name)
+        raise RuntimeError("model not present in the catalog: {}".format(name))
+
+    @staticmethod
+    def _get_c2_imagenet_pretrained(name):
+        prefix = ModelCatalog.S3_C2_DETECTRON_PREFIX
+        name = name[len("ImageNetPretrained/") :]
+        name = ModelCatalog.C2_IMAGENET_MODELS[name]
+        url = "/".join([prefix, name])
+        return url
+
+    @staticmethod
+    def _get_c2_detectron_baseline(name):
+        name = name[len("Caffe2Detectron/COCO/") :]
+        url = ModelCatalog.C2_DETECTRON_MODELS[name]
+        if "keypoint_rcnn" in name:
+            dataset = ModelCatalog.C2_DATASET_COCO_KEYPOINTS
+        else:
+            dataset = ModelCatalog.C2_DATASET_COCO
+
+        if "35998355/rpn_R-50-C4_1x" in name:
+            # this one model is somehow different from others ..
+            type = "rpn"
+        else:
+            type = "generalized_rcnn"
+
+        # Detectron C2 models are stored in the structure defined in `C2_DETECTRON_PATH_FORMAT`.
+        url = ModelCatalog.C2_DETECTRON_PATH_FORMAT.format(
+            prefix=ModelCatalog.S3_C2_DETECTRON_PREFIX, url=url, type=type, dataset=dataset
+        )
+        return url
+
+
+class ModelCatalogHandler(PathHandler):
+    """
+    Resolve URL like catalog://.
+    """
+
+    PREFIX = "catalog://"
+
+    def _get_supported_prefixes(self):
+        return [self.PREFIX]
+
+    def _get_local_path(self, path):
+        logger = logging.getLogger(__name__)
+        catalog_path = ModelCatalog.get(path[len(self.PREFIX) :])
+        logger.info("Catalog entry {} points to {}".format(path, catalog_path))
+        return PathManager.get_local_path(catalog_path)
+
+    def _open(self, path, mode="r", **kwargs):
+        return PathManager.open(self._get_local_path(path), mode, **kwargs)
+
+
+class Detectron2Handler(PathHandler):
+    """
+    Resolve anything that's in Detectron2 model zoo.
+    """
+
+    PREFIX = "detectron2://"
+    S3_DETECTRON2_PREFIX = "https://dl.fbaipublicfiles.com/detectron2/"
+
+    def _get_supported_prefixes(self):
+        return [self.PREFIX]
+
+    def _get_local_path(self, path):
+        name = path[len(self.PREFIX) :]
+        return PathManager.get_local_path(self.S3_DETECTRON2_PREFIX + name)
+
+    def _open(self, path, mode="r", **kwargs):
+        return PathManager.open(self._get_local_path(path), mode, **kwargs)
+
+
+PathManager.register_handler(ModelCatalogHandler())
+PathManager.register_handler(Detectron2Handler())
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/detectron2/checkpoint/detection_checkpoint.py b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/checkpoint/detection_checkpoint.py
new file mode 100644
index 0000000000000000000000000000000000000000..06e6739f7b2070cf3e2d34099188e5ea1f7cf622
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/checkpoint/detection_checkpoint.py
@@ -0,0 +1,73 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+import pickle
+from fvcore.common.checkpoint import Checkpointer
+from fvcore.common.file_io import PathManager
+
+import detectron2.utils.comm as comm
+
+from .c2_model_loading import align_and_update_state_dicts
+
+
+class DetectionCheckpointer(Checkpointer):
+    """
+    Same as :class:`Checkpointer`, but is able to handle models in detectron & detectron2
+    model zoo, and apply conversions for legacy models.
+    """
+
+    def __init__(self, model, save_dir="", *, save_to_disk=None, **checkpointables):
+        is_main_process = comm.is_main_process()
+        super().__init__(
+            model,
+            save_dir,
+            save_to_disk=is_main_process if save_to_disk is None else save_to_disk,
+            **checkpointables,
+        )
+
+    def _load_file(self, filename):
+        if filename.endswith(".pkl"):
+            with PathManager.open(filename, "rb") as f:
+                data = pickle.load(f, encoding="latin1")
+            if "model" in data and "__author__" in data:
+                # file is in Detectron2 model zoo format
+                self.logger.info("Reading a file from '{}'".format(data["__author__"]))
+                return data
+            else:
+                # assume file is from Caffe2 / Detectron1 model zoo
+                if "blobs" in data:
+                    # Detection models have "blobs", but ImageNet models don't
+                    data = data["blobs"]
+                data = {k: v for k, v in data.items() if not k.endswith("_momentum")}
+                return {"model": data, "__author__": "Caffe2", "matching_heuristics": True}
+
+        loaded = super()._load_file(filename)  # load native pth checkpoint
+        if "model" not in loaded:
+            loaded = {"model": loaded}
+        return loaded
+
+    def _load_model(self, checkpoint):
+        if checkpoint.get("matching_heuristics", False):
+            self._convert_ndarray_to_tensor(checkpoint["model"])
+            # convert weights by name-matching heuristics
+            model_state_dict = self.model.state_dict()
+            align_and_update_state_dicts(
+                model_state_dict,
+                checkpoint["model"],
+                c2_conversion=checkpoint.get("__author__", None) == "Caffe2",
+            )
+            checkpoint["model"] = model_state_dict
+        # for non-caffe2 models, use standard ways to load it
+        incompatible = super()._load_model(checkpoint)
+        if incompatible is None:  # support older versions of fvcore
+            return None
+
+        model_buffers = dict(self.model.named_buffers(recurse=False))
+        for k in ["pixel_mean", "pixel_std"]:
+            # Ignore missing key message about pixel_mean/std.
+            # Though they may be missing in old checkpoints, they will be correctly
+            # initialized from config anyway.
+            if k in model_buffers:
+                try:
+                    incompatible.missing_keys.remove(k)
+                except ValueError:
+                    pass
+        return incompatible
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/detectron2/config/__init__.py b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/config/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..f996ecd74947c504f86e3e6854a45bd74ad32c1c
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/config/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+from .compat import downgrade_config, upgrade_config
+from .config import CfgNode, get_cfg, global_cfg, set_global_cfg, configurable
+
+__all__ = [
+    "CfgNode",
+    "get_cfg",
+    "global_cfg",
+    "set_global_cfg",
+    "downgrade_config",
+    "upgrade_config",
+    "configurable",
+]
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/detectron2/config/compat.py b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/config/compat.py
new file mode 100644
index 0000000000000000000000000000000000000000..41fe3a00ca05885abf28106808fe7f8d862b5036
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/config/compat.py
@@ -0,0 +1,229 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+"""
+Backward compatibility of configs.
+
+Instructions to bump version:
++ It's not needed to bump version if new keys are added.
+  It's only needed when backward-incompatible changes happen
+  (i.e., some existing keys disappear, or the meaning of a key changes)
++ To bump version, do the following:
+    1. Increment _C.VERSION in defaults.py
+    2. Add a converter in this file.
+
+      Each ConverterVX has a function "upgrade" which in-place upgrades config from X-1 to X,
+      and a function "downgrade" which in-place downgrades config from X to X-1
+
+      In each function, VERSION is left unchanged.
+
+      Each converter assumes that its input has the relevant keys
+      (i.e., the input is not a partial config).
+    3. Run the tests (test_config.py) to make sure the upgrade & downgrade
+       functions are consistent.
+"""
+
+import logging
+from typing import List, Optional, Tuple
+
+from .config import CfgNode as CN
+from .defaults import _C
+
+__all__ = ["upgrade_config", "downgrade_config"]
+
+
+def upgrade_config(cfg: CN, to_version: Optional[int] = None) -> CN:
+    """
+    Upgrade a config from its current version to a newer version.
+
+    Args:
+        cfg (CfgNode):
+        to_version (int): defaults to the latest version.
+    """
+    cfg = cfg.clone()
+    if to_version is None:
+        to_version = _C.VERSION
+
+    assert cfg.VERSION <= to_version, "Cannot upgrade from v{} to v{}!".format(
+        cfg.VERSION, to_version
+    )
+    for k in range(cfg.VERSION, to_version):
+        converter = globals()["ConverterV" + str(k + 1)]
+        converter.upgrade(cfg)
+        cfg.VERSION = k + 1
+    return cfg
+
+
+def downgrade_config(cfg: CN, to_version: int) -> CN:
+    """
+    Downgrade a config from its current version to an older version.
+
+    Args:
+        cfg (CfgNode):
+        to_version (int):
+
+    Note:
+        A general downgrade of arbitrary configs is not always possible due to the
+        different functionalities in different versions.
+        The purpose of downgrade is only to recover the defaults in old versions,
+        allowing it to load an old partial yaml config.
+        Therefore, the implementation only needs to fill in the default values
+        in the old version when a general downgrade is not possible.
+    """
+    cfg = cfg.clone()
+    assert cfg.VERSION >= to_version, "Cannot downgrade from v{} to v{}!".format(
+        cfg.VERSION, to_version
+    )
+    for k in range(cfg.VERSION, to_version, -1):
+        converter = globals()["ConverterV" + str(k)]
+        converter.downgrade(cfg)
+        cfg.VERSION = k - 1
+    return cfg
+
+
+def guess_version(cfg: CN, filename: str) -> int:
+    """
+    Guess the version of a partial config where the VERSION field is not specified.
+    Returns the version, or the latest if cannot make a guess.
+
+    This makes it easier for users to migrate.
+    """
+    logger = logging.getLogger(__name__)
+
+    def _has(name: str) -> bool:
+        cur = cfg
+        for n in name.split("."):
+            if n not in cur:
+                return False
+            cur = cur[n]
+        return True
+
+    # Most users' partial configs have "MODEL.WEIGHT", so guess on it
+    ret = None
+    if _has("MODEL.WEIGHT") or _has("TEST.AUG_ON"):
+        ret = 1
+
+    if ret is not None:
+        logger.warning("Config '{}' has no VERSION. Assuming it to be v{}.".format(filename, ret))
+    else:
+        ret = _C.VERSION
+        logger.warning(
+            "Config '{}' has no VERSION. Assuming it to be compatible with latest v{}.".format(
+                filename, ret
+            )
+        )
+    return ret
+
+
+def _rename(cfg: CN, old: str, new: str) -> None:
+    old_keys = old.split(".")
+    new_keys = new.split(".")
+
+    def _set(key_seq: List[str], val: str) -> None:
+        cur = cfg
+        for k in key_seq[:-1]:
+            if k not in cur:
+                cur[k] = CN()
+            cur = cur[k]
+        cur[key_seq[-1]] = val
+
+    def _get(key_seq: List[str]) -> CN:
+        cur = cfg
+        for k in key_seq:
+            cur = cur[k]
+        return cur
+
+    def _del(key_seq: List[str]) -> None:
+        cur = cfg
+        for k in key_seq[:-1]:
+            cur = cur[k]
+        del cur[key_seq[-1]]
+        if len(cur) == 0 and len(key_seq) > 1:
+            _del(key_seq[:-1])
+
+    _set(new_keys, _get(old_keys))
+    _del(old_keys)
+
+
+class _RenameConverter:
+    """
+    A converter that handles simple rename.
+    """
+
+    RENAME: List[Tuple[str, str]] = []  # list of tuples of (old name, new name)
+
+    @classmethod
+    def upgrade(cls, cfg: CN) -> None:
+        for old, new in cls.RENAME:
+            _rename(cfg, old, new)
+
+    @classmethod
+    def downgrade(cls, cfg: CN) -> None:
+        for old, new in cls.RENAME[::-1]:
+            _rename(cfg, new, old)
+
+
+class ConverterV1(_RenameConverter):
+    RENAME = [("MODEL.RPN_HEAD.NAME", "MODEL.RPN.HEAD_NAME")]
+
+
+class ConverterV2(_RenameConverter):
+    """
+    A large bulk of rename, before public release.
+    """
+
+    RENAME = [
+        ("MODEL.WEIGHT", "MODEL.WEIGHTS"),
+        ("MODEL.PANOPTIC_FPN.SEMANTIC_LOSS_SCALE", "MODEL.SEM_SEG_HEAD.LOSS_WEIGHT"),
+        ("MODEL.PANOPTIC_FPN.RPN_LOSS_SCALE", "MODEL.RPN.LOSS_WEIGHT"),
+        ("MODEL.PANOPTIC_FPN.INSTANCE_LOSS_SCALE", "MODEL.PANOPTIC_FPN.INSTANCE_LOSS_WEIGHT"),
+        ("MODEL.PANOPTIC_FPN.COMBINE_ON", "MODEL.PANOPTIC_FPN.COMBINE.ENABLED"),
+        (
+            "MODEL.PANOPTIC_FPN.COMBINE_OVERLAP_THRESHOLD",
+            "MODEL.PANOPTIC_FPN.COMBINE.OVERLAP_THRESH",
+        ),
+        (
+            "MODEL.PANOPTIC_FPN.COMBINE_STUFF_AREA_LIMIT",
+            "MODEL.PANOPTIC_FPN.COMBINE.STUFF_AREA_LIMIT",
+        ),
+        (
+            "MODEL.PANOPTIC_FPN.COMBINE_INSTANCES_CONFIDENCE_THRESHOLD",
+            "MODEL.PANOPTIC_FPN.COMBINE.INSTANCES_CONFIDENCE_THRESH",
+        ),
+        ("MODEL.ROI_HEADS.SCORE_THRESH", "MODEL.ROI_HEADS.SCORE_THRESH_TEST"),
+        ("MODEL.ROI_HEADS.NMS", "MODEL.ROI_HEADS.NMS_THRESH_TEST"),
+        ("MODEL.RETINANET.INFERENCE_SCORE_THRESHOLD", "MODEL.RETINANET.SCORE_THRESH_TEST"),
+        ("MODEL.RETINANET.INFERENCE_TOPK_CANDIDATES", "MODEL.RETINANET.TOPK_CANDIDATES_TEST"),
+        ("MODEL.RETINANET.INFERENCE_NMS_THRESHOLD", "MODEL.RETINANET.NMS_THRESH_TEST"),
+        ("TEST.DETECTIONS_PER_IMG", "TEST.DETECTIONS_PER_IMAGE"),
+        ("TEST.AUG_ON", "TEST.AUG.ENABLED"),
+        ("TEST.AUG_MIN_SIZES", "TEST.AUG.MIN_SIZES"),
+        ("TEST.AUG_MAX_SIZE", "TEST.AUG.MAX_SIZE"),
+        ("TEST.AUG_FLIP", "TEST.AUG.FLIP"),
+    ]
+
+    @classmethod
+    def upgrade(cls, cfg: CN) -> None:
+        super().upgrade(cfg)
+
+        if cfg.MODEL.META_ARCHITECTURE == "RetinaNet":
+            _rename(
+                cfg, "MODEL.RETINANET.ANCHOR_ASPECT_RATIOS", "MODEL.ANCHOR_GENERATOR.ASPECT_RATIOS"
+            )
+            _rename(cfg, "MODEL.RETINANET.ANCHOR_SIZES", "MODEL.ANCHOR_GENERATOR.SIZES")
+            del cfg["MODEL"]["RPN"]["ANCHOR_SIZES"]
+            del cfg["MODEL"]["RPN"]["ANCHOR_ASPECT_RATIOS"]
+        else:
+            _rename(cfg, "MODEL.RPN.ANCHOR_ASPECT_RATIOS", "MODEL.ANCHOR_GENERATOR.ASPECT_RATIOS")
+            _rename(cfg, "MODEL.RPN.ANCHOR_SIZES", "MODEL.ANCHOR_GENERATOR.SIZES")
+            del cfg["MODEL"]["RETINANET"]["ANCHOR_SIZES"]
+            del cfg["MODEL"]["RETINANET"]["ANCHOR_ASPECT_RATIOS"]
+        del cfg["MODEL"]["RETINANET"]["ANCHOR_STRIDES"]
+
+    @classmethod
+    def downgrade(cls, cfg: CN) -> None:
+        super().downgrade(cfg)
+
+        _rename(cfg, "MODEL.ANCHOR_GENERATOR.ASPECT_RATIOS", "MODEL.RPN.ANCHOR_ASPECT_RATIOS")
+        _rename(cfg, "MODEL.ANCHOR_GENERATOR.SIZES", "MODEL.RPN.ANCHOR_SIZES")
+        cfg.MODEL.RETINANET.ANCHOR_ASPECT_RATIOS = cfg.MODEL.RPN.ANCHOR_ASPECT_RATIOS
+        cfg.MODEL.RETINANET.ANCHOR_SIZES = cfg.MODEL.RPN.ANCHOR_SIZES
+        cfg.MODEL.RETINANET.ANCHOR_STRIDES = []  # this is not used anywhere in any version
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/detectron2/config/config.py b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/config/config.py
new file mode 100644
index 0000000000000000000000000000000000000000..14ad524f00e706ddba567a62f805481c2f185a8e
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/config/config.py
@@ -0,0 +1,202 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+
+import functools
+import inspect
+import logging
+from fvcore.common.config import CfgNode as _CfgNode
+from fvcore.common.file_io import PathManager
+
+
+class CfgNode(_CfgNode):
+    """
+    The same as `fvcore.common.config.CfgNode`, but different in:
+
+    1. Use unsafe yaml loading by default.
+       Note that this may lead to arbitrary code execution: you must not
+       load a config file from untrusted sources before manually inspecting
+       the content of the file.
+    2. Support config versioning.
+       When attempting to merge an old config, it will convert the old config automatically.
+    """
+
+    # Note that the default value of allow_unsafe is changed to True
+    def merge_from_file(self, cfg_filename: str, allow_unsafe: bool = True) -> None:
+        assert PathManager.isfile(cfg_filename), f"Config file '{cfg_filename}' does not exist!"
+        loaded_cfg = _CfgNode.load_yaml_with_base(cfg_filename, allow_unsafe=allow_unsafe)
+        loaded_cfg = type(self)(loaded_cfg)
+
+        # defaults.py needs to import CfgNode
+        from .defaults import _C
+
+        latest_ver = _C.VERSION
+        assert (
+            latest_ver == self.VERSION
+        ), "CfgNode.merge_from_file is only allowed on a config object of latest version!"
+
+        logger = logging.getLogger(__name__)
+
+        loaded_ver = loaded_cfg.get("VERSION", None)
+        if loaded_ver is None:
+            from .compat import guess_version
+
+            loaded_ver = guess_version(loaded_cfg, cfg_filename)
+        assert loaded_ver <= self.VERSION, "Cannot merge a v{} config into a v{} config.".format(
+            loaded_ver, self.VERSION
+        )
+
+        if loaded_ver == self.VERSION:
+            self.merge_from_other_cfg(loaded_cfg)
+        else:
+            # compat.py needs to import CfgNode
+            from .compat import upgrade_config, downgrade_config
+
+            logger.warning(
+                "Loading an old v{} config file '{}' by automatically upgrading to v{}. "
+                "See docs/CHANGELOG.md for instructions to update your files.".format(
+                    loaded_ver, cfg_filename, self.VERSION
+                )
+            )
+            # To convert, first obtain a full config at an old version
+            old_self = downgrade_config(self, to_version=loaded_ver)
+            old_self.merge_from_other_cfg(loaded_cfg)
+            new_config = upgrade_config(old_self)
+            self.clear()
+            self.update(new_config)
+
+    def dump(self, *args, **kwargs):
+        """
+        Returns:
+            str: a yaml string representation of the config
+        """
+        # to make it show up in docs
+        return super().dump(*args, **kwargs)
+
+
+global_cfg = CfgNode()
+
+
+def get_cfg() -> CfgNode:
+    """
+    Get a copy of the default config.
+
+    Returns:
+        a detectron2 CfgNode instance.
+    """
+    from .defaults import _C
+
+    return _C.clone()
+
+
+def set_global_cfg(cfg: CfgNode) -> None:
+    """
+    Let the global config point to the given cfg.
+
+    Assume that the given "cfg" has the key "KEY", after calling
+    `set_global_cfg(cfg)`, the key can be accessed by:
+
+    .. code-block:: python
+
+        from detectron2.config import global_cfg
+        print(global_cfg.KEY)
+
+    By using a hacky global config, you can access these configs anywhere,
+    without having to pass the config object or the values deep into the code.
+    This is a hacky feature introduced for quick prototyping / research exploration.
+    """
+    global global_cfg
+    global_cfg.clear()
+    global_cfg.update(cfg)
+
+
+def configurable(init_func):
+    """
+    Decorate a class's __init__ method so that it can be called with a CfgNode
+    object using the class's from_config classmethod.
+
+    Examples:
+
+    .. code-block:: python
+
+        class A:
+            @configurable
+            def __init__(self, a, b=2, c=3):
+                pass
+
+            @classmethod
+            def from_config(cls, cfg):
+                # Returns kwargs to be passed to __init__
+                return {"a": cfg.A, "b": cfg.B}
+
+        a1 = A(a=1, b=2)  # regular construction
+        a2 = A(cfg)       # construct with a cfg
+        a3 = A(cfg, b=3, c=4)  # construct with extra overwrite
+    """
+    assert init_func.__name__ == "__init__", "@configurable should only be used for __init__!"
+    if init_func.__module__.startswith("detectron2."):
+        assert (
+            init_func.__doc__ is not None and "experimental" in init_func.__doc__
+        ), f"configurable {init_func} should be marked experimental"
+
+    @functools.wraps(init_func)
+    def wrapped(self, *args, **kwargs):
+        try:
+            from_config_func = type(self).from_config
+        except AttributeError:
+            raise AttributeError("Class with @configurable must have a 'from_config' classmethod.")
+        if not inspect.ismethod(from_config_func):
+            raise TypeError("Class with @configurable must have a 'from_config' classmethod.")
+
+        if _called_with_cfg(*args, **kwargs):
+            explicit_args = _get_args_from_config(from_config_func, *args, **kwargs)
+            init_func(self, **explicit_args)
+        else:
+            init_func(self, *args, **kwargs)
+
+    return wrapped
+
+
+def _get_args_from_config(from_config_func, *args, **kwargs):
+    """
+    Use `from_config` to obtain explicit arguments.
+
+    Returns:
+        dict: arguments to be used for cls.__init__
+    """
+    signature = inspect.signature(from_config_func)
+    if list(signature.parameters.keys())[0] != "cfg":
+        raise TypeError(
+            f"{from_config_func.__self__}.from_config must take 'cfg' as the first argument!"
+        )
+    support_var_arg = any(
+        param.kind in [param.VAR_POSITIONAL, param.VAR_KEYWORD]
+        for param in signature.parameters.values()
+    )
+    if support_var_arg:  # forward all arguments to from_config, if from_config accepts them
+        ret = from_config_func(*args, **kwargs)
+    else:
+        # forward supported arguments to from_config
+        supported_arg_names = set(signature.parameters.keys())
+        extra_kwargs = {}
+        for name in list(kwargs.keys()):
+            if name not in supported_arg_names:
+                extra_kwargs[name] = kwargs.pop(name)
+        ret = from_config_func(*args, **kwargs)
+        # forward the other arguments to __init__
+        ret.update(extra_kwargs)
+    return ret
+
+
+def _called_with_cfg(*args, **kwargs):
+    """
+    Returns:
+        bool: whether the arguments contain CfgNode and should be considered
+            forwarded to from_config.
+    """
+    if len(args) and isinstance(args[0], _CfgNode):
+        return True
+    if isinstance(kwargs.pop("cfg", None), _CfgNode):
+        return True
+    # `from_config`'s first argument is forced to be "cfg".
+    # So the above check covers all cases.
+    return False
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/detectron2/config/defaults.py b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/config/defaults.py
new file mode 100644
index 0000000000000000000000000000000000000000..b9ad62f5f01606438082e012ba5a4a68381c3b3c
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/config/defaults.py
@@ -0,0 +1,598 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+from .config import CfgNode as CN
+
+# -----------------------------------------------------------------------------
+# Convention about Training / Test specific parameters
+# -----------------------------------------------------------------------------
+# Whenever an argument can be either used for training or for testing, the
+# corresponding name will be post-fixed by a _TRAIN for a training parameter,
+# or _TEST for a test-specific parameter.
+# For example, the number of images during training will be
+# IMAGES_PER_BATCH_TRAIN, while the number of images for testing will be
+# IMAGES_PER_BATCH_TEST
+
+# -----------------------------------------------------------------------------
+# Config definition
+# -----------------------------------------------------------------------------
+
+_C = CN()
+
+# The version number, to upgrade from old configs to new ones if any
+# changes happen. It's recommended to keep a VERSION in your config file.
+_C.VERSION = 2
+
+_C.MODEL = CN()
+_C.MODEL.LOAD_PROPOSALS = False
+_C.MODEL.MASK_ON = False
+_C.MODEL.KEYPOINT_ON = False
+_C.MODEL.DEVICE = "cuda"
+_C.MODEL.META_ARCHITECTURE = "GeneralizedRCNN"
+
+# Path (possibly with schema like catalog:// or detectron2://) to a checkpoint file
+# to be loaded to the model. You can find available models in the model zoo.
+_C.MODEL.WEIGHTS = ""
+
+# Values to be used for image normalization (BGR order, since INPUT.FORMAT defaults to BGR).
+# To train on images of different number of channels, just set different mean & std.
+# Default values are the mean pixel value from ImageNet: [103.53, 116.28, 123.675]
+_C.MODEL.PIXEL_MEAN = [103.530, 116.280, 123.675]
+# When using pre-trained models in Detectron1 or any MSRA models,
+# std has been absorbed into its conv1 weights, so the std needs to be set 1.
+# Otherwise, you can use [57.375, 57.120, 58.395] (ImageNet std)
+_C.MODEL.PIXEL_STD = [1.0, 1.0, 1.0]
+
+
+# -----------------------------------------------------------------------------
+# INPUT
+# -----------------------------------------------------------------------------
+_C.INPUT = CN()
+# Size of the smallest side of the image during training
+_C.INPUT.MIN_SIZE_TRAIN = (800,)
+# Sample size of smallest side by choice or random selection from range give by
+# INPUT.MIN_SIZE_TRAIN
+_C.INPUT.MIN_SIZE_TRAIN_SAMPLING = "choice"
+# Maximum size of the side of the image during training
+_C.INPUT.MAX_SIZE_TRAIN = 1333
+# Size of the smallest side of the image during testing. Set to zero to disable resize in testing.
+_C.INPUT.MIN_SIZE_TEST = 800
+# Maximum size of the side of the image during testing
+_C.INPUT.MAX_SIZE_TEST = 1333
+
+# `True` if cropping is used for data augmentation during training
+_C.INPUT.CROP = CN({"ENABLED": False})
+# Cropping type:
+# - "relative" crop (H * CROP.SIZE[0], W * CROP.SIZE[1]) part of an input of size (H, W)
+# - "relative_range" uniformly sample relative crop size from between [CROP.SIZE[0], [CROP.SIZE[1]].
+#   and  [1, 1] and use it as in "relative" scenario.
+# - "absolute" crop part of an input with absolute size: (CROP.SIZE[0], CROP.SIZE[1]).
+_C.INPUT.CROP.TYPE = "relative_range"
+# Size of crop in range (0, 1] if CROP.TYPE is "relative" or "relative_range" and in number of
+# pixels if CROP.TYPE is "absolute"
+_C.INPUT.CROP.SIZE = [0.9, 0.9]
+
+
+# Whether the model needs RGB, YUV, HSV etc.
+# Should be one of the modes defined here, as we use PIL to read the image:
+# https://pillow.readthedocs.io/en/stable/handbook/concepts.html#concept-modes
+# with BGR being the one exception. One can set image format to BGR, we will
+# internally use RGB for conversion and flip the channels over
+_C.INPUT.FORMAT = "BGR"
+# The ground truth mask format that the model will use.
+# Mask R-CNN supports either "polygon" or "bitmask" as ground truth.
+_C.INPUT.MASK_FORMAT = "polygon"  # alternative: "bitmask"
+
+
+# -----------------------------------------------------------------------------
+# Dataset
+# -----------------------------------------------------------------------------
+_C.DATASETS = CN()
+# List of the dataset names for training. Must be registered in DatasetCatalog
+_C.DATASETS.TRAIN = ()
+# List of the pre-computed proposal files for training, which must be consistent
+# with data listed in DATASETS.TRAIN.
+_C.DATASETS.PROPOSAL_FILES_TRAIN = ()
+# Number of top scoring precomputed proposals to keep for training
+_C.DATASETS.PRECOMPUTED_PROPOSAL_TOPK_TRAIN = 2000
+# List of the dataset names for testing. Must be registered in DatasetCatalog
+_C.DATASETS.TEST = ()
+# List of the pre-computed proposal files for test, which must be consistent
+# with data listed in DATASETS.TEST.
+_C.DATASETS.PROPOSAL_FILES_TEST = ()
+# Number of top scoring precomputed proposals to keep for test
+_C.DATASETS.PRECOMPUTED_PROPOSAL_TOPK_TEST = 1000
+
+# -----------------------------------------------------------------------------
+# DataLoader
+# -----------------------------------------------------------------------------
+_C.DATALOADER = CN()
+# Number of data loading threads
+_C.DATALOADER.NUM_WORKERS = 4
+# If True, each batch should contain only images for which the aspect ratio
+# is compatible. This groups portrait images together, and landscape images
+# are not batched with portrait images.
+_C.DATALOADER.ASPECT_RATIO_GROUPING = True
+# Options: TrainingSampler, RepeatFactorTrainingSampler
+_C.DATALOADER.SAMPLER_TRAIN = "TrainingSampler"
+# Repeat threshold for RepeatFactorTrainingSampler
+_C.DATALOADER.REPEAT_THRESHOLD = 0.0
+# if True, the dataloader will filter out images that have no associated
+# annotations at train time.
+_C.DATALOADER.FILTER_EMPTY_ANNOTATIONS = True
+
+# ---------------------------------------------------------------------------- #
+# Backbone options
+# ---------------------------------------------------------------------------- #
+_C.MODEL.BACKBONE = CN()
+
+_C.MODEL.BACKBONE.NAME = "build_resnet_backbone"
+# Freeze the first several stages so they are not trained.
+# There are 5 stages in ResNet. The first is a convolution, and the following
+# stages are each group of residual blocks.
+_C.MODEL.BACKBONE.FREEZE_AT = 2
+
+
+# ---------------------------------------------------------------------------- #
+# FPN options
+# ---------------------------------------------------------------------------- #
+_C.MODEL.FPN = CN()
+# Names of the input feature maps to be used by FPN
+# They must have contiguous power of 2 strides
+# e.g., ["res2", "res3", "res4", "res5"]
+_C.MODEL.FPN.IN_FEATURES = []
+_C.MODEL.FPN.OUT_CHANNELS = 256
+
+# Options: "" (no norm), "GN"
+_C.MODEL.FPN.NORM = ""
+
+# Types for fusing the FPN top-down and lateral features. Can be either "sum" or "avg"
+_C.MODEL.FPN.FUSE_TYPE = "sum"
+
+
+# ---------------------------------------------------------------------------- #
+# Proposal generator options
+# ---------------------------------------------------------------------------- #
+_C.MODEL.PROPOSAL_GENERATOR = CN()
+# Current proposal generators include "RPN", "RRPN" and "PrecomputedProposals"
+_C.MODEL.PROPOSAL_GENERATOR.NAME = "RPN"
+# Proposal height and width both need to be greater than MIN_SIZE
+# (a the scale used during training or inference)
+_C.MODEL.PROPOSAL_GENERATOR.MIN_SIZE = 0
+
+
+# ---------------------------------------------------------------------------- #
+# Anchor generator options
+# ---------------------------------------------------------------------------- #
+_C.MODEL.ANCHOR_GENERATOR = CN()
+# The generator can be any name in the ANCHOR_GENERATOR registry
+_C.MODEL.ANCHOR_GENERATOR.NAME = "DefaultAnchorGenerator"
+# Anchor sizes (i.e. sqrt of area) in absolute pixels w.r.t. the network input.
+# Format: list[list[float]]. SIZES[i] specifies the list of sizes
+# to use for IN_FEATURES[i]; len(SIZES) == len(IN_FEATURES) must be true,
+# or len(SIZES) == 1 is true and size list SIZES[0] is used for all
+# IN_FEATURES.
+_C.MODEL.ANCHOR_GENERATOR.SIZES = [[32, 64, 128, 256, 512]]
+# Anchor aspect ratios. For each area given in `SIZES`, anchors with different aspect
+# ratios are generated by an anchor generator.
+# Format: list[list[float]]. ASPECT_RATIOS[i] specifies the list of aspect ratios (H/W)
+# to use for IN_FEATURES[i]; len(ASPECT_RATIOS) == len(IN_FEATURES) must be true,
+# or len(ASPECT_RATIOS) == 1 is true and aspect ratio list ASPECT_RATIOS[0] is used
+# for all IN_FEATURES.
+_C.MODEL.ANCHOR_GENERATOR.ASPECT_RATIOS = [[0.5, 1.0, 2.0]]
+# Anchor angles.
+# list[list[float]], the angle in degrees, for each input feature map.
+# ANGLES[i] specifies the list of angles for IN_FEATURES[i].
+_C.MODEL.ANCHOR_GENERATOR.ANGLES = [[-90, 0, 90]]
+# Relative offset between the center of the first anchor and the top-left corner of the image
+# Value has to be in [0, 1). Recommend to use 0.5, which means half stride.
+# The value is not expected to affect model accuracy.
+_C.MODEL.ANCHOR_GENERATOR.OFFSET = 0.0
+
+# ---------------------------------------------------------------------------- #
+# RPN options
+# ---------------------------------------------------------------------------- #
+_C.MODEL.RPN = CN()
+_C.MODEL.RPN.HEAD_NAME = "StandardRPNHead"  # used by RPN_HEAD_REGISTRY
+
+# Names of the input feature maps to be used by RPN
+# e.g., ["p2", "p3", "p4", "p5", "p6"] for FPN
+_C.MODEL.RPN.IN_FEATURES = ["res4"]
+# Remove RPN anchors that go outside the image by BOUNDARY_THRESH pixels
+# Set to -1 or a large value, e.g. 100000, to disable pruning anchors
+_C.MODEL.RPN.BOUNDARY_THRESH = -1
+# IOU overlap ratios [BG_IOU_THRESHOLD, FG_IOU_THRESHOLD]
+# Minimum overlap required between an anchor and ground-truth box for the
+# (anchor, gt box) pair to be a positive example (IoU >= FG_IOU_THRESHOLD
+# ==> positive RPN example: 1)
+# Maximum overlap allowed between an anchor and ground-truth box for the
+# (anchor, gt box) pair to be a negative examples (IoU < BG_IOU_THRESHOLD
+# ==> negative RPN example: 0)
+# Anchors with overlap in between (BG_IOU_THRESHOLD <= IoU < FG_IOU_THRESHOLD)
+# are ignored (-1)
+_C.MODEL.RPN.IOU_THRESHOLDS = [0.3, 0.7]
+_C.MODEL.RPN.IOU_LABELS = [0, -1, 1]
+# Total number of RPN examples per image
+_C.MODEL.RPN.BATCH_SIZE_PER_IMAGE = 256
+# Target fraction of foreground (positive) examples per RPN minibatch
+_C.MODEL.RPN.POSITIVE_FRACTION = 0.5
+# Weights on (dx, dy, dw, dh) for normalizing RPN anchor regression targets
+_C.MODEL.RPN.BBOX_REG_WEIGHTS = (1.0, 1.0, 1.0, 1.0)
+# The transition point from L1 to L2 loss. Set to 0.0 to make the loss simply L1.
+_C.MODEL.RPN.SMOOTH_L1_BETA = 0.0
+_C.MODEL.RPN.LOSS_WEIGHT = 1.0
+# Number of top scoring RPN proposals to keep before applying NMS
+# When FPN is used, this is *per FPN level* (not total)
+_C.MODEL.RPN.PRE_NMS_TOPK_TRAIN = 12000
+_C.MODEL.RPN.PRE_NMS_TOPK_TEST = 6000
+# Number of top scoring RPN proposals to keep after applying NMS
+# When FPN is used, this limit is applied per level and then again to the union
+# of proposals from all levels
+# NOTE: When FPN is used, the meaning of this config is different from Detectron1.
+# It means per-batch topk in Detectron1, but per-image topk here.
+# See "modeling/rpn/rpn_outputs.py" for details.
+_C.MODEL.RPN.POST_NMS_TOPK_TRAIN = 2000
+_C.MODEL.RPN.POST_NMS_TOPK_TEST = 1000
+# NMS threshold used on RPN proposals
+_C.MODEL.RPN.NMS_THRESH = 0.7
+
+# ---------------------------------------------------------------------------- #
+# ROI HEADS options
+# ---------------------------------------------------------------------------- #
+_C.MODEL.ROI_HEADS = CN()
+_C.MODEL.ROI_HEADS.NAME = "Res5ROIHeads"
+# Number of foreground classes
+_C.MODEL.ROI_HEADS.NUM_CLASSES = 80
+# Names of the input feature maps to be used by ROI heads
+# Currently all heads (box, mask, ...) use the same input feature map list
+# e.g., ["p2", "p3", "p4", "p5"] is commonly used for FPN
+_C.MODEL.ROI_HEADS.IN_FEATURES = ["res4"]
+# IOU overlap ratios [IOU_THRESHOLD]
+# Overlap threshold for an RoI to be considered background (if < IOU_THRESHOLD)
+# Overlap threshold for an RoI to be considered foreground (if >= IOU_THRESHOLD)
+_C.MODEL.ROI_HEADS.IOU_THRESHOLDS = [0.5]
+_C.MODEL.ROI_HEADS.IOU_LABELS = [0, 1]
+# RoI minibatch size *per image* (number of regions of interest [ROIs])
+# Total number of RoIs per training minibatch =
+#   ROI_HEADS.BATCH_SIZE_PER_IMAGE * SOLVER.IMS_PER_BATCH
+# E.g., a common configuration is: 512 * 16 = 8192
+_C.MODEL.ROI_HEADS.BATCH_SIZE_PER_IMAGE = 512
+# Target fraction of RoI minibatch that is labeled foreground (i.e. class > 0)
+_C.MODEL.ROI_HEADS.POSITIVE_FRACTION = 0.25
+
+# Only used on test mode
+
+# Minimum score threshold (assuming scores in a [0, 1] range); a value chosen to
+# balance obtaining high recall with not having too many low precision
+# detections that will slow down inference post processing steps (like NMS)
+# A default threshold of 0.0 increases AP by ~0.2-0.3 but significantly slows down
+# inference.
+_C.MODEL.ROI_HEADS.SCORE_THRESH_TEST = 0.05
+# Overlap threshold used for non-maximum suppression (suppress boxes with
+# IoU >= this threshold)
+_C.MODEL.ROI_HEADS.NMS_THRESH_TEST = 0.5
+# If True, augment proposals with ground-truth boxes before sampling proposals to
+# train ROI heads.
+_C.MODEL.ROI_HEADS.PROPOSAL_APPEND_GT = True
+
+# ---------------------------------------------------------------------------- #
+# Box Head
+# ---------------------------------------------------------------------------- #
+_C.MODEL.ROI_BOX_HEAD = CN()
+# C4 don't use head name option
+# Options for non-C4 models: FastRCNNConvFCHead,
+_C.MODEL.ROI_BOX_HEAD.NAME = ""
+# Default weights on (dx, dy, dw, dh) for normalizing bbox regression targets
+# These are empirically chosen to approximately lead to unit variance targets
+_C.MODEL.ROI_BOX_HEAD.BBOX_REG_WEIGHTS = (10.0, 10.0, 5.0, 5.0)
+# The transition point from L1 to L2 loss. Set to 0.0 to make the loss simply L1.
+_C.MODEL.ROI_BOX_HEAD.SMOOTH_L1_BETA = 0.0
+_C.MODEL.ROI_BOX_HEAD.POOLER_RESOLUTION = 14
+_C.MODEL.ROI_BOX_HEAD.POOLER_SAMPLING_RATIO = 0
+# Type of pooling operation applied to the incoming feature map for each RoI
+_C.MODEL.ROI_BOX_HEAD.POOLER_TYPE = "ROIAlignV2"
+
+_C.MODEL.ROI_BOX_HEAD.NUM_FC = 0
+# Hidden layer dimension for FC layers in the RoI box head
+_C.MODEL.ROI_BOX_HEAD.FC_DIM = 1024
+_C.MODEL.ROI_BOX_HEAD.NUM_CONV = 0
+# Channel dimension for Conv layers in the RoI box head
+_C.MODEL.ROI_BOX_HEAD.CONV_DIM = 256
+# Normalization method for the convolution layers.
+# Options: "" (no norm), "GN", "SyncBN".
+_C.MODEL.ROI_BOX_HEAD.NORM = ""
+# Whether to use class agnostic for bbox regression
+_C.MODEL.ROI_BOX_HEAD.CLS_AGNOSTIC_BBOX_REG = False
+# If true, RoI heads use bounding boxes predicted by the box head rather than proposal boxes.
+_C.MODEL.ROI_BOX_HEAD.TRAIN_ON_PRED_BOXES = False
+
+# ---------------------------------------------------------------------------- #
+# Cascaded Box Head
+# ---------------------------------------------------------------------------- #
+_C.MODEL.ROI_BOX_CASCADE_HEAD = CN()
+# The number of cascade stages is implicitly defined by the length of the following two configs.
+_C.MODEL.ROI_BOX_CASCADE_HEAD.BBOX_REG_WEIGHTS = (
+    (10.0, 10.0, 5.0, 5.0),
+    (20.0, 20.0, 10.0, 10.0),
+    (30.0, 30.0, 15.0, 15.0),
+)
+_C.MODEL.ROI_BOX_CASCADE_HEAD.IOUS = (0.5, 0.6, 0.7)
+
+
+# ---------------------------------------------------------------------------- #
+# Mask Head
+# ---------------------------------------------------------------------------- #
+_C.MODEL.ROI_MASK_HEAD = CN()
+_C.MODEL.ROI_MASK_HEAD.NAME = "MaskRCNNConvUpsampleHead"
+_C.MODEL.ROI_MASK_HEAD.POOLER_RESOLUTION = 14
+_C.MODEL.ROI_MASK_HEAD.POOLER_SAMPLING_RATIO = 0
+_C.MODEL.ROI_MASK_HEAD.NUM_CONV = 0  # The number of convs in the mask head
+_C.MODEL.ROI_MASK_HEAD.CONV_DIM = 256
+# Normalization method for the convolution layers.
+# Options: "" (no norm), "GN", "SyncBN".
+_C.MODEL.ROI_MASK_HEAD.NORM = ""
+# Whether to use class agnostic for mask prediction
+_C.MODEL.ROI_MASK_HEAD.CLS_AGNOSTIC_MASK = False
+# Type of pooling operation applied to the incoming feature map for each RoI
+_C.MODEL.ROI_MASK_HEAD.POOLER_TYPE = "ROIAlignV2"
+
+
+# ---------------------------------------------------------------------------- #
+# Keypoint Head
+# ---------------------------------------------------------------------------- #
+_C.MODEL.ROI_KEYPOINT_HEAD = CN()
+_C.MODEL.ROI_KEYPOINT_HEAD.NAME = "KRCNNConvDeconvUpsampleHead"
+_C.MODEL.ROI_KEYPOINT_HEAD.POOLER_RESOLUTION = 14
+_C.MODEL.ROI_KEYPOINT_HEAD.POOLER_SAMPLING_RATIO = 0
+_C.MODEL.ROI_KEYPOINT_HEAD.CONV_DIMS = tuple(512 for _ in range(8))
+_C.MODEL.ROI_KEYPOINT_HEAD.NUM_KEYPOINTS = 17  # 17 is the number of keypoints in COCO.
+
+# Images with too few (or no) keypoints are excluded from training.
+_C.MODEL.ROI_KEYPOINT_HEAD.MIN_KEYPOINTS_PER_IMAGE = 1
+# Normalize by the total number of visible keypoints in the minibatch if True.
+# Otherwise, normalize by the total number of keypoints that could ever exist
+# in the minibatch.
+# The keypoint softmax loss is only calculated on visible keypoints.
+# Since the number of visible keypoints can vary significantly between
+# minibatches, this has the effect of up-weighting the importance of
+# minibatches with few visible keypoints. (Imagine the extreme case of
+# only one visible keypoint versus N: in the case of N, each one
+# contributes 1/N to the gradient compared to the single keypoint
+# determining the gradient direction). Instead, we can normalize the
+# loss by the total number of keypoints, if it were the case that all
+# keypoints were visible in a full minibatch. (Returning to the example,
+# this means that the one visible keypoint contributes as much as each
+# of the N keypoints.)
+_C.MODEL.ROI_KEYPOINT_HEAD.NORMALIZE_LOSS_BY_VISIBLE_KEYPOINTS = True
+# Multi-task loss weight to use for keypoints
+# Recommended values:
+#   - use 1.0 if NORMALIZE_LOSS_BY_VISIBLE_KEYPOINTS is True
+#   - use 4.0 if NORMALIZE_LOSS_BY_VISIBLE_KEYPOINTS is False
+_C.MODEL.ROI_KEYPOINT_HEAD.LOSS_WEIGHT = 1.0
+# Type of pooling operation applied to the incoming feature map for each RoI
+_C.MODEL.ROI_KEYPOINT_HEAD.POOLER_TYPE = "ROIAlignV2"
+
+# ---------------------------------------------------------------------------- #
+# Semantic Segmentation Head
+# ---------------------------------------------------------------------------- #
+_C.MODEL.SEM_SEG_HEAD = CN()
+_C.MODEL.SEM_SEG_HEAD.NAME = "SemSegFPNHead"
+_C.MODEL.SEM_SEG_HEAD.IN_FEATURES = ["p2", "p3", "p4", "p5"]
+# Label in the semantic segmentation ground truth that is ignored, i.e., no loss is calculated for
+# the correposnding pixel.
+_C.MODEL.SEM_SEG_HEAD.IGNORE_VALUE = 255
+# Number of classes in the semantic segmentation head
+_C.MODEL.SEM_SEG_HEAD.NUM_CLASSES = 54
+# Number of channels in the 3x3 convs inside semantic-FPN heads.
+_C.MODEL.SEM_SEG_HEAD.CONVS_DIM = 128
+# Outputs from semantic-FPN heads are up-scaled to the COMMON_STRIDE stride.
+_C.MODEL.SEM_SEG_HEAD.COMMON_STRIDE = 4
+# Normalization method for the convolution layers. Options: "" (no norm), "GN".
+_C.MODEL.SEM_SEG_HEAD.NORM = "GN"
+_C.MODEL.SEM_SEG_HEAD.LOSS_WEIGHT = 1.0
+
+_C.MODEL.PANOPTIC_FPN = CN()
+# Scaling of all losses from instance detection / segmentation head.
+_C.MODEL.PANOPTIC_FPN.INSTANCE_LOSS_WEIGHT = 1.0
+
+# options when combining instance & semantic segmentation outputs
+_C.MODEL.PANOPTIC_FPN.COMBINE = CN({"ENABLED": True})
+_C.MODEL.PANOPTIC_FPN.COMBINE.OVERLAP_THRESH = 0.5
+_C.MODEL.PANOPTIC_FPN.COMBINE.STUFF_AREA_LIMIT = 4096
+_C.MODEL.PANOPTIC_FPN.COMBINE.INSTANCES_CONFIDENCE_THRESH = 0.5
+
+
+# ---------------------------------------------------------------------------- #
+# RetinaNet Head
+# ---------------------------------------------------------------------------- #
+_C.MODEL.RETINANET = CN()
+
+# This is the number of foreground classes.
+_C.MODEL.RETINANET.NUM_CLASSES = 80
+
+_C.MODEL.RETINANET.IN_FEATURES = ["p3", "p4", "p5", "p6", "p7"]
+
+# Convolutions to use in the cls and bbox tower
+# NOTE: this doesn't include the last conv for logits
+_C.MODEL.RETINANET.NUM_CONVS = 4
+
+# IoU overlap ratio [bg, fg] for labeling anchors.
+# Anchors with < bg are labeled negative (0)
+# Anchors  with >= bg and < fg are ignored (-1)
+# Anchors with >= fg are labeled positive (1)
+_C.MODEL.RETINANET.IOU_THRESHOLDS = [0.4, 0.5]
+_C.MODEL.RETINANET.IOU_LABELS = [0, -1, 1]
+
+# Prior prob for rare case (i.e. foreground) at the beginning of training.
+# This is used to set the bias for the logits layer of the classifier subnet.
+# This improves training stability in the case of heavy class imbalance.
+_C.MODEL.RETINANET.PRIOR_PROB = 0.01
+
+# Inference cls score threshold, only anchors with score > INFERENCE_TH are
+# considered for inference (to improve speed)
+_C.MODEL.RETINANET.SCORE_THRESH_TEST = 0.05
+_C.MODEL.RETINANET.TOPK_CANDIDATES_TEST = 1000
+_C.MODEL.RETINANET.NMS_THRESH_TEST = 0.5
+
+# Weights on (dx, dy, dw, dh) for normalizing Retinanet anchor regression targets
+_C.MODEL.RETINANET.BBOX_REG_WEIGHTS = (1.0, 1.0, 1.0, 1.0)
+
+# Loss parameters
+_C.MODEL.RETINANET.FOCAL_LOSS_GAMMA = 2.0
+_C.MODEL.RETINANET.FOCAL_LOSS_ALPHA = 0.25
+_C.MODEL.RETINANET.SMOOTH_L1_LOSS_BETA = 0.1
+
+
+# ---------------------------------------------------------------------------- #
+# ResNe[X]t options (ResNets = {ResNet, ResNeXt}
+# Note that parts of a resnet may be used for both the backbone and the head
+# These options apply to both
+# ---------------------------------------------------------------------------- #
+_C.MODEL.RESNETS = CN()
+
+_C.MODEL.RESNETS.DEPTH = 50
+_C.MODEL.RESNETS.OUT_FEATURES = ["res4"]  # res4 for C4 backbone, res2..5 for FPN backbone
+
+# Number of groups to use; 1 ==> ResNet; > 1 ==> ResNeXt
+_C.MODEL.RESNETS.NUM_GROUPS = 1
+
+# Options: FrozenBN, GN, "SyncBN", "BN"
+_C.MODEL.RESNETS.NORM = "FrozenBN"
+
+# Baseline width of each group.
+# Scaling this parameters will scale the width of all bottleneck layers.
+_C.MODEL.RESNETS.WIDTH_PER_GROUP = 64
+
+# Place the stride 2 conv on the 1x1 filter
+# Use True only for the original MSRA ResNet; use False for C2 and Torch models
+_C.MODEL.RESNETS.STRIDE_IN_1X1 = True
+
+# Apply dilation in stage "res5"
+_C.MODEL.RESNETS.RES5_DILATION = 1
+
+# Output width of res2. Scaling this parameters will scale the width of all 1x1 convs in ResNet
+# For R18 and R34, this needs to be set to 64
+_C.MODEL.RESNETS.RES2_OUT_CHANNELS = 256
+_C.MODEL.RESNETS.STEM_OUT_CHANNELS = 64
+
+# Apply Deformable Convolution in stages
+# Specify if apply deform_conv on Res2, Res3, Res4, Res5
+_C.MODEL.RESNETS.DEFORM_ON_PER_STAGE = [False, False, False, False]
+# Use True to use modulated deform_conv (DeformableV2, https://arxiv.org/abs/1811.11168);
+# Use False for DeformableV1.
+_C.MODEL.RESNETS.DEFORM_MODULATED = False
+# Number of groups in deformable conv.
+_C.MODEL.RESNETS.DEFORM_NUM_GROUPS = 1
+
+
+# ---------------------------------------------------------------------------- #
+# Solver
+# ---------------------------------------------------------------------------- #
+_C.SOLVER = CN()
+
+# See detectron2/solver/build.py for LR scheduler options
+_C.SOLVER.LR_SCHEDULER_NAME = "WarmupMultiStepLR"
+
+_C.SOLVER.MAX_ITER = 40000
+
+_C.SOLVER.BASE_LR = 0.001
+
+_C.SOLVER.MOMENTUM = 0.9
+
+_C.SOLVER.NESTEROV = False
+
+_C.SOLVER.WEIGHT_DECAY = 0.0001
+# The weight decay that's applied to parameters of normalization layers
+# (typically the affine transformation)
+_C.SOLVER.WEIGHT_DECAY_NORM = 0.0
+
+_C.SOLVER.GAMMA = 0.1
+# The iteration number to decrease learning rate by GAMMA.
+_C.SOLVER.STEPS = (30000,)
+
+_C.SOLVER.WARMUP_FACTOR = 1.0 / 1000
+_C.SOLVER.WARMUP_ITERS = 1000
+_C.SOLVER.WARMUP_METHOD = "linear"
+
+# Save a checkpoint after every this number of iterations
+_C.SOLVER.CHECKPOINT_PERIOD = 5000
+
+# Number of images per batch across all machines.
+# If we have 16 GPUs and IMS_PER_BATCH = 32,
+# each GPU will see 2 images per batch.
+_C.SOLVER.IMS_PER_BATCH = 16
+
+# Detectron v1 (and previous detection code) used a 2x higher LR and 0 WD for
+# biases. This is not useful (at least for recent models). You should avoid
+# changing these and they exist only to reproduce Detectron v1 training if
+# desired.
+_C.SOLVER.BIAS_LR_FACTOR = 1.0
+_C.SOLVER.WEIGHT_DECAY_BIAS = _C.SOLVER.WEIGHT_DECAY
+
+# Gradient clipping
+_C.SOLVER.CLIP_GRADIENTS = CN({"ENABLED": False})
+# Type of gradient clipping, currently 2 values are supported:
+# - "value": the absolute values of elements of each gradients are clipped
+# - "norm": the norm of the gradient for each parameter is clipped thus
+#   affecting all elements in the parameter
+_C.SOLVER.CLIP_GRADIENTS.CLIP_TYPE = "value"
+# Maximum absolute value used for clipping gradients
+_C.SOLVER.CLIP_GRADIENTS.CLIP_VALUE = 1.0
+# Floating point number p for L-p norm to be used with the "norm"
+# gradient clipping type; for L-inf, please specify .inf
+_C.SOLVER.CLIP_GRADIENTS.NORM_TYPE = 2.0
+
+# ---------------------------------------------------------------------------- #
+# Specific test options
+# ---------------------------------------------------------------------------- #
+_C.TEST = CN()
+# For end-to-end tests to verify the expected accuracy.
+# Each item is [task, metric, value, tolerance]
+# e.g.: [['bbox', 'AP', 38.5, 0.2]]
+_C.TEST.EXPECTED_RESULTS = []
+# The period (in terms of steps) to evaluate the model during training.
+# Set to 0 to disable.
+_C.TEST.EVAL_PERIOD = 0
+# The sigmas used to calculate keypoint OKS. See http://cocodataset.org/#keypoints-eval
+# When empty it will use the defaults in COCO.
+# Otherwise it should have the same length as ROI_KEYPOINT_HEAD.NUM_KEYPOINTS.
+_C.TEST.KEYPOINT_OKS_SIGMAS = []
+# Maximum number of detections to return per image during inference (100 is
+# based on the limit established for the COCO dataset).
+_C.TEST.DETECTIONS_PER_IMAGE = 100
+
+_C.TEST.AUG = CN({"ENABLED": False})
+_C.TEST.AUG.MIN_SIZES = (400, 500, 600, 700, 800, 900, 1000, 1100, 1200)
+_C.TEST.AUG.MAX_SIZE = 4000
+_C.TEST.AUG.FLIP = True
+
+_C.TEST.PRECISE_BN = CN({"ENABLED": False})
+_C.TEST.PRECISE_BN.NUM_ITER = 200
+
+# ---------------------------------------------------------------------------- #
+# Misc options
+# ---------------------------------------------------------------------------- #
+# Directory where output files are written
+_C.OUTPUT_DIR = "./output"
+# Set seed to negative to fully randomize everything.
+# Set seed to positive to use a fixed seed. Note that a fixed seed increases
+# reproducibility but does not guarantee fully deterministic behavior.
+# Disabling all parallelism further increases reproducibility.
+_C.SEED = -1
+# Benchmark different cudnn algorithms.
+# If input images have very different sizes, this option will have large overhead
+# for about 10k iterations. It usually hurts total time, but can benefit for certain models.
+# If input images have the same or similar sizes, benchmark is often helpful.
+_C.CUDNN_BENCHMARK = False
+# The period (in terms of steps) for minibatch visualization at train time.
+# Set to 0 to disable.
+_C.VIS_PERIOD = 0
+
+# global config is for quick hack purposes.
+# You can set them in command line or config files,
+# and access it with:
+#
+# from detectron2.config import global_cfg
+# print(global_cfg.HACK)
+#
+# Do not commit any configs into it.
+_C.GLOBAL = CN()
+_C.GLOBAL.HACK = 1.0
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/detectron2/data/__init__.py b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/data/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e8f72e0f45d6d683771f0d815dfd0e3d0db52b9d
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/data/__init__.py
@@ -0,0 +1,18 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+from . import transforms  # isort:skip
+
+from .build import (
+    build_detection_test_loader,
+    build_detection_train_loader,
+    get_detection_dataset_dicts,
+    load_proposals_into_dataset,
+    print_instances_class_histogram,
+)
+from .catalog import DatasetCatalog, MetadataCatalog
+from .common import DatasetFromList, MapDataset
+from .dataset_mapper import DatasetMapper
+
+# ensure the builtin data are registered
+from . import datasets, samplers  # isort:skip
+
+__all__ = [k for k in globals().keys() if not k.startswith("_")]
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/detectron2/data/build.py b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/data/build.py
new file mode 100644
index 0000000000000000000000000000000000000000..cb7e85789d75daf4ee206449ce0d3254e948db16
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/data/build.py
@@ -0,0 +1,397 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+import bisect
+import copy
+import itertools
+import logging
+import numpy as np
+import operator
+import pickle
+import torch.utils.data
+from fvcore.common.file_io import PathManager
+from tabulate import tabulate
+from termcolor import colored
+
+from detectron2.structures import BoxMode
+from detectron2.utils.comm import get_world_size
+from detectron2.utils.env import seed_all_rng
+from detectron2.utils.logger import log_first_n
+
+from . import samplers
+from .catalog import DatasetCatalog, MetadataCatalog
+from .common import AspectRatioGroupedDataset, DatasetFromList, MapDataset
+from .dataset_mapper import DatasetMapper
+from .detection_utils import check_metadata_consistency
+
+"""
+This file contains the default logic to build a dataloader for training or testing.
+"""
+
+__all__ = [
+    "build_detection_train_loader",
+    "build_detection_test_loader",
+    "get_detection_dataset_dicts",
+    "load_proposals_into_dataset",
+    "print_instances_class_histogram",
+]
+
+
+def filter_images_with_only_crowd_annotations(dataset_dicts):
+    """
+    Filter out images with none annotations or only crowd annotations
+    (i.e., images without non-crowd annotations).
+    A common training-time preprocessing on COCO dataset.
+
+    Args:
+        dataset_dicts (list[dict]): annotations in Detectron2 Dataset format.
+
+    Returns:
+        list[dict]: the same format, but filtered.
+    """
+    num_before = len(dataset_dicts)
+
+    def valid(anns):
+        for ann in anns:
+            if ann.get("iscrowd", 0) == 0:
+                return True
+        return False
+
+    dataset_dicts = [x for x in dataset_dicts if valid(x["annotations"])]
+    num_after = len(dataset_dicts)
+    logger = logging.getLogger(__name__)
+    logger.info(
+        "Removed {} images with no usable annotations. {} images left.".format(
+            num_before - num_after, num_after
+        )
+    )
+    return dataset_dicts
+
+
+def filter_images_with_few_keypoints(dataset_dicts, min_keypoints_per_image):
+    """
+    Filter out images with too few number of keypoints.
+
+    Args:
+        dataset_dicts (list[dict]): annotations in Detectron2 Dataset format.
+
+    Returns:
+        list[dict]: the same format as dataset_dicts, but filtered.
+    """
+    num_before = len(dataset_dicts)
+
+    def visible_keypoints_in_image(dic):
+        # Each keypoints field has the format [x1, y1, v1, ...], where v is visibility
+        annotations = dic["annotations"]
+        return sum(
+            (np.array(ann["keypoints"][2::3]) > 0).sum()
+            for ann in annotations
+            if "keypoints" in ann
+        )
+
+    dataset_dicts = [
+        x for x in dataset_dicts if visible_keypoints_in_image(x) >= min_keypoints_per_image
+    ]
+    num_after = len(dataset_dicts)
+    logger = logging.getLogger(__name__)
+    logger.info(
+        "Removed {} images with fewer than {} keypoints.".format(
+            num_before - num_after, min_keypoints_per_image
+        )
+    )
+    return dataset_dicts
+
+
+def load_proposals_into_dataset(dataset_dicts, proposal_file):
+    """
+    Load precomputed object proposals into the dataset.
+
+    The proposal file should be a pickled dict with the following keys:
+
+    - "ids": list[int] or list[str], the image ids
+    - "boxes": list[np.ndarray], each is an Nx4 array of boxes corresponding to the image id
+    - "objectness_logits": list[np.ndarray], each is an N sized array of objectness scores
+      corresponding to the boxes.
+    - "bbox_mode": the BoxMode of the boxes array. Defaults to ``BoxMode.XYXY_ABS``.
+
+    Args:
+        dataset_dicts (list[dict]): annotations in Detectron2 Dataset format.
+        proposal_file (str): file path of pre-computed proposals, in pkl format.
+
+    Returns:
+        list[dict]: the same format as dataset_dicts, but added proposal field.
+    """
+    logger = logging.getLogger(__name__)
+    logger.info("Loading proposals from: {}".format(proposal_file))
+
+    with PathManager.open(proposal_file, "rb") as f:
+        proposals = pickle.load(f, encoding="latin1")
+
+    # Rename the key names in D1 proposal files
+    rename_keys = {"indexes": "ids", "scores": "objectness_logits"}
+    for key in rename_keys:
+        if key in proposals:
+            proposals[rename_keys[key]] = proposals.pop(key)
+
+    # Fetch the indexes of all proposals that are in the dataset
+    # Convert image_id to str since they could be int.
+    img_ids = set({str(record["image_id"]) for record in dataset_dicts})
+    id_to_index = {str(id): i for i, id in enumerate(proposals["ids"]) if str(id) in img_ids}
+
+    # Assuming default bbox_mode of precomputed proposals are 'XYXY_ABS'
+    bbox_mode = BoxMode(proposals["bbox_mode"]) if "bbox_mode" in proposals else BoxMode.XYXY_ABS
+
+    for record in dataset_dicts:
+        # Get the index of the proposal
+        i = id_to_index[str(record["image_id"])]
+
+        boxes = proposals["boxes"][i]
+        objectness_logits = proposals["objectness_logits"][i]
+        # Sort the proposals in descending order of the scores
+        inds = objectness_logits.argsort()[::-1]
+        record["proposal_boxes"] = boxes[inds]
+        record["proposal_objectness_logits"] = objectness_logits[inds]
+        record["proposal_bbox_mode"] = bbox_mode
+
+    return dataset_dicts
+
+
+def _quantize(x, bin_edges):
+    bin_edges = copy.copy(bin_edges)
+    bin_edges = sorted(bin_edges)
+    quantized = list(map(lambda y: bisect.bisect_right(bin_edges, y), x))
+    return quantized
+
+
+def print_instances_class_histogram(dataset_dicts, class_names):
+    """
+    Args:
+        dataset_dicts (list[dict]): list of dataset dicts.
+        class_names (list[str]): list of class names (zero-indexed).
+    """
+    num_classes = len(class_names)
+    hist_bins = np.arange(num_classes + 1)
+    histogram = np.zeros((num_classes,), dtype=np.int)
+    for entry in dataset_dicts:
+        annos = entry["annotations"]
+        classes = [x["category_id"] for x in annos if not x.get("iscrowd", 0)]
+        histogram += np.histogram(classes, bins=hist_bins)[0]
+
+    N_COLS = min(6, len(class_names) * 2)
+
+    def short_name(x):
+        # make long class names shorter. useful for lvis
+        if len(x) > 13:
+            return x[:11] + ".."
+        return x
+
+    data = list(
+        itertools.chain(*[[short_name(class_names[i]), int(v)] for i, v in enumerate(histogram)])
+    )
+    total_num_instances = sum(data[1::2])
+    data.extend([None] * (N_COLS - (len(data) % N_COLS)))
+    if num_classes > 1:
+        data.extend(["total", total_num_instances])
+    data = itertools.zip_longest(*[data[i::N_COLS] for i in range(N_COLS)])
+    table = tabulate(
+        data,
+        headers=["category", "#instances"] * (N_COLS // 2),
+        tablefmt="pipe",
+        numalign="left",
+        stralign="center",
+    )
+    log_first_n(
+        logging.INFO,
+        "Distribution of instances among all {} categories:\n".format(num_classes)
+        + colored(table, "cyan"),
+        key="message",
+    )
+
+
+def get_detection_dataset_dicts(
+    dataset_names, filter_empty=True, min_keypoints=0, proposal_files=None
+):
+    """
+    Load and prepare dataset dicts for instance detection/segmentation and semantic segmentation.
+
+    Args:
+        dataset_names (list[str]): a list of dataset names
+        filter_empty (bool): whether to filter out images without instance annotations
+        min_keypoints (int): filter out images with fewer keypoints than
+            `min_keypoints`. Set to 0 to do nothing.
+        proposal_files (list[str]): if given, a list of object proposal files
+            that match each dataset in `dataset_names`.
+    """
+    assert len(dataset_names)
+    dataset_dicts = [DatasetCatalog.get(dataset_name) for dataset_name in dataset_names]
+    for dataset_name, dicts in zip(dataset_names, dataset_dicts):
+        assert len(dicts), "Dataset '{}' is empty!".format(dataset_name)
+
+    if proposal_files is not None:
+        assert len(dataset_names) == len(proposal_files)
+        # load precomputed proposals from proposal files
+        dataset_dicts = [
+            load_proposals_into_dataset(dataset_i_dicts, proposal_file)
+            for dataset_i_dicts, proposal_file in zip(dataset_dicts, proposal_files)
+        ]
+
+    dataset_dicts = list(itertools.chain.from_iterable(dataset_dicts))
+
+    has_instances = "annotations" in dataset_dicts[0]
+    # Keep images without instance-level GT if the dataset has semantic labels.
+    if filter_empty and has_instances and "sem_seg_file_name" not in dataset_dicts[0]:
+        dataset_dicts = filter_images_with_only_crowd_annotations(dataset_dicts)
+
+    if min_keypoints > 0 and has_instances:
+        dataset_dicts = filter_images_with_few_keypoints(dataset_dicts, min_keypoints)
+
+    if has_instances:
+        try:
+            class_names = MetadataCatalog.get(dataset_names[0]).thing_classes
+            check_metadata_consistency("thing_classes", dataset_names)
+            print_instances_class_histogram(dataset_dicts, class_names)
+        except AttributeError:  # class names are not available for this dataset
+            pass
+    return dataset_dicts
+
+
+def build_detection_train_loader(cfg, mapper=None):
+    """
+    A data loader is created by the following steps:
+
+    1. Use the dataset names in config to query :class:`DatasetCatalog`, and obtain a list of dicts.
+    2. Coordinate a random shuffle order shared among all processes (all GPUs)
+    3. Each process spawn another few workers to process the dicts. Each worker will:
+       * Map each metadata dict into another format to be consumed by the model.
+       * Batch them by simply putting dicts into a list.
+
+    The batched ``list[mapped_dict]`` is what this dataloader will yield.
+
+    Args:
+        cfg (CfgNode): the config
+        mapper (callable): a callable which takes a sample (dict) from dataset and
+            returns the format to be consumed by the model.
+            By default it will be `DatasetMapper(cfg, True)`.
+
+    Returns:
+        an infinite iterator of training data
+    """
+    num_workers = get_world_size()
+    images_per_batch = cfg.SOLVER.IMS_PER_BATCH
+    assert (
+        images_per_batch % num_workers == 0
+    ), "SOLVER.IMS_PER_BATCH ({}) must be divisible by the number of workers ({}).".format(
+        images_per_batch, num_workers
+    )
+    assert (
+        images_per_batch >= num_workers
+    ), "SOLVER.IMS_PER_BATCH ({}) must be larger than the number of workers ({}).".format(
+        images_per_batch, num_workers
+    )
+    images_per_worker = images_per_batch // num_workers
+
+    dataset_dicts = get_detection_dataset_dicts(
+        cfg.DATASETS.TRAIN,
+        filter_empty=cfg.DATALOADER.FILTER_EMPTY_ANNOTATIONS,
+        min_keypoints=cfg.MODEL.ROI_KEYPOINT_HEAD.MIN_KEYPOINTS_PER_IMAGE
+        if cfg.MODEL.KEYPOINT_ON
+        else 0,
+        proposal_files=cfg.DATASETS.PROPOSAL_FILES_TRAIN if cfg.MODEL.LOAD_PROPOSALS else None,
+    )
+    dataset = DatasetFromList(dataset_dicts, copy=False)
+
+    if mapper is None:
+        mapper = DatasetMapper(cfg, True)
+    dataset = MapDataset(dataset, mapper)
+
+    sampler_name = cfg.DATALOADER.SAMPLER_TRAIN
+    logger = logging.getLogger(__name__)
+    logger.info("Using training sampler {}".format(sampler_name))
+    if sampler_name == "TrainingSampler":
+        sampler = samplers.TrainingSampler(len(dataset))
+    elif sampler_name == "RepeatFactorTrainingSampler":
+        sampler = samplers.RepeatFactorTrainingSampler(
+            dataset_dicts, cfg.DATALOADER.REPEAT_THRESHOLD
+        )
+    else:
+        raise ValueError("Unknown training sampler: {}".format(sampler_name))
+
+    if cfg.DATALOADER.ASPECT_RATIO_GROUPING:
+        data_loader = torch.utils.data.DataLoader(
+            dataset,
+            sampler=sampler,
+            num_workers=cfg.DATALOADER.NUM_WORKERS,
+            batch_sampler=None,
+            collate_fn=operator.itemgetter(0),  # don't batch, but yield individual elements
+            worker_init_fn=worker_init_reset_seed,
+        )  # yield individual mapped dict
+        data_loader = AspectRatioGroupedDataset(data_loader, images_per_worker)
+    else:
+        batch_sampler = torch.utils.data.sampler.BatchSampler(
+            sampler, images_per_worker, drop_last=True
+        )
+        # drop_last so the batch always have the same size
+        data_loader = torch.utils.data.DataLoader(
+            dataset,
+            num_workers=cfg.DATALOADER.NUM_WORKERS,
+            batch_sampler=batch_sampler,
+            collate_fn=trivial_batch_collator,
+            worker_init_fn=worker_init_reset_seed,
+        )
+
+    return data_loader
+
+
+def build_detection_test_loader(cfg, dataset_name, mapper=None):
+    """
+    Similar to `build_detection_train_loader`.
+    But this function uses the given `dataset_name` argument (instead of the names in cfg),
+    and uses batch size 1.
+
+    Args:
+        cfg: a detectron2 CfgNode
+        dataset_name (str): a name of the dataset that's available in the DatasetCatalog
+        mapper (callable): a callable which takes a sample (dict) from dataset
+           and returns the format to be consumed by the model.
+           By default it will be `DatasetMapper(cfg, False)`.
+
+    Returns:
+        DataLoader: a torch DataLoader, that loads the given detection
+        dataset, with test-time transformation and batching.
+    """
+    dataset_dicts = get_detection_dataset_dicts(
+        [dataset_name],
+        filter_empty=False,
+        proposal_files=[
+            cfg.DATASETS.PROPOSAL_FILES_TEST[list(cfg.DATASETS.TEST).index(dataset_name)]
+        ]
+        if cfg.MODEL.LOAD_PROPOSALS
+        else None,
+    )
+
+    dataset = DatasetFromList(dataset_dicts)
+    if mapper is None:
+        mapper = DatasetMapper(cfg, False)
+    dataset = MapDataset(dataset, mapper)
+
+    sampler = samplers.InferenceSampler(len(dataset))
+    # Always use 1 image per worker during inference since this is the
+    # standard when reporting inference time in papers.
+    batch_sampler = torch.utils.data.sampler.BatchSampler(sampler, 1, drop_last=False)
+
+    data_loader = torch.utils.data.DataLoader(
+        dataset,
+        num_workers=cfg.DATALOADER.NUM_WORKERS,
+        batch_sampler=batch_sampler,
+        collate_fn=trivial_batch_collator,
+    )
+    return data_loader
+
+
+def trivial_batch_collator(batch):
+    """
+    A batch collator that does nothing.
+    """
+    return batch
+
+
+def worker_init_reset_seed(worker_id):
+    seed_all_rng(np.random.randint(2 ** 31) + worker_id)
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/detectron2/data/catalog.py b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/data/catalog.py
new file mode 100644
index 0000000000000000000000000000000000000000..57f18c8705363fdcc79182f0abd0b28d6b2dde8b
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/data/catalog.py
@@ -0,0 +1,221 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+import copy
+import logging
+import types
+from typing import List
+
+from detectron2.utils.logger import log_first_n
+
+__all__ = ["DatasetCatalog", "MetadataCatalog"]
+
+
+class DatasetCatalog(object):
+    """
+    A catalog that stores information about the data and how to obtain them.
+
+    It contains a mapping from strings
+    (which are names that identify a dataset, e.g. "coco_2014_train")
+    to a function which parses the dataset and returns the samples in the
+    format of `list[dict]`.
+
+    The returned dicts should be in Detectron2 Dataset format (See DATASETS.md for details)
+    if used with the data loader functionalities in `data/build.py,data/detection_transform.py`.
+
+    The purpose of having this catalog is to make it easy to choose
+    different data, by just using the strings in the config.
+    """
+
+    _REGISTERED = {}
+
+    @staticmethod
+    def register(name, func):
+        """
+        Args:
+            name (str): the name that identifies a dataset, e.g. "coco_2014_train".
+            func (callable): a callable which takes no arguments and returns a list of dicts.
+        """
+        assert callable(func), "You must register a function with `DatasetCatalog.register`!"
+        assert name not in DatasetCatalog._REGISTERED, "Dataset '{}' is already registered!".format(
+            name
+        )
+        DatasetCatalog._REGISTERED[name] = func
+
+    @staticmethod
+    def get(name):
+        """
+        Call the registered function and return its results.
+
+        Args:
+            name (str): the name that identifies a dataset, e.g. "coco_2014_train".
+
+        Returns:
+            list[dict]: dataset annotations.0
+        """
+        try:
+            f = DatasetCatalog._REGISTERED[name]
+        except KeyError:
+            raise KeyError(
+                "Dataset '{}' is not registered! Available data are: {}".format(
+                    name, ", ".join(DatasetCatalog._REGISTERED.keys())
+                )
+            )
+        return f()
+
+    @staticmethod
+    def list() -> List[str]:
+        """
+        List all registered data.
+
+        Returns:
+            list[str]
+        """
+        return list(DatasetCatalog._REGISTERED.keys())
+
+    @staticmethod
+    def clear():
+        """
+        Remove all registered dataset.
+        """
+        DatasetCatalog._REGISTERED.clear()
+
+
+class Metadata(types.SimpleNamespace):
+    """
+    A class that supports simple attribute setter/getter.
+    It is intended for storing metadata of a dataset and make it accessible globally.
+
+    Examples:
+
+    .. code-block:: python
+
+        # somewhere when you load the data:
+        MetadataCatalog.get("mydataset").thing_classes = ["person", "dog"]
+
+        # somewhere when you print statistics or visualize:
+        classes = MetadataCatalog.get("mydataset").thing_classes
+    """
+
+    # the name of the dataset
+    # set default to N/A so that `self.name` in the errors will not trigger getattr again
+    name: str = "N/A"
+
+    _RENAMED = {
+        "class_names": "thing_classes",
+        "dataset_id_to_contiguous_id": "thing_dataset_id_to_contiguous_id",
+        "stuff_class_names": "stuff_classes",
+    }
+
+    def __getattr__(self, key):
+        if key in self._RENAMED:
+            log_first_n(
+                logging.WARNING,
+                "Metadata '{}' was renamed to '{}'!".format(key, self._RENAMED[key]),
+                n=10,
+            )
+            return getattr(self, self._RENAMED[key])
+
+        raise AttributeError(
+            "Attribute '{}' does not exist in the metadata of '{}'. Available keys are {}.".format(
+                key, self.name, str(self.__dict__.keys())
+            )
+        )
+
+    def __setattr__(self, key, val):
+        if key in self._RENAMED:
+            log_first_n(
+                logging.WARNING,
+                "Metadata '{}' was renamed to '{}'!".format(key, self._RENAMED[key]),
+                n=10,
+            )
+            setattr(self, self._RENAMED[key], val)
+
+        # Ensure that metadata of the same name stays consistent
+        try:
+            oldval = getattr(self, key)
+            assert oldval == val, (
+                "Attribute '{}' in the metadata of '{}' cannot be set "
+                "to a different value!\n{} != {}".format(key, self.name, oldval, val)
+            )
+        except AttributeError:
+            super().__setattr__(key, val)
+
+    def as_dict(self):
+        """
+        Returns all the metadata as a dict.
+        Note that modifications to the returned dict will not reflect on the Metadata object.
+        """
+        return copy.copy(self.__dict__)
+
+    def set(self, **kwargs):
+        """
+        Set multiple metadata with kwargs.
+        """
+        for k, v in kwargs.items():
+            setattr(self, k, v)
+        return self
+
+    def get(self, key, default=None):
+        """
+        Access an attribute and return its value if exists.
+        Otherwise return default.
+        """
+        try:
+            return getattr(self, key)
+        except AttributeError:
+            return default
+
+
+class MetadataCatalog:
+    """
+    MetadataCatalog provides access to "Metadata" of a given dataset.
+
+    The metadata associated with a certain name is a singleton: once created,
+    the metadata will stay alive and will be returned by future calls to `get(name)`.
+
+    It's like global variables, so don't abuse it.
+    It's meant for storing knowledge that's constant and shared across the execution
+    of the program, e.g.: the class names in COCO.
+    """
+
+    _NAME_TO_META = {}
+
+    @staticmethod
+    def get(name):
+        """
+        Args:
+            name (str): name of a dataset (e.g. coco_2014_train).
+
+        Returns:
+            Metadata: The :class:`Metadata` instance associated with this name,
+            or create an empty one if none is available.
+        """
+        assert len(name)
+        if name in MetadataCatalog._NAME_TO_META:
+            ret = MetadataCatalog._NAME_TO_META[name]
+            # TODO this is for the BC breaking change in D15247032.
+            # Remove this in the future.
+            if hasattr(ret, "dataset_name"):
+                logger = logging.getLogger()
+                logger.warning(
+                    """
+The 'dataset_name' key in metadata is no longer used for
+sharing metadata among splits after D15247032! Add
+metadata to each split (now called dataset) separately!
+                    """
+                )
+                parent_meta = MetadataCatalog.get(ret.dataset_name).as_dict()
+                ret.set(**parent_meta)
+            return ret
+        else:
+            m = MetadataCatalog._NAME_TO_META[name] = Metadata(name=name)
+            return m
+
+    @staticmethod
+    def list():
+        """
+        List all registered metadata.
+
+        Returns:
+            list[str]: keys (names of data) of all registered metadata
+        """
+        return list(MetadataCatalog._NAME_TO_META.keys())
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/detectron2/data/common.py b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/data/common.py
new file mode 100644
index 0000000000000000000000000000000000000000..a42c8b21b86338a3f034d01c3484dd32b1b845a9
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/data/common.py
@@ -0,0 +1,149 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+import copy
+import logging
+import numpy as np
+import pickle
+import random
+import torch.utils.data as data
+
+from detectron2.utils.serialize import PicklableWrapper
+
+__all__ = ["MapDataset", "DatasetFromList", "AspectRatioGroupedDataset"]
+
+
+class MapDataset(data.Dataset):
+    """
+    Map a function over the elements in a dataset.
+
+    Args:
+        dataset: a dataset where map function is applied.
+        map_func: a callable which maps the element in dataset. map_func is
+            responsible for error handling, when error happens, it needs to
+            return None so the MapDataset will randomly use other
+            elements from the dataset.
+    """
+
+    def __init__(self, dataset, map_func):
+        self._dataset = dataset
+        self._map_func = PicklableWrapper(map_func)  # wrap so that a lambda will work
+
+        self._rng = random.Random(42)
+        self._fallback_candidates = set(range(len(dataset)))
+
+    def __len__(self):
+        return len(self._dataset)
+
+    def __getitem__(self, idx):
+        retry_count = 0
+        cur_idx = int(idx)
+
+        while True:
+            data = self._map_func(self._dataset[cur_idx])
+            if data is not None:
+                self._fallback_candidates.add(cur_idx)
+                return data
+
+            # _map_func fails for this idx, use a random new index from the pool
+            retry_count += 1
+            self._fallback_candidates.discard(cur_idx)
+            cur_idx = self._rng.sample(self._fallback_candidates, k=1)[0]
+
+            if retry_count >= 3:
+                logger = logging.getLogger(__name__)
+                logger.warning(
+                    "Failed to apply `_map_func` for idx: {}, retry count: {}".format(
+                        idx, retry_count
+                    )
+                )
+
+
+class DatasetFromList(data.Dataset):
+    """
+    Wrap a list to a torch Dataset. It produces elements of the list as data.
+    """
+
+    def __init__(self, lst: list, copy: bool = True, serialize: bool = True):
+        """
+        Args:
+            lst (list): a list which contains elements to produce.
+            copy (bool): whether to deepcopy the element when producing it,
+                so that the result can be modified in place without affecting the
+                source in the list.
+            serialize (bool): whether to hold memory using serialized objects, when
+                enabled, data loader workers can use shared RAM from master
+                process instead of making a copy.
+        """
+        self._lst = lst
+        self._copy = copy
+        self._serialize = serialize
+
+        def _serialize(data):
+            buffer = pickle.dumps(data, protocol=-1)
+            return np.frombuffer(buffer, dtype=np.uint8)
+
+        if self._serialize:
+            logger = logging.getLogger(__name__)
+            logger.info(
+                "Serializing {} elements to byte tensors and concatenating them all ...".format(
+                    len(self._lst)
+                )
+            )
+            self._lst = [_serialize(x) for x in self._lst]
+            self._addr = np.asarray([len(x) for x in self._lst], dtype=np.int64)
+            self._addr = np.cumsum(self._addr)
+            self._lst = np.concatenate(self._lst)
+            logger.info("Serialized dataset takes {:.2f} MiB".format(len(self._lst) / 1024 ** 2))
+
+    def __len__(self):
+        if self._serialize:
+            return len(self._addr)
+        else:
+            return len(self._lst)
+
+    def __getitem__(self, idx):
+        if self._serialize:
+            start_addr = 0 if idx == 0 else self._addr[idx - 1].item()
+            end_addr = self._addr[idx].item()
+            bytes = memoryview(self._lst[start_addr:end_addr])
+            return pickle.loads(bytes)
+        elif self._copy:
+            return copy.deepcopy(self._lst[idx])
+        else:
+            return self._lst[idx]
+
+
+class AspectRatioGroupedDataset(data.IterableDataset):
+    """
+    Batch data that have similar aspect ratio together.
+    In this implementation, images whose aspect ratio < (or >) 1 will
+    be batched together.
+    This improves training speed because the images then need less padding
+    to form a batch.
+
+    It assumes the underlying dataset produces dicts with "width" and "height" keys.
+    It will then produce a list of original dicts with length = batch_size,
+    all with similar aspect ratios.
+    """
+
+    def __init__(self, dataset, batch_size):
+        """
+        Args:
+            dataset: an iterable. Each element must be a dict with keys
+                "width" and "height", which will be used to batch data.
+            batch_size (int):
+        """
+        self.dataset = dataset
+        self.batch_size = batch_size
+        self._buckets = [[] for _ in range(2)]
+        # Hard-coded two aspect ratio groups: w > h and w < h.
+        # Can add support for more aspect ratio groups, but doesn't seem useful
+
+    def __iter__(self):
+        for d in self.dataset:
+            w, h = d["width"], d["height"]
+            bucket_id = 0 if w > h else 1
+            bucket = self._buckets[bucket_id]
+            bucket.append(d)
+            if len(bucket) == self.batch_size:
+                yield bucket[:]
+                del bucket[:]
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/detectron2/data/dataset_mapper.py b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/data/dataset_mapper.py
new file mode 100644
index 0000000000000000000000000000000000000000..db73b378a6c2938a3beb700010a13172e6cc549f
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/data/dataset_mapper.py
@@ -0,0 +1,149 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+import copy
+import logging
+import numpy as np
+import torch
+from fvcore.common.file_io import PathManager
+from PIL import Image
+
+from . import detection_utils as utils
+from . import transforms as T
+
+"""
+This file contains the default mapping that's applied to "dataset dicts".
+"""
+
+__all__ = ["DatasetMapper"]
+
+
+class DatasetMapper:
+    """
+    A callable which takes a dataset dict in Detectron2 Dataset format,
+    and map it into a format used by the model.
+
+    This is the default callable to be used to map your dataset dict into training data.
+    You may need to follow it to implement your own one for customized logic,
+    such as a different way to read or transform images.
+    See :doc:`/tutorials/data_loading` for details.
+
+    The callable currently does the following:
+
+    1. Read the image from "file_name"
+    2. Applies cropping/geometric transforms to the image and annotations
+    3. Prepare data and annotations to Tensor and :class:`Instances`
+    """
+
+    def __init__(self, cfg, is_train=True):
+        if cfg.INPUT.CROP.ENABLED and is_train:
+            self.crop_gen = T.RandomCrop(cfg.INPUT.CROP.TYPE, cfg.INPUT.CROP.SIZE)
+            logging.getLogger(__name__).info("CropGen used in training: " + str(self.crop_gen))
+        else:
+            self.crop_gen = None
+
+        self.tfm_gens = utils.build_transform_gen(cfg, is_train)
+
+        # fmt: off
+        self.img_format     = cfg.INPUT.FORMAT
+        self.mask_on        = cfg.MODEL.MASK_ON
+        self.mask_format    = cfg.INPUT.MASK_FORMAT
+        self.keypoint_on    = cfg.MODEL.KEYPOINT_ON
+        self.load_proposals = cfg.MODEL.LOAD_PROPOSALS
+        # fmt: on
+        if self.keypoint_on and is_train:
+            # Flip only makes sense in training
+            self.keypoint_hflip_indices = utils.create_keypoint_hflip_indices(cfg.DATASETS.TRAIN)
+        else:
+            self.keypoint_hflip_indices = None
+
+        if self.load_proposals:
+            self.min_box_side_len = cfg.MODEL.PROPOSAL_GENERATOR.MIN_SIZE
+            self.proposal_topk = (
+                cfg.DATASETS.PRECOMPUTED_PROPOSAL_TOPK_TRAIN
+                if is_train
+                else cfg.DATASETS.PRECOMPUTED_PROPOSAL_TOPK_TEST
+            )
+        self.is_train = is_train
+
+    def __call__(self, dataset_dict):
+        """
+        Args:
+            dataset_dict (dict): Metadata of one image, in Detectron2 Dataset format.
+
+        Returns:
+            dict: a format that builtin models in detectron2 accept
+        """
+        dataset_dict = copy.deepcopy(dataset_dict)  # it will be modified by code below
+        # USER: Write your own image loading if it's not from a file
+        image = utils.read_image(dataset_dict["file_name"], format=self.img_format)
+        utils.check_image_size(dataset_dict, image)
+
+        if "annotations" not in dataset_dict:
+            image, transforms = T.apply_transform_gens(
+                ([self.crop_gen] if self.crop_gen else []) + self.tfm_gens, image
+            )
+        else:
+            # Crop around an instance if there are instances in the image.
+            # USER: Remove if you don't use cropping
+            if self.crop_gen:
+                crop_tfm = utils.gen_crop_transform_with_instance(
+                    self.crop_gen.get_crop_size(image.shape[:2]),
+                    image.shape[:2],
+                    np.random.choice(dataset_dict["annotations"]),
+                )
+                image = crop_tfm.apply_image(image)
+            image, transforms = T.apply_transform_gens(self.tfm_gens, image)
+            if self.crop_gen:
+                transforms = crop_tfm + transforms
+
+        image_shape = image.shape[:2]  # h, w
+
+        # Pytorch's dataloader is efficient on torch.Tensor due to shared-memory,
+        # but not efficient on large generic data structures due to the use of pickle & mp.Queue.
+        # Therefore it's important to use torch.Tensor.
+        dataset_dict["image"] = torch.as_tensor(np.ascontiguousarray(image.transpose(2, 0, 1)))
+
+        # USER: Remove if you don't use pre-computed proposals.
+        if self.load_proposals:
+            utils.transform_proposals(
+                dataset_dict, image_shape, transforms, self.min_box_side_len, self.proposal_topk
+            )
+
+        if not self.is_train:
+            # USER: Modify this if you want to keep them for some reason.
+            dataset_dict.pop("annotations", None)
+            dataset_dict.pop("sem_seg_file_name", None)
+            return dataset_dict
+
+        if "annotations" in dataset_dict:
+            # USER: Modify this if you want to keep them for some reason.
+            for anno in dataset_dict["annotations"]:
+                if not self.mask_on:
+                    anno.pop("segmentation", None)
+                if not self.keypoint_on:
+                    anno.pop("keypoints", None)
+
+            # USER: Implement additional transformations if you have other types of data
+            annos = [
+                utils.transform_instance_annotations(
+                    obj, transforms, image_shape, keypoint_hflip_indices=self.keypoint_hflip_indices
+                )
+                for obj in dataset_dict.pop("annotations")
+                if obj.get("iscrowd", 0) == 0
+            ]
+            instances = utils.annotations_to_instances(
+                annos, image_shape, mask_format=self.mask_format
+            )
+            # Create a tight bounding box from masks, useful when image is cropped
+            if self.crop_gen and instances.has("gt_masks"):
+                instances.gt_boxes = instances.gt_masks.get_bounding_boxes()
+            dataset_dict["instances"] = utils.filter_empty_instances(instances)
+
+        # USER: Remove if you don't do semantic/panoptic segmentation.
+        if "sem_seg_file_name" in dataset_dict:
+            with PathManager.open(dataset_dict.pop("sem_seg_file_name"), "rb") as f:
+                sem_seg_gt = Image.open(f)
+                sem_seg_gt = np.asarray(sem_seg_gt, dtype="uint8")
+            sem_seg_gt = transforms.apply_segmentation(sem_seg_gt)
+            sem_seg_gt = torch.as_tensor(sem_seg_gt.astype("long"))
+            dataset_dict["sem_seg"] = sem_seg_gt
+        return dataset_dict
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/detectron2/data/datasets/README.md b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/data/datasets/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..9fb3e4f7afec17137c95c78be6ef06d520ec8032
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/data/datasets/README.md
@@ -0,0 +1,9 @@
+
+
+### Common Datasets
+
+The dataset implemented here do not need to load the data into the final format.
+It should provide the minimal data structure needed to use the dataset, so it can be very efficient.
+
+For example, for an image dataset, just provide the file names and labels, but don't read the images.
+Let the downstream decide how to read.
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/detectron2/data/datasets/__init__.py b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/data/datasets/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..9c3f556bd201890fcca901d26efb5f9d8c3304f5
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/data/datasets/__init__.py
@@ -0,0 +1,9 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+from .cityscapes import load_cityscapes_instances
+from .coco import load_coco_json, load_sem_seg
+from .lvis import load_lvis_json, register_lvis_instances, get_lvis_instances_meta
+from .register_coco import register_coco_instances, register_coco_panoptic_separated
+from . import builtin  # ensure the builtin data are registered
+
+
+__all__ = [k for k in globals().keys() if "builtin" not in k and not k.startswith("_")]
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/detectron2/data/datasets/builtin.py b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/data/datasets/builtin.py
new file mode 100644
index 0000000000000000000000000000000000000000..21ac2228c56d59b38c9288fd720aab5fdc63ac0b
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/data/datasets/builtin.py
@@ -0,0 +1,220 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+
+
+"""
+This file registers pre-defined data at hard-coded paths, and their metadata.
+
+We hard-code metadata for common data. This will enable:
+1. Consistency check when loading the data
+2. Use models on these standard data directly and run demos,
+   without having to download the dataset annotations
+
+We hard-code some paths to the dataset that's assumed to
+exist in "./data/".
+
+Users SHOULD NOT use this file to create new dataset / metadata for new dataset.
+To add new dataset, refer to the tutorial "docs/DATASETS.md".
+"""
+
+import os
+
+from detectron2.data import DatasetCatalog, MetadataCatalog
+
+from .builtin_meta import _get_builtin_metadata
+from .cityscapes import load_cityscapes_instances, load_cityscapes_semantic
+from .lvis import get_lvis_instances_meta, register_lvis_instances
+from .pascal_voc import register_pascal_voc
+from .register_coco import register_coco_instances, register_coco_panoptic_separated
+
+# ==== Predefined data and splits for COCO ==========
+
+_PREDEFINED_SPLITS_COCO = {}
+_PREDEFINED_SPLITS_COCO["coco"] = {
+    "coco_2014_train": ("coco/train2014", "coco/annotations/instances_train2014.json"),
+    "coco_2014_val": ("coco/val2014", "coco/annotations/instances_val2014.json"),
+    "coco_2014_minival": ("coco/val2014", "coco/annotations/instances_minival2014.json"),
+    "coco_2014_minival_100": ("coco/val2014", "coco/annotations/instances_minival2014_100.json"),
+    "coco_2014_valminusminival": (
+        "coco/val2014",
+        "coco/annotations/instances_valminusminival2014.json",
+    ),
+    "coco_2017_train": ("coco/train2017", "coco/annotations/instances_train2017.json"),
+    "coco_2017_val": ("coco/val2017", "coco/annotations/instances_val2017.json"),
+    "coco_2017_test": ("coco/test2017", "coco/annotations/image_info_test2017.json"),
+    "coco_2017_test-dev": ("coco/test2017", "coco/annotations/image_info_test-dev2017.json"),
+    "coco_2017_val_100": ("coco/val2017", "coco/annotations/instances_val2017_100.json"),
+}
+
+_PREDEFINED_SPLITS_COCO["coco_person"] = {
+    "keypoints_coco_2014_train": (
+        "coco/train2014",
+        "coco/annotations/person_keypoints_train2014.json",
+    ),
+    "keypoints_coco_2014_val": ("coco/val2014", "coco/annotations/person_keypoints_val2014.json"),
+    "keypoints_coco_2014_minival": (
+        "coco/val2014",
+        "coco/annotations/person_keypoints_minival2014.json",
+    ),
+    "keypoints_coco_2014_valminusminival": (
+        "coco/val2014",
+        "coco/annotations/person_keypoints_valminusminival2014.json",
+    ),
+    "keypoints_coco_2014_minival_100": (
+        "coco/val2014",
+        "coco/annotations/person_keypoints_minival2014_100.json",
+    ),
+    "keypoints_coco_2017_train": (
+        "coco/train2017",
+        "coco/annotations/person_keypoints_train2017.json",
+    ),
+    "keypoints_coco_2017_val": ("coco/val2017", "coco/annotations/person_keypoints_val2017.json"),
+    "keypoints_coco_2017_val_100": (
+        "coco/val2017",
+        "coco/annotations/person_keypoints_val2017_100.json",
+    ),
+}
+
+
+_PREDEFINED_SPLITS_COCO_PANOPTIC = {
+    "coco_2017_train_panoptic": (
+        # This is the original panoptic annotation directory
+        "coco/panoptic_train2017",
+        "coco/annotations/panoptic_train2017.json",
+        # This directory contains semantic annotations that are
+        # converted from panoptic annotations.
+        # It is used by PanopticFPN.
+        # You can use the script at detectron2/data/prepare_panoptic_fpn.py
+        # to create these directories.
+        "coco/panoptic_stuff_train2017",
+    ),
+    "coco_2017_val_panoptic": (
+        "coco/panoptic_val2017",
+        "coco/annotations/panoptic_val2017.json",
+        "coco/panoptic_stuff_val2017",
+    ),
+    "coco_2017_val_100_panoptic": (
+        "coco/panoptic_val2017_100",
+        "coco/annotations/panoptic_val2017_100.json",
+        "coco/panoptic_stuff_val2017_100",
+    ),
+}
+
+
+def register_all_coco(root):
+    for dataset_name, splits_per_dataset in _PREDEFINED_SPLITS_COCO.items():
+        for key, (image_root, json_file) in splits_per_dataset.items():
+            # Assume pre-defined data live in `./data`.
+            register_coco_instances(
+                key,
+                _get_builtin_metadata(dataset_name),
+                os.path.join(root, json_file) if "://" not in json_file else json_file,
+                os.path.join(root, image_root),
+            )
+
+    for (
+        prefix,
+        (panoptic_root, panoptic_json, semantic_root),
+    ) in _PREDEFINED_SPLITS_COCO_PANOPTIC.items():
+        prefix_instances = prefix[: -len("_panoptic")]
+        instances_meta = MetadataCatalog.get(prefix_instances)
+        image_root, instances_json = instances_meta.image_root, instances_meta.json_file
+        register_coco_panoptic_separated(
+            prefix,
+            _get_builtin_metadata("coco_panoptic_separated"),
+            image_root,
+            os.path.join(root, panoptic_root),
+            os.path.join(root, panoptic_json),
+            os.path.join(root, semantic_root),
+            instances_json,
+        )
+
+
+# ==== Predefined data and splits for LVIS ==========
+
+
+_PREDEFINED_SPLITS_LVIS = {
+    "lvis_v0.5": {
+        "lvis_v0.5_train": ("coco/train2017", "lvis/lvis_v0.5_train.json"),
+        "lvis_v0.5_val": ("coco/val2017", "lvis/lvis_v0.5_val.json"),
+        "lvis_v0.5_val_rand_100": ("coco/val2017", "lvis/lvis_v0.5_val_rand_100.json"),
+        "lvis_v0.5_test": ("coco/test2017", "lvis/lvis_v0.5_image_info_test.json"),
+    },
+    "lvis_v0.5_cocofied": {
+        "lvis_v0.5_train_cocofied": ("coco/train2017", "lvis/lvis_v0.5_train_cocofied.json"),
+        "lvis_v0.5_val_cocofied": ("coco/val2017", "lvis/lvis_v0.5_val_cocofied.json"),
+    },
+}
+
+
+def register_all_lvis(root):
+    for dataset_name, splits_per_dataset in _PREDEFINED_SPLITS_LVIS.items():
+        for key, (image_root, json_file) in splits_per_dataset.items():
+            # Assume pre-defined data live in `./data`.
+            register_lvis_instances(
+                key,
+                get_lvis_instances_meta(dataset_name),
+                os.path.join(root, json_file) if "://" not in json_file else json_file,
+                os.path.join(root, image_root),
+            )
+
+
+# ==== Predefined splits for raw cityscapes images ===========
+
+
+_RAW_CITYSCAPES_SPLITS = {
+    "cityscapes_fine_{task}_train": ("cityscapes/leftImg8bit/train", "cityscapes/gtFine/train"),
+    "cityscapes_fine_{task}_val": ("cityscapes/leftImg8bit/val", "cityscapes/gtFine/val"),
+    "cityscapes_fine_{task}_test": ("cityscapes/leftImg8bit/test", "cityscapes/gtFine/test"),
+}
+
+
+def register_all_cityscapes(root):
+    for key, (image_dir, gt_dir) in _RAW_CITYSCAPES_SPLITS.items():
+        meta = _get_builtin_metadata("cityscapes")
+        image_dir = os.path.join(root, image_dir)
+        gt_dir = os.path.join(root, gt_dir)
+
+        inst_key = key.format(task="instance_seg")
+        DatasetCatalog.register(
+            inst_key,
+            lambda x=image_dir, y=gt_dir: load_cityscapes_instances(
+                x, y, from_json=True, to_polygons=True
+            ),
+        )
+        MetadataCatalog.get(inst_key).set(
+            image_dir=image_dir, gt_dir=gt_dir, evaluator_type="cityscapes_instance", **meta
+        )
+
+        sem_key = key.format(task="sem_seg")
+        DatasetCatalog.register(
+            sem_key, lambda x=image_dir, y=gt_dir: load_cityscapes_semantic(x, y)
+        )
+        MetadataCatalog.get(sem_key).set(
+            image_dir=image_dir, gt_dir=gt_dir, evaluator_type="cityscapes_sem_seg", **meta
+        )
+
+
+# ==== Predefined splits for PASCAL VOC ===========
+def register_all_pascal_voc(root):
+    SPLITS = [
+        ("voc_2007_trainval", "VOC2007", "trainval"),
+        ("voc_2007_train", "VOC2007", "train"),
+        ("voc_2007_val", "VOC2007", "val"),
+        ("voc_2007_test", "VOC2007", "test"),
+        ("voc_2012_trainval", "VOC2012", "trainval"),
+        ("voc_2012_train", "VOC2012", "train"),
+        ("voc_2012_val", "VOC2012", "val"),
+    ]
+    for name, dirname, split in SPLITS:
+        year = 2007 if "2007" in name else 2012
+        register_pascal_voc(name, os.path.join(root, dirname), split, year)
+        MetadataCatalog.get(name).evaluator_type = "pascal_voc"
+
+
+# Register them all under "./data"
+_root = os.getenv("DETECTRON2_DATASETS", "data")
+register_all_coco(_root)
+register_all_lvis(_root)
+register_all_cityscapes(_root)
+register_all_pascal_voc(_root)
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/detectron2/data/datasets/builtin_meta.py b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/data/datasets/builtin_meta.py
new file mode 100644
index 0000000000000000000000000000000000000000..74c79863a9d1ef5df9b5ce64f97d6be8e4e37d59
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/data/datasets/builtin_meta.py
@@ -0,0 +1,267 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+
+
+# All coco categories, together with their nice-looking visualization colors
+# It's from https://github.com/cocodataset/panopticapi/blob/master/panoptic_coco_categories.json
+COCO_CATEGORIES = [
+    {"color": [220, 20, 60], "isthing": 1, "id": 1, "name": "person"},
+    {"color": [119, 11, 32], "isthing": 1, "id": 2, "name": "bicycle"},
+    {"color": [0, 0, 142], "isthing": 1, "id": 3, "name": "car"},
+    {"color": [0, 0, 230], "isthing": 1, "id": 4, "name": "motorcycle"},
+    {"color": [106, 0, 228], "isthing": 1, "id": 5, "name": "airplane"},
+    {"color": [0, 60, 100], "isthing": 1, "id": 6, "name": "bus"},
+    {"color": [0, 80, 100], "isthing": 1, "id": 7, "name": "train"},
+    {"color": [0, 0, 70], "isthing": 1, "id": 8, "name": "truck"},
+    {"color": [0, 0, 192], "isthing": 1, "id": 9, "name": "boat"},
+    {"color": [250, 170, 30], "isthing": 1, "id": 10, "name": "traffic light"},
+    {"color": [100, 170, 30], "isthing": 1, "id": 11, "name": "fire hydrant"},
+    {"color": [220, 220, 0], "isthing": 1, "id": 13, "name": "stop sign"},
+    {"color": [175, 116, 175], "isthing": 1, "id": 14, "name": "parking meter"},
+    {"color": [250, 0, 30], "isthing": 1, "id": 15, "name": "bench"},
+    {"color": [165, 42, 42], "isthing": 1, "id": 16, "name": "bird"},
+    {"color": [255, 77, 255], "isthing": 1, "id": 17, "name": "cat"},
+    {"color": [0, 226, 252], "isthing": 1, "id": 18, "name": "dog"},
+    {"color": [182, 182, 255], "isthing": 1, "id": 19, "name": "horse"},
+    {"color": [0, 82, 0], "isthing": 1, "id": 20, "name": "sheep"},
+    {"color": [120, 166, 157], "isthing": 1, "id": 21, "name": "cow"},
+    {"color": [110, 76, 0], "isthing": 1, "id": 22, "name": "elephant"},
+    {"color": [174, 57, 255], "isthing": 1, "id": 23, "name": "bear"},
+    {"color": [199, 100, 0], "isthing": 1, "id": 24, "name": "zebra"},
+    {"color": [72, 0, 118], "isthing": 1, "id": 25, "name": "giraffe"},
+    {"color": [255, 179, 240], "isthing": 1, "id": 27, "name": "backpack"},
+    {"color": [0, 125, 92], "isthing": 1, "id": 28, "name": "umbrella"},
+    {"color": [209, 0, 151], "isthing": 1, "id": 31, "name": "handbag"},
+    {"color": [188, 208, 182], "isthing": 1, "id": 32, "name": "tie"},
+    {"color": [0, 220, 176], "isthing": 1, "id": 33, "name": "suitcase"},
+    {"color": [255, 99, 164], "isthing": 1, "id": 34, "name": "frisbee"},
+    {"color": [92, 0, 73], "isthing": 1, "id": 35, "name": "skis"},
+    {"color": [133, 129, 255], "isthing": 1, "id": 36, "name": "snowboard"},
+    {"color": [78, 180, 255], "isthing": 1, "id": 37, "name": "sports ball"},
+    {"color": [0, 228, 0], "isthing": 1, "id": 38, "name": "kite"},
+    {"color": [174, 255, 243], "isthing": 1, "id": 39, "name": "baseball bat"},
+    {"color": [45, 89, 255], "isthing": 1, "id": 40, "name": "baseball glove"},
+    {"color": [134, 134, 103], "isthing": 1, "id": 41, "name": "skateboard"},
+    {"color": [145, 148, 174], "isthing": 1, "id": 42, "name": "surfboard"},
+    {"color": [255, 208, 186], "isthing": 1, "id": 43, "name": "tennis racket"},
+    {"color": [197, 226, 255], "isthing": 1, "id": 44, "name": "bottle"},
+    {"color": [171, 134, 1], "isthing": 1, "id": 46, "name": "wine glass"},
+    {"color": [109, 63, 54], "isthing": 1, "id": 47, "name": "cup"},
+    {"color": [207, 138, 255], "isthing": 1, "id": 48, "name": "fork"},
+    {"color": [151, 0, 95], "isthing": 1, "id": 49, "name": "knife"},
+    {"color": [9, 80, 61], "isthing": 1, "id": 50, "name": "spoon"},
+    {"color": [84, 105, 51], "isthing": 1, "id": 51, "name": "bowl"},
+    {"color": [74, 65, 105], "isthing": 1, "id": 52, "name": "banana"},
+    {"color": [166, 196, 102], "isthing": 1, "id": 53, "name": "apple"},
+    {"color": [208, 195, 210], "isthing": 1, "id": 54, "name": "sandwich"},
+    {"color": [255, 109, 65], "isthing": 1, "id": 55, "name": "orange"},
+    {"color": [0, 143, 149], "isthing": 1, "id": 56, "name": "broccoli"},
+    {"color": [179, 0, 194], "isthing": 1, "id": 57, "name": "carrot"},
+    {"color": [209, 99, 106], "isthing": 1, "id": 58, "name": "hot dog"},
+    {"color": [5, 121, 0], "isthing": 1, "id": 59, "name": "pizza"},
+    {"color": [227, 255, 205], "isthing": 1, "id": 60, "name": "donut"},
+    {"color": [147, 186, 208], "isthing": 1, "id": 61, "name": "cake"},
+    {"color": [153, 69, 1], "isthing": 1, "id": 62, "name": "chair"},
+    {"color": [3, 95, 161], "isthing": 1, "id": 63, "name": "couch"},
+    {"color": [163, 255, 0], "isthing": 1, "id": 64, "name": "potted plant"},
+    {"color": [119, 0, 170], "isthing": 1, "id": 65, "name": "bed"},
+    {"color": [0, 182, 199], "isthing": 1, "id": 67, "name": "dining table"},
+    {"color": [0, 165, 120], "isthing": 1, "id": 70, "name": "toilet"},
+    {"color": [183, 130, 88], "isthing": 1, "id": 72, "name": "tv"},
+    {"color": [95, 32, 0], "isthing": 1, "id": 73, "name": "laptop"},
+    {"color": [130, 114, 135], "isthing": 1, "id": 74, "name": "mouse"},
+    {"color": [110, 129, 133], "isthing": 1, "id": 75, "name": "remote"},
+    {"color": [166, 74, 118], "isthing": 1, "id": 76, "name": "keyboard"},
+    {"color": [219, 142, 185], "isthing": 1, "id": 77, "name": "cell phone"},
+    {"color": [79, 210, 114], "isthing": 1, "id": 78, "name": "microwave"},
+    {"color": [178, 90, 62], "isthing": 1, "id": 79, "name": "oven"},
+    {"color": [65, 70, 15], "isthing": 1, "id": 80, "name": "toaster"},
+    {"color": [127, 167, 115], "isthing": 1, "id": 81, "name": "sink"},
+    {"color": [59, 105, 106], "isthing": 1, "id": 82, "name": "refrigerator"},
+    {"color": [142, 108, 45], "isthing": 1, "id": 84, "name": "book"},
+    {"color": [196, 172, 0], "isthing": 1, "id": 85, "name": "clock"},
+    {"color": [95, 54, 80], "isthing": 1, "id": 86, "name": "vase"},
+    {"color": [128, 76, 255], "isthing": 1, "id": 87, "name": "scissors"},
+    {"color": [201, 57, 1], "isthing": 1, "id": 88, "name": "teddy bear"},
+    {"color": [246, 0, 122], "isthing": 1, "id": 89, "name": "hair drier"},
+    {"color": [191, 162, 208], "isthing": 1, "id": 90, "name": "toothbrush"},
+    {"color": [255, 255, 128], "isthing": 0, "id": 92, "name": "banner"},
+    {"color": [147, 211, 203], "isthing": 0, "id": 93, "name": "blanket"},
+    {"color": [150, 100, 100], "isthing": 0, "id": 95, "name": "bridge"},
+    {"color": [168, 171, 172], "isthing": 0, "id": 100, "name": "cardboard"},
+    {"color": [146, 112, 198], "isthing": 0, "id": 107, "name": "counter"},
+    {"color": [210, 170, 100], "isthing": 0, "id": 109, "name": "curtain"},
+    {"color": [92, 136, 89], "isthing": 0, "id": 112, "name": "door-stuff"},
+    {"color": [218, 88, 184], "isthing": 0, "id": 118, "name": "floor-wood"},
+    {"color": [241, 129, 0], "isthing": 0, "id": 119, "name": "flower"},
+    {"color": [217, 17, 255], "isthing": 0, "id": 122, "name": "fruit"},
+    {"color": [124, 74, 181], "isthing": 0, "id": 125, "name": "gravel"},
+    {"color": [70, 70, 70], "isthing": 0, "id": 128, "name": "house"},
+    {"color": [255, 228, 255], "isthing": 0, "id": 130, "name": "light"},
+    {"color": [154, 208, 0], "isthing": 0, "id": 133, "name": "mirror-stuff"},
+    {"color": [193, 0, 92], "isthing": 0, "id": 138, "name": "net"},
+    {"color": [76, 91, 113], "isthing": 0, "id": 141, "name": "pillow"},
+    {"color": [255, 180, 195], "isthing": 0, "id": 144, "name": "platform"},
+    {"color": [106, 154, 176], "isthing": 0, "id": 145, "name": "playingfield"},
+    {"color": [230, 150, 140], "isthing": 0, "id": 147, "name": "railroad"},
+    {"color": [60, 143, 255], "isthing": 0, "id": 148, "name": "river"},
+    {"color": [128, 64, 128], "isthing": 0, "id": 149, "name": "road"},
+    {"color": [92, 82, 55], "isthing": 0, "id": 151, "name": "roof"},
+    {"color": [254, 212, 124], "isthing": 0, "id": 154, "name": "sand"},
+    {"color": [73, 77, 174], "isthing": 0, "id": 155, "name": "sea"},
+    {"color": [255, 160, 98], "isthing": 0, "id": 156, "name": "shelf"},
+    {"color": [255, 255, 255], "isthing": 0, "id": 159, "name": "snow"},
+    {"color": [104, 84, 109], "isthing": 0, "id": 161, "name": "stairs"},
+    {"color": [169, 164, 131], "isthing": 0, "id": 166, "name": "tent"},
+    {"color": [225, 199, 255], "isthing": 0, "id": 168, "name": "towel"},
+    {"color": [137, 54, 74], "isthing": 0, "id": 171, "name": "wall-brick"},
+    {"color": [135, 158, 223], "isthing": 0, "id": 175, "name": "wall-stone"},
+    {"color": [7, 246, 231], "isthing": 0, "id": 176, "name": "wall-tile"},
+    {"color": [107, 255, 200], "isthing": 0, "id": 177, "name": "wall-wood"},
+    {"color": [58, 41, 149], "isthing": 0, "id": 178, "name": "water-other"},
+    {"color": [183, 121, 142], "isthing": 0, "id": 180, "name": "window-blind"},
+    {"color": [255, 73, 97], "isthing": 0, "id": 181, "name": "window-other"},
+    {"color": [107, 142, 35], "isthing": 0, "id": 184, "name": "tree-merged"},
+    {"color": [190, 153, 153], "isthing": 0, "id": 185, "name": "fence-merged"},
+    {"color": [146, 139, 141], "isthing": 0, "id": 186, "name": "ceiling-merged"},
+    {"color": [70, 130, 180], "isthing": 0, "id": 187, "name": "sky-other-merged"},
+    {"color": [134, 199, 156], "isthing": 0, "id": 188, "name": "cabinet-merged"},
+    {"color": [209, 226, 140], "isthing": 0, "id": 189, "name": "table-merged"},
+    {"color": [96, 36, 108], "isthing": 0, "id": 190, "name": "floor-other-merged"},
+    {"color": [96, 96, 96], "isthing": 0, "id": 191, "name": "pavement-merged"},
+    {"color": [64, 170, 64], "isthing": 0, "id": 192, "name": "mountain-merged"},
+    {"color": [152, 251, 152], "isthing": 0, "id": 193, "name": "grass-merged"},
+    {"color": [208, 229, 228], "isthing": 0, "id": 194, "name": "dirt-merged"},
+    {"color": [206, 186, 171], "isthing": 0, "id": 195, "name": "paper-merged"},
+    {"color": [152, 161, 64], "isthing": 0, "id": 196, "name": "food-other-merged"},
+    {"color": [116, 112, 0], "isthing": 0, "id": 197, "name": "building-other-merged"},
+    {"color": [0, 114, 143], "isthing": 0, "id": 198, "name": "rock-merged"},
+    {"color": [102, 102, 156], "isthing": 0, "id": 199, "name": "wall-other-merged"},
+    {"color": [250, 141, 255], "isthing": 0, "id": 200, "name": "rug-merged"},
+]
+
+# fmt: off
+COCO_PERSON_KEYPOINT_NAMES = (
+    "nose",
+    "left_eye", "right_eye",
+    "left_ear", "right_ear",
+    "left_shoulder", "right_shoulder",
+    "left_elbow", "right_elbow",
+    "left_wrist", "right_wrist",
+    "left_hip", "right_hip",
+    "left_knee", "right_knee",
+    "left_ankle", "right_ankle",
+)
+# fmt: on
+
+# Pairs of keypoints that should be exchanged under horizontal flipping
+COCO_PERSON_KEYPOINT_FLIP_MAP = (
+    ("left_eye", "right_eye"),
+    ("left_ear", "right_ear"),
+    ("left_shoulder", "right_shoulder"),
+    ("left_elbow", "right_elbow"),
+    ("left_wrist", "right_wrist"),
+    ("left_hip", "right_hip"),
+    ("left_knee", "right_knee"),
+    ("left_ankle", "right_ankle"),
+)
+
+# rules for pairs of keypoints to draw a line between, and the line color to use.
+KEYPOINT_CONNECTION_RULES = [
+    # face
+    ("left_ear", "left_eye", (102, 204, 255)),
+    ("right_ear", "right_eye", (51, 153, 255)),
+    ("left_eye", "nose", (102, 0, 204)),
+    ("nose", "right_eye", (51, 102, 255)),
+    # upper-body
+    ("left_shoulder", "right_shoulder", (255, 128, 0)),
+    ("left_shoulder", "left_elbow", (153, 255, 204)),
+    ("right_shoulder", "right_elbow", (128, 229, 255)),
+    ("left_elbow", "left_wrist", (153, 255, 153)),
+    ("right_elbow", "right_wrist", (102, 255, 224)),
+    # lower-body
+    ("left_hip", "right_hip", (255, 102, 0)),
+    ("left_hip", "left_knee", (255, 255, 77)),
+    ("right_hip", "right_knee", (153, 255, 204)),
+    ("left_knee", "left_ankle", (191, 255, 128)),
+    ("right_knee", "right_ankle", (255, 195, 77)),
+]
+
+
+def _get_coco_instances_meta():
+    thing_ids = [k["id"] for k in COCO_CATEGORIES if k["isthing"] == 1]
+    thing_colors = [k["color"] for k in COCO_CATEGORIES if k["isthing"] == 1]
+    assert len(thing_ids) == 80, len(thing_ids)
+    # Mapping from the incontiguous COCO category id to an id in [0, 79]
+    thing_dataset_id_to_contiguous_id = {k: i for i, k in enumerate(thing_ids)}
+    thing_classes = [k["name"] for k in COCO_CATEGORIES if k["isthing"] == 1]
+    ret = {
+        "thing_dataset_id_to_contiguous_id": thing_dataset_id_to_contiguous_id,
+        "thing_classes": thing_classes,
+        "thing_colors": thing_colors,
+    }
+    return ret
+
+
+def _get_coco_panoptic_separated_meta():
+    """
+    Returns metadata for "separated" version of the panoptic segmentation dataset.
+    """
+    stuff_ids = [k["id"] for k in COCO_CATEGORIES if k["isthing"] == 0]
+    assert len(stuff_ids) == 53, len(stuff_ids)
+
+    # For semantic segmentation, this mapping maps from contiguous stuff id
+    # (in [0, 53], used in models) to ids in the dataset (used for processing results)
+    # The id 0 is mapped to an extra category "thing".
+    stuff_dataset_id_to_contiguous_id = {k: i + 1 for i, k in enumerate(stuff_ids)}
+    # When converting COCO panoptic annotations to semantic annotations
+    # We label the "thing" category to 0
+    stuff_dataset_id_to_contiguous_id[0] = 0
+
+    # 54 names for COCO stuff categories (including "things")
+    stuff_classes = ["things"] + [
+        k["name"].replace("-other", "").replace("-merged", "")
+        for k in COCO_CATEGORIES
+        if k["isthing"] == 0
+    ]
+
+    # NOTE: I randomly picked a color for things
+    stuff_colors = [[82, 18, 128]] + [k["color"] for k in COCO_CATEGORIES if k["isthing"] == 0]
+    ret = {
+        "stuff_dataset_id_to_contiguous_id": stuff_dataset_id_to_contiguous_id,
+        "stuff_classes": stuff_classes,
+        "stuff_colors": stuff_colors,
+    }
+    ret.update(_get_coco_instances_meta())
+    return ret
+
+
+def _get_builtin_metadata(dataset_name):
+    if dataset_name == "coco":
+        return _get_coco_instances_meta()
+    if dataset_name == "coco_panoptic_separated":
+        return _get_coco_panoptic_separated_meta()
+    elif dataset_name == "coco_person":
+        return {
+            "thing_classes": ["person"],
+            "keypoint_names": COCO_PERSON_KEYPOINT_NAMES,
+            "keypoint_flip_map": COCO_PERSON_KEYPOINT_FLIP_MAP,
+            "keypoint_connection_rules": KEYPOINT_CONNECTION_RULES,
+        }
+    elif dataset_name == "cityscapes":
+        # fmt: off
+        CITYSCAPES_THING_CLASSES = [
+            "person", "rider", "car", "truck",
+            "bus", "train", "motorcycle", "bicycle",
+        ]
+        CITYSCAPES_STUFF_CLASSES = [
+            "road", "sidewalk", "building", "wall", "fence", "pole", "traffic light",
+            "traffic sign", "vegetation", "terrain", "sky", "person", "rider", "car",
+            "truck", "bus", "train", "motorcycle", "bicycle", "license plate",
+        ]
+        # fmt: on
+        return {
+            "thing_classes": CITYSCAPES_THING_CLASSES,
+            "stuff_classes": CITYSCAPES_STUFF_CLASSES,
+        }
+    raise KeyError("No built-in metadata for dataset {}".format(dataset_name))
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/detectron2/data/datasets/cityscapes.py b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/data/datasets/cityscapes.py
new file mode 100644
index 0000000000000000000000000000000000000000..062a555b959582eca525087ffc9859d298e926b8
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/data/datasets/cityscapes.py
@@ -0,0 +1,329 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+import functools
+import json
+import logging
+import multiprocessing as mp
+import numpy as np
+import os
+from itertools import chain
+import pycocotools.mask as mask_util
+from fvcore.common.file_io import PathManager
+from PIL import Image
+
+from detectron2.structures import BoxMode
+from detectron2.utils.comm import get_world_size
+from detectron2.utils.logger import setup_logger
+
+try:
+    import cv2  # noqa
+except ImportError:
+    # OpenCV is an optional dependency at the moment
+    pass
+
+
+logger = logging.getLogger(__name__)
+
+
+def get_cityscapes_files(image_dir, gt_dir):
+    files = []
+    # scan through the directory
+    cities = PathManager.ls(image_dir)
+    logger.info(f"{len(cities)} cities found in '{image_dir}'.")
+    for city in cities:
+        city_img_dir = os.path.join(image_dir, city)
+        city_gt_dir = os.path.join(gt_dir, city)
+        for basename in PathManager.ls(city_img_dir):
+            image_file = os.path.join(city_img_dir, basename)
+
+            suffix = "leftImg8bit.png"
+            assert basename.endswith(suffix)
+            basename = basename[: -len(suffix)]
+
+            instance_file = os.path.join(city_gt_dir, basename + "gtFine_instanceIds.png")
+            label_file = os.path.join(city_gt_dir, basename + "gtFine_labelIds.png")
+            json_file = os.path.join(city_gt_dir, basename + "gtFine_polygons.json")
+
+            files.append((image_file, instance_file, label_file, json_file))
+    assert len(files), "No images found in {}".format(image_dir)
+    for f in files[0]:
+        assert PathManager.isfile(f), f
+    return files
+
+
+def load_cityscapes_instances(image_dir, gt_dir, from_json=True, to_polygons=True):
+    """
+    Args:
+        image_dir (str): path to the raw dataset. e.g., "~/cityscapes/leftImg8bit/train".
+        gt_dir (str): path to the raw annotations. e.g., "~/cityscapes/gtFine/train".
+        from_json (bool): whether to read annotations from the raw json file or the png files.
+        to_polygons (bool): whether to represent the segmentation as polygons
+            (COCO's format) instead of masks (cityscapes's format).
+
+    Returns:
+        list[dict]: a list of dicts in Detectron2 standard format. (See
+        `Using Custom Datasets </tutorials/data.html>`_ )
+    """
+    if from_json:
+        assert to_polygons, (
+            "Cityscapes's json annotations are in polygon format. "
+            "Converting to mask format is not supported now."
+        )
+    files = get_cityscapes_files(image_dir, gt_dir)
+
+    logger.info("Preprocessing cityscapes annotations ...")
+    # This is still not fast: all workers will execute duplicate works and will
+    # take up to 10m on a 8GPU server.
+    pool = mp.Pool(processes=max(mp.cpu_count() // get_world_size() // 2, 4))
+
+    ret = pool.map(
+        functools.partial(cityscapes_files_to_dict, from_json=from_json, to_polygons=to_polygons),
+        files,
+    )
+    logger.info("Loaded {} images from {}".format(len(ret), image_dir))
+
+    # Map cityscape ids to contiguous ids
+    from cityscapesscripts.helpers.labels import labels
+
+    labels = [l for l in labels if l.hasInstances and not l.ignoreInEval]
+    dataset_id_to_contiguous_id = {l.id: idx for idx, l in enumerate(labels)}
+    for dict_per_image in ret:
+        for anno in dict_per_image["annotations"]:
+            anno["category_id"] = dataset_id_to_contiguous_id[anno["category_id"]]
+    return ret
+
+
+def load_cityscapes_semantic(image_dir, gt_dir):
+    """
+    Args:
+        image_dir (str): path to the raw dataset. e.g., "~/cityscapes/leftImg8bit/train".
+        gt_dir (str): path to the raw annotations. e.g., "~/cityscapes/gtFine/train".
+
+    Returns:
+        list[dict]: a list of dict, each has "file_name" and
+            "sem_seg_file_name".
+    """
+    ret = []
+    # gt_dir is small and contain many small files. make sense to fetch to local first
+    gt_dir = PathManager.get_local_path(gt_dir)
+    for image_file, _, label_file, json_file in get_cityscapes_files(image_dir, gt_dir):
+        label_file = label_file.replace("labelIds", "labelTrainIds")
+
+        with PathManager.open(json_file, "r") as f:
+            jsonobj = json.load(f)
+        ret.append(
+            {
+                "file_name": image_file,
+                "sem_seg_file_name": label_file,
+                "height": jsonobj["imgHeight"],
+                "width": jsonobj["imgWidth"],
+            }
+        )
+    assert len(ret), f"No images found in {image_dir}!"
+    assert PathManager.isfile(
+        ret[0]["sem_seg_file_name"]
+    ), "Please generate labelTrainIds.png with cityscapesscripts/preparation/createTrainIdLabelImgs.py"  # noqa
+    return ret
+
+
+def cityscapes_files_to_dict(files, from_json, to_polygons):
+    """
+    Parse cityscapes annotation files to a instance segmentation dataset dict.
+
+    Args:
+        files (tuple): consists of (image_file, instance_id_file, label_id_file, json_file)
+        from_json (bool): whether to read annotations from the raw json file or the png files.
+        to_polygons (bool): whether to represent the segmentation as polygons
+            (COCO's format) instead of masks (cityscapes's format).
+
+    Returns:
+        A dict in Detectron2 Dataset format.
+    """
+    from cityscapesscripts.helpers.labels import id2label, name2label
+
+    image_file, instance_id_file, _, json_file = files
+
+    annos = []
+
+    if from_json:
+        from shapely.geometry import MultiPolygon, Polygon
+
+        with PathManager.open(json_file, "r") as f:
+            jsonobj = json.load(f)
+        ret = {
+            "file_name": image_file,
+            "image_id": os.path.basename(image_file),
+            "height": jsonobj["imgHeight"],
+            "width": jsonobj["imgWidth"],
+        }
+
+        # `polygons_union` contains the union of all valid polygons.
+        polygons_union = Polygon()
+
+        # CityscapesScripts draw the polygons in sequential order
+        # and each polygon *overwrites* existing ones. See
+        # (https://github.com/mcordts/cityscapesScripts/blob/master/cityscapesscripts/preparation/json2instanceImg.py) # noqa
+        # We use reverse order, and each polygon *avoids* early ones.
+        # This will resolve the ploygon overlaps in the same way as CityscapesScripts.
+        for obj in jsonobj["objects"][::-1]:
+            if "deleted" in obj:  # cityscapes data format specific
+                continue
+            label_name = obj["label"]
+
+            try:
+                label = name2label[label_name]
+            except KeyError:
+                if label_name.endswith("group"):  # crowd area
+                    label = name2label[label_name[: -len("group")]]
+                else:
+                    raise
+            if label.id < 0:  # cityscapes data format
+                continue
+
+            # Cityscapes's raw annotations uses integer coordinates
+            # Therefore +0.5 here
+            poly_coord = np.asarray(obj["polygon"], dtype="f4") + 0.5
+            # CityscapesScript uses PIL.ImageDraw.polygon to rasterize
+            # polygons for evaluation. This function operates in integer space
+            # and draws each pixel whose center falls into the polygon.
+            # Therefore it draws a polygon which is 0.5 "fatter" in expectation.
+            # We therefore dilate the input polygon by 0.5 as our input.
+            poly = Polygon(poly_coord).buffer(0.5, resolution=4)
+
+            if not label.hasInstances or label.ignoreInEval:
+                # even if we won't store the polygon it still contributes to overlaps resolution
+                polygons_union = polygons_union.union(poly)
+                continue
+
+            # Take non-overlapping part of the polygon
+            poly_wo_overlaps = poly.difference(polygons_union)
+            if poly_wo_overlaps.is_empty:
+                continue
+            polygons_union = polygons_union.union(poly)
+
+            anno = {}
+            anno["iscrowd"] = label_name.endswith("group")
+            anno["category_id"] = label.id
+
+            if isinstance(poly_wo_overlaps, Polygon):
+                poly_list = [poly_wo_overlaps]
+            elif isinstance(poly_wo_overlaps, MultiPolygon):
+                poly_list = poly_wo_overlaps.geoms
+            else:
+                raise NotImplementedError("Unknown geometric structure {}".format(poly_wo_overlaps))
+
+            poly_coord = []
+            for poly_el in poly_list:
+                # COCO API can work only with exterior boundaries now, hence we store only them.
+                # TODO: store both exterior and interior boundaries once other parts of the
+                # codebase support holes in polygons.
+                poly_coord.append(list(chain(*poly_el.exterior.coords)))
+            anno["segmentation"] = poly_coord
+            (xmin, ymin, xmax, ymax) = poly_wo_overlaps.bounds
+
+            anno["bbox"] = (xmin, ymin, xmax, ymax)
+            anno["bbox_mode"] = BoxMode.XYXY_ABS
+
+            annos.append(anno)
+    else:
+        # See also the official annotation parsing scripts at
+        # https://github.com/mcordts/cityscapesScripts/blob/master/cityscapesscripts/evaluation/instances2dict.py  # noqa
+        with PathManager.open(instance_id_file, "rb") as f:
+            inst_image = np.asarray(Image.open(f), order="F")
+        # ids < 24 are stuff labels (filtering them first is about 5% faster)
+        flattened_ids = np.unique(inst_image[inst_image >= 24])
+
+        ret = {
+            "file_name": image_file,
+            "image_id": os.path.basename(image_file),
+            "height": inst_image.shape[0],
+            "width": inst_image.shape[1],
+        }
+
+        for instance_id in flattened_ids:
+            # For non-crowd annotations, instance_id // 1000 is the label_id
+            # Crowd annotations have <1000 instance ids
+            label_id = instance_id // 1000 if instance_id >= 1000 else instance_id
+            label = id2label[label_id]
+            if not label.hasInstances or label.ignoreInEval:
+                continue
+
+            anno = {}
+            anno["iscrowd"] = instance_id < 1000
+            anno["category_id"] = label.id
+
+            mask = np.asarray(inst_image == instance_id, dtype=np.uint8, order="F")
+
+            inds = np.nonzero(mask)
+            ymin, ymax = inds[0].min(), inds[0].max()
+            xmin, xmax = inds[1].min(), inds[1].max()
+            anno["bbox"] = (xmin, ymin, xmax, ymax)
+            if xmax <= xmin or ymax <= ymin:
+                continue
+            anno["bbox_mode"] = BoxMode.XYXY_ABS
+            if to_polygons:
+                # This conversion comes from D4809743 and D5171122,
+                # when Mask-RCNN was first developed.
+                contours = cv2.findContours(mask.copy(), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_NONE)[
+                    -2
+                ]
+                polygons = [c.reshape(-1).tolist() for c in contours if len(c) >= 3]
+                # opencv's can produce invalid polygons
+                if len(polygons) == 0:
+                    continue
+                anno["segmentation"] = polygons
+            else:
+                anno["segmentation"] = mask_util.encode(mask[:, :, None])[0]
+            annos.append(anno)
+    ret["annotations"] = annos
+    return ret
+
+
+if __name__ == "__main__":
+    """
+    Test the cityscapes dataset loader.
+
+    Usage:
+        python -m detectron2.data.data.cityscapes \
+            cityscapes/leftImg8bit/train cityscapes/gtFine/train
+    """
+    import argparse
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument("image_dir")
+    parser.add_argument("gt_dir")
+    parser.add_argument("--type", choices=["instance", "semantic"], default="instance")
+    args = parser.parse_args()
+    from detectron2.data.catalog import Metadata
+    from detectron2.utils.visualizer import Visualizer
+    from cityscapesscripts.helpers.labels import labels
+
+    logger = setup_logger(name=__name__)
+
+    dirname = "cityscapes-data-vis"
+    os.makedirs(dirname, exist_ok=True)
+
+    if args.type == "instance":
+        dicts = load_cityscapes_instances(
+            args.image_dir, args.gt_dir, from_json=True, to_polygons=True
+        )
+        logger.info("Done loading {} samples.".format(len(dicts)))
+
+        thing_classes = [k.name for k in labels if k.hasInstances and not k.ignoreInEval]
+        meta = Metadata().set(thing_classes=thing_classes)
+
+    else:
+        dicts = load_cityscapes_semantic(args.image_dir, args.gt_dir)
+        logger.info("Done loading {} samples.".format(len(dicts)))
+
+        stuff_names = [k.name for k in labels if k.trainId != 255]
+        stuff_colors = [k.color for k in labels if k.trainId != 255]
+        meta = Metadata().set(stuff_names=stuff_names, stuff_colors=stuff_colors)
+
+    for d in dicts:
+        img = np.array(Image.open(PathManager.open(d["file_name"], "rb")))
+        visualizer = Visualizer(img, metadata=meta)
+        vis = visualizer.draw_dataset_dict(d)
+        # cv2.imshow("a", vis.get_image()[:, :, ::-1])
+        # cv2.waitKey()
+        fpath = os.path.join(dirname, os.path.basename(d["file_name"]))
+        vis.save(fpath)
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/detectron2/data/datasets/coco.py b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/data/datasets/coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..f6f099e778e34cf89d267e13424d4f69240b7878
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/data/datasets/coco.py
@@ -0,0 +1,466 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+import contextlib
+import datetime
+import io
+import json
+import logging
+import numpy as np
+import os
+import pycocotools.mask as mask_util
+from fvcore.common.file_io import PathManager, file_lock
+from fvcore.common.timer import Timer
+from PIL import Image
+
+from detectron2.structures import Boxes, BoxMode, PolygonMasks
+
+from .. import DatasetCatalog, MetadataCatalog
+
+"""
+This file contains functions to parse COCO-format annotations into dicts in "Detectron2 format".
+"""
+
+
+logger = logging.getLogger(__name__)
+
+__all__ = ["load_coco_json", "load_sem_seg", "convert_to_coco_json"]
+
+
+def load_coco_json(json_file, image_root, dataset_name=None, extra_annotation_keys=None):
+    """
+    Load a json file with COCO's instances annotation format.
+    Currently supports instance detection, instance segmentation,
+    and person keypoints annotations.
+
+    Args:
+        json_file (str): full path to the json file in COCO instances annotation format.
+        image_root (str or path-like): the directory where the images in this json file exists.
+        dataset_name (str): the name of the dataset (e.g., coco_2017_train).
+            If provided, this function will also put "thing_classes" into
+            the metadata associated with this dataset.
+        extra_annotation_keys (list[str]): list of per-annotation keys that should also be
+            loaded into the dataset dict (besides "iscrowd", "bbox", "keypoints",
+            "category_id", "segmentation"). The values for these keys will be returned as-is.
+            For example, the densepose annotations are loaded in this way.
+
+    Returns:
+        list[dict]: a list of dicts in Detectron2 standard dataset dicts format. (See
+        `Using Custom Datasets </tutorials/data.html>`_ )
+
+    Notes:
+        1. This function does not read the image files.
+           The results do not have the "image" field.
+    """
+    from pycocotools.coco import COCO
+
+    timer = Timer()
+    json_file = PathManager.get_local_path(json_file)
+    with contextlib.redirect_stdout(io.StringIO()):
+        coco_api = COCO(json_file)
+    if timer.seconds() > 1:
+        logger.info("Loading {} takes {:.2f} seconds.".format(json_file, timer.seconds()))
+
+    id_map = None
+    if dataset_name is not None:
+        meta = MetadataCatalog.get(dataset_name)
+        cat_ids = sorted(coco_api.getCatIds())
+        cats = coco_api.loadCats(cat_ids)
+        # The categories in a custom json file may not be sorted.
+        thing_classes = [c["name"] for c in sorted(cats, key=lambda x: x["id"])]
+        meta.thing_classes = thing_classes
+
+        # In COCO, certain category ids are artificially removed,
+        # and by convention they are always ignored.
+        # We deal with COCO's id issue and translate
+        # the category ids to contiguous ids in [0, 80).
+
+        # It works by looking at the "categories" field in the json, therefore
+        # if users' own json also have incontiguous ids, we'll
+        # apply this mapping as well but print a warning.
+        if not (min(cat_ids) == 1 and max(cat_ids) == len(cat_ids)):
+            if "coco" not in dataset_name:
+                logger.warning(
+                    """
+Category ids in annotations are not in [1, #categories]! We'll apply a mapping for you.
+"""
+                )
+        id_map = {v: i for i, v in enumerate(cat_ids)}
+        meta.thing_dataset_id_to_contiguous_id = id_map
+
+    # sort indices for reproducible results
+    img_ids = sorted(coco_api.imgs.keys())
+    # imgs is a list of dicts, each looks something like:
+    # {'license': 4,
+    #  'url': 'http://farm6.staticflickr.com/5454/9413846304_881d5e5c3b_z.jpg',
+    #  'file_name': 'COCO_val2014_000000001268.jpg',
+    #  'height': 427,
+    #  'width': 640,
+    #  'date_captured': '2013-11-17 05:57:24',
+    #  'id': 1268}
+    imgs = coco_api.loadImgs(img_ids)
+    # anns is a list[list[dict]], where each dict is an annotation
+    # record for an object. The inner list enumerates the objects in an image
+    # and the outer list enumerates over images. Example of anns[0]:
+    # [{'segmentation': [[192.81,
+    #     247.09,
+    #     ...
+    #     219.03,
+    #     249.06]],
+    #   'area': 1035.749,
+    #   'iscrowd': 0,
+    #   'image_id': 1268,
+    #   'bbox': [192.81, 224.8, 74.73, 33.43],
+    #   'category_id': 16,
+    #   'id': 42986},
+    #  ...]
+    anns = [coco_api.imgToAnns[img_id] for img_id in img_ids]
+
+    if "minival" not in json_file:
+        # The popular valminusminival & minival annotations for COCO2014 contain this bug.
+        # However the ratio of buggy annotations there is tiny and does not affect accuracy.
+        # Therefore we explicitly white-list them.
+        ann_ids = [ann["id"] for anns_per_image in anns for ann in anns_per_image]
+        assert len(set(ann_ids)) == len(ann_ids), "Annotation ids in '{}' are not unique!".format(
+            json_file
+        )
+
+    imgs_anns = list(zip(imgs, anns))
+
+    logger.info("Loaded {} images in COCO format from {}".format(len(imgs_anns), json_file))
+
+    dataset_dicts = []
+
+    ann_keys = ["iscrowd", "bbox", "keypoints", "category_id"] + (extra_annotation_keys or [])
+
+    num_instances_without_valid_segmentation = 0
+
+    for (img_dict, anno_dict_list) in imgs_anns:
+        record = {}
+        record["file_name"] = os.path.join(image_root, img_dict["file_name"])
+        record["height"] = img_dict["height"]
+        record["width"] = img_dict["width"]
+        image_id = record["image_id"] = img_dict["id"]
+
+        objs = []
+        for anno in anno_dict_list:
+            # Check that the image_id in this annotation is the same as
+            # the image_id we're looking at.
+            # This fails only when the data parsing logic or the annotation file is buggy.
+
+            # The original COCO valminusminival2014 & minival2014 annotation files
+            # actually contains bugs that, together with certain ways of using COCO API,
+            # can trigger this assertion.
+            assert anno["image_id"] == image_id
+
+            assert anno.get("ignore", 0) == 0, '"ignore" in COCO json file is not supported.'
+
+            obj = {key: anno[key] for key in ann_keys if key in anno}
+
+            segm = anno.get("segmentation", None)
+            if segm:  # either list[list[float]] or dict(RLE)
+                if not isinstance(segm, dict):
+                    # filter out invalid polygons (< 3 points)
+                    segm = [poly for poly in segm if len(poly) % 2 == 0 and len(poly) >= 6]
+                    if len(segm) == 0:
+                        num_instances_without_valid_segmentation += 1
+                        continue  # ignore this instance
+                obj["segmentation"] = segm
+
+            keypts = anno.get("keypoints", None)
+            if keypts:  # list[int]
+                for idx, v in enumerate(keypts):
+                    if idx % 3 != 2:
+                        # COCO's segmentation coordinates are floating points in [0, H or W],
+                        # but keypoint coordinates are integers in [0, H-1 or W-1]
+                        # Therefore we assume the coordinates are "pixel indices" and
+                        # add 0.5 to convert to floating point coordinates.
+                        keypts[idx] = v + 0.5
+                obj["keypoints"] = keypts
+
+            obj["bbox_mode"] = BoxMode.XYWH_ABS
+            if id_map:
+                obj["category_id"] = id_map[obj["category_id"]]
+            objs.append(obj)
+        record["annotations"] = objs
+        dataset_dicts.append(record)
+
+    if num_instances_without_valid_segmentation > 0:
+        logger.warning(
+            "Filtered out {} instances without valid segmentation. "
+            "There might be issues in your dataset generation process.".format(
+                num_instances_without_valid_segmentation
+            )
+        )
+    return dataset_dicts
+
+
+def load_sem_seg(gt_root, image_root, gt_ext="png", image_ext="jpg"):
+    """
+    Load semantic segmentation data. All files under "gt_root" with "gt_ext" extension are
+    treated as ground truth annotations and all files under "image_root" with "image_ext" extension
+    as input images. Ground truth and input images are matched using file paths relative to
+    "gt_root" and "image_root" respectively without taking into account file extensions.
+    This works for COCO as well as some other data.
+
+    Args:
+        gt_root (str): full path to ground truth semantic segmentation files. Semantic segmentation
+            annotations are stored as images with integer values in pixels that represent
+            corresponding semantic labels.
+        image_root (str): the directory where the input images are.
+        gt_ext (str): file extension for ground truth annotations.
+        image_ext (str): file extension for input images.
+
+    Returns:
+        list[dict]:
+            a list of dicts in detectron2 standard format without instance-level
+            annotation.
+
+    Notes:
+        1. This function does not read the image and ground truth files.
+           The results do not have the "image" and "sem_seg" fields.
+    """
+
+    # We match input images with ground truth based on their relative filepaths (without file
+    # extensions) starting from 'image_root' and 'gt_root' respectively.
+    def file2id(folder_path, file_path):
+        # extract relative path starting from `folder_path`
+        image_id = os.path.normpath(os.path.relpath(file_path, start=folder_path))
+        # remove file extension
+        image_id = os.path.splitext(image_id)[0]
+        return image_id
+
+    input_files = sorted(
+        (os.path.join(image_root, f) for f in PathManager.ls(image_root) if f.endswith(image_ext)),
+        key=lambda file_path: file2id(image_root, file_path),
+    )
+    gt_files = sorted(
+        (os.path.join(gt_root, f) for f in PathManager.ls(gt_root) if f.endswith(gt_ext)),
+        key=lambda file_path: file2id(gt_root, file_path),
+    )
+
+    assert len(gt_files) > 0, "No annotations found in {}.".format(gt_root)
+
+    # Use the intersection, so that val2017_100 annotations can run smoothly with val2017 images
+    if len(input_files) != len(gt_files):
+        logger.warn(
+            "Directory {} and {} has {} and {} files, respectively.".format(
+                image_root, gt_root, len(input_files), len(gt_files)
+            )
+        )
+        input_basenames = [os.path.basename(f)[: -len(image_ext)] for f in input_files]
+        gt_basenames = [os.path.basename(f)[: -len(gt_ext)] for f in gt_files]
+        intersect = list(set(input_basenames) & set(gt_basenames))
+        # sort, otherwise each worker may obtain a list[dict] in different order
+        intersect = sorted(intersect)
+        logger.warn("Will use their intersection of {} files.".format(len(intersect)))
+        input_files = [os.path.join(image_root, f + image_ext) for f in intersect]
+        gt_files = [os.path.join(gt_root, f + gt_ext) for f in intersect]
+
+    logger.info(
+        "Loaded {} images with semantic segmentation from {}".format(len(input_files), image_root)
+    )
+
+    dataset_dicts = []
+    for (img_path, gt_path) in zip(input_files, gt_files):
+        record = {}
+        record["file_name"] = img_path
+        record["sem_seg_file_name"] = gt_path
+        dataset_dicts.append(record)
+
+    return dataset_dicts
+
+
+def convert_to_coco_dict(dataset_name):
+    """
+    Convert an instance detection/segmentation or keypoint detection dataset
+    in detectron2's standard format into COCO json format.
+
+    Generic dataset description can be found here:
+    https://detectron2.readthedocs.io/tutorials/datasets.html#register-a-dataset
+
+    COCO data format description can be found here:
+    http://cocodataset.org/#format-data
+
+    Args:
+        dataset_name (str):
+            name of the source dataset
+            Must be registered in DatastCatalog and in detectron2's standard format.
+            Must have corresponding metadata "thing_classes"
+    Returns:
+        coco_dict: serializable dict in COCO json format
+    """
+
+    dataset_dicts = DatasetCatalog.get(dataset_name)
+    metadata = MetadataCatalog.get(dataset_name)
+
+    # unmap the category mapping ids for COCO
+    if hasattr(metadata, "thing_dataset_id_to_contiguous_id"):
+        reverse_id_mapping = {v: k for k, v in metadata.thing_dataset_id_to_contiguous_id.items()}
+        reverse_id_mapper = lambda contiguous_id: reverse_id_mapping[contiguous_id]  # noqa
+    else:
+        reverse_id_mapper = lambda contiguous_id: contiguous_id  # noqa
+
+    categories = [
+        {"id": reverse_id_mapper(id), "name": name}
+        for id, name in enumerate(metadata.thing_classes)
+    ]
+
+    logger.info("Converting dataset dicts into COCO format")
+    coco_images = []
+    coco_annotations = []
+
+    for image_id, image_dict in enumerate(dataset_dicts):
+        coco_image = {
+            "id": image_dict.get("image_id", image_id),
+            "width": image_dict["width"],
+            "height": image_dict["height"],
+            "file_name": image_dict["file_name"],
+        }
+        coco_images.append(coco_image)
+
+        anns_per_image = image_dict["annotations"]
+        for annotation in anns_per_image:
+            # create a new dict with only COCO fields
+            coco_annotation = {}
+
+            # COCO requirement: XYWH box format
+            bbox = annotation["bbox"]
+            bbox_mode = annotation["bbox_mode"]
+            bbox = BoxMode.convert(bbox, bbox_mode, BoxMode.XYWH_ABS)
+
+            # COCO requirement: instance area
+            if "segmentation" in annotation:
+                # Computing areas for instances by counting the pixels
+                segmentation = annotation["segmentation"]
+                # TODO: check segmentation type: RLE, BinaryMask or Polygon
+                if isinstance(segmentation, list):
+                    polygons = PolygonMasks([segmentation])
+                    area = polygons.area()[0].item()
+                elif isinstance(segmentation, dict):  # RLE
+                    area = mask_util.area(segmentation).item()
+                else:
+                    raise TypeError(f"Unknown segmentation type {type(segmentation)}!")
+            else:
+                # Computing areas using bounding boxes
+                bbox_xy = BoxMode.convert(bbox, BoxMode.XYWH_ABS, BoxMode.XYXY_ABS)
+                area = Boxes([bbox_xy]).area()[0].item()
+
+            if "keypoints" in annotation:
+                keypoints = annotation["keypoints"]  # list[int]
+                for idx, v in enumerate(keypoints):
+                    if idx % 3 != 2:
+                        # COCO's segmentation coordinates are floating points in [0, H or W],
+                        # but keypoint coordinates are integers in [0, H-1 or W-1]
+                        # For COCO format consistency we substract 0.5
+                        # https://github.com/facebookresearch/detectron2/pull/175#issuecomment-551202163
+                        keypoints[idx] = v - 0.5
+                if "num_keypoints" in annotation:
+                    num_keypoints = annotation["num_keypoints"]
+                else:
+                    num_keypoints = sum(kp > 0 for kp in keypoints[2::3])
+
+            # COCO requirement:
+            #   linking annotations to images
+            #   "id" field must start with 1
+            coco_annotation["id"] = len(coco_annotations) + 1
+            coco_annotation["image_id"] = coco_image["id"]
+            coco_annotation["bbox"] = [round(float(x), 3) for x in bbox]
+            coco_annotation["area"] = float(area)
+            coco_annotation["iscrowd"] = annotation.get("iscrowd", 0)
+            coco_annotation["category_id"] = reverse_id_mapper(annotation["category_id"])
+
+            # Add optional fields
+            if "keypoints" in annotation:
+                coco_annotation["keypoints"] = keypoints
+                coco_annotation["num_keypoints"] = num_keypoints
+
+            if "segmentation" in annotation:
+                coco_annotation["segmentation"] = annotation["segmentation"]
+                if isinstance(coco_annotation["segmentation"], dict):  # RLE
+                    coco_annotation["segmentation"]["counts"] = coco_annotation["segmentation"][
+                        "counts"
+                    ].decode("ascii")
+
+            coco_annotations.append(coco_annotation)
+
+    logger.info(
+        "Conversion finished, "
+        f"#images: {len(coco_images)}, #annotations: {len(coco_annotations)}"
+    )
+
+    info = {
+        "date_created": str(datetime.datetime.now()),
+        "description": "Automatically generated COCO json file for Detectron2.",
+    }
+    coco_dict = {
+        "info": info,
+        "images": coco_images,
+        "annotations": coco_annotations,
+        "categories": categories,
+        "licenses": None,
+    }
+    return coco_dict
+
+
+def convert_to_coco_json(dataset_name, output_file, allow_cached=True):
+    """
+    Converts dataset into COCO format and saves it to a json file.
+    dataset_name must be registered in DatasetCatalog and in detectron2's standard format.
+
+    Args:
+        dataset_name:
+            reference from the config file to the catalogs
+            must be registered in DatasetCatalog and in detectron2's standard format
+        output_file: path of json file that will be saved to
+        allow_cached: if json file is already present then skip conversion
+    """
+
+    # TODO: The dataset or the conversion script *may* change,
+    # a checksum would be useful for validating the cached data
+
+    PathManager.mkdirs(os.path.dirname(output_file))
+    with file_lock(output_file):
+        if PathManager.exists(output_file) and allow_cached:
+            logger.warning(
+                f"Using previously cached COCO format annotations at '{output_file}'. "
+                "You need to clear the cache file if your dataset has been modified."
+            )
+        else:
+            logger.info(f"Converting annotations of dataset '{dataset_name}' to COCO format ...)")
+            coco_dict = convert_to_coco_dict(dataset_name)
+
+            logger.info(f"Caching COCO format annotations at '{output_file}' ...")
+            with PathManager.open(output_file, "w") as f:
+                json.dump(coco_dict, f)
+
+
+if __name__ == "__main__":
+    """
+    Test the COCO json dataset loader.
+
+    Usage:
+        python -m detectron2.data.data.coco \
+            path/to/json path/to/image_root dataset_name
+
+        "dataset_name" can be "coco_2014_minival_100", or other
+        pre-registered ones
+    """
+    from detectron2.utils.logger import setup_logger
+    from detectron2.utils.visualizer import Visualizer
+    import detectron2.data.datasets  # noqa # add pre-defined metadata
+    import sys
+
+    logger = setup_logger(name=__name__)
+    assert sys.argv[3] in DatasetCatalog.list()
+    meta = MetadataCatalog.get(sys.argv[3])
+
+    dicts = load_coco_json(sys.argv[1], sys.argv[2], sys.argv[3])
+    logger.info("Done loading {} samples.".format(len(dicts)))
+
+    dirname = "coco-data-vis"
+    os.makedirs(dirname, exist_ok=True)
+    for d in dicts:
+        img = np.array(Image.open(d["file_name"]))
+        visualizer = Visualizer(img, metadata=meta)
+        vis = visualizer.draw_dataset_dict(d)
+        fpath = os.path.join(dirname, os.path.basename(d["file_name"]))
+        vis.save(fpath)
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/detectron2/data/datasets/lvis.py b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/data/datasets/lvis.py
new file mode 100644
index 0000000000000000000000000000000000000000..7b95be350a775af78aa6412f560a29e825ba61a1
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/data/datasets/lvis.py
@@ -0,0 +1,209 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+import logging
+import os
+from fvcore.common.file_io import PathManager
+from fvcore.common.timer import Timer
+
+from detectron2.data import DatasetCatalog, MetadataCatalog
+from detectron2.structures import BoxMode
+
+from .builtin_meta import _get_coco_instances_meta
+from .lvis_v0_5_categories import LVIS_CATEGORIES
+
+"""
+This file contains functions to parse LVIS-format annotations into dicts in the
+"Detectron2 format".
+"""
+
+logger = logging.getLogger(__name__)
+
+__all__ = ["load_lvis_json", "register_lvis_instances", "get_lvis_instances_meta"]
+
+
+def register_lvis_instances(name, metadata, json_file, image_root):
+    """
+    Register a dataset in LVIS's json annotation format for instance detection and segmentation.
+
+    Args:
+        name (str): a name that identifies the dataset, e.g. "lvis_v0.5_train".
+        metadata (dict): extra metadata associated with this dataset. It can be an empty dict.
+        json_file (str): path to the json instance annotation file.
+        image_root (str or path-like): directory which contains all the images.
+    """
+    DatasetCatalog.register(name, lambda: load_lvis_json(json_file, image_root, name))
+    MetadataCatalog.get(name).set(
+        json_file=json_file, image_root=image_root, evaluator_type="lvis", **metadata
+    )
+
+
+def load_lvis_json(json_file, image_root, dataset_name=None):
+    """
+    Load a json file in LVIS's annotation format.
+
+    Args:
+        json_file (str): full path to the LVIS json annotation file.
+        image_root (str): the directory where the images in this json file exists.
+        dataset_name (str): the name of the dataset (e.g., "lvis_v0.5_train").
+            If provided, this function will put "thing_classes" into the metadata
+            associated with this dataset.
+
+    Returns:
+        list[dict]: a list of dicts in Detectron2 standard format. (See
+        `Using Custom Datasets </tutorials/data.html>`_ )
+
+    Notes:
+        1. This function does not read the image files.
+           The results do not have the "image" field.
+    """
+    from lvis import LVIS
+
+    json_file = PathManager.get_local_path(json_file)
+
+    timer = Timer()
+    lvis_api = LVIS(json_file)
+    if timer.seconds() > 1:
+        logger.info("Loading {} takes {:.2f} seconds.".format(json_file, timer.seconds()))
+
+    if dataset_name is not None:
+        meta = get_lvis_instances_meta(dataset_name)
+        MetadataCatalog.get(dataset_name).set(**meta)
+
+    # sort indices for reproducible results
+    img_ids = sorted(lvis_api.imgs.keys())
+    # imgs is a list of dicts, each looks something like:
+    # {'license': 4,
+    #  'url': 'http://farm6.staticflickr.com/5454/9413846304_881d5e5c3b_z.jpg',
+    #  'file_name': 'COCO_val2014_000000001268.jpg',
+    #  'height': 427,
+    #  'width': 640,
+    #  'date_captured': '2013-11-17 05:57:24',
+    #  'id': 1268}
+    imgs = lvis_api.load_imgs(img_ids)
+    # anns is a list[list[dict]], where each dict is an annotation
+    # record for an object. The inner list enumerates the objects in an image
+    # and the outer list enumerates over images. Example of anns[0]:
+    # [{'segmentation': [[192.81,
+    #     247.09,
+    #     ...
+    #     219.03,
+    #     249.06]],
+    #   'area': 1035.749,
+    #   'image_id': 1268,
+    #   'bbox': [192.81, 224.8, 74.73, 33.43],
+    #   'category_id': 16,
+    #   'id': 42986},
+    #  ...]
+    anns = [lvis_api.img_ann_map[img_id] for img_id in img_ids]
+
+    # Sanity check that each annotation has a unique id
+    ann_ids = [ann["id"] for anns_per_image in anns for ann in anns_per_image]
+    assert len(set(ann_ids)) == len(ann_ids), "Annotation ids in '{}' are not unique".format(
+        json_file
+    )
+
+    imgs_anns = list(zip(imgs, anns))
+
+    logger.info("Loaded {} images in the LVIS format from {}".format(len(imgs_anns), json_file))
+
+    dataset_dicts = []
+
+    for (img_dict, anno_dict_list) in imgs_anns:
+        record = {}
+        file_name = img_dict["file_name"]
+        if img_dict["file_name"].startswith("COCO"):
+            # Convert form the COCO 2014 file naming convention of
+            # COCO_[train/val/test]2014_000000000000.jpg to the 2017 naming convention of
+            # 000000000000.jpg (LVIS v1 will fix this naming issue)
+            file_name = file_name[-16:]
+        record["file_name"] = os.path.join(image_root, file_name)
+        record["height"] = img_dict["height"]
+        record["width"] = img_dict["width"]
+        record["not_exhaustive_category_ids"] = img_dict.get("not_exhaustive_category_ids", [])
+        record["neg_category_ids"] = img_dict.get("neg_category_ids", [])
+        image_id = record["image_id"] = img_dict["id"]
+
+        objs = []
+        for anno in anno_dict_list:
+            # Check that the image_id in this annotation is the same as
+            # the image_id we're looking at.
+            # This fails only when the data parsing logic or the annotation file is buggy.
+            assert anno["image_id"] == image_id
+            obj = {"bbox": anno["bbox"], "bbox_mode": BoxMode.XYWH_ABS}
+            obj["category_id"] = anno["category_id"] - 1  # Convert 1-indexed to 0-indexed
+            segm = anno["segmentation"]  # list[list[float]]
+            # filter out invalid polygons (< 3 points)
+            valid_segm = [poly for poly in segm if len(poly) % 2 == 0 and len(poly) >= 6]
+            assert len(segm) == len(
+                valid_segm
+            ), "Annotation contains an invalid polygon with < 3 points"
+            assert len(segm) > 0
+            obj["segmentation"] = segm
+            objs.append(obj)
+        record["annotations"] = objs
+        dataset_dicts.append(record)
+
+    return dataset_dicts
+
+
+def get_lvis_instances_meta(dataset_name):
+    """
+    Load LVIS metadata.
+
+    Args:
+        dataset_name (str): LVIS dataset name without the split name (e.g., "lvis_v0.5").
+
+    Returns:
+        dict: LVIS metadata with keys: thing_classes
+    """
+    if "cocofied" in dataset_name:
+        return _get_coco_instances_meta()
+    if "v0.5" in dataset_name:
+        return _get_lvis_instances_meta_v0_5()
+    # There will be a v1 in the future
+    # elif dataset_name == "lvis_v1":
+    #   return get_lvis_instances_meta_v1()
+    raise ValueError("No built-in metadata for dataset {}".format(dataset_name))
+
+
+def _get_lvis_instances_meta_v0_5():
+    assert len(LVIS_CATEGORIES) == 1230
+    cat_ids = [k["id"] for k in LVIS_CATEGORIES]
+    assert min(cat_ids) == 1 and max(cat_ids) == len(
+        cat_ids
+    ), "Category ids are not in [1, #categories], as expected"
+    # Ensure that the category list is sorted by id
+    lvis_categories = sorted(LVIS_CATEGORIES, key=lambda x: x["id"])
+    thing_classes = [k["synonyms"][0] for k in lvis_categories]
+    meta = {"thing_classes": thing_classes}
+    return meta
+
+
+if __name__ == "__main__":
+    """
+    Test the LVIS json dataset loader.
+
+    Usage:
+        python -m detectron2.data.data.lvis \
+            path/to/json path/to/image_root dataset_name vis_limit
+    """
+    import sys
+    import numpy as np
+    from detectron2.utils.logger import setup_logger
+    from PIL import Image
+    import detectron2.data.datasets  # noqa # add pre-defined metadata
+    from detectron2.utils.visualizer import Visualizer
+
+    logger = setup_logger(name=__name__)
+    meta = MetadataCatalog.get(sys.argv[3])
+
+    dicts = load_lvis_json(sys.argv[1], sys.argv[2], sys.argv[3])
+    logger.info("Done loading {} samples.".format(len(dicts)))
+
+    dirname = "lvis-data-vis"
+    os.makedirs(dirname, exist_ok=True)
+    for d in dicts[: int(sys.argv[4])]:
+        img = np.array(Image.open(d["file_name"]))
+        visualizer = Visualizer(img, metadata=meta)
+        vis = visualizer.draw_dataset_dict(d)
+        fpath = os.path.join(dirname, os.path.basename(d["file_name"]))
+        vis.save(fpath)
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/detectron2/data/datasets/lvis_v0_5_categories.py b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/data/datasets/lvis_v0_5_categories.py
new file mode 100644
index 0000000000000000000000000000000000000000..8205e605f85dab3674c6f1600d7675eef86b160f
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/data/datasets/lvis_v0_5_categories.py
@@ -0,0 +1,13 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+# Autogen with
+# with open("lvis_v0.5_val.json", "r") as f:
+#     a = json.load(f)
+# c = a["categories"]
+# for x in c:
+#     del x["image_count"]
+#     del x["instance_count"]
+# LVIS_CATEGORIES = repr(c) + "  # noqa"
+
+# fmt: off
+LVIS_CATEGORIES = [{'frequency': 'r', 'id': 1, 'synset': 'acorn.n.01', 'synonyms': ['acorn'], 'def': 'nut from an oak tree', 'name': 'acorn'}, {'frequency': 'c', 'id': 2, 'synset': 'aerosol.n.02', 'synonyms': ['aerosol_can', 'spray_can'], 'def': 'a dispenser that holds a substance under pressure', 'name': 'aerosol_can'}, {'frequency': 'f', 'id': 3, 'synset': 'air_conditioner.n.01', 'synonyms': ['air_conditioner'], 'def': 'a machine that keeps air cool and dry', 'name': 'air_conditioner'}, {'frequency': 'f', 'id': 4, 'synset': 'airplane.n.01', 'synonyms': ['airplane', 'aeroplane'], 'def': 'an aircraft that has a fixed wing and is powered by propellers or jets', 'name': 'airplane'}, {'frequency': 'c', 'id': 5, 'synset': 'alarm_clock.n.01', 'synonyms': ['alarm_clock'], 'def': 'a clock that wakes a sleeper at some preset time', 'name': 'alarm_clock'}, {'frequency': 'c', 'id': 6, 'synset': 'alcohol.n.01', 'synonyms': ['alcohol', 'alcoholic_beverage'], 'def': 'a liquor or brew containing alcohol as the active agent', 'name': 'alcohol'}, {'frequency': 'r', 'id': 7, 'synset': 'alligator.n.02', 'synonyms': ['alligator', 'gator'], 'def': 'amphibious reptiles related to crocodiles but with shorter broader snouts', 'name': 'alligator'}, {'frequency': 'c', 'id': 8, 'synset': 'almond.n.02', 'synonyms': ['almond'], 'def': 'oval-shaped edible seed of the almond tree', 'name': 'almond'}, {'frequency': 'c', 'id': 9, 'synset': 'ambulance.n.01', 'synonyms': ['ambulance'], 'def': 'a vehicle that takes people to and from hospitals', 'name': 'ambulance'}, {'frequency': 'r', 'id': 10, 'synset': 'amplifier.n.01', 'synonyms': ['amplifier'], 'def': 'electronic equipment that increases strength of signals', 'name': 'amplifier'}, {'frequency': 'c', 'id': 11, 'synset': 'anklet.n.03', 'synonyms': ['anklet', 'ankle_bracelet'], 'def': 'an ornament worn around the ankle', 'name': 'anklet'}, {'frequency': 'f', 'id': 12, 'synset': 'antenna.n.01', 'synonyms': ['antenna', 'aerial', 'transmitting_aerial'], 'def': 'an electrical device that sends or receives radio or television signals', 'name': 'antenna'}, {'frequency': 'f', 'id': 13, 'synset': 'apple.n.01', 'synonyms': ['apple'], 'def': 'fruit with red or yellow or green skin and sweet to tart crisp whitish flesh', 'name': 'apple'}, {'frequency': 'r', 'id': 14, 'synset': 'apple_juice.n.01', 'synonyms': ['apple_juice'], 'def': 'the juice of apples', 'name': 'apple_juice'}, {'frequency': 'r', 'id': 15, 'synset': 'applesauce.n.01', 'synonyms': ['applesauce'], 'def': 'puree of stewed apples usually sweetened and spiced', 'name': 'applesauce'}, {'frequency': 'r', 'id': 16, 'synset': 'apricot.n.02', 'synonyms': ['apricot'], 'def': 'downy yellow to rosy-colored fruit resembling a small peach', 'name': 'apricot'}, {'frequency': 'f', 'id': 17, 'synset': 'apron.n.01', 'synonyms': ['apron'], 'def': 'a garment of cloth that is tied about the waist and worn to protect clothing', 'name': 'apron'}, {'frequency': 'c', 'id': 18, 'synset': 'aquarium.n.01', 'synonyms': ['aquarium', 'fish_tank'], 'def': 'a tank/pool/bowl filled with water for keeping live fish and underwater animals', 'name': 'aquarium'}, {'frequency': 'c', 'id': 19, 'synset': 'armband.n.02', 'synonyms': ['armband'], 'def': 'a band worn around the upper arm', 'name': 'armband'}, {'frequency': 'f', 'id': 20, 'synset': 'armchair.n.01', 'synonyms': ['armchair'], 'def': 'chair with a support on each side for arms', 'name': 'armchair'}, {'frequency': 'r', 'id': 21, 'synset': 'armoire.n.01', 'synonyms': ['armoire'], 'def': 'a large wardrobe or cabinet', 'name': 'armoire'}, {'frequency': 'r', 'id': 22, 'synset': 'armor.n.01', 'synonyms': ['armor', 'armour'], 'def': 'protective covering made of metal and used in combat', 'name': 'armor'}, {'frequency': 'c', 'id': 23, 'synset': 'artichoke.n.02', 'synonyms': ['artichoke'], 'def': 'a thistlelike flower head with edible fleshy leaves and heart', 'name': 'artichoke'}, {'frequency': 'f', 'id': 24, 'synset': 'ashcan.n.01', 'synonyms': ['trash_can', 'garbage_can', 'wastebin', 'dustbin', 'trash_barrel', 'trash_bin'], 'def': 'a bin that holds rubbish until it is collected', 'name': 'trash_can'}, {'frequency': 'c', 'id': 25, 'synset': 'ashtray.n.01', 'synonyms': ['ashtray'], 'def': "a receptacle for the ash from smokers' cigars or cigarettes", 'name': 'ashtray'}, {'frequency': 'c', 'id': 26, 'synset': 'asparagus.n.02', 'synonyms': ['asparagus'], 'def': 'edible young shoots of the asparagus plant', 'name': 'asparagus'}, {'frequency': 'c', 'id': 27, 'synset': 'atomizer.n.01', 'synonyms': ['atomizer', 'atomiser', 'spray', 'sprayer', 'nebulizer', 'nebuliser'], 'def': 'a dispenser that turns a liquid (such as perfume) into a fine mist', 'name': 'atomizer'}, {'frequency': 'c', 'id': 28, 'synset': 'avocado.n.01', 'synonyms': ['avocado'], 'def': 'a pear-shaped fruit with green or blackish skin and rich yellowish pulp enclosing a single large seed', 'name': 'avocado'}, {'frequency': 'c', 'id': 29, 'synset': 'award.n.02', 'synonyms': ['award', 'accolade'], 'def': 'a tangible symbol signifying approval or distinction', 'name': 'award'}, {'frequency': 'f', 'id': 30, 'synset': 'awning.n.01', 'synonyms': ['awning'], 'def': 'a canopy made of canvas to shelter people or things from rain or sun', 'name': 'awning'}, {'frequency': 'r', 'id': 31, 'synset': 'ax.n.01', 'synonyms': ['ax', 'axe'], 'def': 'an edge tool with a heavy bladed head mounted across a handle', 'name': 'ax'}, {'frequency': 'f', 'id': 32, 'synset': 'baby_buggy.n.01', 'synonyms': ['baby_buggy', 'baby_carriage', 'perambulator', 'pram', 'stroller'], 'def': 'a small vehicle with four wheels in which a baby or child is pushed around', 'name': 'baby_buggy'}, {'frequency': 'c', 'id': 33, 'synset': 'backboard.n.01', 'synonyms': ['basketball_backboard'], 'def': 'a raised vertical board with basket attached; used to play basketball', 'name': 'basketball_backboard'}, {'frequency': 'f', 'id': 34, 'synset': 'backpack.n.01', 'synonyms': ['backpack', 'knapsack', 'packsack', 'rucksack', 'haversack'], 'def': 'a bag carried by a strap on your back or shoulder', 'name': 'backpack'}, {'frequency': 'f', 'id': 35, 'synset': 'bag.n.04', 'synonyms': ['handbag', 'purse', 'pocketbook'], 'def': 'a container used for carrying money and small personal items or accessories', 'name': 'handbag'}, {'frequency': 'f', 'id': 36, 'synset': 'bag.n.06', 'synonyms': ['suitcase', 'baggage', 'luggage'], 'def': 'cases used to carry belongings when traveling', 'name': 'suitcase'}, {'frequency': 'c', 'id': 37, 'synset': 'bagel.n.01', 'synonyms': ['bagel', 'beigel'], 'def': 'glazed yeast-raised doughnut-shaped roll with hard crust', 'name': 'bagel'}, {'frequency': 'r', 'id': 38, 'synset': 'bagpipe.n.01', 'synonyms': ['bagpipe'], 'def': 'a tubular wind instrument; the player blows air into a bag and squeezes it out', 'name': 'bagpipe'}, {'frequency': 'r', 'id': 39, 'synset': 'baguet.n.01', 'synonyms': ['baguet', 'baguette'], 'def': 'narrow French stick loaf', 'name': 'baguet'}, {'frequency': 'r', 'id': 40, 'synset': 'bait.n.02', 'synonyms': ['bait', 'lure'], 'def': 'something used to lure fish or other animals into danger so they can be trapped or killed', 'name': 'bait'}, {'frequency': 'f', 'id': 41, 'synset': 'ball.n.06', 'synonyms': ['ball'], 'def': 'a spherical object used as a plaything', 'name': 'ball'}, {'frequency': 'r', 'id': 42, 'synset': 'ballet_skirt.n.01', 'synonyms': ['ballet_skirt', 'tutu'], 'def': 'very short skirt worn by ballerinas', 'name': 'ballet_skirt'}, {'frequency': 'f', 'id': 43, 'synset': 'balloon.n.01', 'synonyms': ['balloon'], 'def': 'large tough nonrigid bag filled with gas or heated air', 'name': 'balloon'}, {'frequency': 'c', 'id': 44, 'synset': 'bamboo.n.02', 'synonyms': ['bamboo'], 'def': 'woody tropical grass having hollow woody stems', 'name': 'bamboo'}, {'frequency': 'f', 'id': 45, 'synset': 'banana.n.02', 'synonyms': ['banana'], 'def': 'elongated crescent-shaped yellow fruit with soft sweet flesh', 'name': 'banana'}, {'frequency': 'r', 'id': 46, 'synset': 'band_aid.n.01', 'synonyms': ['Band_Aid'], 'def': 'trade name for an adhesive bandage to cover small cuts or blisters', 'name': 'Band_Aid'}, {'frequency': 'c', 'id': 47, 'synset': 'bandage.n.01', 'synonyms': ['bandage'], 'def': 'a piece of soft material that covers and protects an injured part of the body', 'name': 'bandage'}, {'frequency': 'c', 'id': 48, 'synset': 'bandanna.n.01', 'synonyms': ['bandanna', 'bandana'], 'def': 'large and brightly colored handkerchief; often used as a neckerchief', 'name': 'bandanna'}, {'frequency': 'r', 'id': 49, 'synset': 'banjo.n.01', 'synonyms': ['banjo'], 'def': 'a stringed instrument of the guitar family with a long neck and circular body', 'name': 'banjo'}, {'frequency': 'f', 'id': 50, 'synset': 'banner.n.01', 'synonyms': ['banner', 'streamer'], 'def': 'long strip of cloth or paper used for decoration or advertising', 'name': 'banner'}, {'frequency': 'r', 'id': 51, 'synset': 'barbell.n.01', 'synonyms': ['barbell'], 'def': 'a bar to which heavy discs are attached at each end; used in weightlifting', 'name': 'barbell'}, {'frequency': 'r', 'id': 52, 'synset': 'barge.n.01', 'synonyms': ['barge'], 'def': 'a flatbottom boat for carrying heavy loads (especially on canals)', 'name': 'barge'}, {'frequency': 'f', 'id': 53, 'synset': 'barrel.n.02', 'synonyms': ['barrel', 'cask'], 'def': 'a cylindrical container that holds liquids', 'name': 'barrel'}, {'frequency': 'c', 'id': 54, 'synset': 'barrette.n.01', 'synonyms': ['barrette'], 'def': "a pin for holding women's hair in place", 'name': 'barrette'}, {'frequency': 'c', 'id': 55, 'synset': 'barrow.n.03', 'synonyms': ['barrow', 'garden_cart', 'lawn_cart', 'wheelbarrow'], 'def': 'a cart for carrying small loads; has handles and one or more wheels', 'name': 'barrow'}, {'frequency': 'f', 'id': 56, 'synset': 'base.n.03', 'synonyms': ['baseball_base'], 'def': 'a place that the runner must touch before scoring', 'name': 'baseball_base'}, {'frequency': 'f', 'id': 57, 'synset': 'baseball.n.02', 'synonyms': ['baseball'], 'def': 'a ball used in playing baseball', 'name': 'baseball'}, {'frequency': 'f', 'id': 58, 'synset': 'baseball_bat.n.01', 'synonyms': ['baseball_bat'], 'def': 'an implement used in baseball by the batter', 'name': 'baseball_bat'}, {'frequency': 'f', 'id': 59, 'synset': 'baseball_cap.n.01', 'synonyms': ['baseball_cap', 'jockey_cap', 'golf_cap'], 'def': 'a cap with a bill', 'name': 'baseball_cap'}, {'frequency': 'f', 'id': 60, 'synset': 'baseball_glove.n.01', 'synonyms': ['baseball_glove', 'baseball_mitt'], 'def': 'the handwear used by fielders in playing baseball', 'name': 'baseball_glove'}, {'frequency': 'f', 'id': 61, 'synset': 'basket.n.01', 'synonyms': ['basket', 'handbasket'], 'def': 'a container that is usually woven and has handles', 'name': 'basket'}, {'frequency': 'c', 'id': 62, 'synset': 'basket.n.03', 'synonyms': ['basketball_hoop'], 'def': 'metal hoop supporting a net through which players try to throw the basketball', 'name': 'basketball_hoop'}, {'frequency': 'c', 'id': 63, 'synset': 'basketball.n.02', 'synonyms': ['basketball'], 'def': 'an inflated ball used in playing basketball', 'name': 'basketball'}, {'frequency': 'r', 'id': 64, 'synset': 'bass_horn.n.01', 'synonyms': ['bass_horn', 'sousaphone', 'tuba'], 'def': 'the lowest brass wind instrument', 'name': 'bass_horn'}, {'frequency': 'r', 'id': 65, 'synset': 'bat.n.01', 'synonyms': ['bat_(animal)'], 'def': 'nocturnal mouselike mammal with forelimbs modified to form membranous wings', 'name': 'bat_(animal)'}, {'frequency': 'f', 'id': 66, 'synset': 'bath_mat.n.01', 'synonyms': ['bath_mat'], 'def': 'a heavy towel or mat to stand on while drying yourself after a bath', 'name': 'bath_mat'}, {'frequency': 'f', 'id': 67, 'synset': 'bath_towel.n.01', 'synonyms': ['bath_towel'], 'def': 'a large towel; to dry yourself after a bath', 'name': 'bath_towel'}, {'frequency': 'c', 'id': 68, 'synset': 'bathrobe.n.01', 'synonyms': ['bathrobe'], 'def': 'a loose-fitting robe of towelling; worn after a bath or swim', 'name': 'bathrobe'}, {'frequency': 'f', 'id': 69, 'synset': 'bathtub.n.01', 'synonyms': ['bathtub', 'bathing_tub'], 'def': 'a large open container that you fill with water and use to wash the body', 'name': 'bathtub'}, {'frequency': 'r', 'id': 70, 'synset': 'batter.n.02', 'synonyms': ['batter_(food)'], 'def': 'a liquid or semiliquid mixture, as of flour, eggs, and milk, used in cooking', 'name': 'batter_(food)'}, {'frequency': 'c', 'id': 71, 'synset': 'battery.n.02', 'synonyms': ['battery'], 'def': 'a portable device that produces electricity', 'name': 'battery'}, {'frequency': 'r', 'id': 72, 'synset': 'beach_ball.n.01', 'synonyms': ['beachball'], 'def': 'large and light ball; for play at the seaside', 'name': 'beachball'}, {'frequency': 'c', 'id': 73, 'synset': 'bead.n.01', 'synonyms': ['bead'], 'def': 'a small ball with a hole through the middle used for ornamentation, jewellery, etc.', 'name': 'bead'}, {'frequency': 'r', 'id': 74, 'synset': 'beaker.n.01', 'synonyms': ['beaker'], 'def': 'a flatbottomed jar made of glass or plastic; used for chemistry', 'name': 'beaker'}, {'frequency': 'c', 'id': 75, 'synset': 'bean_curd.n.01', 'synonyms': ['bean_curd', 'tofu'], 'def': 'cheeselike food made of curdled soybean milk', 'name': 'bean_curd'}, {'frequency': 'c', 'id': 76, 'synset': 'beanbag.n.01', 'synonyms': ['beanbag'], 'def': 'a bag filled with dried beans or similar items; used in games or to sit on', 'name': 'beanbag'}, {'frequency': 'f', 'id': 77, 'synset': 'beanie.n.01', 'synonyms': ['beanie', 'beany'], 'def': 'a small skullcap; formerly worn by schoolboys and college freshmen', 'name': 'beanie'}, {'frequency': 'f', 'id': 78, 'synset': 'bear.n.01', 'synonyms': ['bear'], 'def': 'large carnivorous or omnivorous mammals with shaggy coats and claws', 'name': 'bear'}, {'frequency': 'f', 'id': 79, 'synset': 'bed.n.01', 'synonyms': ['bed'], 'def': 'a piece of furniture that provides a place to sleep', 'name': 'bed'}, {'frequency': 'c', 'id': 80, 'synset': 'bedspread.n.01', 'synonyms': ['bedspread', 'bedcover', 'bed_covering', 'counterpane', 'spread'], 'def': 'decorative cover for a bed', 'name': 'bedspread'}, {'frequency': 'f', 'id': 81, 'synset': 'beef.n.01', 'synonyms': ['cow'], 'def': 'cattle that are reared for their meat', 'name': 'cow'}, {'frequency': 'c', 'id': 82, 'synset': 'beef.n.02', 'synonyms': ['beef_(food)', 'boeuf_(food)'], 'def': 'meat from an adult domestic bovine', 'name': 'beef_(food)'}, {'frequency': 'r', 'id': 83, 'synset': 'beeper.n.01', 'synonyms': ['beeper', 'pager'], 'def': 'an device that beeps when the person carrying it is being paged', 'name': 'beeper'}, {'frequency': 'f', 'id': 84, 'synset': 'beer_bottle.n.01', 'synonyms': ['beer_bottle'], 'def': 'a bottle that holds beer', 'name': 'beer_bottle'}, {'frequency': 'c', 'id': 85, 'synset': 'beer_can.n.01', 'synonyms': ['beer_can'], 'def': 'a can that holds beer', 'name': 'beer_can'}, {'frequency': 'r', 'id': 86, 'synset': 'beetle.n.01', 'synonyms': ['beetle'], 'def': 'insect with hard wing covers', 'name': 'beetle'}, {'frequency': 'f', 'id': 87, 'synset': 'bell.n.01', 'synonyms': ['bell'], 'def': 'a hollow device made of metal that makes a ringing sound when struck', 'name': 'bell'}, {'frequency': 'f', 'id': 88, 'synset': 'bell_pepper.n.02', 'synonyms': ['bell_pepper', 'capsicum'], 'def': 'large bell-shaped sweet pepper in green or red or yellow or orange or black varieties', 'name': 'bell_pepper'}, {'frequency': 'f', 'id': 89, 'synset': 'belt.n.02', 'synonyms': ['belt'], 'def': 'a band to tie or buckle around the body (usually at the waist)', 'name': 'belt'}, {'frequency': 'f', 'id': 90, 'synset': 'belt_buckle.n.01', 'synonyms': ['belt_buckle'], 'def': 'the buckle used to fasten a belt', 'name': 'belt_buckle'}, {'frequency': 'f', 'id': 91, 'synset': 'bench.n.01', 'synonyms': ['bench'], 'def': 'a long seat for more than one person', 'name': 'bench'}, {'frequency': 'c', 'id': 92, 'synset': 'beret.n.01', 'synonyms': ['beret'], 'def': 'a cap with no brim or bill; made of soft cloth', 'name': 'beret'}, {'frequency': 'c', 'id': 93, 'synset': 'bib.n.02', 'synonyms': ['bib'], 'def': 'a napkin tied under the chin of a child while eating', 'name': 'bib'}, {'frequency': 'r', 'id': 94, 'synset': 'bible.n.01', 'synonyms': ['Bible'], 'def': 'the sacred writings of the Christian religions', 'name': 'Bible'}, {'frequency': 'f', 'id': 95, 'synset': 'bicycle.n.01', 'synonyms': ['bicycle', 'bike_(bicycle)'], 'def': 'a wheeled vehicle that has two wheels and is moved by foot pedals', 'name': 'bicycle'}, {'frequency': 'f', 'id': 96, 'synset': 'bill.n.09', 'synonyms': ['visor', 'vizor'], 'def': 'a brim that projects to the front to shade the eyes', 'name': 'visor'}, {'frequency': 'c', 'id': 97, 'synset': 'binder.n.03', 'synonyms': ['binder', 'ring-binder'], 'def': 'holds loose papers or magazines', 'name': 'binder'}, {'frequency': 'c', 'id': 98, 'synset': 'binoculars.n.01', 'synonyms': ['binoculars', 'field_glasses', 'opera_glasses'], 'def': 'an optical instrument designed for simultaneous use by both eyes', 'name': 'binoculars'}, {'frequency': 'f', 'id': 99, 'synset': 'bird.n.01', 'synonyms': ['bird'], 'def': 'animal characterized by feathers and wings', 'name': 'bird'}, {'frequency': 'r', 'id': 100, 'synset': 'bird_feeder.n.01', 'synonyms': ['birdfeeder'], 'def': 'an outdoor device that supplies food for wild birds', 'name': 'birdfeeder'}, {'frequency': 'r', 'id': 101, 'synset': 'birdbath.n.01', 'synonyms': ['birdbath'], 'def': 'an ornamental basin (usually in a garden) for birds to bathe in', 'name': 'birdbath'}, {'frequency': 'c', 'id': 102, 'synset': 'birdcage.n.01', 'synonyms': ['birdcage'], 'def': 'a cage in which a bird can be kept', 'name': 'birdcage'}, {'frequency': 'c', 'id': 103, 'synset': 'birdhouse.n.01', 'synonyms': ['birdhouse'], 'def': 'a shelter for birds', 'name': 'birdhouse'}, {'frequency': 'f', 'id': 104, 'synset': 'birthday_cake.n.01', 'synonyms': ['birthday_cake'], 'def': 'decorated cake served at a birthday party', 'name': 'birthday_cake'}, {'frequency': 'r', 'id': 105, 'synset': 'birthday_card.n.01', 'synonyms': ['birthday_card'], 'def': 'a card expressing a birthday greeting', 'name': 'birthday_card'}, {'frequency': 'r', 'id': 106, 'synset': 'biscuit.n.01', 'synonyms': ['biscuit_(bread)'], 'def': 'small round bread leavened with baking-powder or soda', 'name': 'biscuit_(bread)'}, {'frequency': 'r', 'id': 107, 'synset': 'black_flag.n.01', 'synonyms': ['pirate_flag'], 'def': 'a flag usually bearing a white skull and crossbones on a black background', 'name': 'pirate_flag'}, {'frequency': 'c', 'id': 108, 'synset': 'black_sheep.n.02', 'synonyms': ['black_sheep'], 'def': 'sheep with a black coat', 'name': 'black_sheep'}, {'frequency': 'c', 'id': 109, 'synset': 'blackboard.n.01', 'synonyms': ['blackboard', 'chalkboard'], 'def': 'sheet of slate; for writing with chalk', 'name': 'blackboard'}, {'frequency': 'f', 'id': 110, 'synset': 'blanket.n.01', 'synonyms': ['blanket'], 'def': 'bedding that keeps a person warm in bed', 'name': 'blanket'}, {'frequency': 'c', 'id': 111, 'synset': 'blazer.n.01', 'synonyms': ['blazer', 'sport_jacket', 'sport_coat', 'sports_jacket', 'sports_coat'], 'def': 'lightweight jacket; often striped in the colors of a club or school', 'name': 'blazer'}, {'frequency': 'f', 'id': 112, 'synset': 'blender.n.01', 'synonyms': ['blender', 'liquidizer', 'liquidiser'], 'def': 'an electrically powered mixer that mix or chop or liquefy foods', 'name': 'blender'}, {'frequency': 'r', 'id': 113, 'synset': 'blimp.n.02', 'synonyms': ['blimp'], 'def': 'a small nonrigid airship used for observation or as a barrage balloon', 'name': 'blimp'}, {'frequency': 'c', 'id': 114, 'synset': 'blinker.n.01', 'synonyms': ['blinker', 'flasher'], 'def': 'a light that flashes on and off; used as a signal or to send messages', 'name': 'blinker'}, {'frequency': 'c', 'id': 115, 'synset': 'blueberry.n.02', 'synonyms': ['blueberry'], 'def': 'sweet edible dark-blue berries of blueberry plants', 'name': 'blueberry'}, {'frequency': 'r', 'id': 116, 'synset': 'boar.n.02', 'synonyms': ['boar'], 'def': 'an uncastrated male hog', 'name': 'boar'}, {'frequency': 'r', 'id': 117, 'synset': 'board.n.09', 'synonyms': ['gameboard'], 'def': 'a flat portable surface (usually rectangular) designed for board games', 'name': 'gameboard'}, {'frequency': 'f', 'id': 118, 'synset': 'boat.n.01', 'synonyms': ['boat', 'ship_(boat)'], 'def': 'a vessel for travel on water', 'name': 'boat'}, {'frequency': 'c', 'id': 119, 'synset': 'bobbin.n.01', 'synonyms': ['bobbin', 'spool', 'reel'], 'def': 'a thing around which thread/tape/film or other flexible materials can be wound', 'name': 'bobbin'}, {'frequency': 'r', 'id': 120, 'synset': 'bobby_pin.n.01', 'synonyms': ['bobby_pin', 'hairgrip'], 'def': 'a flat wire hairpin used to hold bobbed hair in place', 'name': 'bobby_pin'}, {'frequency': 'c', 'id': 121, 'synset': 'boiled_egg.n.01', 'synonyms': ['boiled_egg', 'coddled_egg'], 'def': 'egg cooked briefly in the shell in gently boiling water', 'name': 'boiled_egg'}, {'frequency': 'r', 'id': 122, 'synset': 'bolo_tie.n.01', 'synonyms': ['bolo_tie', 'bolo', 'bola_tie', 'bola'], 'def': 'a cord fastened around the neck with an ornamental clasp and worn as a necktie', 'name': 'bolo_tie'}, {'frequency': 'c', 'id': 123, 'synset': 'bolt.n.03', 'synonyms': ['deadbolt'], 'def': 'the part of a lock that is engaged or withdrawn with a key', 'name': 'deadbolt'}, {'frequency': 'f', 'id': 124, 'synset': 'bolt.n.06', 'synonyms': ['bolt'], 'def': 'a screw that screws into a nut to form a fastener', 'name': 'bolt'}, {'frequency': 'r', 'id': 125, 'synset': 'bonnet.n.01', 'synonyms': ['bonnet'], 'def': 'a hat tied under the chin', 'name': 'bonnet'}, {'frequency': 'f', 'id': 126, 'synset': 'book.n.01', 'synonyms': ['book'], 'def': 'a written work or composition that has been published', 'name': 'book'}, {'frequency': 'r', 'id': 127, 'synset': 'book_bag.n.01', 'synonyms': ['book_bag'], 'def': 'a bag in which students carry their books', 'name': 'book_bag'}, {'frequency': 'c', 'id': 128, 'synset': 'bookcase.n.01', 'synonyms': ['bookcase'], 'def': 'a piece of furniture with shelves for storing books', 'name': 'bookcase'}, {'frequency': 'c', 'id': 129, 'synset': 'booklet.n.01', 'synonyms': ['booklet', 'brochure', 'leaflet', 'pamphlet'], 'def': 'a small book usually having a paper cover', 'name': 'booklet'}, {'frequency': 'r', 'id': 130, 'synset': 'bookmark.n.01', 'synonyms': ['bookmark', 'bookmarker'], 'def': 'a marker (a piece of paper or ribbon) placed between the pages of a book', 'name': 'bookmark'}, {'frequency': 'r', 'id': 131, 'synset': 'boom.n.04', 'synonyms': ['boom_microphone', 'microphone_boom'], 'def': 'a pole carrying an overhead microphone projected over a film or tv set', 'name': 'boom_microphone'}, {'frequency': 'f', 'id': 132, 'synset': 'boot.n.01', 'synonyms': ['boot'], 'def': 'footwear that covers the whole foot and lower leg', 'name': 'boot'}, {'frequency': 'f', 'id': 133, 'synset': 'bottle.n.01', 'synonyms': ['bottle'], 'def': 'a glass or plastic vessel used for storing drinks or other liquids', 'name': 'bottle'}, {'frequency': 'c', 'id': 134, 'synset': 'bottle_opener.n.01', 'synonyms': ['bottle_opener'], 'def': 'an opener for removing caps or corks from bottles', 'name': 'bottle_opener'}, {'frequency': 'c', 'id': 135, 'synset': 'bouquet.n.01', 'synonyms': ['bouquet'], 'def': 'an arrangement of flowers that is usually given as a present', 'name': 'bouquet'}, {'frequency': 'r', 'id': 136, 'synset': 'bow.n.04', 'synonyms': ['bow_(weapon)'], 'def': 'a weapon for shooting arrows', 'name': 'bow_(weapon)'}, {'frequency': 'f', 'id': 137, 'synset': 'bow.n.08', 'synonyms': ['bow_(decorative_ribbons)'], 'def': 'a decorative interlacing of ribbons', 'name': 'bow_(decorative_ribbons)'}, {'frequency': 'f', 'id': 138, 'synset': 'bow_tie.n.01', 'synonyms': ['bow-tie', 'bowtie'], 'def': "a man's tie that ties in a bow", 'name': 'bow-tie'}, {'frequency': 'f', 'id': 139, 'synset': 'bowl.n.03', 'synonyms': ['bowl'], 'def': 'a dish that is round and open at the top for serving foods', 'name': 'bowl'}, {'frequency': 'r', 'id': 140, 'synset': 'bowl.n.08', 'synonyms': ['pipe_bowl'], 'def': 'a small round container that is open at the top for holding tobacco', 'name': 'pipe_bowl'}, {'frequency': 'c', 'id': 141, 'synset': 'bowler_hat.n.01', 'synonyms': ['bowler_hat', 'bowler', 'derby_hat', 'derby', 'plug_hat'], 'def': 'a felt hat that is round and hard with a narrow brim', 'name': 'bowler_hat'}, {'frequency': 'r', 'id': 142, 'synset': 'bowling_ball.n.01', 'synonyms': ['bowling_ball'], 'def': 'a large ball with finger holes used in the sport of bowling', 'name': 'bowling_ball'}, {'frequency': 'r', 'id': 143, 'synset': 'bowling_pin.n.01', 'synonyms': ['bowling_pin'], 'def': 'a club-shaped wooden object used in bowling', 'name': 'bowling_pin'}, {'frequency': 'r', 'id': 144, 'synset': 'boxing_glove.n.01', 'synonyms': ['boxing_glove'], 'def': 'large glove coverings the fists of a fighter worn for the sport of boxing', 'name': 'boxing_glove'}, {'frequency': 'c', 'id': 145, 'synset': 'brace.n.06', 'synonyms': ['suspenders'], 'def': 'elastic straps that hold trousers up (usually used in the plural)', 'name': 'suspenders'}, {'frequency': 'f', 'id': 146, 'synset': 'bracelet.n.02', 'synonyms': ['bracelet', 'bangle'], 'def': 'jewelry worn around the wrist for decoration', 'name': 'bracelet'}, {'frequency': 'r', 'id': 147, 'synset': 'brass.n.07', 'synonyms': ['brass_plaque'], 'def': 'a memorial made of brass', 'name': 'brass_plaque'}, {'frequency': 'c', 'id': 148, 'synset': 'brassiere.n.01', 'synonyms': ['brassiere', 'bra', 'bandeau'], 'def': 'an undergarment worn by women to support their breasts', 'name': 'brassiere'}, {'frequency': 'c', 'id': 149, 'synset': 'bread-bin.n.01', 'synonyms': ['bread-bin', 'breadbox'], 'def': 'a container used to keep bread or cake in', 'name': 'bread-bin'}, {'frequency': 'r', 'id': 150, 'synset': 'breechcloth.n.01', 'synonyms': ['breechcloth', 'breechclout', 'loincloth'], 'def': 'a garment that provides covering for the loins', 'name': 'breechcloth'}, {'frequency': 'c', 'id': 151, 'synset': 'bridal_gown.n.01', 'synonyms': ['bridal_gown', 'wedding_gown', 'wedding_dress'], 'def': 'a gown worn by the bride at a wedding', 'name': 'bridal_gown'}, {'frequency': 'c', 'id': 152, 'synset': 'briefcase.n.01', 'synonyms': ['briefcase'], 'def': 'a case with a handle; for carrying papers or files or books', 'name': 'briefcase'}, {'frequency': 'c', 'id': 153, 'synset': 'bristle_brush.n.01', 'synonyms': ['bristle_brush'], 'def': 'a brush that is made with the short stiff hairs of an animal or plant', 'name': 'bristle_brush'}, {'frequency': 'f', 'id': 154, 'synset': 'broccoli.n.01', 'synonyms': ['broccoli'], 'def': 'plant with dense clusters of tight green flower buds', 'name': 'broccoli'}, {'frequency': 'r', 'id': 155, 'synset': 'brooch.n.01', 'synonyms': ['broach'], 'def': 'a decorative pin worn by women', 'name': 'broach'}, {'frequency': 'c', 'id': 156, 'synset': 'broom.n.01', 'synonyms': ['broom'], 'def': 'bundle of straws or twigs attached to a long handle; used for cleaning', 'name': 'broom'}, {'frequency': 'c', 'id': 157, 'synset': 'brownie.n.03', 'synonyms': ['brownie'], 'def': 'square or bar of very rich chocolate cake usually with nuts', 'name': 'brownie'}, {'frequency': 'c', 'id': 158, 'synset': 'brussels_sprouts.n.01', 'synonyms': ['brussels_sprouts'], 'def': 'the small edible cabbage-like buds growing along a stalk', 'name': 'brussels_sprouts'}, {'frequency': 'r', 'id': 159, 'synset': 'bubble_gum.n.01', 'synonyms': ['bubble_gum'], 'def': 'a kind of chewing gum that can be blown into bubbles', 'name': 'bubble_gum'}, {'frequency': 'f', 'id': 160, 'synset': 'bucket.n.01', 'synonyms': ['bucket', 'pail'], 'def': 'a roughly cylindrical vessel that is open at the top', 'name': 'bucket'}, {'frequency': 'r', 'id': 161, 'synset': 'buggy.n.01', 'synonyms': ['horse_buggy'], 'def': 'a small lightweight carriage; drawn by a single horse', 'name': 'horse_buggy'}, {'frequency': 'c', 'id': 162, 'synset': 'bull.n.11', 'synonyms': ['bull'], 'def': 'mature male cow', 'name': 'bull'}, {'frequency': 'r', 'id': 163, 'synset': 'bulldog.n.01', 'synonyms': ['bulldog'], 'def': 'a thickset short-haired dog with a large head and strong undershot lower jaw', 'name': 'bulldog'}, {'frequency': 'r', 'id': 164, 'synset': 'bulldozer.n.01', 'synonyms': ['bulldozer', 'dozer'], 'def': 'large powerful tractor; a large blade in front flattens areas of ground', 'name': 'bulldozer'}, {'frequency': 'c', 'id': 165, 'synset': 'bullet_train.n.01', 'synonyms': ['bullet_train'], 'def': 'a high-speed passenger train', 'name': 'bullet_train'}, {'frequency': 'c', 'id': 166, 'synset': 'bulletin_board.n.02', 'synonyms': ['bulletin_board', 'notice_board'], 'def': 'a board that hangs on a wall; displays announcements', 'name': 'bulletin_board'}, {'frequency': 'r', 'id': 167, 'synset': 'bulletproof_vest.n.01', 'synonyms': ['bulletproof_vest'], 'def': 'a vest capable of resisting the impact of a bullet', 'name': 'bulletproof_vest'}, {'frequency': 'c', 'id': 168, 'synset': 'bullhorn.n.01', 'synonyms': ['bullhorn', 'megaphone'], 'def': 'a portable loudspeaker with built-in microphone and amplifier', 'name': 'bullhorn'}, {'frequency': 'r', 'id': 169, 'synset': 'bully_beef.n.01', 'synonyms': ['corned_beef', 'corn_beef'], 'def': 'beef cured or pickled in brine', 'name': 'corned_beef'}, {'frequency': 'f', 'id': 170, 'synset': 'bun.n.01', 'synonyms': ['bun', 'roll'], 'def': 'small rounded bread either plain or sweet', 'name': 'bun'}, {'frequency': 'c', 'id': 171, 'synset': 'bunk_bed.n.01', 'synonyms': ['bunk_bed'], 'def': 'beds built one above the other', 'name': 'bunk_bed'}, {'frequency': 'f', 'id': 172, 'synset': 'buoy.n.01', 'synonyms': ['buoy'], 'def': 'a float attached by rope to the seabed to mark channels in a harbor or underwater hazards', 'name': 'buoy'}, {'frequency': 'r', 'id': 173, 'synset': 'burrito.n.01', 'synonyms': ['burrito'], 'def': 'a flour tortilla folded around a filling', 'name': 'burrito'}, {'frequency': 'f', 'id': 174, 'synset': 'bus.n.01', 'synonyms': ['bus_(vehicle)', 'autobus', 'charabanc', 'double-decker', 'motorbus', 'motorcoach'], 'def': 'a vehicle carrying many passengers; used for public transport', 'name': 'bus_(vehicle)'}, {'frequency': 'c', 'id': 175, 'synset': 'business_card.n.01', 'synonyms': ['business_card'], 'def': "a card on which are printed the person's name and business affiliation", 'name': 'business_card'}, {'frequency': 'c', 'id': 176, 'synset': 'butcher_knife.n.01', 'synonyms': ['butcher_knife'], 'def': 'a large sharp knife for cutting or trimming meat', 'name': 'butcher_knife'}, {'frequency': 'c', 'id': 177, 'synset': 'butter.n.01', 'synonyms': ['butter'], 'def': 'an edible emulsion of fat globules made by churning milk or cream; for cooking and table use', 'name': 'butter'}, {'frequency': 'c', 'id': 178, 'synset': 'butterfly.n.01', 'synonyms': ['butterfly'], 'def': 'insect typically having a slender body with knobbed antennae and broad colorful wings', 'name': 'butterfly'}, {'frequency': 'f', 'id': 179, 'synset': 'button.n.01', 'synonyms': ['button'], 'def': 'a round fastener sewn to shirts and coats etc to fit through buttonholes', 'name': 'button'}, {'frequency': 'f', 'id': 180, 'synset': 'cab.n.03', 'synonyms': ['cab_(taxi)', 'taxi', 'taxicab'], 'def': 'a car that takes passengers where they want to go in exchange for money', 'name': 'cab_(taxi)'}, {'frequency': 'r', 'id': 181, 'synset': 'cabana.n.01', 'synonyms': ['cabana'], 'def': 'a small tent used as a dressing room beside the sea or a swimming pool', 'name': 'cabana'}, {'frequency': 'r', 'id': 182, 'synset': 'cabin_car.n.01', 'synonyms': ['cabin_car', 'caboose'], 'def': 'a car on a freight train for use of the train crew; usually the last car on the train', 'name': 'cabin_car'}, {'frequency': 'f', 'id': 183, 'synset': 'cabinet.n.01', 'synonyms': ['cabinet'], 'def': 'a piece of furniture resembling a cupboard with doors and shelves and drawers', 'name': 'cabinet'}, {'frequency': 'r', 'id': 184, 'synset': 'cabinet.n.03', 'synonyms': ['locker', 'storage_locker'], 'def': 'a storage compartment for clothes and valuables; usually it has a lock', 'name': 'locker'}, {'frequency': 'f', 'id': 185, 'synset': 'cake.n.03', 'synonyms': ['cake'], 'def': 'baked goods made from or based on a mixture of flour, sugar, eggs, and fat', 'name': 'cake'}, {'frequency': 'c', 'id': 186, 'synset': 'calculator.n.02', 'synonyms': ['calculator'], 'def': 'a small machine that is used for mathematical calculations', 'name': 'calculator'}, {'frequency': 'f', 'id': 187, 'synset': 'calendar.n.02', 'synonyms': ['calendar'], 'def': 'a list or register of events (appointments/social events/court cases, etc)', 'name': 'calendar'}, {'frequency': 'c', 'id': 188, 'synset': 'calf.n.01', 'synonyms': ['calf'], 'def': 'young of domestic cattle', 'name': 'calf'}, {'frequency': 'c', 'id': 189, 'synset': 'camcorder.n.01', 'synonyms': ['camcorder'], 'def': 'a portable television camera and videocassette recorder', 'name': 'camcorder'}, {'frequency': 'c', 'id': 190, 'synset': 'camel.n.01', 'synonyms': ['camel'], 'def': 'cud-chewing mammal used as a draft or saddle animal in desert regions', 'name': 'camel'}, {'frequency': 'f', 'id': 191, 'synset': 'camera.n.01', 'synonyms': ['camera'], 'def': 'equipment for taking photographs', 'name': 'camera'}, {'frequency': 'c', 'id': 192, 'synset': 'camera_lens.n.01', 'synonyms': ['camera_lens'], 'def': 'a lens that focuses the image in a camera', 'name': 'camera_lens'}, {'frequency': 'c', 'id': 193, 'synset': 'camper.n.02', 'synonyms': ['camper_(vehicle)', 'camping_bus', 'motor_home'], 'def': 'a recreational vehicle equipped for camping out while traveling', 'name': 'camper_(vehicle)'}, {'frequency': 'f', 'id': 194, 'synset': 'can.n.01', 'synonyms': ['can', 'tin_can'], 'def': 'airtight sealed metal container for food or drink or paint etc.', 'name': 'can'}, {'frequency': 'c', 'id': 195, 'synset': 'can_opener.n.01', 'synonyms': ['can_opener', 'tin_opener'], 'def': 'a device for cutting cans open', 'name': 'can_opener'}, {'frequency': 'r', 'id': 196, 'synset': 'candelabrum.n.01', 'synonyms': ['candelabrum', 'candelabra'], 'def': 'branched candlestick; ornamental; has several lights', 'name': 'candelabrum'}, {'frequency': 'f', 'id': 197, 'synset': 'candle.n.01', 'synonyms': ['candle', 'candlestick'], 'def': 'stick of wax with a wick in the middle', 'name': 'candle'}, {'frequency': 'f', 'id': 198, 'synset': 'candlestick.n.01', 'synonyms': ['candle_holder'], 'def': 'a holder with sockets for candles', 'name': 'candle_holder'}, {'frequency': 'r', 'id': 199, 'synset': 'candy_bar.n.01', 'synonyms': ['candy_bar'], 'def': 'a candy shaped as a bar', 'name': 'candy_bar'}, {'frequency': 'c', 'id': 200, 'synset': 'candy_cane.n.01', 'synonyms': ['candy_cane'], 'def': 'a hard candy in the shape of a rod (usually with stripes)', 'name': 'candy_cane'}, {'frequency': 'c', 'id': 201, 'synset': 'cane.n.01', 'synonyms': ['walking_cane'], 'def': 'a stick that people can lean on to help them walk', 'name': 'walking_cane'}, {'frequency': 'c', 'id': 202, 'synset': 'canister.n.02', 'synonyms': ['canister', 'cannister'], 'def': 'metal container for storing dry foods such as tea or flour', 'name': 'canister'}, {'frequency': 'r', 'id': 203, 'synset': 'cannon.n.02', 'synonyms': ['cannon'], 'def': 'heavy gun fired from a tank', 'name': 'cannon'}, {'frequency': 'c', 'id': 204, 'synset': 'canoe.n.01', 'synonyms': ['canoe'], 'def': 'small and light boat; pointed at both ends; propelled with a paddle', 'name': 'canoe'}, {'frequency': 'r', 'id': 205, 'synset': 'cantaloup.n.02', 'synonyms': ['cantaloup', 'cantaloupe'], 'def': 'the fruit of a cantaloup vine; small to medium-sized melon with yellowish flesh', 'name': 'cantaloup'}, {'frequency': 'r', 'id': 206, 'synset': 'canteen.n.01', 'synonyms': ['canteen'], 'def': 'a flask for carrying water; used by soldiers or travelers', 'name': 'canteen'}, {'frequency': 'c', 'id': 207, 'synset': 'cap.n.01', 'synonyms': ['cap_(headwear)'], 'def': 'a tight-fitting headwear', 'name': 'cap_(headwear)'}, {'frequency': 'f', 'id': 208, 'synset': 'cap.n.02', 'synonyms': ['bottle_cap', 'cap_(container_lid)'], 'def': 'a top (as for a bottle)', 'name': 'bottle_cap'}, {'frequency': 'r', 'id': 209, 'synset': 'cape.n.02', 'synonyms': ['cape'], 'def': 'a sleeveless garment like a cloak but shorter', 'name': 'cape'}, {'frequency': 'c', 'id': 210, 'synset': 'cappuccino.n.01', 'synonyms': ['cappuccino', 'coffee_cappuccino'], 'def': 'equal parts of espresso and steamed milk', 'name': 'cappuccino'}, {'frequency': 'f', 'id': 211, 'synset': 'car.n.01', 'synonyms': ['car_(automobile)', 'auto_(automobile)', 'automobile'], 'def': 'a motor vehicle with four wheels', 'name': 'car_(automobile)'}, {'frequency': 'f', 'id': 212, 'synset': 'car.n.02', 'synonyms': ['railcar_(part_of_a_train)', 'railway_car_(part_of_a_train)', 'railroad_car_(part_of_a_train)'], 'def': 'a wheeled vehicle adapted to the rails of railroad', 'name': 'railcar_(part_of_a_train)'}, {'frequency': 'r', 'id': 213, 'synset': 'car.n.04', 'synonyms': ['elevator_car'], 'def': 'where passengers ride up and down', 'name': 'elevator_car'}, {'frequency': 'r', 'id': 214, 'synset': 'car_battery.n.01', 'synonyms': ['car_battery', 'automobile_battery'], 'def': 'a battery in a motor vehicle', 'name': 'car_battery'}, {'frequency': 'c', 'id': 215, 'synset': 'card.n.02', 'synonyms': ['identity_card'], 'def': 'a card certifying the identity of the bearer', 'name': 'identity_card'}, {'frequency': 'c', 'id': 216, 'synset': 'card.n.03', 'synonyms': ['card'], 'def': 'a rectangular piece of paper used to send messages (e.g. greetings or pictures)', 'name': 'card'}, {'frequency': 'r', 'id': 217, 'synset': 'cardigan.n.01', 'synonyms': ['cardigan'], 'def': 'knitted jacket that is fastened up the front with buttons or a zipper', 'name': 'cardigan'}, {'frequency': 'r', 'id': 218, 'synset': 'cargo_ship.n.01', 'synonyms': ['cargo_ship', 'cargo_vessel'], 'def': 'a ship designed to carry cargo', 'name': 'cargo_ship'}, {'frequency': 'r', 'id': 219, 'synset': 'carnation.n.01', 'synonyms': ['carnation'], 'def': 'plant with pink to purple-red spice-scented usually double flowers', 'name': 'carnation'}, {'frequency': 'c', 'id': 220, 'synset': 'carriage.n.02', 'synonyms': ['horse_carriage'], 'def': 'a vehicle with wheels drawn by one or more horses', 'name': 'horse_carriage'}, {'frequency': 'f', 'id': 221, 'synset': 'carrot.n.01', 'synonyms': ['carrot'], 'def': 'deep orange edible root of the cultivated carrot plant', 'name': 'carrot'}, {'frequency': 'c', 'id': 222, 'synset': 'carryall.n.01', 'synonyms': ['tote_bag'], 'def': 'a capacious bag or basket', 'name': 'tote_bag'}, {'frequency': 'c', 'id': 223, 'synset': 'cart.n.01', 'synonyms': ['cart'], 'def': 'a heavy open wagon usually having two wheels and drawn by an animal', 'name': 'cart'}, {'frequency': 'c', 'id': 224, 'synset': 'carton.n.02', 'synonyms': ['carton'], 'def': 'a box made of cardboard; opens by flaps on top', 'name': 'carton'}, {'frequency': 'c', 'id': 225, 'synset': 'cash_register.n.01', 'synonyms': ['cash_register', 'register_(for_cash_transactions)'], 'def': 'a cashbox with an adding machine to register transactions', 'name': 'cash_register'}, {'frequency': 'r', 'id': 226, 'synset': 'casserole.n.01', 'synonyms': ['casserole'], 'def': 'food cooked and served in a casserole', 'name': 'casserole'}, {'frequency': 'r', 'id': 227, 'synset': 'cassette.n.01', 'synonyms': ['cassette'], 'def': 'a container that holds a magnetic tape used for recording or playing sound or video', 'name': 'cassette'}, {'frequency': 'c', 'id': 228, 'synset': 'cast.n.05', 'synonyms': ['cast', 'plaster_cast', 'plaster_bandage'], 'def': 'bandage consisting of a firm covering that immobilizes broken bones while they heal', 'name': 'cast'}, {'frequency': 'f', 'id': 229, 'synset': 'cat.n.01', 'synonyms': ['cat'], 'def': 'a domestic house cat', 'name': 'cat'}, {'frequency': 'c', 'id': 230, 'synset': 'cauliflower.n.02', 'synonyms': ['cauliflower'], 'def': 'edible compact head of white undeveloped flowers', 'name': 'cauliflower'}, {'frequency': 'r', 'id': 231, 'synset': 'caviar.n.01', 'synonyms': ['caviar', 'caviare'], 'def': "salted roe of sturgeon or other large fish; usually served as an hors d'oeuvre", 'name': 'caviar'}, {'frequency': 'c', 'id': 232, 'synset': 'cayenne.n.02', 'synonyms': ['cayenne_(spice)', 'cayenne_pepper_(spice)', 'red_pepper_(spice)'], 'def': 'ground pods and seeds of pungent red peppers of the genus Capsicum', 'name': 'cayenne_(spice)'}, {'frequency': 'c', 'id': 233, 'synset': 'cd_player.n.01', 'synonyms': ['CD_player'], 'def': 'electronic equipment for playing compact discs (CDs)', 'name': 'CD_player'}, {'frequency': 'c', 'id': 234, 'synset': 'celery.n.01', 'synonyms': ['celery'], 'def': 'widely cultivated herb with aromatic leaf stalks that are eaten raw or cooked', 'name': 'celery'}, {'frequency': 'f', 'id': 235, 'synset': 'cellular_telephone.n.01', 'synonyms': ['cellular_telephone', 'cellular_phone', 'cellphone', 'mobile_phone', 'smart_phone'], 'def': 'a hand-held mobile telephone', 'name': 'cellular_telephone'}, {'frequency': 'r', 'id': 236, 'synset': 'chain_mail.n.01', 'synonyms': ['chain_mail', 'ring_mail', 'chain_armor', 'chain_armour', 'ring_armor', 'ring_armour'], 'def': '(Middle Ages) flexible armor made of interlinked metal rings', 'name': 'chain_mail'}, {'frequency': 'f', 'id': 237, 'synset': 'chair.n.01', 'synonyms': ['chair'], 'def': 'a seat for one person, with a support for the back', 'name': 'chair'}, {'frequency': 'r', 'id': 238, 'synset': 'chaise_longue.n.01', 'synonyms': ['chaise_longue', 'chaise', 'daybed'], 'def': 'a long chair; for reclining', 'name': 'chaise_longue'}, {'frequency': 'r', 'id': 239, 'synset': 'champagne.n.01', 'synonyms': ['champagne'], 'def': 'a white sparkling wine produced in Champagne or resembling that produced there', 'name': 'champagne'}, {'frequency': 'f', 'id': 240, 'synset': 'chandelier.n.01', 'synonyms': ['chandelier'], 'def': 'branched lighting fixture; often ornate; hangs from the ceiling', 'name': 'chandelier'}, {'frequency': 'r', 'id': 241, 'synset': 'chap.n.04', 'synonyms': ['chap'], 'def': 'leather leggings without a seat; worn over trousers by cowboys to protect their legs', 'name': 'chap'}, {'frequency': 'r', 'id': 242, 'synset': 'checkbook.n.01', 'synonyms': ['checkbook', 'chequebook'], 'def': 'a book issued to holders of checking accounts', 'name': 'checkbook'}, {'frequency': 'r', 'id': 243, 'synset': 'checkerboard.n.01', 'synonyms': ['checkerboard'], 'def': 'a board having 64 squares of two alternating colors', 'name': 'checkerboard'}, {'frequency': 'c', 'id': 244, 'synset': 'cherry.n.03', 'synonyms': ['cherry'], 'def': 'a red fruit with a single hard stone', 'name': 'cherry'}, {'frequency': 'r', 'id': 245, 'synset': 'chessboard.n.01', 'synonyms': ['chessboard'], 'def': 'a checkerboard used to play chess', 'name': 'chessboard'}, {'frequency': 'r', 'id': 246, 'synset': 'chest_of_drawers.n.01', 'synonyms': ['chest_of_drawers_(furniture)', 'bureau_(furniture)', 'chest_(furniture)'], 'def': 'furniture with drawers for keeping clothes', 'name': 'chest_of_drawers_(furniture)'}, {'frequency': 'c', 'id': 247, 'synset': 'chicken.n.02', 'synonyms': ['chicken_(animal)'], 'def': 'a domestic fowl bred for flesh or eggs', 'name': 'chicken_(animal)'}, {'frequency': 'c', 'id': 248, 'synset': 'chicken_wire.n.01', 'synonyms': ['chicken_wire'], 'def': 'a galvanized wire network with a hexagonal mesh; used to build fences', 'name': 'chicken_wire'}, {'frequency': 'r', 'id': 249, 'synset': 'chickpea.n.01', 'synonyms': ['chickpea', 'garbanzo'], 'def': 'the seed of the chickpea plant; usually dried', 'name': 'chickpea'}, {'frequency': 'r', 'id': 250, 'synset': 'chihuahua.n.03', 'synonyms': ['Chihuahua'], 'def': 'an old breed of tiny short-haired dog with protruding eyes from Mexico', 'name': 'Chihuahua'}, {'frequency': 'r', 'id': 251, 'synset': 'chili.n.02', 'synonyms': ['chili_(vegetable)', 'chili_pepper_(vegetable)', 'chilli_(vegetable)', 'chilly_(vegetable)', 'chile_(vegetable)'], 'def': 'very hot and finely tapering pepper of special pungency', 'name': 'chili_(vegetable)'}, {'frequency': 'r', 'id': 252, 'synset': 'chime.n.01', 'synonyms': ['chime', 'gong'], 'def': 'an instrument consisting of a set of bells that are struck with a hammer', 'name': 'chime'}, {'frequency': 'r', 'id': 253, 'synset': 'chinaware.n.01', 'synonyms': ['chinaware'], 'def': 'dishware made of high quality porcelain', 'name': 'chinaware'}, {'frequency': 'c', 'id': 254, 'synset': 'chip.n.04', 'synonyms': ['crisp_(potato_chip)', 'potato_chip'], 'def': 'a thin crisp slice of potato fried in deep fat', 'name': 'crisp_(potato_chip)'}, {'frequency': 'r', 'id': 255, 'synset': 'chip.n.06', 'synonyms': ['poker_chip'], 'def': 'a small disk-shaped counter used to represent money when gambling', 'name': 'poker_chip'}, {'frequency': 'c', 'id': 256, 'synset': 'chocolate_bar.n.01', 'synonyms': ['chocolate_bar'], 'def': 'a bar of chocolate candy', 'name': 'chocolate_bar'}, {'frequency': 'c', 'id': 257, 'synset': 'chocolate_cake.n.01', 'synonyms': ['chocolate_cake'], 'def': 'cake containing chocolate', 'name': 'chocolate_cake'}, {'frequency': 'r', 'id': 258, 'synset': 'chocolate_milk.n.01', 'synonyms': ['chocolate_milk'], 'def': 'milk flavored with chocolate syrup', 'name': 'chocolate_milk'}, {'frequency': 'r', 'id': 259, 'synset': 'chocolate_mousse.n.01', 'synonyms': ['chocolate_mousse'], 'def': 'dessert mousse made with chocolate', 'name': 'chocolate_mousse'}, {'frequency': 'f', 'id': 260, 'synset': 'choker.n.03', 'synonyms': ['choker', 'collar', 'neckband'], 'def': 'necklace that fits tightly around the neck', 'name': 'choker'}, {'frequency': 'f', 'id': 261, 'synset': 'chopping_board.n.01', 'synonyms': ['chopping_board', 'cutting_board', 'chopping_block'], 'def': 'a wooden board where meats or vegetables can be cut', 'name': 'chopping_board'}, {'frequency': 'c', 'id': 262, 'synset': 'chopstick.n.01', 'synonyms': ['chopstick'], 'def': 'one of a pair of slender sticks used as oriental tableware to eat food with', 'name': 'chopstick'}, {'frequency': 'f', 'id': 263, 'synset': 'christmas_tree.n.05', 'synonyms': ['Christmas_tree'], 'def': 'an ornamented evergreen used as a Christmas decoration', 'name': 'Christmas_tree'}, {'frequency': 'c', 'id': 264, 'synset': 'chute.n.02', 'synonyms': ['slide'], 'def': 'sloping channel through which things can descend', 'name': 'slide'}, {'frequency': 'r', 'id': 265, 'synset': 'cider.n.01', 'synonyms': ['cider', 'cyder'], 'def': 'a beverage made from juice pressed from apples', 'name': 'cider'}, {'frequency': 'r', 'id': 266, 'synset': 'cigar_box.n.01', 'synonyms': ['cigar_box'], 'def': 'a box for holding cigars', 'name': 'cigar_box'}, {'frequency': 'c', 'id': 267, 'synset': 'cigarette.n.01', 'synonyms': ['cigarette'], 'def': 'finely ground tobacco wrapped in paper; for smoking', 'name': 'cigarette'}, {'frequency': 'c', 'id': 268, 'synset': 'cigarette_case.n.01', 'synonyms': ['cigarette_case', 'cigarette_pack'], 'def': 'a small flat case for holding cigarettes', 'name': 'cigarette_case'}, {'frequency': 'f', 'id': 269, 'synset': 'cistern.n.02', 'synonyms': ['cistern', 'water_tank'], 'def': 'a tank that holds the water used to flush a toilet', 'name': 'cistern'}, {'frequency': 'r', 'id': 270, 'synset': 'clarinet.n.01', 'synonyms': ['clarinet'], 'def': 'a single-reed instrument with a straight tube', 'name': 'clarinet'}, {'frequency': 'r', 'id': 271, 'synset': 'clasp.n.01', 'synonyms': ['clasp'], 'def': 'a fastener (as a buckle or hook) that is used to hold two things together', 'name': 'clasp'}, {'frequency': 'c', 'id': 272, 'synset': 'cleansing_agent.n.01', 'synonyms': ['cleansing_agent', 'cleanser', 'cleaner'], 'def': 'a preparation used in cleaning something', 'name': 'cleansing_agent'}, {'frequency': 'r', 'id': 273, 'synset': 'clementine.n.01', 'synonyms': ['clementine'], 'def': 'a variety of mandarin orange', 'name': 'clementine'}, {'frequency': 'c', 'id': 274, 'synset': 'clip.n.03', 'synonyms': ['clip'], 'def': 'any of various small fasteners used to hold loose articles together', 'name': 'clip'}, {'frequency': 'c', 'id': 275, 'synset': 'clipboard.n.01', 'synonyms': ['clipboard'], 'def': 'a small writing board with a clip at the top for holding papers', 'name': 'clipboard'}, {'frequency': 'f', 'id': 276, 'synset': 'clock.n.01', 'synonyms': ['clock', 'timepiece', 'timekeeper'], 'def': 'a timepiece that shows the time of day', 'name': 'clock'}, {'frequency': 'f', 'id': 277, 'synset': 'clock_tower.n.01', 'synonyms': ['clock_tower'], 'def': 'a tower with a large clock visible high up on an outside face', 'name': 'clock_tower'}, {'frequency': 'c', 'id': 278, 'synset': 'clothes_hamper.n.01', 'synonyms': ['clothes_hamper', 'laundry_basket', 'clothes_basket'], 'def': 'a hamper that holds dirty clothes to be washed or wet clothes to be dried', 'name': 'clothes_hamper'}, {'frequency': 'c', 'id': 279, 'synset': 'clothespin.n.01', 'synonyms': ['clothespin', 'clothes_peg'], 'def': 'wood or plastic fastener; for holding clothes on a clothesline', 'name': 'clothespin'}, {'frequency': 'r', 'id': 280, 'synset': 'clutch_bag.n.01', 'synonyms': ['clutch_bag'], 'def': "a woman's strapless purse that is carried in the hand", 'name': 'clutch_bag'}, {'frequency': 'f', 'id': 281, 'synset': 'coaster.n.03', 'synonyms': ['coaster'], 'def': 'a covering (plate or mat) that protects the surface of a table', 'name': 'coaster'}, {'frequency': 'f', 'id': 282, 'synset': 'coat.n.01', 'synonyms': ['coat'], 'def': 'an outer garment that has sleeves and covers the body from shoulder down', 'name': 'coat'}, {'frequency': 'c', 'id': 283, 'synset': 'coat_hanger.n.01', 'synonyms': ['coat_hanger', 'clothes_hanger', 'dress_hanger'], 'def': "a hanger that is shaped like a person's shoulders", 'name': 'coat_hanger'}, {'frequency': 'r', 'id': 284, 'synset': 'coatrack.n.01', 'synonyms': ['coatrack', 'hatrack'], 'def': 'a rack with hooks for temporarily holding coats and hats', 'name': 'coatrack'}, {'frequency': 'c', 'id': 285, 'synset': 'cock.n.04', 'synonyms': ['cock', 'rooster'], 'def': 'adult male chicken', 'name': 'cock'}, {'frequency': 'c', 'id': 286, 'synset': 'coconut.n.02', 'synonyms': ['coconut', 'cocoanut'], 'def': 'large hard-shelled brown oval nut with a fibrous husk', 'name': 'coconut'}, {'frequency': 'r', 'id': 287, 'synset': 'coffee_filter.n.01', 'synonyms': ['coffee_filter'], 'def': 'filter (usually of paper) that passes the coffee and retains the coffee grounds', 'name': 'coffee_filter'}, {'frequency': 'f', 'id': 288, 'synset': 'coffee_maker.n.01', 'synonyms': ['coffee_maker', 'coffee_machine'], 'def': 'a kitchen appliance for brewing coffee automatically', 'name': 'coffee_maker'}, {'frequency': 'f', 'id': 289, 'synset': 'coffee_table.n.01', 'synonyms': ['coffee_table', 'cocktail_table'], 'def': 'low table where magazines can be placed and coffee or cocktails are served', 'name': 'coffee_table'}, {'frequency': 'c', 'id': 290, 'synset': 'coffeepot.n.01', 'synonyms': ['coffeepot'], 'def': 'tall pot in which coffee is brewed', 'name': 'coffeepot'}, {'frequency': 'r', 'id': 291, 'synset': 'coil.n.05', 'synonyms': ['coil'], 'def': 'tubing that is wound in a spiral', 'name': 'coil'}, {'frequency': 'c', 'id': 292, 'synset': 'coin.n.01', 'synonyms': ['coin'], 'def': 'a flat metal piece (usually a disc) used as money', 'name': 'coin'}, {'frequency': 'r', 'id': 293, 'synset': 'colander.n.01', 'synonyms': ['colander', 'cullender'], 'def': 'bowl-shaped strainer; used to wash or drain foods', 'name': 'colander'}, {'frequency': 'c', 'id': 294, 'synset': 'coleslaw.n.01', 'synonyms': ['coleslaw', 'slaw'], 'def': 'basically shredded cabbage', 'name': 'coleslaw'}, {'frequency': 'r', 'id': 295, 'synset': 'coloring_material.n.01', 'synonyms': ['coloring_material', 'colouring_material'], 'def': 'any material used for its color', 'name': 'coloring_material'}, {'frequency': 'r', 'id': 296, 'synset': 'combination_lock.n.01', 'synonyms': ['combination_lock'], 'def': 'lock that can be opened only by turning dials in a special sequence', 'name': 'combination_lock'}, {'frequency': 'c', 'id': 297, 'synset': 'comforter.n.04', 'synonyms': ['pacifier', 'teething_ring'], 'def': 'device used for an infant to suck or bite on', 'name': 'pacifier'}, {'frequency': 'r', 'id': 298, 'synset': 'comic_book.n.01', 'synonyms': ['comic_book'], 'def': 'a magazine devoted to comic strips', 'name': 'comic_book'}, {'frequency': 'f', 'id': 299, 'synset': 'computer_keyboard.n.01', 'synonyms': ['computer_keyboard', 'keyboard_(computer)'], 'def': 'a keyboard that is a data input device for computers', 'name': 'computer_keyboard'}, {'frequency': 'r', 'id': 300, 'synset': 'concrete_mixer.n.01', 'synonyms': ['concrete_mixer', 'cement_mixer'], 'def': 'a machine with a large revolving drum in which cement/concrete is mixed', 'name': 'concrete_mixer'}, {'frequency': 'f', 'id': 301, 'synset': 'cone.n.01', 'synonyms': ['cone', 'traffic_cone'], 'def': 'a cone-shaped object used to direct traffic', 'name': 'cone'}, {'frequency': 'f', 'id': 302, 'synset': 'control.n.09', 'synonyms': ['control', 'controller'], 'def': 'a mechanism that controls the operation of a machine', 'name': 'control'}, {'frequency': 'r', 'id': 303, 'synset': 'convertible.n.01', 'synonyms': ['convertible_(automobile)'], 'def': 'a car that has top that can be folded or removed', 'name': 'convertible_(automobile)'}, {'frequency': 'r', 'id': 304, 'synset': 'convertible.n.03', 'synonyms': ['sofa_bed'], 'def': 'a sofa that can be converted into a bed', 'name': 'sofa_bed'}, {'frequency': 'c', 'id': 305, 'synset': 'cookie.n.01', 'synonyms': ['cookie', 'cooky', 'biscuit_(cookie)'], 'def': "any of various small flat sweet cakes (`biscuit' is the British term)", 'name': 'cookie'}, {'frequency': 'r', 'id': 306, 'synset': 'cookie_jar.n.01', 'synonyms': ['cookie_jar', 'cooky_jar'], 'def': 'a jar in which cookies are kept (and sometimes money is hidden)', 'name': 'cookie_jar'}, {'frequency': 'r', 'id': 307, 'synset': 'cooking_utensil.n.01', 'synonyms': ['cooking_utensil'], 'def': 'a kitchen utensil made of material that does not melt easily; used for cooking', 'name': 'cooking_utensil'}, {'frequency': 'f', 'id': 308, 'synset': 'cooler.n.01', 'synonyms': ['cooler_(for_food)', 'ice_chest'], 'def': 'an insulated box for storing food often with ice', 'name': 'cooler_(for_food)'}, {'frequency': 'c', 'id': 309, 'synset': 'cork.n.04', 'synonyms': ['cork_(bottle_plug)', 'bottle_cork'], 'def': 'the plug in the mouth of a bottle (especially a wine bottle)', 'name': 'cork_(bottle_plug)'}, {'frequency': 'r', 'id': 310, 'synset': 'corkboard.n.01', 'synonyms': ['corkboard'], 'def': 'a sheet consisting of cork granules', 'name': 'corkboard'}, {'frequency': 'r', 'id': 311, 'synset': 'corkscrew.n.01', 'synonyms': ['corkscrew', 'bottle_screw'], 'def': 'a bottle opener that pulls corks', 'name': 'corkscrew'}, {'frequency': 'c', 'id': 312, 'synset': 'corn.n.03', 'synonyms': ['edible_corn', 'corn', 'maize'], 'def': 'ears of corn that can be prepared and served for human food', 'name': 'edible_corn'}, {'frequency': 'r', 'id': 313, 'synset': 'cornbread.n.01', 'synonyms': ['cornbread'], 'def': 'bread made primarily of cornmeal', 'name': 'cornbread'}, {'frequency': 'c', 'id': 314, 'synset': 'cornet.n.01', 'synonyms': ['cornet', 'horn', 'trumpet'], 'def': 'a brass musical instrument with a narrow tube and a flared bell and many valves', 'name': 'cornet'}, {'frequency': 'c', 'id': 315, 'synset': 'cornice.n.01', 'synonyms': ['cornice', 'valance', 'valance_board', 'pelmet'], 'def': 'a decorative framework to conceal curtain fixtures at the top of a window casing', 'name': 'cornice'}, {'frequency': 'r', 'id': 316, 'synset': 'cornmeal.n.01', 'synonyms': ['cornmeal'], 'def': 'coarsely ground corn', 'name': 'cornmeal'}, {'frequency': 'r', 'id': 317, 'synset': 'corset.n.01', 'synonyms': ['corset', 'girdle'], 'def': "a woman's close-fitting foundation garment", 'name': 'corset'}, {'frequency': 'r', 'id': 318, 'synset': 'cos.n.02', 'synonyms': ['romaine_lettuce'], 'def': 'lettuce with long dark-green leaves in a loosely packed elongated head', 'name': 'romaine_lettuce'}, {'frequency': 'c', 'id': 319, 'synset': 'costume.n.04', 'synonyms': ['costume'], 'def': 'the attire characteristic of a country or a time or a social class', 'name': 'costume'}, {'frequency': 'r', 'id': 320, 'synset': 'cougar.n.01', 'synonyms': ['cougar', 'puma', 'catamount', 'mountain_lion', 'panther'], 'def': 'large American feline resembling a lion', 'name': 'cougar'}, {'frequency': 'r', 'id': 321, 'synset': 'coverall.n.01', 'synonyms': ['coverall'], 'def': 'a loose-fitting protective garment that is worn over other clothing', 'name': 'coverall'}, {'frequency': 'r', 'id': 322, 'synset': 'cowbell.n.01', 'synonyms': ['cowbell'], 'def': 'a bell hung around the neck of cow so that the cow can be easily located', 'name': 'cowbell'}, {'frequency': 'f', 'id': 323, 'synset': 'cowboy_hat.n.01', 'synonyms': ['cowboy_hat', 'ten-gallon_hat'], 'def': 'a hat with a wide brim and a soft crown; worn by American ranch hands', 'name': 'cowboy_hat'}, {'frequency': 'r', 'id': 324, 'synset': 'crab.n.01', 'synonyms': ['crab_(animal)'], 'def': 'decapod having eyes on short stalks and a broad flattened shell and pincers', 'name': 'crab_(animal)'}, {'frequency': 'c', 'id': 325, 'synset': 'cracker.n.01', 'synonyms': ['cracker'], 'def': 'a thin crisp wafer', 'name': 'cracker'}, {'frequency': 'r', 'id': 326, 'synset': 'crape.n.01', 'synonyms': ['crape', 'crepe', 'French_pancake'], 'def': 'small very thin pancake', 'name': 'crape'}, {'frequency': 'f', 'id': 327, 'synset': 'crate.n.01', 'synonyms': ['crate'], 'def': 'a rugged box (usually made of wood); used for shipping', 'name': 'crate'}, {'frequency': 'r', 'id': 328, 'synset': 'crayon.n.01', 'synonyms': ['crayon', 'wax_crayon'], 'def': 'writing or drawing implement made of a colored stick of composition wax', 'name': 'crayon'}, {'frequency': 'r', 'id': 329, 'synset': 'cream_pitcher.n.01', 'synonyms': ['cream_pitcher'], 'def': 'a small pitcher for serving cream', 'name': 'cream_pitcher'}, {'frequency': 'r', 'id': 330, 'synset': 'credit_card.n.01', 'synonyms': ['credit_card', 'charge_card', 'debit_card'], 'def': 'a card, usually plastic, used to pay for goods and services', 'name': 'credit_card'}, {'frequency': 'c', 'id': 331, 'synset': 'crescent_roll.n.01', 'synonyms': ['crescent_roll', 'croissant'], 'def': 'very rich flaky crescent-shaped roll', 'name': 'crescent_roll'}, {'frequency': 'c', 'id': 332, 'synset': 'crib.n.01', 'synonyms': ['crib', 'cot'], 'def': 'baby bed with high sides made of slats', 'name': 'crib'}, {'frequency': 'c', 'id': 333, 'synset': 'crock.n.03', 'synonyms': ['crock_pot', 'earthenware_jar'], 'def': 'an earthen jar (made of baked clay)', 'name': 'crock_pot'}, {'frequency': 'f', 'id': 334, 'synset': 'crossbar.n.01', 'synonyms': ['crossbar'], 'def': 'a horizontal bar that goes across something', 'name': 'crossbar'}, {'frequency': 'r', 'id': 335, 'synset': 'crouton.n.01', 'synonyms': ['crouton'], 'def': 'a small piece of toasted or fried bread; served in soup or salads', 'name': 'crouton'}, {'frequency': 'r', 'id': 336, 'synset': 'crow.n.01', 'synonyms': ['crow'], 'def': 'black birds having a raucous call', 'name': 'crow'}, {'frequency': 'c', 'id': 337, 'synset': 'crown.n.04', 'synonyms': ['crown'], 'def': 'an ornamental jeweled headdress signifying sovereignty', 'name': 'crown'}, {'frequency': 'c', 'id': 338, 'synset': 'crucifix.n.01', 'synonyms': ['crucifix'], 'def': 'representation of the cross on which Jesus died', 'name': 'crucifix'}, {'frequency': 'c', 'id': 339, 'synset': 'cruise_ship.n.01', 'synonyms': ['cruise_ship', 'cruise_liner'], 'def': 'a passenger ship used commercially for pleasure cruises', 'name': 'cruise_ship'}, {'frequency': 'c', 'id': 340, 'synset': 'cruiser.n.01', 'synonyms': ['police_cruiser', 'patrol_car', 'police_car', 'squad_car'], 'def': 'a car in which policemen cruise the streets', 'name': 'police_cruiser'}, {'frequency': 'c', 'id': 341, 'synset': 'crumb.n.03', 'synonyms': ['crumb'], 'def': 'small piece of e.g. bread or cake', 'name': 'crumb'}, {'frequency': 'r', 'id': 342, 'synset': 'crutch.n.01', 'synonyms': ['crutch'], 'def': 'a wooden or metal staff that fits under the armpit and reaches to the ground', 'name': 'crutch'}, {'frequency': 'c', 'id': 343, 'synset': 'cub.n.03', 'synonyms': ['cub_(animal)'], 'def': 'the young of certain carnivorous mammals such as the bear or wolf or lion', 'name': 'cub_(animal)'}, {'frequency': 'r', 'id': 344, 'synset': 'cube.n.05', 'synonyms': ['cube', 'square_block'], 'def': 'a block in the (approximate) shape of a cube', 'name': 'cube'}, {'frequency': 'f', 'id': 345, 'synset': 'cucumber.n.02', 'synonyms': ['cucumber', 'cuke'], 'def': 'cylindrical green fruit with thin green rind and white flesh eaten as a vegetable', 'name': 'cucumber'}, {'frequency': 'c', 'id': 346, 'synset': 'cufflink.n.01', 'synonyms': ['cufflink'], 'def': 'jewelry consisting of linked buttons used to fasten the cuffs of a shirt', 'name': 'cufflink'}, {'frequency': 'f', 'id': 347, 'synset': 'cup.n.01', 'synonyms': ['cup'], 'def': 'a small open container usually used for drinking; usually has a handle', 'name': 'cup'}, {'frequency': 'c', 'id': 348, 'synset': 'cup.n.08', 'synonyms': ['trophy_cup'], 'def': 'a metal vessel with handles that is awarded as a trophy to a competition winner', 'name': 'trophy_cup'}, {'frequency': 'c', 'id': 349, 'synset': 'cupcake.n.01', 'synonyms': ['cupcake'], 'def': 'small cake baked in a muffin tin', 'name': 'cupcake'}, {'frequency': 'r', 'id': 350, 'synset': 'curler.n.01', 'synonyms': ['hair_curler', 'hair_roller', 'hair_crimper'], 'def': 'a cylindrical tube around which the hair is wound to curl it', 'name': 'hair_curler'}, {'frequency': 'r', 'id': 351, 'synset': 'curling_iron.n.01', 'synonyms': ['curling_iron'], 'def': 'a cylindrical home appliance that heats hair that has been curled around it', 'name': 'curling_iron'}, {'frequency': 'f', 'id': 352, 'synset': 'curtain.n.01', 'synonyms': ['curtain', 'drapery'], 'def': 'hanging cloth used as a blind (especially for a window)', 'name': 'curtain'}, {'frequency': 'f', 'id': 353, 'synset': 'cushion.n.03', 'synonyms': ['cushion'], 'def': 'a soft bag filled with air or padding such as feathers or foam rubber', 'name': 'cushion'}, {'frequency': 'r', 'id': 354, 'synset': 'custard.n.01', 'synonyms': ['custard'], 'def': 'sweetened mixture of milk and eggs baked or boiled or frozen', 'name': 'custard'}, {'frequency': 'c', 'id': 355, 'synset': 'cutter.n.06', 'synonyms': ['cutting_tool'], 'def': 'a cutting implement; a tool for cutting', 'name': 'cutting_tool'}, {'frequency': 'r', 'id': 356, 'synset': 'cylinder.n.04', 'synonyms': ['cylinder'], 'def': 'a cylindrical container', 'name': 'cylinder'}, {'frequency': 'r', 'id': 357, 'synset': 'cymbal.n.01', 'synonyms': ['cymbal'], 'def': 'a percussion instrument consisting of a concave brass disk', 'name': 'cymbal'}, {'frequency': 'r', 'id': 358, 'synset': 'dachshund.n.01', 'synonyms': ['dachshund', 'dachsie', 'badger_dog'], 'def': 'small long-bodied short-legged breed of dog having a short sleek coat and long drooping ears', 'name': 'dachshund'}, {'frequency': 'r', 'id': 359, 'synset': 'dagger.n.01', 'synonyms': ['dagger'], 'def': 'a short knife with a pointed blade used for piercing or stabbing', 'name': 'dagger'}, {'frequency': 'r', 'id': 360, 'synset': 'dartboard.n.01', 'synonyms': ['dartboard'], 'def': 'a circular board of wood or cork used as the target in the game of darts', 'name': 'dartboard'}, {'frequency': 'r', 'id': 361, 'synset': 'date.n.08', 'synonyms': ['date_(fruit)'], 'def': 'sweet edible fruit of the date palm with a single long woody seed', 'name': 'date_(fruit)'}, {'frequency': 'f', 'id': 362, 'synset': 'deck_chair.n.01', 'synonyms': ['deck_chair', 'beach_chair'], 'def': 'a folding chair for use outdoors; a wooden frame supports a length of canvas', 'name': 'deck_chair'}, {'frequency': 'c', 'id': 363, 'synset': 'deer.n.01', 'synonyms': ['deer', 'cervid'], 'def': "distinguished from Bovidae by the male's having solid deciduous antlers", 'name': 'deer'}, {'frequency': 'c', 'id': 364, 'synset': 'dental_floss.n.01', 'synonyms': ['dental_floss', 'floss'], 'def': 'a soft thread for cleaning the spaces between the teeth', 'name': 'dental_floss'}, {'frequency': 'f', 'id': 365, 'synset': 'desk.n.01', 'synonyms': ['desk'], 'def': 'a piece of furniture with a writing surface and usually drawers or other compartments', 'name': 'desk'}, {'frequency': 'r', 'id': 366, 'synset': 'detergent.n.01', 'synonyms': ['detergent'], 'def': 'a surface-active chemical widely used in industry and laundering', 'name': 'detergent'}, {'frequency': 'c', 'id': 367, 'synset': 'diaper.n.01', 'synonyms': ['diaper'], 'def': 'garment consisting of a folded cloth drawn up between the legs and fastened at the waist', 'name': 'diaper'}, {'frequency': 'r', 'id': 368, 'synset': 'diary.n.01', 'synonyms': ['diary', 'journal'], 'def': 'a daily written record of (usually personal) experiences and observations', 'name': 'diary'}, {'frequency': 'r', 'id': 369, 'synset': 'die.n.01', 'synonyms': ['die', 'dice'], 'def': 'a small cube with 1 to 6 spots on the six faces; used in gambling', 'name': 'die'}, {'frequency': 'r', 'id': 370, 'synset': 'dinghy.n.01', 'synonyms': ['dinghy', 'dory', 'rowboat'], 'def': 'a small boat of shallow draft with seats and oars with which it is propelled', 'name': 'dinghy'}, {'frequency': 'f', 'id': 371, 'synset': 'dining_table.n.01', 'synonyms': ['dining_table'], 'def': 'a table at which meals are served', 'name': 'dining_table'}, {'frequency': 'r', 'id': 372, 'synset': 'dinner_jacket.n.01', 'synonyms': ['tux', 'tuxedo'], 'def': 'semiformal evening dress for men', 'name': 'tux'}, {'frequency': 'c', 'id': 373, 'synset': 'dish.n.01', 'synonyms': ['dish'], 'def': 'a piece of dishware normally used as a container for holding or serving food', 'name': 'dish'}, {'frequency': 'c', 'id': 374, 'synset': 'dish.n.05', 'synonyms': ['dish_antenna'], 'def': 'directional antenna consisting of a parabolic reflector', 'name': 'dish_antenna'}, {'frequency': 'c', 'id': 375, 'synset': 'dishrag.n.01', 'synonyms': ['dishrag', 'dishcloth'], 'def': 'a cloth for washing dishes', 'name': 'dishrag'}, {'frequency': 'c', 'id': 376, 'synset': 'dishtowel.n.01', 'synonyms': ['dishtowel', 'tea_towel'], 'def': 'a towel for drying dishes', 'name': 'dishtowel'}, {'frequency': 'f', 'id': 377, 'synset': 'dishwasher.n.01', 'synonyms': ['dishwasher', 'dishwashing_machine'], 'def': 'a machine for washing dishes', 'name': 'dishwasher'}, {'frequency': 'r', 'id': 378, 'synset': 'dishwasher_detergent.n.01', 'synonyms': ['dishwasher_detergent', 'dishwashing_detergent', 'dishwashing_liquid'], 'def': 'a low-sudsing detergent designed for use in dishwashers', 'name': 'dishwasher_detergent'}, {'frequency': 'r', 'id': 379, 'synset': 'diskette.n.01', 'synonyms': ['diskette', 'floppy', 'floppy_disk'], 'def': 'a small plastic magnetic disk enclosed in a stiff envelope used to store data', 'name': 'diskette'}, {'frequency': 'c', 'id': 380, 'synset': 'dispenser.n.01', 'synonyms': ['dispenser'], 'def': 'a container so designed that the contents can be used in prescribed amounts', 'name': 'dispenser'}, {'frequency': 'c', 'id': 381, 'synset': 'dixie_cup.n.01', 'synonyms': ['Dixie_cup', 'paper_cup'], 'def': 'a disposable cup made of paper; for holding drinks', 'name': 'Dixie_cup'}, {'frequency': 'f', 'id': 382, 'synset': 'dog.n.01', 'synonyms': ['dog'], 'def': 'a common domesticated dog', 'name': 'dog'}, {'frequency': 'f', 'id': 383, 'synset': 'dog_collar.n.01', 'synonyms': ['dog_collar'], 'def': 'a collar for a dog', 'name': 'dog_collar'}, {'frequency': 'c', 'id': 384, 'synset': 'doll.n.01', 'synonyms': ['doll'], 'def': 'a toy replica of a HUMAN (NOT AN ANIMAL)', 'name': 'doll'}, {'frequency': 'r', 'id': 385, 'synset': 'dollar.n.02', 'synonyms': ['dollar', 'dollar_bill', 'one_dollar_bill'], 'def': 'a piece of paper money worth one dollar', 'name': 'dollar'}, {'frequency': 'r', 'id': 386, 'synset': 'dolphin.n.02', 'synonyms': ['dolphin'], 'def': 'any of various small toothed whales with a beaklike snout; larger than porpoises', 'name': 'dolphin'}, {'frequency': 'c', 'id': 387, 'synset': 'domestic_ass.n.01', 'synonyms': ['domestic_ass', 'donkey'], 'def': 'domestic beast of burden descended from the African wild ass; patient but stubborn', 'name': 'domestic_ass'}, {'frequency': 'r', 'id': 388, 'synset': 'domino.n.03', 'synonyms': ['eye_mask'], 'def': 'a mask covering the upper part of the face but with holes for the eyes', 'name': 'eye_mask'}, {'frequency': 'r', 'id': 389, 'synset': 'doorbell.n.01', 'synonyms': ['doorbell', 'buzzer'], 'def': 'a button at an outer door that gives a ringing or buzzing signal when pushed', 'name': 'doorbell'}, {'frequency': 'f', 'id': 390, 'synset': 'doorknob.n.01', 'synonyms': ['doorknob', 'doorhandle'], 'def': "a knob used to open a door (often called `doorhandle' in Great Britain)", 'name': 'doorknob'}, {'frequency': 'c', 'id': 391, 'synset': 'doormat.n.02', 'synonyms': ['doormat', 'welcome_mat'], 'def': 'a mat placed outside an exterior door for wiping the shoes before entering', 'name': 'doormat'}, {'frequency': 'f', 'id': 392, 'synset': 'doughnut.n.02', 'synonyms': ['doughnut', 'donut'], 'def': 'a small ring-shaped friedcake', 'name': 'doughnut'}, {'frequency': 'r', 'id': 393, 'synset': 'dove.n.01', 'synonyms': ['dove'], 'def': 'any of numerous small pigeons', 'name': 'dove'}, {'frequency': 'r', 'id': 394, 'synset': 'dragonfly.n.01', 'synonyms': ['dragonfly'], 'def': 'slender-bodied non-stinging insect having iridescent wings that are outspread at rest', 'name': 'dragonfly'}, {'frequency': 'f', 'id': 395, 'synset': 'drawer.n.01', 'synonyms': ['drawer'], 'def': 'a boxlike container in a piece of furniture; made so as to slide in and out', 'name': 'drawer'}, {'frequency': 'c', 'id': 396, 'synset': 'drawers.n.01', 'synonyms': ['underdrawers', 'boxers', 'boxershorts'], 'def': 'underpants worn by men', 'name': 'underdrawers'}, {'frequency': 'f', 'id': 397, 'synset': 'dress.n.01', 'synonyms': ['dress', 'frock'], 'def': 'a one-piece garment for a woman; has skirt and bodice', 'name': 'dress'}, {'frequency': 'c', 'id': 398, 'synset': 'dress_hat.n.01', 'synonyms': ['dress_hat', 'high_hat', 'opera_hat', 'silk_hat', 'top_hat'], 'def': "a man's hat with a tall crown; usually covered with silk or with beaver fur", 'name': 'dress_hat'}, {'frequency': 'c', 'id': 399, 'synset': 'dress_suit.n.01', 'synonyms': ['dress_suit'], 'def': 'formalwear consisting of full evening dress for men', 'name': 'dress_suit'}, {'frequency': 'c', 'id': 400, 'synset': 'dresser.n.05', 'synonyms': ['dresser'], 'def': 'a cabinet with shelves', 'name': 'dresser'}, {'frequency': 'c', 'id': 401, 'synset': 'drill.n.01', 'synonyms': ['drill'], 'def': 'a tool with a sharp rotating point for making holes in hard materials', 'name': 'drill'}, {'frequency': 'r', 'id': 402, 'synset': 'drinking_fountain.n.01', 'synonyms': ['drinking_fountain'], 'def': 'a public fountain to provide a jet of drinking water', 'name': 'drinking_fountain'}, {'frequency': 'r', 'id': 403, 'synset': 'drone.n.04', 'synonyms': ['drone'], 'def': 'an aircraft without a pilot that is operated by remote control', 'name': 'drone'}, {'frequency': 'r', 'id': 404, 'synset': 'dropper.n.01', 'synonyms': ['dropper', 'eye_dropper'], 'def': 'pipet consisting of a small tube with a vacuum bulb at one end for drawing liquid in and releasing it a drop at a time', 'name': 'dropper'}, {'frequency': 'c', 'id': 405, 'synset': 'drum.n.01', 'synonyms': ['drum_(musical_instrument)'], 'def': 'a musical percussion instrument; usually consists of a hollow cylinder with a membrane stretched across each end', 'name': 'drum_(musical_instrument)'}, {'frequency': 'r', 'id': 406, 'synset': 'drumstick.n.02', 'synonyms': ['drumstick'], 'def': 'a stick used for playing a drum', 'name': 'drumstick'}, {'frequency': 'f', 'id': 407, 'synset': 'duck.n.01', 'synonyms': ['duck'], 'def': 'small web-footed broad-billed swimming bird', 'name': 'duck'}, {'frequency': 'r', 'id': 408, 'synset': 'duckling.n.02', 'synonyms': ['duckling'], 'def': 'young duck', 'name': 'duckling'}, {'frequency': 'c', 'id': 409, 'synset': 'duct_tape.n.01', 'synonyms': ['duct_tape'], 'def': 'a wide silvery adhesive tape', 'name': 'duct_tape'}, {'frequency': 'f', 'id': 410, 'synset': 'duffel_bag.n.01', 'synonyms': ['duffel_bag', 'duffle_bag', 'duffel', 'duffle'], 'def': 'a large cylindrical bag of heavy cloth', 'name': 'duffel_bag'}, {'frequency': 'r', 'id': 411, 'synset': 'dumbbell.n.01', 'synonyms': ['dumbbell'], 'def': 'an exercising weight with two ball-like ends connected by a short handle', 'name': 'dumbbell'}, {'frequency': 'c', 'id': 412, 'synset': 'dumpster.n.01', 'synonyms': ['dumpster'], 'def': 'a container designed to receive and transport and dump waste', 'name': 'dumpster'}, {'frequency': 'r', 'id': 413, 'synset': 'dustpan.n.02', 'synonyms': ['dustpan'], 'def': 'a short-handled receptacle into which dust can be swept', 'name': 'dustpan'}, {'frequency': 'r', 'id': 414, 'synset': 'dutch_oven.n.02', 'synonyms': ['Dutch_oven'], 'def': 'iron or earthenware cooking pot; used for stews', 'name': 'Dutch_oven'}, {'frequency': 'c', 'id': 415, 'synset': 'eagle.n.01', 'synonyms': ['eagle'], 'def': 'large birds of prey noted for their broad wings and strong soaring flight', 'name': 'eagle'}, {'frequency': 'f', 'id': 416, 'synset': 'earphone.n.01', 'synonyms': ['earphone', 'earpiece', 'headphone'], 'def': 'device for listening to audio that is held over or inserted into the ear', 'name': 'earphone'}, {'frequency': 'r', 'id': 417, 'synset': 'earplug.n.01', 'synonyms': ['earplug'], 'def': 'a soft plug that is inserted into the ear canal to block sound', 'name': 'earplug'}, {'frequency': 'f', 'id': 418, 'synset': 'earring.n.01', 'synonyms': ['earring'], 'def': 'jewelry to ornament the ear', 'name': 'earring'}, {'frequency': 'c', 'id': 419, 'synset': 'easel.n.01', 'synonyms': ['easel'], 'def': "an upright tripod for displaying something (usually an artist's canvas)", 'name': 'easel'}, {'frequency': 'r', 'id': 420, 'synset': 'eclair.n.01', 'synonyms': ['eclair'], 'def': 'oblong cream puff', 'name': 'eclair'}, {'frequency': 'r', 'id': 421, 'synset': 'eel.n.01', 'synonyms': ['eel'], 'def': 'an elongate fish with fatty flesh', 'name': 'eel'}, {'frequency': 'f', 'id': 422, 'synset': 'egg.n.02', 'synonyms': ['egg', 'eggs'], 'def': 'oval reproductive body of a fowl (especially a hen) used as food', 'name': 'egg'}, {'frequency': 'r', 'id': 423, 'synset': 'egg_roll.n.01', 'synonyms': ['egg_roll', 'spring_roll'], 'def': 'minced vegetables and meat wrapped in a pancake and fried', 'name': 'egg_roll'}, {'frequency': 'c', 'id': 424, 'synset': 'egg_yolk.n.01', 'synonyms': ['egg_yolk', 'yolk_(egg)'], 'def': 'the yellow spherical part of an egg', 'name': 'egg_yolk'}, {'frequency': 'c', 'id': 425, 'synset': 'eggbeater.n.02', 'synonyms': ['eggbeater', 'eggwhisk'], 'def': 'a mixer for beating eggs or whipping cream', 'name': 'eggbeater'}, {'frequency': 'c', 'id': 426, 'synset': 'eggplant.n.01', 'synonyms': ['eggplant', 'aubergine'], 'def': 'egg-shaped vegetable having a shiny skin typically dark purple', 'name': 'eggplant'}, {'frequency': 'r', 'id': 427, 'synset': 'electric_chair.n.01', 'synonyms': ['electric_chair'], 'def': 'a chair-shaped instrument of execution by electrocution', 'name': 'electric_chair'}, {'frequency': 'f', 'id': 428, 'synset': 'electric_refrigerator.n.01', 'synonyms': ['refrigerator'], 'def': 'a refrigerator in which the coolant is pumped around by an electric motor', 'name': 'refrigerator'}, {'frequency': 'f', 'id': 429, 'synset': 'elephant.n.01', 'synonyms': ['elephant'], 'def': 'a common elephant', 'name': 'elephant'}, {'frequency': 'r', 'id': 430, 'synset': 'elk.n.01', 'synonyms': ['elk', 'moose'], 'def': 'large northern deer with enormous flattened antlers in the male', 'name': 'elk'}, {'frequency': 'c', 'id': 431, 'synset': 'envelope.n.01', 'synonyms': ['envelope'], 'def': 'a flat (usually rectangular) container for a letter, thin package, etc.', 'name': 'envelope'}, {'frequency': 'c', 'id': 432, 'synset': 'eraser.n.01', 'synonyms': ['eraser'], 'def': 'an implement used to erase something', 'name': 'eraser'}, {'frequency': 'r', 'id': 433, 'synset': 'escargot.n.01', 'synonyms': ['escargot'], 'def': 'edible snail usually served in the shell with a sauce of melted butter and garlic', 'name': 'escargot'}, {'frequency': 'r', 'id': 434, 'synset': 'eyepatch.n.01', 'synonyms': ['eyepatch'], 'def': 'a protective cloth covering for an injured eye', 'name': 'eyepatch'}, {'frequency': 'r', 'id': 435, 'synset': 'falcon.n.01', 'synonyms': ['falcon'], 'def': 'birds of prey having long pointed powerful wings adapted for swift flight', 'name': 'falcon'}, {'frequency': 'f', 'id': 436, 'synset': 'fan.n.01', 'synonyms': ['fan'], 'def': 'a device for creating a current of air by movement of a surface or surfaces', 'name': 'fan'}, {'frequency': 'f', 'id': 437, 'synset': 'faucet.n.01', 'synonyms': ['faucet', 'spigot', 'tap'], 'def': 'a regulator for controlling the flow of a liquid from a reservoir', 'name': 'faucet'}, {'frequency': 'r', 'id': 438, 'synset': 'fedora.n.01', 'synonyms': ['fedora'], 'def': 'a hat made of felt with a creased crown', 'name': 'fedora'}, {'frequency': 'r', 'id': 439, 'synset': 'ferret.n.02', 'synonyms': ['ferret'], 'def': 'domesticated albino variety of the European polecat bred for hunting rats and rabbits', 'name': 'ferret'}, {'frequency': 'c', 'id': 440, 'synset': 'ferris_wheel.n.01', 'synonyms': ['Ferris_wheel'], 'def': 'a large wheel with suspended seats that remain upright as the wheel rotates', 'name': 'Ferris_wheel'}, {'frequency': 'r', 'id': 441, 'synset': 'ferry.n.01', 'synonyms': ['ferry', 'ferryboat'], 'def': 'a boat that transports people or vehicles across a body of water and operates on a regular schedule', 'name': 'ferry'}, {'frequency': 'r', 'id': 442, 'synset': 'fig.n.04', 'synonyms': ['fig_(fruit)'], 'def': 'fleshy sweet pear-shaped yellowish or purple fruit eaten fresh or preserved or dried', 'name': 'fig_(fruit)'}, {'frequency': 'c', 'id': 443, 'synset': 'fighter.n.02', 'synonyms': ['fighter_jet', 'fighter_aircraft', 'attack_aircraft'], 'def': 'a high-speed military or naval airplane designed to destroy enemy targets', 'name': 'fighter_jet'}, {'frequency': 'f', 'id': 444, 'synset': 'figurine.n.01', 'synonyms': ['figurine'], 'def': 'a small carved or molded figure', 'name': 'figurine'}, {'frequency': 'c', 'id': 445, 'synset': 'file.n.03', 'synonyms': ['file_cabinet', 'filing_cabinet'], 'def': 'office furniture consisting of a container for keeping papers in order', 'name': 'file_cabinet'}, {'frequency': 'r', 'id': 446, 'synset': 'file.n.04', 'synonyms': ['file_(tool)'], 'def': 'a steel hand tool with small sharp teeth on some or all of its surfaces; used for smoothing wood or metal', 'name': 'file_(tool)'}, {'frequency': 'f', 'id': 447, 'synset': 'fire_alarm.n.02', 'synonyms': ['fire_alarm', 'smoke_alarm'], 'def': 'an alarm that is tripped off by fire or smoke', 'name': 'fire_alarm'}, {'frequency': 'c', 'id': 448, 'synset': 'fire_engine.n.01', 'synonyms': ['fire_engine', 'fire_truck'], 'def': 'large trucks that carry firefighters and equipment to the site of a fire', 'name': 'fire_engine'}, {'frequency': 'c', 'id': 449, 'synset': 'fire_extinguisher.n.01', 'synonyms': ['fire_extinguisher', 'extinguisher'], 'def': 'a manually operated device for extinguishing small fires', 'name': 'fire_extinguisher'}, {'frequency': 'c', 'id': 450, 'synset': 'fire_hose.n.01', 'synonyms': ['fire_hose'], 'def': 'a large hose that carries water from a fire hydrant to the site of the fire', 'name': 'fire_hose'}, {'frequency': 'f', 'id': 451, 'synset': 'fireplace.n.01', 'synonyms': ['fireplace'], 'def': 'an open recess in a wall at the base of a chimney where a fire can be built', 'name': 'fireplace'}, {'frequency': 'f', 'id': 452, 'synset': 'fireplug.n.01', 'synonyms': ['fireplug', 'fire_hydrant', 'hydrant'], 'def': 'an upright hydrant for drawing water to use in fighting a fire', 'name': 'fireplug'}, {'frequency': 'c', 'id': 453, 'synset': 'fish.n.01', 'synonyms': ['fish'], 'def': 'any of various mostly cold-blooded aquatic vertebrates usually having scales and breathing through gills', 'name': 'fish'}, {'frequency': 'r', 'id': 454, 'synset': 'fish.n.02', 'synonyms': ['fish_(food)'], 'def': 'the flesh of fish used as food', 'name': 'fish_(food)'}, {'frequency': 'r', 'id': 455, 'synset': 'fishbowl.n.02', 'synonyms': ['fishbowl', 'goldfish_bowl'], 'def': 'a transparent bowl in which small fish are kept', 'name': 'fishbowl'}, {'frequency': 'r', 'id': 456, 'synset': 'fishing_boat.n.01', 'synonyms': ['fishing_boat', 'fishing_vessel'], 'def': 'a vessel for fishing', 'name': 'fishing_boat'}, {'frequency': 'c', 'id': 457, 'synset': 'fishing_rod.n.01', 'synonyms': ['fishing_rod', 'fishing_pole'], 'def': 'a rod that is used in fishing to extend the fishing line', 'name': 'fishing_rod'}, {'frequency': 'f', 'id': 458, 'synset': 'flag.n.01', 'synonyms': ['flag'], 'def': 'emblem usually consisting of a rectangular piece of cloth of distinctive design (do not include pole)', 'name': 'flag'}, {'frequency': 'f', 'id': 459, 'synset': 'flagpole.n.02', 'synonyms': ['flagpole', 'flagstaff'], 'def': 'a tall staff or pole on which a flag is raised', 'name': 'flagpole'}, {'frequency': 'c', 'id': 460, 'synset': 'flamingo.n.01', 'synonyms': ['flamingo'], 'def': 'large pink web-footed bird with down-bent bill', 'name': 'flamingo'}, {'frequency': 'c', 'id': 461, 'synset': 'flannel.n.01', 'synonyms': ['flannel'], 'def': 'a soft light woolen fabric; used for clothing', 'name': 'flannel'}, {'frequency': 'r', 'id': 462, 'synset': 'flash.n.10', 'synonyms': ['flash', 'flashbulb'], 'def': 'a lamp for providing momentary light to take a photograph', 'name': 'flash'}, {'frequency': 'c', 'id': 463, 'synset': 'flashlight.n.01', 'synonyms': ['flashlight', 'torch'], 'def': 'a small portable battery-powered electric lamp', 'name': 'flashlight'}, {'frequency': 'r', 'id': 464, 'synset': 'fleece.n.03', 'synonyms': ['fleece'], 'def': 'a soft bulky fabric with deep pile; used chiefly for clothing', 'name': 'fleece'}, {'frequency': 'f', 'id': 465, 'synset': 'flip-flop.n.02', 'synonyms': ['flip-flop_(sandal)'], 'def': 'a backless sandal held to the foot by a thong between two toes', 'name': 'flip-flop_(sandal)'}, {'frequency': 'c', 'id': 466, 'synset': 'flipper.n.01', 'synonyms': ['flipper_(footwear)', 'fin_(footwear)'], 'def': 'a shoe to aid a person in swimming', 'name': 'flipper_(footwear)'}, {'frequency': 'f', 'id': 467, 'synset': 'flower_arrangement.n.01', 'synonyms': ['flower_arrangement', 'floral_arrangement'], 'def': 'a decorative arrangement of flowers', 'name': 'flower_arrangement'}, {'frequency': 'c', 'id': 468, 'synset': 'flute.n.02', 'synonyms': ['flute_glass', 'champagne_flute'], 'def': 'a tall narrow wineglass', 'name': 'flute_glass'}, {'frequency': 'r', 'id': 469, 'synset': 'foal.n.01', 'synonyms': ['foal'], 'def': 'a young horse', 'name': 'foal'}, {'frequency': 'c', 'id': 470, 'synset': 'folding_chair.n.01', 'synonyms': ['folding_chair'], 'def': 'a chair that can be folded flat for storage', 'name': 'folding_chair'}, {'frequency': 'c', 'id': 471, 'synset': 'food_processor.n.01', 'synonyms': ['food_processor'], 'def': 'a kitchen appliance for shredding, blending, chopping, or slicing food', 'name': 'food_processor'}, {'frequency': 'c', 'id': 472, 'synset': 'football.n.02', 'synonyms': ['football_(American)'], 'def': 'the inflated oblong ball used in playing American football', 'name': 'football_(American)'}, {'frequency': 'r', 'id': 473, 'synset': 'football_helmet.n.01', 'synonyms': ['football_helmet'], 'def': 'a padded helmet with a face mask to protect the head of football players', 'name': 'football_helmet'}, {'frequency': 'c', 'id': 474, 'synset': 'footstool.n.01', 'synonyms': ['footstool', 'footrest'], 'def': 'a low seat or a stool to rest the feet of a seated person', 'name': 'footstool'}, {'frequency': 'f', 'id': 475, 'synset': 'fork.n.01', 'synonyms': ['fork'], 'def': 'cutlery used for serving and eating food', 'name': 'fork'}, {'frequency': 'r', 'id': 476, 'synset': 'forklift.n.01', 'synonyms': ['forklift'], 'def': 'an industrial vehicle with a power operated fork in front that can be inserted under loads to lift and move them', 'name': 'forklift'}, {'frequency': 'r', 'id': 477, 'synset': 'freight_car.n.01', 'synonyms': ['freight_car'], 'def': 'a railway car that carries freight', 'name': 'freight_car'}, {'frequency': 'r', 'id': 478, 'synset': 'french_toast.n.01', 'synonyms': ['French_toast'], 'def': 'bread slice dipped in egg and milk and fried', 'name': 'French_toast'}, {'frequency': 'c', 'id': 479, 'synset': 'freshener.n.01', 'synonyms': ['freshener', 'air_freshener'], 'def': 'anything that freshens', 'name': 'freshener'}, {'frequency': 'f', 'id': 480, 'synset': 'frisbee.n.01', 'synonyms': ['frisbee'], 'def': 'a light, plastic disk propelled with a flip of the wrist for recreation or competition', 'name': 'frisbee'}, {'frequency': 'c', 'id': 481, 'synset': 'frog.n.01', 'synonyms': ['frog', 'toad', 'toad_frog'], 'def': 'a tailless stout-bodied amphibians with long hind limbs for leaping', 'name': 'frog'}, {'frequency': 'c', 'id': 482, 'synset': 'fruit_juice.n.01', 'synonyms': ['fruit_juice'], 'def': 'drink produced by squeezing or crushing fruit', 'name': 'fruit_juice'}, {'frequency': 'r', 'id': 483, 'synset': 'fruit_salad.n.01', 'synonyms': ['fruit_salad'], 'def': 'salad composed of fruits', 'name': 'fruit_salad'}, {'frequency': 'c', 'id': 484, 'synset': 'frying_pan.n.01', 'synonyms': ['frying_pan', 'frypan', 'skillet'], 'def': 'a pan used for frying foods', 'name': 'frying_pan'}, {'frequency': 'r', 'id': 485, 'synset': 'fudge.n.01', 'synonyms': ['fudge'], 'def': 'soft creamy candy', 'name': 'fudge'}, {'frequency': 'r', 'id': 486, 'synset': 'funnel.n.02', 'synonyms': ['funnel'], 'def': 'a cone-shaped utensil used to channel a substance into a container with a small mouth', 'name': 'funnel'}, {'frequency': 'c', 'id': 487, 'synset': 'futon.n.01', 'synonyms': ['futon'], 'def': 'a pad that is used for sleeping on the floor or on a raised frame', 'name': 'futon'}, {'frequency': 'r', 'id': 488, 'synset': 'gag.n.02', 'synonyms': ['gag', 'muzzle'], 'def': "restraint put into a person's mouth to prevent speaking or shouting", 'name': 'gag'}, {'frequency': 'r', 'id': 489, 'synset': 'garbage.n.03', 'synonyms': ['garbage'], 'def': 'a receptacle where waste can be discarded', 'name': 'garbage'}, {'frequency': 'c', 'id': 490, 'synset': 'garbage_truck.n.01', 'synonyms': ['garbage_truck'], 'def': 'a truck for collecting domestic refuse', 'name': 'garbage_truck'}, {'frequency': 'c', 'id': 491, 'synset': 'garden_hose.n.01', 'synonyms': ['garden_hose'], 'def': 'a hose used for watering a lawn or garden', 'name': 'garden_hose'}, {'frequency': 'c', 'id': 492, 'synset': 'gargle.n.01', 'synonyms': ['gargle', 'mouthwash'], 'def': 'a medicated solution used for gargling and rinsing the mouth', 'name': 'gargle'}, {'frequency': 'r', 'id': 493, 'synset': 'gargoyle.n.02', 'synonyms': ['gargoyle'], 'def': 'an ornament consisting of a grotesquely carved figure of a person or animal', 'name': 'gargoyle'}, {'frequency': 'c', 'id': 494, 'synset': 'garlic.n.02', 'synonyms': ['garlic', 'ail'], 'def': 'aromatic bulb used as seasoning', 'name': 'garlic'}, {'frequency': 'r', 'id': 495, 'synset': 'gasmask.n.01', 'synonyms': ['gasmask', 'respirator', 'gas_helmet'], 'def': 'a protective face mask with a filter', 'name': 'gasmask'}, {'frequency': 'r', 'id': 496, 'synset': 'gazelle.n.01', 'synonyms': ['gazelle'], 'def': 'small swift graceful antelope of Africa and Asia having lustrous eyes', 'name': 'gazelle'}, {'frequency': 'c', 'id': 497, 'synset': 'gelatin.n.02', 'synonyms': ['gelatin', 'jelly'], 'def': 'an edible jelly made with gelatin and used as a dessert or salad base or a coating for foods', 'name': 'gelatin'}, {'frequency': 'r', 'id': 498, 'synset': 'gem.n.02', 'synonyms': ['gemstone'], 'def': 'a crystalline rock that can be cut and polished for jewelry', 'name': 'gemstone'}, {'frequency': 'c', 'id': 499, 'synset': 'giant_panda.n.01', 'synonyms': ['giant_panda', 'panda', 'panda_bear'], 'def': 'large black-and-white herbivorous mammal of bamboo forests of China and Tibet', 'name': 'giant_panda'}, {'frequency': 'c', 'id': 500, 'synset': 'gift_wrap.n.01', 'synonyms': ['gift_wrap'], 'def': 'attractive wrapping paper suitable for wrapping gifts', 'name': 'gift_wrap'}, {'frequency': 'c', 'id': 501, 'synset': 'ginger.n.03', 'synonyms': ['ginger', 'gingerroot'], 'def': 'the root of the common ginger plant; used fresh as a seasoning', 'name': 'ginger'}, {'frequency': 'f', 'id': 502, 'synset': 'giraffe.n.01', 'synonyms': ['giraffe'], 'def': 'tall animal having a spotted coat and small horns and very long neck and legs', 'name': 'giraffe'}, {'frequency': 'c', 'id': 503, 'synset': 'girdle.n.02', 'synonyms': ['cincture', 'sash', 'waistband', 'waistcloth'], 'def': 'a band of material around the waist that strengthens a skirt or trousers', 'name': 'cincture'}, {'frequency': 'f', 'id': 504, 'synset': 'glass.n.02', 'synonyms': ['glass_(drink_container)', 'drinking_glass'], 'def': 'a container for holding liquids while drinking', 'name': 'glass_(drink_container)'}, {'frequency': 'c', 'id': 505, 'synset': 'globe.n.03', 'synonyms': ['globe'], 'def': 'a sphere on which a map (especially of the earth) is represented', 'name': 'globe'}, {'frequency': 'f', 'id': 506, 'synset': 'glove.n.02', 'synonyms': ['glove'], 'def': 'handwear covering the hand', 'name': 'glove'}, {'frequency': 'c', 'id': 507, 'synset': 'goat.n.01', 'synonyms': ['goat'], 'def': 'a common goat', 'name': 'goat'}, {'frequency': 'f', 'id': 508, 'synset': 'goggles.n.01', 'synonyms': ['goggles'], 'def': 'tight-fitting spectacles worn to protect the eyes', 'name': 'goggles'}, {'frequency': 'r', 'id': 509, 'synset': 'goldfish.n.01', 'synonyms': ['goldfish'], 'def': 'small golden or orange-red freshwater fishes used as pond or aquarium pets', 'name': 'goldfish'}, {'frequency': 'r', 'id': 510, 'synset': 'golf_club.n.02', 'synonyms': ['golf_club', 'golf-club'], 'def': 'golf equipment used by a golfer to hit a golf ball', 'name': 'golf_club'}, {'frequency': 'c', 'id': 511, 'synset': 'golfcart.n.01', 'synonyms': ['golfcart'], 'def': 'a small motor vehicle in which golfers can ride between shots', 'name': 'golfcart'}, {'frequency': 'r', 'id': 512, 'synset': 'gondola.n.02', 'synonyms': ['gondola_(boat)'], 'def': 'long narrow flat-bottomed boat propelled by sculling; traditionally used on canals of Venice', 'name': 'gondola_(boat)'}, {'frequency': 'c', 'id': 513, 'synset': 'goose.n.01', 'synonyms': ['goose'], 'def': 'loud, web-footed long-necked aquatic birds usually larger than ducks', 'name': 'goose'}, {'frequency': 'r', 'id': 514, 'synset': 'gorilla.n.01', 'synonyms': ['gorilla'], 'def': 'largest ape', 'name': 'gorilla'}, {'frequency': 'r', 'id': 515, 'synset': 'gourd.n.02', 'synonyms': ['gourd'], 'def': 'any of numerous inedible fruits with hard rinds', 'name': 'gourd'}, {'frequency': 'r', 'id': 516, 'synset': 'gown.n.04', 'synonyms': ['surgical_gown', 'scrubs_(surgical_clothing)'], 'def': 'protective garment worn by surgeons during operations', 'name': 'surgical_gown'}, {'frequency': 'f', 'id': 517, 'synset': 'grape.n.01', 'synonyms': ['grape'], 'def': 'any of various juicy fruit with green or purple skins; grow in clusters', 'name': 'grape'}, {'frequency': 'r', 'id': 518, 'synset': 'grasshopper.n.01', 'synonyms': ['grasshopper'], 'def': 'plant-eating insect with hind legs adapted for leaping', 'name': 'grasshopper'}, {'frequency': 'c', 'id': 519, 'synset': 'grater.n.01', 'synonyms': ['grater'], 'def': 'utensil with sharp perforations for shredding foods (as vegetables or cheese)', 'name': 'grater'}, {'frequency': 'c', 'id': 520, 'synset': 'gravestone.n.01', 'synonyms': ['gravestone', 'headstone', 'tombstone'], 'def': 'a stone that is used to mark a grave', 'name': 'gravestone'}, {'frequency': 'r', 'id': 521, 'synset': 'gravy_boat.n.01', 'synonyms': ['gravy_boat', 'gravy_holder'], 'def': 'a dish (often boat-shaped) for serving gravy or sauce', 'name': 'gravy_boat'}, {'frequency': 'c', 'id': 522, 'synset': 'green_bean.n.02', 'synonyms': ['green_bean'], 'def': 'a common bean plant cultivated for its slender green edible pods', 'name': 'green_bean'}, {'frequency': 'c', 'id': 523, 'synset': 'green_onion.n.01', 'synonyms': ['green_onion', 'spring_onion', 'scallion'], 'def': 'a young onion before the bulb has enlarged', 'name': 'green_onion'}, {'frequency': 'r', 'id': 524, 'synset': 'griddle.n.01', 'synonyms': ['griddle'], 'def': 'cooking utensil consisting of a flat heated surface on which food is cooked', 'name': 'griddle'}, {'frequency': 'r', 'id': 525, 'synset': 'grillroom.n.01', 'synonyms': ['grillroom', 'grill_(restaurant)'], 'def': 'a restaurant where food is cooked on a grill', 'name': 'grillroom'}, {'frequency': 'r', 'id': 526, 'synset': 'grinder.n.04', 'synonyms': ['grinder_(tool)'], 'def': 'a machine tool that polishes metal', 'name': 'grinder_(tool)'}, {'frequency': 'r', 'id': 527, 'synset': 'grits.n.01', 'synonyms': ['grits', 'hominy_grits'], 'def': 'coarsely ground corn boiled as a breakfast dish', 'name': 'grits'}, {'frequency': 'c', 'id': 528, 'synset': 'grizzly.n.01', 'synonyms': ['grizzly', 'grizzly_bear'], 'def': 'powerful brownish-yellow bear of the uplands of western North America', 'name': 'grizzly'}, {'frequency': 'c', 'id': 529, 'synset': 'grocery_bag.n.01', 'synonyms': ['grocery_bag'], 'def': "a sack for holding customer's groceries", 'name': 'grocery_bag'}, {'frequency': 'r', 'id': 530, 'synset': 'guacamole.n.01', 'synonyms': ['guacamole'], 'def': 'a dip made of mashed avocado mixed with chopped onions and other seasonings', 'name': 'guacamole'}, {'frequency': 'f', 'id': 531, 'synset': 'guitar.n.01', 'synonyms': ['guitar'], 'def': 'a stringed instrument usually having six strings; played by strumming or plucking', 'name': 'guitar'}, {'frequency': 'c', 'id': 532, 'synset': 'gull.n.02', 'synonyms': ['gull', 'seagull'], 'def': 'mostly white aquatic bird having long pointed wings and short legs', 'name': 'gull'}, {'frequency': 'c', 'id': 533, 'synset': 'gun.n.01', 'synonyms': ['gun'], 'def': 'a weapon that discharges a bullet at high velocity from a metal tube', 'name': 'gun'}, {'frequency': 'r', 'id': 534, 'synset': 'hair_spray.n.01', 'synonyms': ['hair_spray'], 'def': 'substance sprayed on the hair to hold it in place', 'name': 'hair_spray'}, {'frequency': 'c', 'id': 535, 'synset': 'hairbrush.n.01', 'synonyms': ['hairbrush'], 'def': "a brush used to groom a person's hair", 'name': 'hairbrush'}, {'frequency': 'c', 'id': 536, 'synset': 'hairnet.n.01', 'synonyms': ['hairnet'], 'def': 'a small net that someone wears over their hair to keep it in place', 'name': 'hairnet'}, {'frequency': 'c', 'id': 537, 'synset': 'hairpin.n.01', 'synonyms': ['hairpin'], 'def': "a double pronged pin used to hold women's hair in place", 'name': 'hairpin'}, {'frequency': 'f', 'id': 538, 'synset': 'ham.n.01', 'synonyms': ['ham', 'jambon', 'gammon'], 'def': 'meat cut from the thigh of a hog (usually smoked)', 'name': 'ham'}, {'frequency': 'c', 'id': 539, 'synset': 'hamburger.n.01', 'synonyms': ['hamburger', 'beefburger', 'burger'], 'def': 'a sandwich consisting of a patty of minced beef served on a bun', 'name': 'hamburger'}, {'frequency': 'c', 'id': 540, 'synset': 'hammer.n.02', 'synonyms': ['hammer'], 'def': 'a hand tool with a heavy head and a handle; used to deliver an impulsive force by striking', 'name': 'hammer'}, {'frequency': 'r', 'id': 541, 'synset': 'hammock.n.02', 'synonyms': ['hammock'], 'def': 'a hanging bed of canvas or rope netting (usually suspended between two trees)', 'name': 'hammock'}, {'frequency': 'r', 'id': 542, 'synset': 'hamper.n.02', 'synonyms': ['hamper'], 'def': 'a basket usually with a cover', 'name': 'hamper'}, {'frequency': 'r', 'id': 543, 'synset': 'hamster.n.01', 'synonyms': ['hamster'], 'def': 'short-tailed burrowing rodent with large cheek pouches', 'name': 'hamster'}, {'frequency': 'c', 'id': 544, 'synset': 'hand_blower.n.01', 'synonyms': ['hair_dryer'], 'def': 'a hand-held electric blower that can blow warm air onto the hair', 'name': 'hair_dryer'}, {'frequency': 'r', 'id': 545, 'synset': 'hand_glass.n.01', 'synonyms': ['hand_glass', 'hand_mirror'], 'def': 'a mirror intended to be held in the hand', 'name': 'hand_glass'}, {'frequency': 'f', 'id': 546, 'synset': 'hand_towel.n.01', 'synonyms': ['hand_towel', 'face_towel'], 'def': 'a small towel used to dry the hands or face', 'name': 'hand_towel'}, {'frequency': 'c', 'id': 547, 'synset': 'handcart.n.01', 'synonyms': ['handcart', 'pushcart', 'hand_truck'], 'def': 'wheeled vehicle that can be pushed by a person', 'name': 'handcart'}, {'frequency': 'r', 'id': 548, 'synset': 'handcuff.n.01', 'synonyms': ['handcuff'], 'def': 'shackle that consists of a metal loop that can be locked around the wrist', 'name': 'handcuff'}, {'frequency': 'c', 'id': 549, 'synset': 'handkerchief.n.01', 'synonyms': ['handkerchief'], 'def': 'a square piece of cloth used for wiping the eyes or nose or as a costume accessory', 'name': 'handkerchief'}, {'frequency': 'f', 'id': 550, 'synset': 'handle.n.01', 'synonyms': ['handle', 'grip', 'handgrip'], 'def': 'the appendage to an object that is designed to be held in order to use or move it', 'name': 'handle'}, {'frequency': 'r', 'id': 551, 'synset': 'handsaw.n.01', 'synonyms': ['handsaw', "carpenter's_saw"], 'def': 'a saw used with one hand for cutting wood', 'name': 'handsaw'}, {'frequency': 'r', 'id': 552, 'synset': 'hardback.n.01', 'synonyms': ['hardback_book', 'hardcover_book'], 'def': 'a book with cardboard or cloth or leather covers', 'name': 'hardback_book'}, {'frequency': 'r', 'id': 553, 'synset': 'harmonium.n.01', 'synonyms': ['harmonium', 'organ_(musical_instrument)', 'reed_organ_(musical_instrument)'], 'def': 'a free-reed instrument in which air is forced through the reeds by bellows', 'name': 'harmonium'}, {'frequency': 'f', 'id': 554, 'synset': 'hat.n.01', 'synonyms': ['hat'], 'def': 'headwear that protects the head from bad weather, sun, or worn for fashion', 'name': 'hat'}, {'frequency': 'r', 'id': 555, 'synset': 'hatbox.n.01', 'synonyms': ['hatbox'], 'def': 'a round piece of luggage for carrying hats', 'name': 'hatbox'}, {'frequency': 'r', 'id': 556, 'synset': 'hatch.n.03', 'synonyms': ['hatch'], 'def': 'a movable barrier covering a hatchway', 'name': 'hatch'}, {'frequency': 'c', 'id': 557, 'synset': 'head_covering.n.01', 'synonyms': ['veil'], 'def': 'a garment that covers the head and face', 'name': 'veil'}, {'frequency': 'f', 'id': 558, 'synset': 'headband.n.01', 'synonyms': ['headband'], 'def': 'a band worn around or over the head', 'name': 'headband'}, {'frequency': 'f', 'id': 559, 'synset': 'headboard.n.01', 'synonyms': ['headboard'], 'def': 'a vertical board or panel forming the head of a bedstead', 'name': 'headboard'}, {'frequency': 'f', 'id': 560, 'synset': 'headlight.n.01', 'synonyms': ['headlight', 'headlamp'], 'def': 'a powerful light with reflector; attached to the front of an automobile or locomotive', 'name': 'headlight'}, {'frequency': 'c', 'id': 561, 'synset': 'headscarf.n.01', 'synonyms': ['headscarf'], 'def': 'a kerchief worn over the head and tied under the chin', 'name': 'headscarf'}, {'frequency': 'r', 'id': 562, 'synset': 'headset.n.01', 'synonyms': ['headset'], 'def': 'receiver consisting of a pair of headphones', 'name': 'headset'}, {'frequency': 'c', 'id': 563, 'synset': 'headstall.n.01', 'synonyms': ['headstall_(for_horses)', 'headpiece_(for_horses)'], 'def': "the band that is the part of a bridle that fits around a horse's head", 'name': 'headstall_(for_horses)'}, {'frequency': 'r', 'id': 564, 'synset': 'hearing_aid.n.02', 'synonyms': ['hearing_aid'], 'def': 'an acoustic device used to direct sound to the ear of a hearing-impaired person', 'name': 'hearing_aid'}, {'frequency': 'c', 'id': 565, 'synset': 'heart.n.02', 'synonyms': ['heart'], 'def': 'a muscular organ; its contractions move the blood through the body', 'name': 'heart'}, {'frequency': 'c', 'id': 566, 'synset': 'heater.n.01', 'synonyms': ['heater', 'warmer'], 'def': 'device that heats water or supplies warmth to a room', 'name': 'heater'}, {'frequency': 'c', 'id': 567, 'synset': 'helicopter.n.01', 'synonyms': ['helicopter'], 'def': 'an aircraft without wings that obtains its lift from the rotation of overhead blades', 'name': 'helicopter'}, {'frequency': 'f', 'id': 568, 'synset': 'helmet.n.02', 'synonyms': ['helmet'], 'def': 'a protective headgear made of hard material to resist blows', 'name': 'helmet'}, {'frequency': 'r', 'id': 569, 'synset': 'heron.n.02', 'synonyms': ['heron'], 'def': 'grey or white wading bird with long neck and long legs and (usually) long bill', 'name': 'heron'}, {'frequency': 'c', 'id': 570, 'synset': 'highchair.n.01', 'synonyms': ['highchair', 'feeding_chair'], 'def': 'a chair for feeding a very young child', 'name': 'highchair'}, {'frequency': 'f', 'id': 571, 'synset': 'hinge.n.01', 'synonyms': ['hinge'], 'def': 'a joint that holds two parts together so that one can swing relative to the other', 'name': 'hinge'}, {'frequency': 'r', 'id': 572, 'synset': 'hippopotamus.n.01', 'synonyms': ['hippopotamus'], 'def': 'massive thick-skinned animal living in or around rivers of tropical Africa', 'name': 'hippopotamus'}, {'frequency': 'r', 'id': 573, 'synset': 'hockey_stick.n.01', 'synonyms': ['hockey_stick'], 'def': 'sports implement consisting of a stick used by hockey players to move the puck', 'name': 'hockey_stick'}, {'frequency': 'c', 'id': 574, 'synset': 'hog.n.03', 'synonyms': ['hog', 'pig'], 'def': 'domestic swine', 'name': 'hog'}, {'frequency': 'f', 'id': 575, 'synset': 'home_plate.n.01', 'synonyms': ['home_plate_(baseball)', 'home_base_(baseball)'], 'def': '(baseball) a rubber slab where the batter stands; it must be touched by a base runner in order to score', 'name': 'home_plate_(baseball)'}, {'frequency': 'c', 'id': 576, 'synset': 'honey.n.01', 'synonyms': ['honey'], 'def': 'a sweet yellow liquid produced by bees', 'name': 'honey'}, {'frequency': 'f', 'id': 577, 'synset': 'hood.n.06', 'synonyms': ['fume_hood', 'exhaust_hood'], 'def': 'metal covering leading to a vent that exhausts smoke or fumes', 'name': 'fume_hood'}, {'frequency': 'f', 'id': 578, 'synset': 'hook.n.05', 'synonyms': ['hook'], 'def': 'a curved or bent implement for suspending or pulling something', 'name': 'hook'}, {'frequency': 'f', 'id': 579, 'synset': 'horse.n.01', 'synonyms': ['horse'], 'def': 'a common horse', 'name': 'horse'}, {'frequency': 'f', 'id': 580, 'synset': 'hose.n.03', 'synonyms': ['hose', 'hosepipe'], 'def': 'a flexible pipe for conveying a liquid or gas', 'name': 'hose'}, {'frequency': 'r', 'id': 581, 'synset': 'hot-air_balloon.n.01', 'synonyms': ['hot-air_balloon'], 'def': 'balloon for travel through the air in a basket suspended below a large bag of heated air', 'name': 'hot-air_balloon'}, {'frequency': 'r', 'id': 582, 'synset': 'hot_plate.n.01', 'synonyms': ['hotplate'], 'def': 'a portable electric appliance for heating or cooking or keeping food warm', 'name': 'hotplate'}, {'frequency': 'c', 'id': 583, 'synset': 'hot_sauce.n.01', 'synonyms': ['hot_sauce'], 'def': 'a pungent peppery sauce', 'name': 'hot_sauce'}, {'frequency': 'r', 'id': 584, 'synset': 'hourglass.n.01', 'synonyms': ['hourglass'], 'def': 'a sandglass timer that runs for sixty minutes', 'name': 'hourglass'}, {'frequency': 'r', 'id': 585, 'synset': 'houseboat.n.01', 'synonyms': ['houseboat'], 'def': 'a barge that is designed and equipped for use as a dwelling', 'name': 'houseboat'}, {'frequency': 'r', 'id': 586, 'synset': 'hummingbird.n.01', 'synonyms': ['hummingbird'], 'def': 'tiny American bird having brilliant iridescent plumage and long slender bills', 'name': 'hummingbird'}, {'frequency': 'r', 'id': 587, 'synset': 'hummus.n.01', 'synonyms': ['hummus', 'humus', 'hommos', 'hoummos', 'humous'], 'def': 'a thick spread made from mashed chickpeas', 'name': 'hummus'}, {'frequency': 'c', 'id': 588, 'synset': 'ice_bear.n.01', 'synonyms': ['polar_bear'], 'def': 'white bear of Arctic regions', 'name': 'polar_bear'}, {'frequency': 'c', 'id': 589, 'synset': 'ice_cream.n.01', 'synonyms': ['icecream'], 'def': 'frozen dessert containing cream and sugar and flavoring', 'name': 'icecream'}, {'frequency': 'r', 'id': 590, 'synset': 'ice_lolly.n.01', 'synonyms': ['popsicle'], 'def': 'ice cream or water ice on a small wooden stick', 'name': 'popsicle'}, {'frequency': 'c', 'id': 591, 'synset': 'ice_maker.n.01', 'synonyms': ['ice_maker'], 'def': 'an appliance included in some electric refrigerators for making ice cubes', 'name': 'ice_maker'}, {'frequency': 'r', 'id': 592, 'synset': 'ice_pack.n.01', 'synonyms': ['ice_pack', 'ice_bag'], 'def': 'a waterproof bag filled with ice: applied to the body (especially the head) to cool or reduce swelling', 'name': 'ice_pack'}, {'frequency': 'r', 'id': 593, 'synset': 'ice_skate.n.01', 'synonyms': ['ice_skate'], 'def': 'skate consisting of a boot with a steel blade fitted to the sole', 'name': 'ice_skate'}, {'frequency': 'r', 'id': 594, 'synset': 'ice_tea.n.01', 'synonyms': ['ice_tea', 'iced_tea'], 'def': 'strong tea served over ice', 'name': 'ice_tea'}, {'frequency': 'c', 'id': 595, 'synset': 'igniter.n.01', 'synonyms': ['igniter', 'ignitor', 'lighter'], 'def': 'a substance or device used to start a fire', 'name': 'igniter'}, {'frequency': 'r', 'id': 596, 'synset': 'incense.n.01', 'synonyms': ['incense'], 'def': 'a substance that produces a fragrant odor when burned', 'name': 'incense'}, {'frequency': 'r', 'id': 597, 'synset': 'inhaler.n.01', 'synonyms': ['inhaler', 'inhalator'], 'def': 'a dispenser that produces a chemical vapor to be inhaled through mouth or nose', 'name': 'inhaler'}, {'frequency': 'c', 'id': 598, 'synset': 'ipod.n.01', 'synonyms': ['iPod'], 'def': 'a pocket-sized device used to play music files', 'name': 'iPod'}, {'frequency': 'c', 'id': 599, 'synset': 'iron.n.04', 'synonyms': ['iron_(for_clothing)', 'smoothing_iron_(for_clothing)'], 'def': 'home appliance consisting of a flat metal base that is heated and used to smooth cloth', 'name': 'iron_(for_clothing)'}, {'frequency': 'r', 'id': 600, 'synset': 'ironing_board.n.01', 'synonyms': ['ironing_board'], 'def': 'narrow padded board on collapsible supports; used for ironing clothes', 'name': 'ironing_board'}, {'frequency': 'f', 'id': 601, 'synset': 'jacket.n.01', 'synonyms': ['jacket'], 'def': 'a waist-length coat', 'name': 'jacket'}, {'frequency': 'r', 'id': 602, 'synset': 'jam.n.01', 'synonyms': ['jam'], 'def': 'preserve of crushed fruit', 'name': 'jam'}, {'frequency': 'f', 'id': 603, 'synset': 'jean.n.01', 'synonyms': ['jean', 'blue_jean', 'denim'], 'def': '(usually plural) close-fitting trousers of heavy denim for manual work or casual wear', 'name': 'jean'}, {'frequency': 'c', 'id': 604, 'synset': 'jeep.n.01', 'synonyms': ['jeep', 'landrover'], 'def': 'a car suitable for traveling over rough terrain', 'name': 'jeep'}, {'frequency': 'r', 'id': 605, 'synset': 'jelly_bean.n.01', 'synonyms': ['jelly_bean', 'jelly_egg'], 'def': 'sugar-glazed jellied candy', 'name': 'jelly_bean'}, {'frequency': 'f', 'id': 606, 'synset': 'jersey.n.03', 'synonyms': ['jersey', 'T-shirt', 'tee_shirt'], 'def': 'a close-fitting pullover shirt', 'name': 'jersey'}, {'frequency': 'c', 'id': 607, 'synset': 'jet.n.01', 'synonyms': ['jet_plane', 'jet-propelled_plane'], 'def': 'an airplane powered by one or more jet engines', 'name': 'jet_plane'}, {'frequency': 'c', 'id': 608, 'synset': 'jewelry.n.01', 'synonyms': ['jewelry', 'jewellery'], 'def': 'an adornment (as a bracelet or ring or necklace) made of precious metals and set with gems (or imitation gems)', 'name': 'jewelry'}, {'frequency': 'r', 'id': 609, 'synset': 'joystick.n.02', 'synonyms': ['joystick'], 'def': 'a control device for computers consisting of a vertical handle that can move freely in two directions', 'name': 'joystick'}, {'frequency': 'r', 'id': 610, 'synset': 'jump_suit.n.01', 'synonyms': ['jumpsuit'], 'def': "one-piece garment fashioned after a parachutist's uniform", 'name': 'jumpsuit'}, {'frequency': 'c', 'id': 611, 'synset': 'kayak.n.01', 'synonyms': ['kayak'], 'def': 'a small canoe consisting of a light frame made watertight with animal skins', 'name': 'kayak'}, {'frequency': 'r', 'id': 612, 'synset': 'keg.n.02', 'synonyms': ['keg'], 'def': 'small cask or barrel', 'name': 'keg'}, {'frequency': 'r', 'id': 613, 'synset': 'kennel.n.01', 'synonyms': ['kennel', 'doghouse'], 'def': 'outbuilding that serves as a shelter for a dog', 'name': 'kennel'}, {'frequency': 'c', 'id': 614, 'synset': 'kettle.n.01', 'synonyms': ['kettle', 'boiler'], 'def': 'a metal pot for stewing or boiling; usually has a lid', 'name': 'kettle'}, {'frequency': 'f', 'id': 615, 'synset': 'key.n.01', 'synonyms': ['key'], 'def': 'metal instrument used to unlock a lock', 'name': 'key'}, {'frequency': 'r', 'id': 616, 'synset': 'keycard.n.01', 'synonyms': ['keycard'], 'def': 'a plastic card used to gain access typically to a door', 'name': 'keycard'}, {'frequency': 'r', 'id': 617, 'synset': 'kilt.n.01', 'synonyms': ['kilt'], 'def': 'a knee-length pleated tartan skirt worn by men as part of the traditional dress in the Highlands of northern Scotland', 'name': 'kilt'}, {'frequency': 'c', 'id': 618, 'synset': 'kimono.n.01', 'synonyms': ['kimono'], 'def': 'a loose robe; imitated from robes originally worn by Japanese', 'name': 'kimono'}, {'frequency': 'f', 'id': 619, 'synset': 'kitchen_sink.n.01', 'synonyms': ['kitchen_sink'], 'def': 'a sink in a kitchen', 'name': 'kitchen_sink'}, {'frequency': 'c', 'id': 620, 'synset': 'kitchen_table.n.01', 'synonyms': ['kitchen_table'], 'def': 'a table in the kitchen', 'name': 'kitchen_table'}, {'frequency': 'f', 'id': 621, 'synset': 'kite.n.03', 'synonyms': ['kite'], 'def': 'plaything consisting of a light frame covered with tissue paper; flown in wind at end of a string', 'name': 'kite'}, {'frequency': 'c', 'id': 622, 'synset': 'kitten.n.01', 'synonyms': ['kitten', 'kitty'], 'def': 'young domestic cat', 'name': 'kitten'}, {'frequency': 'c', 'id': 623, 'synset': 'kiwi.n.03', 'synonyms': ['kiwi_fruit'], 'def': 'fuzzy brown egg-shaped fruit with slightly tart green flesh', 'name': 'kiwi_fruit'}, {'frequency': 'f', 'id': 624, 'synset': 'knee_pad.n.01', 'synonyms': ['knee_pad'], 'def': 'protective garment consisting of a pad worn by football or baseball or hockey players', 'name': 'knee_pad'}, {'frequency': 'f', 'id': 625, 'synset': 'knife.n.01', 'synonyms': ['knife'], 'def': 'tool with a blade and point used as a cutting instrument', 'name': 'knife'}, {'frequency': 'r', 'id': 626, 'synset': 'knight.n.02', 'synonyms': ['knight_(chess_piece)', 'horse_(chess_piece)'], 'def': 'a chess game piece shaped to resemble the head of a horse', 'name': 'knight_(chess_piece)'}, {'frequency': 'r', 'id': 627, 'synset': 'knitting_needle.n.01', 'synonyms': ['knitting_needle'], 'def': 'needle consisting of a slender rod with pointed ends; usually used in pairs', 'name': 'knitting_needle'}, {'frequency': 'f', 'id': 628, 'synset': 'knob.n.02', 'synonyms': ['knob'], 'def': 'a round handle often found on a door', 'name': 'knob'}, {'frequency': 'r', 'id': 629, 'synset': 'knocker.n.05', 'synonyms': ['knocker_(on_a_door)', 'doorknocker'], 'def': 'a device (usually metal and ornamental) attached by a hinge to a door', 'name': 'knocker_(on_a_door)'}, {'frequency': 'r', 'id': 630, 'synset': 'koala.n.01', 'synonyms': ['koala', 'koala_bear'], 'def': 'sluggish tailless Australian marsupial with grey furry ears and coat', 'name': 'koala'}, {'frequency': 'r', 'id': 631, 'synset': 'lab_coat.n.01', 'synonyms': ['lab_coat', 'laboratory_coat'], 'def': 'a light coat worn to protect clothing from substances used while working in a laboratory', 'name': 'lab_coat'}, {'frequency': 'f', 'id': 632, 'synset': 'ladder.n.01', 'synonyms': ['ladder'], 'def': 'steps consisting of two parallel members connected by rungs', 'name': 'ladder'}, {'frequency': 'c', 'id': 633, 'synset': 'ladle.n.01', 'synonyms': ['ladle'], 'def': 'a spoon-shaped vessel with a long handle frequently used to transfer liquids', 'name': 'ladle'}, {'frequency': 'r', 'id': 634, 'synset': 'ladybug.n.01', 'synonyms': ['ladybug', 'ladybeetle', 'ladybird_beetle'], 'def': 'small round bright-colored and spotted beetle, typically red and black', 'name': 'ladybug'}, {'frequency': 'c', 'id': 635, 'synset': 'lamb.n.01', 'synonyms': ['lamb_(animal)'], 'def': 'young sheep', 'name': 'lamb_(animal)'}, {'frequency': 'r', 'id': 636, 'synset': 'lamb_chop.n.01', 'synonyms': ['lamb-chop', 'lambchop'], 'def': 'chop cut from a lamb', 'name': 'lamb-chop'}, {'frequency': 'f', 'id': 637, 'synset': 'lamp.n.02', 'synonyms': ['lamp'], 'def': 'a piece of furniture holding one or more electric light bulbs', 'name': 'lamp'}, {'frequency': 'f', 'id': 638, 'synset': 'lamppost.n.01', 'synonyms': ['lamppost'], 'def': 'a metal post supporting an outdoor lamp (such as a streetlight)', 'name': 'lamppost'}, {'frequency': 'f', 'id': 639, 'synset': 'lampshade.n.01', 'synonyms': ['lampshade'], 'def': 'a protective ornamental shade used to screen a light bulb from direct view', 'name': 'lampshade'}, {'frequency': 'c', 'id': 640, 'synset': 'lantern.n.01', 'synonyms': ['lantern'], 'def': 'light in a transparent protective case', 'name': 'lantern'}, {'frequency': 'f', 'id': 641, 'synset': 'lanyard.n.02', 'synonyms': ['lanyard', 'laniard'], 'def': 'a cord worn around the neck to hold a knife or whistle, etc.', 'name': 'lanyard'}, {'frequency': 'f', 'id': 642, 'synset': 'laptop.n.01', 'synonyms': ['laptop_computer', 'notebook_computer'], 'def': 'a portable computer small enough to use in your lap', 'name': 'laptop_computer'}, {'frequency': 'r', 'id': 643, 'synset': 'lasagna.n.01', 'synonyms': ['lasagna', 'lasagne'], 'def': 'baked dish of layers of lasagna pasta with sauce and cheese and meat or vegetables', 'name': 'lasagna'}, {'frequency': 'c', 'id': 644, 'synset': 'latch.n.02', 'synonyms': ['latch'], 'def': 'a bar that can be lowered or slid into a groove to fasten a door or gate', 'name': 'latch'}, {'frequency': 'r', 'id': 645, 'synset': 'lawn_mower.n.01', 'synonyms': ['lawn_mower'], 'def': 'garden tool for mowing grass on lawns', 'name': 'lawn_mower'}, {'frequency': 'r', 'id': 646, 'synset': 'leather.n.01', 'synonyms': ['leather'], 'def': 'an animal skin made smooth and flexible by removing the hair and then tanning', 'name': 'leather'}, {'frequency': 'c', 'id': 647, 'synset': 'legging.n.01', 'synonyms': ['legging_(clothing)', 'leging_(clothing)', 'leg_covering'], 'def': 'a garment covering the leg (usually extending from the knee to the ankle)', 'name': 'legging_(clothing)'}, {'frequency': 'c', 'id': 648, 'synset': 'lego.n.01', 'synonyms': ['Lego', 'Lego_set'], 'def': "a child's plastic construction set for making models from blocks", 'name': 'Lego'}, {'frequency': 'f', 'id': 649, 'synset': 'lemon.n.01', 'synonyms': ['lemon'], 'def': 'yellow oval fruit with juicy acidic flesh', 'name': 'lemon'}, {'frequency': 'r', 'id': 650, 'synset': 'lemonade.n.01', 'synonyms': ['lemonade'], 'def': 'sweetened beverage of diluted lemon juice', 'name': 'lemonade'}, {'frequency': 'f', 'id': 651, 'synset': 'lettuce.n.02', 'synonyms': ['lettuce'], 'def': 'leafy plant commonly eaten in salad or on sandwiches', 'name': 'lettuce'}, {'frequency': 'f', 'id': 652, 'synset': 'license_plate.n.01', 'synonyms': ['license_plate', 'numberplate'], 'def': "a plate mounted on the front and back of car and bearing the car's registration number", 'name': 'license_plate'}, {'frequency': 'f', 'id': 653, 'synset': 'life_buoy.n.01', 'synonyms': ['life_buoy', 'lifesaver', 'life_belt', 'life_ring'], 'def': 'a ring-shaped life preserver used to prevent drowning (NOT a life-jacket or vest)', 'name': 'life_buoy'}, {'frequency': 'f', 'id': 654, 'synset': 'life_jacket.n.01', 'synonyms': ['life_jacket', 'life_vest'], 'def': 'life preserver consisting of a sleeveless jacket of buoyant or inflatable design', 'name': 'life_jacket'}, {'frequency': 'f', 'id': 655, 'synset': 'light_bulb.n.01', 'synonyms': ['lightbulb'], 'def': 'glass bulb or tube shaped electric device that emits light (DO NOT MARK LAMPS AS A WHOLE)', 'name': 'lightbulb'}, {'frequency': 'r', 'id': 656, 'synset': 'lightning_rod.n.02', 'synonyms': ['lightning_rod', 'lightning_conductor'], 'def': 'a metallic conductor that is attached to a high point and leads to the ground', 'name': 'lightning_rod'}, {'frequency': 'c', 'id': 657, 'synset': 'lime.n.06', 'synonyms': ['lime'], 'def': 'the green acidic fruit of any of various lime trees', 'name': 'lime'}, {'frequency': 'r', 'id': 658, 'synset': 'limousine.n.01', 'synonyms': ['limousine'], 'def': 'long luxurious car; usually driven by a chauffeur', 'name': 'limousine'}, {'frequency': 'r', 'id': 659, 'synset': 'linen.n.02', 'synonyms': ['linen_paper'], 'def': 'a high-quality paper made of linen fibers or with a linen finish', 'name': 'linen_paper'}, {'frequency': 'c', 'id': 660, 'synset': 'lion.n.01', 'synonyms': ['lion'], 'def': 'large gregarious predatory cat of Africa and India', 'name': 'lion'}, {'frequency': 'c', 'id': 661, 'synset': 'lip_balm.n.01', 'synonyms': ['lip_balm'], 'def': 'a balm applied to the lips', 'name': 'lip_balm'}, {'frequency': 'c', 'id': 662, 'synset': 'lipstick.n.01', 'synonyms': ['lipstick', 'lip_rouge'], 'def': 'makeup that is used to color the lips', 'name': 'lipstick'}, {'frequency': 'r', 'id': 663, 'synset': 'liquor.n.01', 'synonyms': ['liquor', 'spirits', 'hard_liquor', 'liqueur', 'cordial'], 'def': 'an alcoholic beverage that is distilled rather than fermented', 'name': 'liquor'}, {'frequency': 'r', 'id': 664, 'synset': 'lizard.n.01', 'synonyms': ['lizard'], 'def': 'a reptile with usually two pairs of legs and a tapering tail', 'name': 'lizard'}, {'frequency': 'r', 'id': 665, 'synset': 'loafer.n.02', 'synonyms': ['Loafer_(type_of_shoe)'], 'def': 'a low leather step-in shoe', 'name': 'Loafer_(type_of_shoe)'}, {'frequency': 'f', 'id': 666, 'synset': 'log.n.01', 'synonyms': ['log'], 'def': 'a segment of the trunk of a tree when stripped of branches', 'name': 'log'}, {'frequency': 'c', 'id': 667, 'synset': 'lollipop.n.02', 'synonyms': ['lollipop'], 'def': 'hard candy on a stick', 'name': 'lollipop'}, {'frequency': 'c', 'id': 668, 'synset': 'lotion.n.01', 'synonyms': ['lotion'], 'def': 'any of various cosmetic preparations that are applied to the skin', 'name': 'lotion'}, {'frequency': 'f', 'id': 669, 'synset': 'loudspeaker.n.01', 'synonyms': ['speaker_(stero_equipment)'], 'def': 'electronic device that produces sound often as part of a stereo system', 'name': 'speaker_(stero_equipment)'}, {'frequency': 'c', 'id': 670, 'synset': 'love_seat.n.01', 'synonyms': ['loveseat'], 'def': 'small sofa that seats two people', 'name': 'loveseat'}, {'frequency': 'r', 'id': 671, 'synset': 'machine_gun.n.01', 'synonyms': ['machine_gun'], 'def': 'a rapidly firing automatic gun', 'name': 'machine_gun'}, {'frequency': 'f', 'id': 672, 'synset': 'magazine.n.02', 'synonyms': ['magazine'], 'def': 'a paperback periodic publication', 'name': 'magazine'}, {'frequency': 'f', 'id': 673, 'synset': 'magnet.n.01', 'synonyms': ['magnet'], 'def': 'a device that attracts iron and produces a magnetic field', 'name': 'magnet'}, {'frequency': 'r', 'id': 674, 'synset': 'mail_slot.n.01', 'synonyms': ['mail_slot'], 'def': 'a slot (usually in a door) through which mail can be delivered', 'name': 'mail_slot'}, {'frequency': 'c', 'id': 675, 'synset': 'mailbox.n.01', 'synonyms': ['mailbox_(at_home)', 'letter_box_(at_home)'], 'def': 'a private box for delivery of mail', 'name': 'mailbox_(at_home)'}, {'frequency': 'r', 'id': 676, 'synset': 'mallet.n.01', 'synonyms': ['mallet'], 'def': 'a sports implement with a long handle and a hammer-like head used to hit a ball', 'name': 'mallet'}, {'frequency': 'r', 'id': 677, 'synset': 'mammoth.n.01', 'synonyms': ['mammoth'], 'def': 'any of numerous extinct elephants widely distributed in the Pleistocene', 'name': 'mammoth'}, {'frequency': 'c', 'id': 678, 'synset': 'mandarin.n.05', 'synonyms': ['mandarin_orange'], 'def': 'a somewhat flat reddish-orange loose skinned citrus of China', 'name': 'mandarin_orange'}, {'frequency': 'c', 'id': 679, 'synset': 'manger.n.01', 'synonyms': ['manger', 'trough'], 'def': 'a container (usually in a barn or stable) from which cattle or horses feed', 'name': 'manger'}, {'frequency': 'f', 'id': 680, 'synset': 'manhole.n.01', 'synonyms': ['manhole'], 'def': 'a hole (usually with a flush cover) through which a person can gain access to an underground structure', 'name': 'manhole'}, {'frequency': 'c', 'id': 681, 'synset': 'map.n.01', 'synonyms': ['map'], 'def': "a diagrammatic representation of the earth's surface (or part of it)", 'name': 'map'}, {'frequency': 'c', 'id': 682, 'synset': 'marker.n.03', 'synonyms': ['marker'], 'def': 'a writing implement for making a mark', 'name': 'marker'}, {'frequency': 'r', 'id': 683, 'synset': 'martini.n.01', 'synonyms': ['martini'], 'def': 'a cocktail made of gin (or vodka) with dry vermouth', 'name': 'martini'}, {'frequency': 'r', 'id': 684, 'synset': 'mascot.n.01', 'synonyms': ['mascot'], 'def': 'a person or animal that is adopted by a team or other group as a symbolic figure', 'name': 'mascot'}, {'frequency': 'c', 'id': 685, 'synset': 'mashed_potato.n.01', 'synonyms': ['mashed_potato'], 'def': 'potato that has been peeled and boiled and then mashed', 'name': 'mashed_potato'}, {'frequency': 'r', 'id': 686, 'synset': 'masher.n.02', 'synonyms': ['masher'], 'def': 'a kitchen utensil used for mashing (e.g. potatoes)', 'name': 'masher'}, {'frequency': 'f', 'id': 687, 'synset': 'mask.n.04', 'synonyms': ['mask', 'facemask'], 'def': 'a protective covering worn over the face', 'name': 'mask'}, {'frequency': 'f', 'id': 688, 'synset': 'mast.n.01', 'synonyms': ['mast'], 'def': 'a vertical spar for supporting sails', 'name': 'mast'}, {'frequency': 'c', 'id': 689, 'synset': 'mat.n.03', 'synonyms': ['mat_(gym_equipment)', 'gym_mat'], 'def': 'sports equipment consisting of a piece of thick padding on the floor for gymnastics', 'name': 'mat_(gym_equipment)'}, {'frequency': 'r', 'id': 690, 'synset': 'matchbox.n.01', 'synonyms': ['matchbox'], 'def': 'a box for holding matches', 'name': 'matchbox'}, {'frequency': 'f', 'id': 691, 'synset': 'mattress.n.01', 'synonyms': ['mattress'], 'def': 'a thick pad filled with resilient material used as a bed or part of a bed', 'name': 'mattress'}, {'frequency': 'c', 'id': 692, 'synset': 'measuring_cup.n.01', 'synonyms': ['measuring_cup'], 'def': 'graduated cup used to measure liquid or granular ingredients', 'name': 'measuring_cup'}, {'frequency': 'c', 'id': 693, 'synset': 'measuring_stick.n.01', 'synonyms': ['measuring_stick', 'ruler_(measuring_stick)', 'measuring_rod'], 'def': 'measuring instrument having a sequence of marks at regular intervals', 'name': 'measuring_stick'}, {'frequency': 'c', 'id': 694, 'synset': 'meatball.n.01', 'synonyms': ['meatball'], 'def': 'ground meat formed into a ball and fried or simmered in broth', 'name': 'meatball'}, {'frequency': 'c', 'id': 695, 'synset': 'medicine.n.02', 'synonyms': ['medicine'], 'def': 'something that treats or prevents or alleviates the symptoms of disease', 'name': 'medicine'}, {'frequency': 'r', 'id': 696, 'synset': 'melon.n.01', 'synonyms': ['melon'], 'def': 'fruit of the gourd family having a hard rind and sweet juicy flesh', 'name': 'melon'}, {'frequency': 'f', 'id': 697, 'synset': 'microphone.n.01', 'synonyms': ['microphone'], 'def': 'device for converting sound waves into electrical energy', 'name': 'microphone'}, {'frequency': 'r', 'id': 698, 'synset': 'microscope.n.01', 'synonyms': ['microscope'], 'def': 'magnifier of the image of small objects', 'name': 'microscope'}, {'frequency': 'f', 'id': 699, 'synset': 'microwave.n.02', 'synonyms': ['microwave_oven'], 'def': 'kitchen appliance that cooks food by passing an electromagnetic wave through it', 'name': 'microwave_oven'}, {'frequency': 'r', 'id': 700, 'synset': 'milestone.n.01', 'synonyms': ['milestone', 'milepost'], 'def': 'stone post at side of a road to show distances', 'name': 'milestone'}, {'frequency': 'c', 'id': 701, 'synset': 'milk.n.01', 'synonyms': ['milk'], 'def': 'a white nutritious liquid secreted by mammals and used as food by human beings', 'name': 'milk'}, {'frequency': 'f', 'id': 702, 'synset': 'minivan.n.01', 'synonyms': ['minivan'], 'def': 'a small box-shaped passenger van', 'name': 'minivan'}, {'frequency': 'r', 'id': 703, 'synset': 'mint.n.05', 'synonyms': ['mint_candy'], 'def': 'a candy that is flavored with a mint oil', 'name': 'mint_candy'}, {'frequency': 'f', 'id': 704, 'synset': 'mirror.n.01', 'synonyms': ['mirror'], 'def': 'polished surface that forms images by reflecting light', 'name': 'mirror'}, {'frequency': 'c', 'id': 705, 'synset': 'mitten.n.01', 'synonyms': ['mitten'], 'def': 'glove that encases the thumb separately and the other four fingers together', 'name': 'mitten'}, {'frequency': 'c', 'id': 706, 'synset': 'mixer.n.04', 'synonyms': ['mixer_(kitchen_tool)', 'stand_mixer'], 'def': 'a kitchen utensil that is used for mixing foods', 'name': 'mixer_(kitchen_tool)'}, {'frequency': 'c', 'id': 707, 'synset': 'money.n.03', 'synonyms': ['money'], 'def': 'the official currency issued by a government or national bank', 'name': 'money'}, {'frequency': 'f', 'id': 708, 'synset': 'monitor.n.04', 'synonyms': ['monitor_(computer_equipment) computer_monitor'], 'def': 'a computer monitor', 'name': 'monitor_(computer_equipment) computer_monitor'}, {'frequency': 'c', 'id': 709, 'synset': 'monkey.n.01', 'synonyms': ['monkey'], 'def': 'any of various long-tailed primates', 'name': 'monkey'}, {'frequency': 'f', 'id': 710, 'synset': 'motor.n.01', 'synonyms': ['motor'], 'def': 'machine that converts other forms of energy into mechanical energy and so imparts motion', 'name': 'motor'}, {'frequency': 'f', 'id': 711, 'synset': 'motor_scooter.n.01', 'synonyms': ['motor_scooter', 'scooter'], 'def': 'a wheeled vehicle with small wheels and a low-powered engine', 'name': 'motor_scooter'}, {'frequency': 'r', 'id': 712, 'synset': 'motor_vehicle.n.01', 'synonyms': ['motor_vehicle', 'automotive_vehicle'], 'def': 'a self-propelled wheeled vehicle that does not run on rails', 'name': 'motor_vehicle'}, {'frequency': 'r', 'id': 713, 'synset': 'motorboat.n.01', 'synonyms': ['motorboat', 'powerboat'], 'def': 'a boat propelled by an internal-combustion engine', 'name': 'motorboat'}, {'frequency': 'f', 'id': 714, 'synset': 'motorcycle.n.01', 'synonyms': ['motorcycle'], 'def': 'a motor vehicle with two wheels and a strong frame', 'name': 'motorcycle'}, {'frequency': 'f', 'id': 715, 'synset': 'mound.n.01', 'synonyms': ['mound_(baseball)', "pitcher's_mound"], 'def': '(baseball) the slight elevation on which the pitcher stands', 'name': 'mound_(baseball)'}, {'frequency': 'r', 'id': 716, 'synset': 'mouse.n.01', 'synonyms': ['mouse_(animal_rodent)'], 'def': 'a small rodent with pointed snouts and small ears on elongated bodies with slender usually hairless tails', 'name': 'mouse_(animal_rodent)'}, {'frequency': 'f', 'id': 717, 'synset': 'mouse.n.04', 'synonyms': ['mouse_(computer_equipment)', 'computer_mouse'], 'def': 'a computer input device that controls an on-screen pointer', 'name': 'mouse_(computer_equipment)'}, {'frequency': 'f', 'id': 718, 'synset': 'mousepad.n.01', 'synonyms': ['mousepad'], 'def': 'a small portable pad that provides an operating surface for a computer mouse', 'name': 'mousepad'}, {'frequency': 'c', 'id': 719, 'synset': 'muffin.n.01', 'synonyms': ['muffin'], 'def': 'a sweet quick bread baked in a cup-shaped pan', 'name': 'muffin'}, {'frequency': 'f', 'id': 720, 'synset': 'mug.n.04', 'synonyms': ['mug'], 'def': 'with handle and usually cylindrical', 'name': 'mug'}, {'frequency': 'f', 'id': 721, 'synset': 'mushroom.n.02', 'synonyms': ['mushroom'], 'def': 'a common mushroom', 'name': 'mushroom'}, {'frequency': 'r', 'id': 722, 'synset': 'music_stool.n.01', 'synonyms': ['music_stool', 'piano_stool'], 'def': 'a stool for piano players; usually adjustable in height', 'name': 'music_stool'}, {'frequency': 'r', 'id': 723, 'synset': 'musical_instrument.n.01', 'synonyms': ['musical_instrument', 'instrument_(musical)'], 'def': 'any of various devices or contrivances that can be used to produce musical tones or sounds', 'name': 'musical_instrument'}, {'frequency': 'r', 'id': 724, 'synset': 'nailfile.n.01', 'synonyms': ['nailfile'], 'def': 'a small flat file for shaping the nails', 'name': 'nailfile'}, {'frequency': 'r', 'id': 725, 'synset': 'nameplate.n.01', 'synonyms': ['nameplate'], 'def': 'a plate bearing a name', 'name': 'nameplate'}, {'frequency': 'f', 'id': 726, 'synset': 'napkin.n.01', 'synonyms': ['napkin', 'table_napkin', 'serviette'], 'def': 'a small piece of table linen or paper that is used to wipe the mouth and to cover the lap in order to protect clothing', 'name': 'napkin'}, {'frequency': 'r', 'id': 727, 'synset': 'neckerchief.n.01', 'synonyms': ['neckerchief'], 'def': 'a kerchief worn around the neck', 'name': 'neckerchief'}, {'frequency': 'f', 'id': 728, 'synset': 'necklace.n.01', 'synonyms': ['necklace'], 'def': 'jewelry consisting of a cord or chain (often bearing gems) worn about the neck as an ornament', 'name': 'necklace'}, {'frequency': 'f', 'id': 729, 'synset': 'necktie.n.01', 'synonyms': ['necktie', 'tie_(necktie)'], 'def': 'neckwear consisting of a long narrow piece of material worn under a collar and tied in knot at the front', 'name': 'necktie'}, {'frequency': 'r', 'id': 730, 'synset': 'needle.n.03', 'synonyms': ['needle'], 'def': 'a sharp pointed implement (usually metal)', 'name': 'needle'}, {'frequency': 'c', 'id': 731, 'synset': 'nest.n.01', 'synonyms': ['nest'], 'def': 'a structure in which animals lay eggs or give birth to their young', 'name': 'nest'}, {'frequency': 'r', 'id': 732, 'synset': 'newsstand.n.01', 'synonyms': ['newsstand'], 'def': 'a stall where newspapers and other periodicals are sold', 'name': 'newsstand'}, {'frequency': 'c', 'id': 733, 'synset': 'nightwear.n.01', 'synonyms': ['nightshirt', 'nightwear', 'sleepwear', 'nightclothes'], 'def': 'garments designed to be worn in bed', 'name': 'nightshirt'}, {'frequency': 'r', 'id': 734, 'synset': 'nosebag.n.01', 'synonyms': ['nosebag_(for_animals)', 'feedbag'], 'def': 'a canvas bag that is used to feed an animal (such as a horse); covers the muzzle and fastens at the top of the head', 'name': 'nosebag_(for_animals)'}, {'frequency': 'r', 'id': 735, 'synset': 'noseband.n.01', 'synonyms': ['noseband_(for_animals)', 'nosepiece_(for_animals)'], 'def': "a strap that is the part of a bridle that goes over the animal's nose", 'name': 'noseband_(for_animals)'}, {'frequency': 'f', 'id': 736, 'synset': 'notebook.n.01', 'synonyms': ['notebook'], 'def': 'a book with blank pages for recording notes or memoranda', 'name': 'notebook'}, {'frequency': 'c', 'id': 737, 'synset': 'notepad.n.01', 'synonyms': ['notepad'], 'def': 'a pad of paper for keeping notes', 'name': 'notepad'}, {'frequency': 'c', 'id': 738, 'synset': 'nut.n.03', 'synonyms': ['nut'], 'def': 'a small metal block (usually square or hexagonal) with internal screw thread to be fitted onto a bolt', 'name': 'nut'}, {'frequency': 'r', 'id': 739, 'synset': 'nutcracker.n.01', 'synonyms': ['nutcracker'], 'def': 'a hand tool used to crack nuts open', 'name': 'nutcracker'}, {'frequency': 'c', 'id': 740, 'synset': 'oar.n.01', 'synonyms': ['oar'], 'def': 'an implement used to propel or steer a boat', 'name': 'oar'}, {'frequency': 'r', 'id': 741, 'synset': 'octopus.n.01', 'synonyms': ['octopus_(food)'], 'def': 'tentacles of octopus prepared as food', 'name': 'octopus_(food)'}, {'frequency': 'r', 'id': 742, 'synset': 'octopus.n.02', 'synonyms': ['octopus_(animal)'], 'def': 'bottom-living cephalopod having a soft oval body with eight long tentacles', 'name': 'octopus_(animal)'}, {'frequency': 'c', 'id': 743, 'synset': 'oil_lamp.n.01', 'synonyms': ['oil_lamp', 'kerosene_lamp', 'kerosine_lamp'], 'def': 'a lamp that burns oil (as kerosine) for light', 'name': 'oil_lamp'}, {'frequency': 'c', 'id': 744, 'synset': 'olive_oil.n.01', 'synonyms': ['olive_oil'], 'def': 'oil from olives', 'name': 'olive_oil'}, {'frequency': 'r', 'id': 745, 'synset': 'omelet.n.01', 'synonyms': ['omelet', 'omelette'], 'def': 'beaten eggs cooked until just set; may be folded around e.g. ham or cheese or jelly', 'name': 'omelet'}, {'frequency': 'f', 'id': 746, 'synset': 'onion.n.01', 'synonyms': ['onion'], 'def': 'the bulb of an onion plant', 'name': 'onion'}, {'frequency': 'f', 'id': 747, 'synset': 'orange.n.01', 'synonyms': ['orange_(fruit)'], 'def': 'orange (FRUIT of an orange tree)', 'name': 'orange_(fruit)'}, {'frequency': 'c', 'id': 748, 'synset': 'orange_juice.n.01', 'synonyms': ['orange_juice'], 'def': 'bottled or freshly squeezed juice of oranges', 'name': 'orange_juice'}, {'frequency': 'r', 'id': 749, 'synset': 'oregano.n.01', 'synonyms': ['oregano', 'marjoram'], 'def': 'aromatic Eurasian perennial herb used in cooking and baking', 'name': 'oregano'}, {'frequency': 'c', 'id': 750, 'synset': 'ostrich.n.02', 'synonyms': ['ostrich'], 'def': 'fast-running African flightless bird with two-toed feet; largest living bird', 'name': 'ostrich'}, {'frequency': 'c', 'id': 751, 'synset': 'ottoman.n.03', 'synonyms': ['ottoman', 'pouf', 'pouffe', 'hassock'], 'def': 'thick cushion used as a seat', 'name': 'ottoman'}, {'frequency': 'c', 'id': 752, 'synset': 'overall.n.01', 'synonyms': ['overalls_(clothing)'], 'def': 'work clothing consisting of denim trousers usually with a bib and shoulder straps', 'name': 'overalls_(clothing)'}, {'frequency': 'c', 'id': 753, 'synset': 'owl.n.01', 'synonyms': ['owl'], 'def': 'nocturnal bird of prey with hawk-like beak and claws and large head with front-facing eyes', 'name': 'owl'}, {'frequency': 'c', 'id': 754, 'synset': 'packet.n.03', 'synonyms': ['packet'], 'def': 'a small package or bundle', 'name': 'packet'}, {'frequency': 'r', 'id': 755, 'synset': 'pad.n.03', 'synonyms': ['inkpad', 'inking_pad', 'stamp_pad'], 'def': 'absorbent material saturated with ink used to transfer ink evenly to a rubber stamp', 'name': 'inkpad'}, {'frequency': 'c', 'id': 756, 'synset': 'pad.n.04', 'synonyms': ['pad'], 'def': 'a flat mass of soft material used for protection, stuffing, or comfort', 'name': 'pad'}, {'frequency': 'c', 'id': 757, 'synset': 'paddle.n.04', 'synonyms': ['paddle', 'boat_paddle'], 'def': 'a short light oar used without an oarlock to propel a canoe or small boat', 'name': 'paddle'}, {'frequency': 'c', 'id': 758, 'synset': 'padlock.n.01', 'synonyms': ['padlock'], 'def': 'a detachable, portable lock', 'name': 'padlock'}, {'frequency': 'r', 'id': 759, 'synset': 'paintbox.n.01', 'synonyms': ['paintbox'], 'def': "a box containing a collection of cubes or tubes of artists' paint", 'name': 'paintbox'}, {'frequency': 'c', 'id': 760, 'synset': 'paintbrush.n.01', 'synonyms': ['paintbrush'], 'def': 'a brush used as an applicator to apply paint', 'name': 'paintbrush'}, {'frequency': 'f', 'id': 761, 'synset': 'painting.n.01', 'synonyms': ['painting'], 'def': 'graphic art consisting of an artistic composition made by applying paints to a surface', 'name': 'painting'}, {'frequency': 'c', 'id': 762, 'synset': 'pajama.n.02', 'synonyms': ['pajamas', 'pyjamas'], 'def': 'loose-fitting nightclothes worn for sleeping or lounging', 'name': 'pajamas'}, {'frequency': 'c', 'id': 763, 'synset': 'palette.n.02', 'synonyms': ['palette', 'pallet'], 'def': 'board that provides a flat surface on which artists mix paints and the range of colors used', 'name': 'palette'}, {'frequency': 'f', 'id': 764, 'synset': 'pan.n.01', 'synonyms': ['pan_(for_cooking)', 'cooking_pan'], 'def': 'cooking utensil consisting of a wide metal vessel', 'name': 'pan_(for_cooking)'}, {'frequency': 'r', 'id': 765, 'synset': 'pan.n.03', 'synonyms': ['pan_(metal_container)'], 'def': 'shallow container made of metal', 'name': 'pan_(metal_container)'}, {'frequency': 'c', 'id': 766, 'synset': 'pancake.n.01', 'synonyms': ['pancake'], 'def': 'a flat cake of thin batter fried on both sides on a griddle', 'name': 'pancake'}, {'frequency': 'r', 'id': 767, 'synset': 'pantyhose.n.01', 'synonyms': ['pantyhose'], 'def': "a woman's tights consisting of underpants and stockings", 'name': 'pantyhose'}, {'frequency': 'r', 'id': 768, 'synset': 'papaya.n.02', 'synonyms': ['papaya'], 'def': 'large oval melon-like tropical fruit with yellowish flesh', 'name': 'papaya'}, {'frequency': 'r', 'id': 769, 'synset': 'paper_clip.n.01', 'synonyms': ['paperclip'], 'def': 'a wire or plastic clip for holding sheets of paper together', 'name': 'paperclip'}, {'frequency': 'f', 'id': 770, 'synset': 'paper_plate.n.01', 'synonyms': ['paper_plate'], 'def': 'a disposable plate made of cardboard', 'name': 'paper_plate'}, {'frequency': 'f', 'id': 771, 'synset': 'paper_towel.n.01', 'synonyms': ['paper_towel'], 'def': 'a disposable towel made of absorbent paper', 'name': 'paper_towel'}, {'frequency': 'r', 'id': 772, 'synset': 'paperback_book.n.01', 'synonyms': ['paperback_book', 'paper-back_book', 'softback_book', 'soft-cover_book'], 'def': 'a book with paper covers', 'name': 'paperback_book'}, {'frequency': 'r', 'id': 773, 'synset': 'paperweight.n.01', 'synonyms': ['paperweight'], 'def': 'a weight used to hold down a stack of papers', 'name': 'paperweight'}, {'frequency': 'c', 'id': 774, 'synset': 'parachute.n.01', 'synonyms': ['parachute'], 'def': 'rescue equipment consisting of a device that fills with air and retards your fall', 'name': 'parachute'}, {'frequency': 'r', 'id': 775, 'synset': 'parakeet.n.01', 'synonyms': ['parakeet', 'parrakeet', 'parroket', 'paraquet', 'paroquet', 'parroquet'], 'def': 'any of numerous small slender long-tailed parrots', 'name': 'parakeet'}, {'frequency': 'c', 'id': 776, 'synset': 'parasail.n.01', 'synonyms': ['parasail_(sports)'], 'def': 'parachute that will lift a person up into the air when it is towed by a motorboat or a car', 'name': 'parasail_(sports)'}, {'frequency': 'r', 'id': 777, 'synset': 'parchment.n.01', 'synonyms': ['parchment'], 'def': 'a superior paper resembling sheepskin', 'name': 'parchment'}, {'frequency': 'r', 'id': 778, 'synset': 'parka.n.01', 'synonyms': ['parka', 'anorak'], 'def': "a kind of heavy jacket (`windcheater' is a British term)", 'name': 'parka'}, {'frequency': 'f', 'id': 779, 'synset': 'parking_meter.n.01', 'synonyms': ['parking_meter'], 'def': 'a coin-operated timer located next to a parking space', 'name': 'parking_meter'}, {'frequency': 'c', 'id': 780, 'synset': 'parrot.n.01', 'synonyms': ['parrot'], 'def': 'usually brightly colored tropical birds with short hooked beaks and the ability to mimic sounds', 'name': 'parrot'}, {'frequency': 'c', 'id': 781, 'synset': 'passenger_car.n.01', 'synonyms': ['passenger_car_(part_of_a_train)', 'coach_(part_of_a_train)'], 'def': 'a railcar where passengers ride', 'name': 'passenger_car_(part_of_a_train)'}, {'frequency': 'r', 'id': 782, 'synset': 'passenger_ship.n.01', 'synonyms': ['passenger_ship'], 'def': 'a ship built to carry passengers', 'name': 'passenger_ship'}, {'frequency': 'r', 'id': 783, 'synset': 'passport.n.02', 'synonyms': ['passport'], 'def': 'a document issued by a country to a citizen allowing that person to travel abroad and re-enter the home country', 'name': 'passport'}, {'frequency': 'f', 'id': 784, 'synset': 'pastry.n.02', 'synonyms': ['pastry'], 'def': 'any of various baked foods made of dough or batter', 'name': 'pastry'}, {'frequency': 'r', 'id': 785, 'synset': 'patty.n.01', 'synonyms': ['patty_(food)'], 'def': 'small flat mass of chopped food', 'name': 'patty_(food)'}, {'frequency': 'c', 'id': 786, 'synset': 'pea.n.01', 'synonyms': ['pea_(food)'], 'def': 'seed of a pea plant used for food', 'name': 'pea_(food)'}, {'frequency': 'c', 'id': 787, 'synset': 'peach.n.03', 'synonyms': ['peach'], 'def': 'downy juicy fruit with sweet yellowish or whitish flesh', 'name': 'peach'}, {'frequency': 'c', 'id': 788, 'synset': 'peanut_butter.n.01', 'synonyms': ['peanut_butter'], 'def': 'a spread made from ground peanuts', 'name': 'peanut_butter'}, {'frequency': 'c', 'id': 789, 'synset': 'pear.n.01', 'synonyms': ['pear'], 'def': 'sweet juicy gritty-textured fruit available in many varieties', 'name': 'pear'}, {'frequency': 'r', 'id': 790, 'synset': 'peeler.n.03', 'synonyms': ['peeler_(tool_for_fruit_and_vegetables)'], 'def': 'a device for peeling vegetables or fruits', 'name': 'peeler_(tool_for_fruit_and_vegetables)'}, {'frequency': 'r', 'id': 791, 'synset': 'pegboard.n.01', 'synonyms': ['pegboard'], 'def': 'a board perforated with regularly spaced holes into which pegs can be fitted', 'name': 'pegboard'}, {'frequency': 'c', 'id': 792, 'synset': 'pelican.n.01', 'synonyms': ['pelican'], 'def': 'large long-winged warm-water seabird having a large bill with a distensible pouch for fish', 'name': 'pelican'}, {'frequency': 'f', 'id': 793, 'synset': 'pen.n.01', 'synonyms': ['pen'], 'def': 'a writing implement with a point from which ink flows', 'name': 'pen'}, {'frequency': 'c', 'id': 794, 'synset': 'pencil.n.01', 'synonyms': ['pencil'], 'def': 'a thin cylindrical pointed writing implement made of wood and graphite', 'name': 'pencil'}, {'frequency': 'r', 'id': 795, 'synset': 'pencil_box.n.01', 'synonyms': ['pencil_box', 'pencil_case'], 'def': 'a box for holding pencils', 'name': 'pencil_box'}, {'frequency': 'r', 'id': 796, 'synset': 'pencil_sharpener.n.01', 'synonyms': ['pencil_sharpener'], 'def': 'a rotary implement for sharpening the point on pencils', 'name': 'pencil_sharpener'}, {'frequency': 'r', 'id': 797, 'synset': 'pendulum.n.01', 'synonyms': ['pendulum'], 'def': 'an apparatus consisting of an object mounted so that it swings freely under the influence of gravity', 'name': 'pendulum'}, {'frequency': 'c', 'id': 798, 'synset': 'penguin.n.01', 'synonyms': ['penguin'], 'def': 'short-legged flightless birds of cold southern regions having webbed feet and wings modified as flippers', 'name': 'penguin'}, {'frequency': 'r', 'id': 799, 'synset': 'pennant.n.02', 'synonyms': ['pennant'], 'def': 'a flag longer than it is wide (and often tapering)', 'name': 'pennant'}, {'frequency': 'r', 'id': 800, 'synset': 'penny.n.02', 'synonyms': ['penny_(coin)'], 'def': 'a coin worth one-hundredth of the value of the basic unit', 'name': 'penny_(coin)'}, {'frequency': 'c', 'id': 801, 'synset': 'pepper.n.03', 'synonyms': ['pepper', 'peppercorn'], 'def': 'pungent seasoning from the berry of the common pepper plant; whole or ground', 'name': 'pepper'}, {'frequency': 'c', 'id': 802, 'synset': 'pepper_mill.n.01', 'synonyms': ['pepper_mill', 'pepper_grinder'], 'def': 'a mill for grinding pepper', 'name': 'pepper_mill'}, {'frequency': 'c', 'id': 803, 'synset': 'perfume.n.02', 'synonyms': ['perfume'], 'def': 'a toiletry that emits and diffuses a fragrant odor', 'name': 'perfume'}, {'frequency': 'r', 'id': 804, 'synset': 'persimmon.n.02', 'synonyms': ['persimmon'], 'def': 'orange fruit resembling a plum; edible when fully ripe', 'name': 'persimmon'}, {'frequency': 'f', 'id': 805, 'synset': 'person.n.01', 'synonyms': ['baby', 'child', 'boy', 'girl', 'man', 'woman', 'person', 'human'], 'def': 'a human being', 'name': 'baby'}, {'frequency': 'r', 'id': 806, 'synset': 'pet.n.01', 'synonyms': ['pet'], 'def': 'a domesticated animal kept for companionship or amusement', 'name': 'pet'}, {'frequency': 'r', 'id': 807, 'synset': 'petfood.n.01', 'synonyms': ['petfood', 'pet-food'], 'def': 'food prepared for animal pets', 'name': 'petfood'}, {'frequency': 'r', 'id': 808, 'synset': 'pew.n.01', 'synonyms': ['pew_(church_bench)', 'church_bench'], 'def': 'long bench with backs; used in church by the congregation', 'name': 'pew_(church_bench)'}, {'frequency': 'r', 'id': 809, 'synset': 'phonebook.n.01', 'synonyms': ['phonebook', 'telephone_book', 'telephone_directory'], 'def': 'a directory containing an alphabetical list of telephone subscribers and their telephone numbers', 'name': 'phonebook'}, {'frequency': 'c', 'id': 810, 'synset': 'phonograph_record.n.01', 'synonyms': ['phonograph_record', 'phonograph_recording', 'record_(phonograph_recording)'], 'def': 'sound recording consisting of a typically black disk with a continuous groove', 'name': 'phonograph_record'}, {'frequency': 'c', 'id': 811, 'synset': 'piano.n.01', 'synonyms': ['piano'], 'def': 'a keyboard instrument that is played by depressing keys that cause hammers to strike tuned strings and produce sounds', 'name': 'piano'}, {'frequency': 'f', 'id': 812, 'synset': 'pickle.n.01', 'synonyms': ['pickle'], 'def': 'vegetables (especially cucumbers) preserved in brine or vinegar', 'name': 'pickle'}, {'frequency': 'f', 'id': 813, 'synset': 'pickup.n.01', 'synonyms': ['pickup_truck'], 'def': 'a light truck with an open body and low sides and a tailboard', 'name': 'pickup_truck'}, {'frequency': 'c', 'id': 814, 'synset': 'pie.n.01', 'synonyms': ['pie'], 'def': 'dish baked in pastry-lined pan often with a pastry top', 'name': 'pie'}, {'frequency': 'c', 'id': 815, 'synset': 'pigeon.n.01', 'synonyms': ['pigeon'], 'def': 'wild and domesticated birds having a heavy body and short legs', 'name': 'pigeon'}, {'frequency': 'r', 'id': 816, 'synset': 'piggy_bank.n.01', 'synonyms': ['piggy_bank', 'penny_bank'], 'def': "a child's coin bank (often shaped like a pig)", 'name': 'piggy_bank'}, {'frequency': 'f', 'id': 817, 'synset': 'pillow.n.01', 'synonyms': ['pillow'], 'def': 'a cushion to support the head of a sleeping person', 'name': 'pillow'}, {'frequency': 'r', 'id': 818, 'synset': 'pin.n.09', 'synonyms': ['pin_(non_jewelry)'], 'def': 'a small slender (often pointed) piece of wood or metal used to support or fasten or attach things', 'name': 'pin_(non_jewelry)'}, {'frequency': 'f', 'id': 819, 'synset': 'pineapple.n.02', 'synonyms': ['pineapple'], 'def': 'large sweet fleshy tropical fruit with a tuft of stiff leaves', 'name': 'pineapple'}, {'frequency': 'c', 'id': 820, 'synset': 'pinecone.n.01', 'synonyms': ['pinecone'], 'def': 'the seed-producing cone of a pine tree', 'name': 'pinecone'}, {'frequency': 'r', 'id': 821, 'synset': 'ping-pong_ball.n.01', 'synonyms': ['ping-pong_ball'], 'def': 'light hollow ball used in playing table tennis', 'name': 'ping-pong_ball'}, {'frequency': 'r', 'id': 822, 'synset': 'pinwheel.n.03', 'synonyms': ['pinwheel'], 'def': 'a toy consisting of vanes of colored paper or plastic that is pinned to a stick and spins when it is pointed into the wind', 'name': 'pinwheel'}, {'frequency': 'r', 'id': 823, 'synset': 'pipe.n.01', 'synonyms': ['tobacco_pipe'], 'def': 'a tube with a small bowl at one end; used for smoking tobacco', 'name': 'tobacco_pipe'}, {'frequency': 'f', 'id': 824, 'synset': 'pipe.n.02', 'synonyms': ['pipe', 'piping'], 'def': 'a long tube made of metal or plastic that is used to carry water or oil or gas etc.', 'name': 'pipe'}, {'frequency': 'r', 'id': 825, 'synset': 'pistol.n.01', 'synonyms': ['pistol', 'handgun'], 'def': 'a firearm that is held and fired with one hand', 'name': 'pistol'}, {'frequency': 'r', 'id': 826, 'synset': 'pita.n.01', 'synonyms': ['pita_(bread)', 'pocket_bread'], 'def': 'usually small round bread that can open into a pocket for filling', 'name': 'pita_(bread)'}, {'frequency': 'f', 'id': 827, 'synset': 'pitcher.n.02', 'synonyms': ['pitcher_(vessel_for_liquid)', 'ewer'], 'def': 'an open vessel with a handle and a spout for pouring', 'name': 'pitcher_(vessel_for_liquid)'}, {'frequency': 'r', 'id': 828, 'synset': 'pitchfork.n.01', 'synonyms': ['pitchfork'], 'def': 'a long-handled hand tool with sharp widely spaced prongs for lifting and pitching hay', 'name': 'pitchfork'}, {'frequency': 'f', 'id': 829, 'synset': 'pizza.n.01', 'synonyms': ['pizza'], 'def': 'Italian open pie made of thin bread dough spread with a spiced mixture of e.g. tomato sauce and cheese', 'name': 'pizza'}, {'frequency': 'f', 'id': 830, 'synset': 'place_mat.n.01', 'synonyms': ['place_mat'], 'def': 'a mat placed on a table for an individual place setting', 'name': 'place_mat'}, {'frequency': 'f', 'id': 831, 'synset': 'plate.n.04', 'synonyms': ['plate'], 'def': 'dish on which food is served or from which food is eaten', 'name': 'plate'}, {'frequency': 'c', 'id': 832, 'synset': 'platter.n.01', 'synonyms': ['platter'], 'def': 'a large shallow dish used for serving food', 'name': 'platter'}, {'frequency': 'r', 'id': 833, 'synset': 'playing_card.n.01', 'synonyms': ['playing_card'], 'def': 'one of a pack of cards that are used to play card games', 'name': 'playing_card'}, {'frequency': 'r', 'id': 834, 'synset': 'playpen.n.01', 'synonyms': ['playpen'], 'def': 'a portable enclosure in which babies may be left to play', 'name': 'playpen'}, {'frequency': 'c', 'id': 835, 'synset': 'pliers.n.01', 'synonyms': ['pliers', 'plyers'], 'def': 'a gripping hand tool with two hinged arms and (usually) serrated jaws', 'name': 'pliers'}, {'frequency': 'r', 'id': 836, 'synset': 'plow.n.01', 'synonyms': ['plow_(farm_equipment)', 'plough_(farm_equipment)'], 'def': 'a farm tool having one or more heavy blades to break the soil and cut a furrow prior to sowing', 'name': 'plow_(farm_equipment)'}, {'frequency': 'r', 'id': 837, 'synset': 'pocket_watch.n.01', 'synonyms': ['pocket_watch'], 'def': 'a watch that is carried in a small watch pocket', 'name': 'pocket_watch'}, {'frequency': 'c', 'id': 838, 'synset': 'pocketknife.n.01', 'synonyms': ['pocketknife'], 'def': 'a knife with a blade that folds into the handle; suitable for carrying in the pocket', 'name': 'pocketknife'}, {'frequency': 'c', 'id': 839, 'synset': 'poker.n.01', 'synonyms': ['poker_(fire_stirring_tool)', 'stove_poker', 'fire_hook'], 'def': 'fire iron consisting of a metal rod with a handle; used to stir a fire', 'name': 'poker_(fire_stirring_tool)'}, {'frequency': 'f', 'id': 840, 'synset': 'pole.n.01', 'synonyms': ['pole', 'post'], 'def': 'a long (usually round) rod of wood or metal or plastic', 'name': 'pole'}, {'frequency': 'r', 'id': 841, 'synset': 'police_van.n.01', 'synonyms': ['police_van', 'police_wagon', 'paddy_wagon', 'patrol_wagon'], 'def': 'van used by police to transport prisoners', 'name': 'police_van'}, {'frequency': 'f', 'id': 842, 'synset': 'polo_shirt.n.01', 'synonyms': ['polo_shirt', 'sport_shirt'], 'def': 'a shirt with short sleeves designed for comfort and casual wear', 'name': 'polo_shirt'}, {'frequency': 'r', 'id': 843, 'synset': 'poncho.n.01', 'synonyms': ['poncho'], 'def': 'a blanket-like cloak with a hole in the center for the head', 'name': 'poncho'}, {'frequency': 'c', 'id': 844, 'synset': 'pony.n.05', 'synonyms': ['pony'], 'def': 'any of various breeds of small gentle horses usually less than five feet high at the shoulder', 'name': 'pony'}, {'frequency': 'r', 'id': 845, 'synset': 'pool_table.n.01', 'synonyms': ['pool_table', 'billiard_table', 'snooker_table'], 'def': 'game equipment consisting of a heavy table on which pool is played', 'name': 'pool_table'}, {'frequency': 'f', 'id': 846, 'synset': 'pop.n.02', 'synonyms': ['pop_(soda)', 'soda_(pop)', 'tonic', 'soft_drink'], 'def': 'a sweet drink containing carbonated water and flavoring', 'name': 'pop_(soda)'}, {'frequency': 'r', 'id': 847, 'synset': 'portrait.n.02', 'synonyms': ['portrait', 'portrayal'], 'def': 'any likeness of a person, in any medium', 'name': 'portrait'}, {'frequency': 'c', 'id': 848, 'synset': 'postbox.n.01', 'synonyms': ['postbox_(public)', 'mailbox_(public)'], 'def': 'public box for deposit of mail', 'name': 'postbox_(public)'}, {'frequency': 'c', 'id': 849, 'synset': 'postcard.n.01', 'synonyms': ['postcard', 'postal_card', 'mailing-card'], 'def': 'a card for sending messages by post without an envelope', 'name': 'postcard'}, {'frequency': 'f', 'id': 850, 'synset': 'poster.n.01', 'synonyms': ['poster', 'placard'], 'def': 'a sign posted in a public place as an advertisement', 'name': 'poster'}, {'frequency': 'f', 'id': 851, 'synset': 'pot.n.01', 'synonyms': ['pot'], 'def': 'metal or earthenware cooking vessel that is usually round and deep; often has a handle and lid', 'name': 'pot'}, {'frequency': 'f', 'id': 852, 'synset': 'pot.n.04', 'synonyms': ['flowerpot'], 'def': 'a container in which plants are cultivated', 'name': 'flowerpot'}, {'frequency': 'f', 'id': 853, 'synset': 'potato.n.01', 'synonyms': ['potato'], 'def': 'an edible tuber native to South America', 'name': 'potato'}, {'frequency': 'c', 'id': 854, 'synset': 'potholder.n.01', 'synonyms': ['potholder'], 'def': 'an insulated pad for holding hot pots', 'name': 'potholder'}, {'frequency': 'c', 'id': 855, 'synset': 'pottery.n.01', 'synonyms': ['pottery', 'clayware'], 'def': 'ceramic ware made from clay and baked in a kiln', 'name': 'pottery'}, {'frequency': 'c', 'id': 856, 'synset': 'pouch.n.01', 'synonyms': ['pouch'], 'def': 'a small or medium size container for holding or carrying things', 'name': 'pouch'}, {'frequency': 'r', 'id': 857, 'synset': 'power_shovel.n.01', 'synonyms': ['power_shovel', 'excavator', 'digger'], 'def': 'a machine for excavating', 'name': 'power_shovel'}, {'frequency': 'c', 'id': 858, 'synset': 'prawn.n.01', 'synonyms': ['prawn', 'shrimp'], 'def': 'any of various edible decapod crustaceans', 'name': 'prawn'}, {'frequency': 'f', 'id': 859, 'synset': 'printer.n.03', 'synonyms': ['printer', 'printing_machine'], 'def': 'a machine that prints', 'name': 'printer'}, {'frequency': 'c', 'id': 860, 'synset': 'projectile.n.01', 'synonyms': ['projectile_(weapon)', 'missile'], 'def': 'a weapon that is forcibly thrown or projected at a targets', 'name': 'projectile_(weapon)'}, {'frequency': 'c', 'id': 861, 'synset': 'projector.n.02', 'synonyms': ['projector'], 'def': 'an optical instrument that projects an enlarged image onto a screen', 'name': 'projector'}, {'frequency': 'f', 'id': 862, 'synset': 'propeller.n.01', 'synonyms': ['propeller', 'propellor'], 'def': 'a mechanical device that rotates to push against air or water', 'name': 'propeller'}, {'frequency': 'r', 'id': 863, 'synset': 'prune.n.01', 'synonyms': ['prune'], 'def': 'dried plum', 'name': 'prune'}, {'frequency': 'r', 'id': 864, 'synset': 'pudding.n.01', 'synonyms': ['pudding'], 'def': 'any of various soft thick unsweetened baked dishes', 'name': 'pudding'}, {'frequency': 'r', 'id': 865, 'synset': 'puffer.n.02', 'synonyms': ['puffer_(fish)', 'pufferfish', 'blowfish', 'globefish'], 'def': 'fishes whose elongated spiny body can inflate itself with water or air to form a globe', 'name': 'puffer_(fish)'}, {'frequency': 'r', 'id': 866, 'synset': 'puffin.n.01', 'synonyms': ['puffin'], 'def': 'seabirds having short necks and brightly colored compressed bills', 'name': 'puffin'}, {'frequency': 'r', 'id': 867, 'synset': 'pug.n.01', 'synonyms': ['pug-dog'], 'def': 'small compact smooth-coated breed of Asiatic origin having a tightly curled tail and broad flat wrinkled muzzle', 'name': 'pug-dog'}, {'frequency': 'c', 'id': 868, 'synset': 'pumpkin.n.02', 'synonyms': ['pumpkin'], 'def': 'usually large pulpy deep-yellow round fruit of the squash family maturing in late summer or early autumn', 'name': 'pumpkin'}, {'frequency': 'r', 'id': 869, 'synset': 'punch.n.03', 'synonyms': ['puncher'], 'def': 'a tool for making holes or indentations', 'name': 'puncher'}, {'frequency': 'r', 'id': 870, 'synset': 'puppet.n.01', 'synonyms': ['puppet', 'marionette'], 'def': 'a small figure of a person operated from above with strings by a puppeteer', 'name': 'puppet'}, {'frequency': 'r', 'id': 871, 'synset': 'puppy.n.01', 'synonyms': ['puppy'], 'def': 'a young dog', 'name': 'puppy'}, {'frequency': 'r', 'id': 872, 'synset': 'quesadilla.n.01', 'synonyms': ['quesadilla'], 'def': 'a tortilla that is filled with cheese and heated', 'name': 'quesadilla'}, {'frequency': 'r', 'id': 873, 'synset': 'quiche.n.02', 'synonyms': ['quiche'], 'def': 'a tart filled with rich unsweetened custard; often contains other ingredients (as cheese or ham or seafood or vegetables)', 'name': 'quiche'}, {'frequency': 'f', 'id': 874, 'synset': 'quilt.n.01', 'synonyms': ['quilt', 'comforter'], 'def': 'bedding made of two layers of cloth filled with stuffing and stitched together', 'name': 'quilt'}, {'frequency': 'c', 'id': 875, 'synset': 'rabbit.n.01', 'synonyms': ['rabbit'], 'def': 'any of various burrowing animals of the family Leporidae having long ears and short tails', 'name': 'rabbit'}, {'frequency': 'r', 'id': 876, 'synset': 'racer.n.02', 'synonyms': ['race_car', 'racing_car'], 'def': 'a fast car that competes in races', 'name': 'race_car'}, {'frequency': 'c', 'id': 877, 'synset': 'racket.n.04', 'synonyms': ['racket', 'racquet'], 'def': 'a sports implement used to strike a ball in various games', 'name': 'racket'}, {'frequency': 'r', 'id': 878, 'synset': 'radar.n.01', 'synonyms': ['radar'], 'def': 'measuring instrument in which the echo of a pulse of microwave radiation is used to detect and locate distant objects', 'name': 'radar'}, {'frequency': 'c', 'id': 879, 'synset': 'radiator.n.03', 'synonyms': ['radiator'], 'def': 'a mechanism consisting of a metal honeycomb through which hot fluids circulate', 'name': 'radiator'}, {'frequency': 'c', 'id': 880, 'synset': 'radio_receiver.n.01', 'synonyms': ['radio_receiver', 'radio_set', 'radio', 'tuner_(radio)'], 'def': 'an electronic receiver that detects and demodulates and amplifies transmitted radio signals', 'name': 'radio_receiver'}, {'frequency': 'c', 'id': 881, 'synset': 'radish.n.03', 'synonyms': ['radish', 'daikon'], 'def': 'pungent edible root of any of various cultivated radish plants', 'name': 'radish'}, {'frequency': 'c', 'id': 882, 'synset': 'raft.n.01', 'synonyms': ['raft'], 'def': 'a flat float (usually made of logs or planks) that can be used for transport or as a platform for swimmers', 'name': 'raft'}, {'frequency': 'r', 'id': 883, 'synset': 'rag_doll.n.01', 'synonyms': ['rag_doll'], 'def': 'a cloth doll that is stuffed and (usually) painted', 'name': 'rag_doll'}, {'frequency': 'c', 'id': 884, 'synset': 'raincoat.n.01', 'synonyms': ['raincoat', 'waterproof_jacket'], 'def': 'a water-resistant coat', 'name': 'raincoat'}, {'frequency': 'c', 'id': 885, 'synset': 'ram.n.05', 'synonyms': ['ram_(animal)'], 'def': 'uncastrated adult male sheep', 'name': 'ram_(animal)'}, {'frequency': 'c', 'id': 886, 'synset': 'raspberry.n.02', 'synonyms': ['raspberry'], 'def': 'red or black edible aggregate berries usually smaller than the related blackberries', 'name': 'raspberry'}, {'frequency': 'r', 'id': 887, 'synset': 'rat.n.01', 'synonyms': ['rat'], 'def': 'any of various long-tailed rodents similar to but larger than a mouse', 'name': 'rat'}, {'frequency': 'c', 'id': 888, 'synset': 'razorblade.n.01', 'synonyms': ['razorblade'], 'def': 'a blade that has very sharp edge', 'name': 'razorblade'}, {'frequency': 'c', 'id': 889, 'synset': 'reamer.n.01', 'synonyms': ['reamer_(juicer)', 'juicer', 'juice_reamer'], 'def': 'a squeezer with a conical ridged center that is used for squeezing juice from citrus fruit', 'name': 'reamer_(juicer)'}, {'frequency': 'f', 'id': 890, 'synset': 'rearview_mirror.n.01', 'synonyms': ['rearview_mirror'], 'def': 'car mirror that reflects the view out of the rear window', 'name': 'rearview_mirror'}, {'frequency': 'c', 'id': 891, 'synset': 'receipt.n.02', 'synonyms': ['receipt'], 'def': 'an acknowledgment (usually tangible) that payment has been made', 'name': 'receipt'}, {'frequency': 'c', 'id': 892, 'synset': 'recliner.n.01', 'synonyms': ['recliner', 'reclining_chair', 'lounger_(chair)'], 'def': 'an armchair whose back can be lowered and foot can be raised to allow the sitter to recline in it', 'name': 'recliner'}, {'frequency': 'r', 'id': 893, 'synset': 'record_player.n.01', 'synonyms': ['record_player', 'phonograph_(record_player)', 'turntable'], 'def': 'machine in which rotating records cause a stylus to vibrate and the vibrations are amplified acoustically or electronically', 'name': 'record_player'}, {'frequency': 'r', 'id': 894, 'synset': 'red_cabbage.n.02', 'synonyms': ['red_cabbage'], 'def': 'compact head of purplish-red leaves', 'name': 'red_cabbage'}, {'frequency': 'f', 'id': 895, 'synset': 'reflector.n.01', 'synonyms': ['reflector'], 'def': 'device that reflects light, radiation, etc.', 'name': 'reflector'}, {'frequency': 'f', 'id': 896, 'synset': 'remote_control.n.01', 'synonyms': ['remote_control'], 'def': 'a device that can be used to control a machine or apparatus from a distance', 'name': 'remote_control'}, {'frequency': 'c', 'id': 897, 'synset': 'rhinoceros.n.01', 'synonyms': ['rhinoceros'], 'def': 'massive powerful herbivorous odd-toed ungulate of southeast Asia and Africa having very thick skin and one or two horns on the snout', 'name': 'rhinoceros'}, {'frequency': 'r', 'id': 898, 'synset': 'rib.n.03', 'synonyms': ['rib_(food)'], 'def': 'cut of meat including one or more ribs', 'name': 'rib_(food)'}, {'frequency': 'r', 'id': 899, 'synset': 'rifle.n.01', 'synonyms': ['rifle'], 'def': 'a shoulder firearm with a long barrel', 'name': 'rifle'}, {'frequency': 'f', 'id': 900, 'synset': 'ring.n.08', 'synonyms': ['ring'], 'def': 'jewelry consisting of a circlet of precious metal (often set with jewels) worn on the finger', 'name': 'ring'}, {'frequency': 'r', 'id': 901, 'synset': 'river_boat.n.01', 'synonyms': ['river_boat'], 'def': 'a boat used on rivers or to ply a river', 'name': 'river_boat'}, {'frequency': 'r', 'id': 902, 'synset': 'road_map.n.02', 'synonyms': ['road_map'], 'def': '(NOT A ROAD) a MAP showing roads (for automobile travel)', 'name': 'road_map'}, {'frequency': 'c', 'id': 903, 'synset': 'robe.n.01', 'synonyms': ['robe'], 'def': 'any loose flowing garment', 'name': 'robe'}, {'frequency': 'c', 'id': 904, 'synset': 'rocking_chair.n.01', 'synonyms': ['rocking_chair'], 'def': 'a chair mounted on rockers', 'name': 'rocking_chair'}, {'frequency': 'r', 'id': 905, 'synset': 'roller_skate.n.01', 'synonyms': ['roller_skate'], 'def': 'a shoe with pairs of rollers (small hard wheels) fixed to the sole', 'name': 'roller_skate'}, {'frequency': 'r', 'id': 906, 'synset': 'rollerblade.n.01', 'synonyms': ['Rollerblade'], 'def': 'an in-line variant of a roller skate', 'name': 'Rollerblade'}, {'frequency': 'c', 'id': 907, 'synset': 'rolling_pin.n.01', 'synonyms': ['rolling_pin'], 'def': 'utensil consisting of a cylinder (usually of wood) with a handle at each end; used to roll out dough', 'name': 'rolling_pin'}, {'frequency': 'r', 'id': 908, 'synset': 'root_beer.n.01', 'synonyms': ['root_beer'], 'def': 'carbonated drink containing extracts of roots and herbs', 'name': 'root_beer'}, {'frequency': 'c', 'id': 909, 'synset': 'router.n.02', 'synonyms': ['router_(computer_equipment)'], 'def': 'a device that forwards data packets between computer networks', 'name': 'router_(computer_equipment)'}, {'frequency': 'f', 'id': 910, 'synset': 'rubber_band.n.01', 'synonyms': ['rubber_band', 'elastic_band'], 'def': 'a narrow band of elastic rubber used to hold things (such as papers) together', 'name': 'rubber_band'}, {'frequency': 'c', 'id': 911, 'synset': 'runner.n.08', 'synonyms': ['runner_(carpet)'], 'def': 'a long narrow carpet', 'name': 'runner_(carpet)'}, {'frequency': 'f', 'id': 912, 'synset': 'sack.n.01', 'synonyms': ['plastic_bag', 'paper_bag'], 'def': "a bag made of paper or plastic for holding customer's purchases", 'name': 'plastic_bag'}, {'frequency': 'f', 'id': 913, 'synset': 'saddle.n.01', 'synonyms': ['saddle_(on_an_animal)'], 'def': 'a seat for the rider of a horse or camel', 'name': 'saddle_(on_an_animal)'}, {'frequency': 'f', 'id': 914, 'synset': 'saddle_blanket.n.01', 'synonyms': ['saddle_blanket', 'saddlecloth', 'horse_blanket'], 'def': 'stable gear consisting of a blanket placed under the saddle', 'name': 'saddle_blanket'}, {'frequency': 'c', 'id': 915, 'synset': 'saddlebag.n.01', 'synonyms': ['saddlebag'], 'def': 'a large bag (or pair of bags) hung over a saddle', 'name': 'saddlebag'}, {'frequency': 'r', 'id': 916, 'synset': 'safety_pin.n.01', 'synonyms': ['safety_pin'], 'def': 'a pin in the form of a clasp; has a guard so the point of the pin will not stick the user', 'name': 'safety_pin'}, {'frequency': 'c', 'id': 917, 'synset': 'sail.n.01', 'synonyms': ['sail'], 'def': 'a large piece of fabric by means of which wind is used to propel a sailing vessel', 'name': 'sail'}, {'frequency': 'c', 'id': 918, 'synset': 'salad.n.01', 'synonyms': ['salad'], 'def': 'food mixtures either arranged on a plate or tossed and served with a moist dressing; usually consisting of or including greens', 'name': 'salad'}, {'frequency': 'r', 'id': 919, 'synset': 'salad_plate.n.01', 'synonyms': ['salad_plate', 'salad_bowl'], 'def': 'a plate or bowl for individual servings of salad', 'name': 'salad_plate'}, {'frequency': 'r', 'id': 920, 'synset': 'salami.n.01', 'synonyms': ['salami'], 'def': 'highly seasoned fatty sausage of pork and beef usually dried', 'name': 'salami'}, {'frequency': 'r', 'id': 921, 'synset': 'salmon.n.01', 'synonyms': ['salmon_(fish)'], 'def': 'any of various large food and game fishes of northern waters', 'name': 'salmon_(fish)'}, {'frequency': 'r', 'id': 922, 'synset': 'salmon.n.03', 'synonyms': ['salmon_(food)'], 'def': 'flesh of any of various marine or freshwater fish of the family Salmonidae', 'name': 'salmon_(food)'}, {'frequency': 'r', 'id': 923, 'synset': 'salsa.n.01', 'synonyms': ['salsa'], 'def': 'spicy sauce of tomatoes and onions and chili peppers to accompany Mexican foods', 'name': 'salsa'}, {'frequency': 'f', 'id': 924, 'synset': 'saltshaker.n.01', 'synonyms': ['saltshaker'], 'def': 'a shaker with a perforated top for sprinkling salt', 'name': 'saltshaker'}, {'frequency': 'f', 'id': 925, 'synset': 'sandal.n.01', 'synonyms': ['sandal_(type_of_shoe)'], 'def': 'a shoe consisting of a sole fastened by straps to the foot', 'name': 'sandal_(type_of_shoe)'}, {'frequency': 'f', 'id': 926, 'synset': 'sandwich.n.01', 'synonyms': ['sandwich'], 'def': 'two (or more) slices of bread with a filling between them', 'name': 'sandwich'}, {'frequency': 'r', 'id': 927, 'synset': 'satchel.n.01', 'synonyms': ['satchel'], 'def': 'luggage consisting of a small case with a flat bottom and (usually) a shoulder strap', 'name': 'satchel'}, {'frequency': 'r', 'id': 928, 'synset': 'saucepan.n.01', 'synonyms': ['saucepan'], 'def': 'a deep pan with a handle; used for stewing or boiling', 'name': 'saucepan'}, {'frequency': 'f', 'id': 929, 'synset': 'saucer.n.02', 'synonyms': ['saucer'], 'def': 'a small shallow dish for holding a cup at the table', 'name': 'saucer'}, {'frequency': 'f', 'id': 930, 'synset': 'sausage.n.01', 'synonyms': ['sausage'], 'def': 'highly seasoned minced meat stuffed in casings', 'name': 'sausage'}, {'frequency': 'r', 'id': 931, 'synset': 'sawhorse.n.01', 'synonyms': ['sawhorse', 'sawbuck'], 'def': 'a framework for holding wood that is being sawed', 'name': 'sawhorse'}, {'frequency': 'r', 'id': 932, 'synset': 'sax.n.02', 'synonyms': ['saxophone'], 'def': "a wind instrument with a `J'-shaped form typically made of brass", 'name': 'saxophone'}, {'frequency': 'f', 'id': 933, 'synset': 'scale.n.07', 'synonyms': ['scale_(measuring_instrument)'], 'def': 'a measuring instrument for weighing; shows amount of mass', 'name': 'scale_(measuring_instrument)'}, {'frequency': 'r', 'id': 934, 'synset': 'scarecrow.n.01', 'synonyms': ['scarecrow', 'strawman'], 'def': 'an effigy in the shape of a man to frighten birds away from seeds', 'name': 'scarecrow'}, {'frequency': 'f', 'id': 935, 'synset': 'scarf.n.01', 'synonyms': ['scarf'], 'def': 'a garment worn around the head or neck or shoulders for warmth or decoration', 'name': 'scarf'}, {'frequency': 'c', 'id': 936, 'synset': 'school_bus.n.01', 'synonyms': ['school_bus'], 'def': 'a bus used to transport children to or from school', 'name': 'school_bus'}, {'frequency': 'f', 'id': 937, 'synset': 'scissors.n.01', 'synonyms': ['scissors'], 'def': 'a tool having two crossed pivoting blades with looped handles', 'name': 'scissors'}, {'frequency': 'c', 'id': 938, 'synset': 'scoreboard.n.01', 'synonyms': ['scoreboard'], 'def': 'a large board for displaying the score of a contest (and some other information)', 'name': 'scoreboard'}, {'frequency': 'c', 'id': 939, 'synset': 'scrambled_eggs.n.01', 'synonyms': ['scrambled_eggs'], 'def': 'eggs beaten and cooked to a soft firm consistency while stirring', 'name': 'scrambled_eggs'}, {'frequency': 'r', 'id': 940, 'synset': 'scraper.n.01', 'synonyms': ['scraper'], 'def': 'any of various hand tools for scraping', 'name': 'scraper'}, {'frequency': 'r', 'id': 941, 'synset': 'scratcher.n.03', 'synonyms': ['scratcher'], 'def': 'a device used for scratching', 'name': 'scratcher'}, {'frequency': 'c', 'id': 942, 'synset': 'screwdriver.n.01', 'synonyms': ['screwdriver'], 'def': 'a hand tool for driving screws; has a tip that fits into the head of a screw', 'name': 'screwdriver'}, {'frequency': 'c', 'id': 943, 'synset': 'scrub_brush.n.01', 'synonyms': ['scrubbing_brush'], 'def': 'a brush with short stiff bristles for heavy cleaning', 'name': 'scrubbing_brush'}, {'frequency': 'c', 'id': 944, 'synset': 'sculpture.n.01', 'synonyms': ['sculpture'], 'def': 'a three-dimensional work of art', 'name': 'sculpture'}, {'frequency': 'r', 'id': 945, 'synset': 'seabird.n.01', 'synonyms': ['seabird', 'seafowl'], 'def': 'a bird that frequents coastal waters and the open ocean: gulls; pelicans; gannets; cormorants; albatrosses; petrels; etc.', 'name': 'seabird'}, {'frequency': 'r', 'id': 946, 'synset': 'seahorse.n.02', 'synonyms': ['seahorse'], 'def': 'small fish with horse-like heads bent sharply downward and curled tails', 'name': 'seahorse'}, {'frequency': 'r', 'id': 947, 'synset': 'seaplane.n.01', 'synonyms': ['seaplane', 'hydroplane'], 'def': 'an airplane that can land on or take off from water', 'name': 'seaplane'}, {'frequency': 'c', 'id': 948, 'synset': 'seashell.n.01', 'synonyms': ['seashell'], 'def': 'the shell of a marine organism', 'name': 'seashell'}, {'frequency': 'r', 'id': 949, 'synset': 'seedling.n.01', 'synonyms': ['seedling'], 'def': 'young plant or tree grown from a seed', 'name': 'seedling'}, {'frequency': 'c', 'id': 950, 'synset': 'serving_dish.n.01', 'synonyms': ['serving_dish'], 'def': 'a dish used for serving food', 'name': 'serving_dish'}, {'frequency': 'r', 'id': 951, 'synset': 'sewing_machine.n.01', 'synonyms': ['sewing_machine'], 'def': 'a textile machine used as a home appliance for sewing', 'name': 'sewing_machine'}, {'frequency': 'r', 'id': 952, 'synset': 'shaker.n.03', 'synonyms': ['shaker'], 'def': 'a container in which something can be shaken', 'name': 'shaker'}, {'frequency': 'c', 'id': 953, 'synset': 'shampoo.n.01', 'synonyms': ['shampoo'], 'def': 'cleansing agent consisting of soaps or detergents used for washing the hair', 'name': 'shampoo'}, {'frequency': 'r', 'id': 954, 'synset': 'shark.n.01', 'synonyms': ['shark'], 'def': 'typically large carnivorous fishes with sharpe teeth', 'name': 'shark'}, {'frequency': 'r', 'id': 955, 'synset': 'sharpener.n.01', 'synonyms': ['sharpener'], 'def': 'any implement that is used to make something (an edge or a point) sharper', 'name': 'sharpener'}, {'frequency': 'r', 'id': 956, 'synset': 'sharpie.n.03', 'synonyms': ['Sharpie'], 'def': 'a pen with indelible ink that will write on any surface', 'name': 'Sharpie'}, {'frequency': 'r', 'id': 957, 'synset': 'shaver.n.03', 'synonyms': ['shaver_(electric)', 'electric_shaver', 'electric_razor'], 'def': 'a razor powered by an electric motor', 'name': 'shaver_(electric)'}, {'frequency': 'c', 'id': 958, 'synset': 'shaving_cream.n.01', 'synonyms': ['shaving_cream', 'shaving_soap'], 'def': 'toiletry consisting that forms a rich lather for softening the beard before shaving', 'name': 'shaving_cream'}, {'frequency': 'r', 'id': 959, 'synset': 'shawl.n.01', 'synonyms': ['shawl'], 'def': 'cloak consisting of an oblong piece of cloth used to cover the head and shoulders', 'name': 'shawl'}, {'frequency': 'r', 'id': 960, 'synset': 'shears.n.01', 'synonyms': ['shears'], 'def': 'large scissors with strong blades', 'name': 'shears'}, {'frequency': 'f', 'id': 961, 'synset': 'sheep.n.01', 'synonyms': ['sheep'], 'def': 'woolly usually horned ruminant mammal related to the goat', 'name': 'sheep'}, {'frequency': 'r', 'id': 962, 'synset': 'shepherd_dog.n.01', 'synonyms': ['shepherd_dog', 'sheepdog'], 'def': 'any of various usually long-haired breeds of dog reared to herd and guard sheep', 'name': 'shepherd_dog'}, {'frequency': 'r', 'id': 963, 'synset': 'sherbert.n.01', 'synonyms': ['sherbert', 'sherbet'], 'def': 'a frozen dessert made primarily of fruit juice and sugar', 'name': 'sherbert'}, {'frequency': 'r', 'id': 964, 'synset': 'shield.n.02', 'synonyms': ['shield'], 'def': 'armor carried on the arm to intercept blows', 'name': 'shield'}, {'frequency': 'f', 'id': 965, 'synset': 'shirt.n.01', 'synonyms': ['shirt'], 'def': 'a garment worn on the upper half of the body', 'name': 'shirt'}, {'frequency': 'f', 'id': 966, 'synset': 'shoe.n.01', 'synonyms': ['shoe', 'sneaker_(type_of_shoe)', 'tennis_shoe'], 'def': 'common footwear covering the foot', 'name': 'shoe'}, {'frequency': 'c', 'id': 967, 'synset': 'shopping_bag.n.01', 'synonyms': ['shopping_bag'], 'def': 'a bag made of plastic or strong paper (often with handles); used to transport goods after shopping', 'name': 'shopping_bag'}, {'frequency': 'c', 'id': 968, 'synset': 'shopping_cart.n.01', 'synonyms': ['shopping_cart'], 'def': 'a handcart that holds groceries or other goods while shopping', 'name': 'shopping_cart'}, {'frequency': 'f', 'id': 969, 'synset': 'short_pants.n.01', 'synonyms': ['short_pants', 'shorts_(clothing)', 'trunks_(clothing)'], 'def': 'trousers that end at or above the knee', 'name': 'short_pants'}, {'frequency': 'r', 'id': 970, 'synset': 'shot_glass.n.01', 'synonyms': ['shot_glass'], 'def': 'a small glass adequate to hold a single swallow of whiskey', 'name': 'shot_glass'}, {'frequency': 'c', 'id': 971, 'synset': 'shoulder_bag.n.01', 'synonyms': ['shoulder_bag'], 'def': 'a large handbag that can be carried by a strap looped over the shoulder', 'name': 'shoulder_bag'}, {'frequency': 'c', 'id': 972, 'synset': 'shovel.n.01', 'synonyms': ['shovel'], 'def': 'a hand tool for lifting loose material such as snow, dirt, etc.', 'name': 'shovel'}, {'frequency': 'f', 'id': 973, 'synset': 'shower.n.01', 'synonyms': ['shower_head'], 'def': 'a plumbing fixture that sprays water over you', 'name': 'shower_head'}, {'frequency': 'f', 'id': 974, 'synset': 'shower_curtain.n.01', 'synonyms': ['shower_curtain'], 'def': 'a curtain that keeps water from splashing out of the shower area', 'name': 'shower_curtain'}, {'frequency': 'r', 'id': 975, 'synset': 'shredder.n.01', 'synonyms': ['shredder_(for_paper)'], 'def': 'a device that shreds documents', 'name': 'shredder_(for_paper)'}, {'frequency': 'r', 'id': 976, 'synset': 'sieve.n.01', 'synonyms': ['sieve', 'screen_(sieve)'], 'def': 'a strainer for separating lumps from powdered material or grading particles', 'name': 'sieve'}, {'frequency': 'f', 'id': 977, 'synset': 'signboard.n.01', 'synonyms': ['signboard'], 'def': 'structure displaying a board on which advertisements can be posted', 'name': 'signboard'}, {'frequency': 'c', 'id': 978, 'synset': 'silo.n.01', 'synonyms': ['silo'], 'def': 'a cylindrical tower used for storing goods', 'name': 'silo'}, {'frequency': 'f', 'id': 979, 'synset': 'sink.n.01', 'synonyms': ['sink'], 'def': 'plumbing fixture consisting of a water basin fixed to a wall or floor and having a drainpipe', 'name': 'sink'}, {'frequency': 'f', 'id': 980, 'synset': 'skateboard.n.01', 'synonyms': ['skateboard'], 'def': 'a board with wheels that is ridden in a standing or crouching position and propelled by foot', 'name': 'skateboard'}, {'frequency': 'c', 'id': 981, 'synset': 'skewer.n.01', 'synonyms': ['skewer'], 'def': 'a long pin for holding meat in position while it is being roasted', 'name': 'skewer'}, {'frequency': 'f', 'id': 982, 'synset': 'ski.n.01', 'synonyms': ['ski'], 'def': 'sports equipment for skiing on snow', 'name': 'ski'}, {'frequency': 'f', 'id': 983, 'synset': 'ski_boot.n.01', 'synonyms': ['ski_boot'], 'def': 'a stiff boot that is fastened to a ski with a ski binding', 'name': 'ski_boot'}, {'frequency': 'f', 'id': 984, 'synset': 'ski_parka.n.01', 'synonyms': ['ski_parka', 'ski_jacket'], 'def': 'a parka to be worn while skiing', 'name': 'ski_parka'}, {'frequency': 'f', 'id': 985, 'synset': 'ski_pole.n.01', 'synonyms': ['ski_pole'], 'def': 'a pole with metal points used as an aid in skiing', 'name': 'ski_pole'}, {'frequency': 'f', 'id': 986, 'synset': 'skirt.n.02', 'synonyms': ['skirt'], 'def': 'a garment hanging from the waist; worn mainly by girls and women', 'name': 'skirt'}, {'frequency': 'c', 'id': 987, 'synset': 'sled.n.01', 'synonyms': ['sled', 'sledge', 'sleigh'], 'def': 'a vehicle or flat object for transportation over snow by sliding or pulled by dogs, etc.', 'name': 'sled'}, {'frequency': 'c', 'id': 988, 'synset': 'sleeping_bag.n.01', 'synonyms': ['sleeping_bag'], 'def': 'large padded bag designed to be slept in outdoors', 'name': 'sleeping_bag'}, {'frequency': 'r', 'id': 989, 'synset': 'sling.n.05', 'synonyms': ['sling_(bandage)', 'triangular_bandage'], 'def': 'bandage to support an injured forearm; slung over the shoulder or neck', 'name': 'sling_(bandage)'}, {'frequency': 'c', 'id': 990, 'synset': 'slipper.n.01', 'synonyms': ['slipper_(footwear)', 'carpet_slipper_(footwear)'], 'def': 'low footwear that can be slipped on and off easily; usually worn indoors', 'name': 'slipper_(footwear)'}, {'frequency': 'r', 'id': 991, 'synset': 'smoothie.n.02', 'synonyms': ['smoothie'], 'def': 'a thick smooth drink consisting of fresh fruit pureed with ice cream or yoghurt or milk', 'name': 'smoothie'}, {'frequency': 'r', 'id': 992, 'synset': 'snake.n.01', 'synonyms': ['snake', 'serpent'], 'def': 'limbless scaly elongate reptile; some are venomous', 'name': 'snake'}, {'frequency': 'f', 'id': 993, 'synset': 'snowboard.n.01', 'synonyms': ['snowboard'], 'def': 'a board that resembles a broad ski or a small surfboard; used in a standing position to slide down snow-covered slopes', 'name': 'snowboard'}, {'frequency': 'c', 'id': 994, 'synset': 'snowman.n.01', 'synonyms': ['snowman'], 'def': 'a figure of a person made of packed snow', 'name': 'snowman'}, {'frequency': 'c', 'id': 995, 'synset': 'snowmobile.n.01', 'synonyms': ['snowmobile'], 'def': 'tracked vehicle for travel on snow having skis in front', 'name': 'snowmobile'}, {'frequency': 'f', 'id': 996, 'synset': 'soap.n.01', 'synonyms': ['soap'], 'def': 'a cleansing agent made from the salts of vegetable or animal fats', 'name': 'soap'}, {'frequency': 'f', 'id': 997, 'synset': 'soccer_ball.n.01', 'synonyms': ['soccer_ball'], 'def': "an inflated ball used in playing soccer (called `football' outside of the United States)", 'name': 'soccer_ball'}, {'frequency': 'f', 'id': 998, 'synset': 'sock.n.01', 'synonyms': ['sock'], 'def': 'cloth covering for the foot; worn inside the shoe; reaches to between the ankle and the knee', 'name': 'sock'}, {'frequency': 'r', 'id': 999, 'synset': 'soda_fountain.n.02', 'synonyms': ['soda_fountain'], 'def': 'an apparatus for dispensing soda water', 'name': 'soda_fountain'}, {'frequency': 'r', 'id': 1000, 'synset': 'soda_water.n.01', 'synonyms': ['carbonated_water', 'club_soda', 'seltzer', 'sparkling_water'], 'def': 'effervescent beverage artificially charged with carbon dioxide', 'name': 'carbonated_water'}, {'frequency': 'f', 'id': 1001, 'synset': 'sofa.n.01', 'synonyms': ['sofa', 'couch', 'lounge'], 'def': 'an upholstered seat for more than one person', 'name': 'sofa'}, {'frequency': 'r', 'id': 1002, 'synset': 'softball.n.01', 'synonyms': ['softball'], 'def': 'ball used in playing softball', 'name': 'softball'}, {'frequency': 'c', 'id': 1003, 'synset': 'solar_array.n.01', 'synonyms': ['solar_array', 'solar_battery', 'solar_panel'], 'def': 'electrical device consisting of a large array of connected solar cells', 'name': 'solar_array'}, {'frequency': 'r', 'id': 1004, 'synset': 'sombrero.n.02', 'synonyms': ['sombrero'], 'def': 'a straw hat with a tall crown and broad brim; worn in American southwest and in Mexico', 'name': 'sombrero'}, {'frequency': 'c', 'id': 1005, 'synset': 'soup.n.01', 'synonyms': ['soup'], 'def': 'liquid food especially of meat or fish or vegetable stock often containing pieces of solid food', 'name': 'soup'}, {'frequency': 'r', 'id': 1006, 'synset': 'soup_bowl.n.01', 'synonyms': ['soup_bowl'], 'def': 'a bowl for serving soup', 'name': 'soup_bowl'}, {'frequency': 'c', 'id': 1007, 'synset': 'soupspoon.n.01', 'synonyms': ['soupspoon'], 'def': 'a spoon with a rounded bowl for eating soup', 'name': 'soupspoon'}, {'frequency': 'c', 'id': 1008, 'synset': 'sour_cream.n.01', 'synonyms': ['sour_cream', 'soured_cream'], 'def': 'soured light cream', 'name': 'sour_cream'}, {'frequency': 'r', 'id': 1009, 'synset': 'soya_milk.n.01', 'synonyms': ['soya_milk', 'soybean_milk', 'soymilk'], 'def': 'a milk substitute containing soybean flour and water; used in some infant formulas and in making tofu', 'name': 'soya_milk'}, {'frequency': 'r', 'id': 1010, 'synset': 'space_shuttle.n.01', 'synonyms': ['space_shuttle'], 'def': "a reusable spacecraft with wings for a controlled descent through the Earth's atmosphere", 'name': 'space_shuttle'}, {'frequency': 'r', 'id': 1011, 'synset': 'sparkler.n.02', 'synonyms': ['sparkler_(fireworks)'], 'def': 'a firework that burns slowly and throws out a shower of sparks', 'name': 'sparkler_(fireworks)'}, {'frequency': 'f', 'id': 1012, 'synset': 'spatula.n.02', 'synonyms': ['spatula'], 'def': 'a hand tool with a thin flexible blade used to mix or spread soft substances', 'name': 'spatula'}, {'frequency': 'r', 'id': 1013, 'synset': 'spear.n.01', 'synonyms': ['spear', 'lance'], 'def': 'a long pointed rod used as a tool or weapon', 'name': 'spear'}, {'frequency': 'f', 'id': 1014, 'synset': 'spectacles.n.01', 'synonyms': ['spectacles', 'specs', 'eyeglasses', 'glasses'], 'def': 'optical instrument consisting of a frame that holds a pair of lenses for correcting defective vision', 'name': 'spectacles'}, {'frequency': 'c', 'id': 1015, 'synset': 'spice_rack.n.01', 'synonyms': ['spice_rack'], 'def': 'a rack for displaying containers filled with spices', 'name': 'spice_rack'}, {'frequency': 'r', 'id': 1016, 'synset': 'spider.n.01', 'synonyms': ['spider'], 'def': 'predatory arachnid with eight legs, two poison fangs, two feelers, and usually two silk-spinning organs at the back end of the body', 'name': 'spider'}, {'frequency': 'c', 'id': 1017, 'synset': 'sponge.n.01', 'synonyms': ['sponge'], 'def': 'a porous mass usable to absorb water typically used for cleaning', 'name': 'sponge'}, {'frequency': 'f', 'id': 1018, 'synset': 'spoon.n.01', 'synonyms': ['spoon'], 'def': 'a piece of cutlery with a shallow bowl-shaped container and a handle', 'name': 'spoon'}, {'frequency': 'c', 'id': 1019, 'synset': 'sportswear.n.01', 'synonyms': ['sportswear', 'athletic_wear', 'activewear'], 'def': 'attire worn for sport or for casual wear', 'name': 'sportswear'}, {'frequency': 'c', 'id': 1020, 'synset': 'spotlight.n.02', 'synonyms': ['spotlight'], 'def': 'a lamp that produces a strong beam of light to illuminate a restricted area; used to focus attention of a stage performer', 'name': 'spotlight'}, {'frequency': 'r', 'id': 1021, 'synset': 'squirrel.n.01', 'synonyms': ['squirrel'], 'def': 'a kind of arboreal rodent having a long bushy tail', 'name': 'squirrel'}, {'frequency': 'c', 'id': 1022, 'synset': 'stapler.n.01', 'synonyms': ['stapler_(stapling_machine)'], 'def': 'a machine that inserts staples into sheets of paper in order to fasten them together', 'name': 'stapler_(stapling_machine)'}, {'frequency': 'r', 'id': 1023, 'synset': 'starfish.n.01', 'synonyms': ['starfish', 'sea_star'], 'def': 'echinoderms characterized by five arms extending from a central disk', 'name': 'starfish'}, {'frequency': 'f', 'id': 1024, 'synset': 'statue.n.01', 'synonyms': ['statue_(sculpture)'], 'def': 'a sculpture representing a human or animal', 'name': 'statue_(sculpture)'}, {'frequency': 'c', 'id': 1025, 'synset': 'steak.n.01', 'synonyms': ['steak_(food)'], 'def': 'a slice of meat cut from the fleshy part of an animal or large fish', 'name': 'steak_(food)'}, {'frequency': 'r', 'id': 1026, 'synset': 'steak_knife.n.01', 'synonyms': ['steak_knife'], 'def': 'a sharp table knife used in eating steak', 'name': 'steak_knife'}, {'frequency': 'r', 'id': 1027, 'synset': 'steamer.n.02', 'synonyms': ['steamer_(kitchen_appliance)'], 'def': 'a cooking utensil that can be used to cook food by steaming it', 'name': 'steamer_(kitchen_appliance)'}, {'frequency': 'f', 'id': 1028, 'synset': 'steering_wheel.n.01', 'synonyms': ['steering_wheel'], 'def': 'a handwheel that is used for steering', 'name': 'steering_wheel'}, {'frequency': 'r', 'id': 1029, 'synset': 'stencil.n.01', 'synonyms': ['stencil'], 'def': 'a sheet of material (metal, plastic, etc.) that has been perforated with a pattern; ink or paint can pass through the perforations to create the printed pattern on the surface below', 'name': 'stencil'}, {'frequency': 'r', 'id': 1030, 'synset': 'step_ladder.n.01', 'synonyms': ['stepladder'], 'def': 'a folding portable ladder hinged at the top', 'name': 'stepladder'}, {'frequency': 'c', 'id': 1031, 'synset': 'step_stool.n.01', 'synonyms': ['step_stool'], 'def': 'a stool that has one or two steps that fold under the seat', 'name': 'step_stool'}, {'frequency': 'c', 'id': 1032, 'synset': 'stereo.n.01', 'synonyms': ['stereo_(sound_system)'], 'def': 'electronic device for playing audio', 'name': 'stereo_(sound_system)'}, {'frequency': 'r', 'id': 1033, 'synset': 'stew.n.02', 'synonyms': ['stew'], 'def': 'food prepared by stewing especially meat or fish with vegetables', 'name': 'stew'}, {'frequency': 'r', 'id': 1034, 'synset': 'stirrer.n.02', 'synonyms': ['stirrer'], 'def': 'an implement used for stirring', 'name': 'stirrer'}, {'frequency': 'f', 'id': 1035, 'synset': 'stirrup.n.01', 'synonyms': ['stirrup'], 'def': "support consisting of metal loops into which rider's feet go", 'name': 'stirrup'}, {'frequency': 'c', 'id': 1036, 'synset': 'stocking.n.01', 'synonyms': ['stockings_(leg_wear)'], 'def': 'close-fitting hosiery to cover the foot and leg; come in matched pairs', 'name': 'stockings_(leg_wear)'}, {'frequency': 'f', 'id': 1037, 'synset': 'stool.n.01', 'synonyms': ['stool'], 'def': 'a simple seat without a back or arms', 'name': 'stool'}, {'frequency': 'f', 'id': 1038, 'synset': 'stop_sign.n.01', 'synonyms': ['stop_sign'], 'def': 'a traffic sign to notify drivers that they must come to a complete stop', 'name': 'stop_sign'}, {'frequency': 'f', 'id': 1039, 'synset': 'stoplight.n.01', 'synonyms': ['brake_light'], 'def': 'a red light on the rear of a motor vehicle that signals when the brakes are applied', 'name': 'brake_light'}, {'frequency': 'f', 'id': 1040, 'synset': 'stove.n.01', 'synonyms': ['stove', 'kitchen_stove', 'range_(kitchen_appliance)', 'kitchen_range', 'cooking_stove'], 'def': 'a kitchen appliance used for cooking food', 'name': 'stove'}, {'frequency': 'c', 'id': 1041, 'synset': 'strainer.n.01', 'synonyms': ['strainer'], 'def': 'a filter to retain larger pieces while smaller pieces and liquids pass through', 'name': 'strainer'}, {'frequency': 'f', 'id': 1042, 'synset': 'strap.n.01', 'synonyms': ['strap'], 'def': 'an elongated strip of material for binding things together or holding', 'name': 'strap'}, {'frequency': 'f', 'id': 1043, 'synset': 'straw.n.04', 'synonyms': ['straw_(for_drinking)', 'drinking_straw'], 'def': 'a thin paper or plastic tube used to suck liquids into the mouth', 'name': 'straw_(for_drinking)'}, {'frequency': 'f', 'id': 1044, 'synset': 'strawberry.n.01', 'synonyms': ['strawberry'], 'def': 'sweet fleshy red fruit', 'name': 'strawberry'}, {'frequency': 'f', 'id': 1045, 'synset': 'street_sign.n.01', 'synonyms': ['street_sign'], 'def': 'a sign visible from the street', 'name': 'street_sign'}, {'frequency': 'f', 'id': 1046, 'synset': 'streetlight.n.01', 'synonyms': ['streetlight', 'street_lamp'], 'def': 'a lamp supported on a lamppost; for illuminating a street', 'name': 'streetlight'}, {'frequency': 'r', 'id': 1047, 'synset': 'string_cheese.n.01', 'synonyms': ['string_cheese'], 'def': 'cheese formed in long strings twisted together', 'name': 'string_cheese'}, {'frequency': 'r', 'id': 1048, 'synset': 'stylus.n.02', 'synonyms': ['stylus'], 'def': 'a pointed tool for writing or drawing or engraving', 'name': 'stylus'}, {'frequency': 'r', 'id': 1049, 'synset': 'subwoofer.n.01', 'synonyms': ['subwoofer'], 'def': 'a loudspeaker that is designed to reproduce very low bass frequencies', 'name': 'subwoofer'}, {'frequency': 'r', 'id': 1050, 'synset': 'sugar_bowl.n.01', 'synonyms': ['sugar_bowl'], 'def': 'a dish in which sugar is served', 'name': 'sugar_bowl'}, {'frequency': 'r', 'id': 1051, 'synset': 'sugarcane.n.01', 'synonyms': ['sugarcane_(plant)'], 'def': 'juicy canes whose sap is a source of molasses and commercial sugar; fresh canes are sometimes chewed for the juice', 'name': 'sugarcane_(plant)'}, {'frequency': 'c', 'id': 1052, 'synset': 'suit.n.01', 'synonyms': ['suit_(clothing)'], 'def': 'a set of garments (usually including a jacket and trousers or skirt) for outerwear all of the same fabric and color', 'name': 'suit_(clothing)'}, {'frequency': 'c', 'id': 1053, 'synset': 'sunflower.n.01', 'synonyms': ['sunflower'], 'def': 'any plant of the genus Helianthus having large flower heads with dark disk florets and showy yellow rays', 'name': 'sunflower'}, {'frequency': 'f', 'id': 1054, 'synset': 'sunglasses.n.01', 'synonyms': ['sunglasses'], 'def': 'spectacles that are darkened or polarized to protect the eyes from the glare of the sun', 'name': 'sunglasses'}, {'frequency': 'c', 'id': 1055, 'synset': 'sunhat.n.01', 'synonyms': ['sunhat'], 'def': 'a hat with a broad brim that protects the face from direct exposure to the sun', 'name': 'sunhat'}, {'frequency': 'r', 'id': 1056, 'synset': 'sunscreen.n.01', 'synonyms': ['sunscreen', 'sunblock'], 'def': 'a cream spread on the skin; contains a chemical to filter out ultraviolet light and so protect from sunburn', 'name': 'sunscreen'}, {'frequency': 'f', 'id': 1057, 'synset': 'surfboard.n.01', 'synonyms': ['surfboard'], 'def': 'a narrow buoyant board for riding surf', 'name': 'surfboard'}, {'frequency': 'c', 'id': 1058, 'synset': 'sushi.n.01', 'synonyms': ['sushi'], 'def': 'rice (with raw fish) wrapped in seaweed', 'name': 'sushi'}, {'frequency': 'c', 'id': 1059, 'synset': 'swab.n.02', 'synonyms': ['mop'], 'def': 'cleaning implement consisting of absorbent material fastened to a handle; for cleaning floors', 'name': 'mop'}, {'frequency': 'c', 'id': 1060, 'synset': 'sweat_pants.n.01', 'synonyms': ['sweat_pants'], 'def': 'loose-fitting trousers with elastic cuffs; worn by athletes', 'name': 'sweat_pants'}, {'frequency': 'c', 'id': 1061, 'synset': 'sweatband.n.02', 'synonyms': ['sweatband'], 'def': 'a band of material tied around the forehead or wrist to absorb sweat', 'name': 'sweatband'}, {'frequency': 'f', 'id': 1062, 'synset': 'sweater.n.01', 'synonyms': ['sweater'], 'def': 'a crocheted or knitted garment covering the upper part of the body', 'name': 'sweater'}, {'frequency': 'f', 'id': 1063, 'synset': 'sweatshirt.n.01', 'synonyms': ['sweatshirt'], 'def': 'cotton knit pullover with long sleeves worn during athletic activity', 'name': 'sweatshirt'}, {'frequency': 'c', 'id': 1064, 'synset': 'sweet_potato.n.02', 'synonyms': ['sweet_potato'], 'def': 'the edible tuberous root of the sweet potato vine', 'name': 'sweet_potato'}, {'frequency': 'f', 'id': 1065, 'synset': 'swimsuit.n.01', 'synonyms': ['swimsuit', 'swimwear', 'bathing_suit', 'swimming_costume', 'bathing_costume', 'swimming_trunks', 'bathing_trunks'], 'def': 'garment worn for swimming', 'name': 'swimsuit'}, {'frequency': 'c', 'id': 1066, 'synset': 'sword.n.01', 'synonyms': ['sword'], 'def': 'a cutting or thrusting weapon that has a long metal blade', 'name': 'sword'}, {'frequency': 'r', 'id': 1067, 'synset': 'syringe.n.01', 'synonyms': ['syringe'], 'def': 'a medical instrument used to inject or withdraw fluids', 'name': 'syringe'}, {'frequency': 'r', 'id': 1068, 'synset': 'tabasco.n.02', 'synonyms': ['Tabasco_sauce'], 'def': 'very spicy sauce (trade name Tabasco) made from fully-aged red peppers', 'name': 'Tabasco_sauce'}, {'frequency': 'r', 'id': 1069, 'synset': 'table-tennis_table.n.01', 'synonyms': ['table-tennis_table', 'ping-pong_table'], 'def': 'a table used for playing table tennis', 'name': 'table-tennis_table'}, {'frequency': 'f', 'id': 1070, 'synset': 'table.n.02', 'synonyms': ['table'], 'def': 'a piece of furniture having a smooth flat top that is usually supported by one or more vertical legs', 'name': 'table'}, {'frequency': 'c', 'id': 1071, 'synset': 'table_lamp.n.01', 'synonyms': ['table_lamp'], 'def': 'a lamp that sits on a table', 'name': 'table_lamp'}, {'frequency': 'f', 'id': 1072, 'synset': 'tablecloth.n.01', 'synonyms': ['tablecloth'], 'def': 'a covering spread over a dining table', 'name': 'tablecloth'}, {'frequency': 'r', 'id': 1073, 'synset': 'tachometer.n.01', 'synonyms': ['tachometer'], 'def': 'measuring instrument for indicating speed of rotation', 'name': 'tachometer'}, {'frequency': 'r', 'id': 1074, 'synset': 'taco.n.02', 'synonyms': ['taco'], 'def': 'a small tortilla cupped around a filling', 'name': 'taco'}, {'frequency': 'f', 'id': 1075, 'synset': 'tag.n.02', 'synonyms': ['tag'], 'def': 'a label associated with something for the purpose of identification or information', 'name': 'tag'}, {'frequency': 'f', 'id': 1076, 'synset': 'taillight.n.01', 'synonyms': ['taillight', 'rear_light'], 'def': 'lamp (usually red) mounted at the rear of a motor vehicle', 'name': 'taillight'}, {'frequency': 'r', 'id': 1077, 'synset': 'tambourine.n.01', 'synonyms': ['tambourine'], 'def': 'a shallow drum with a single drumhead and with metallic disks in the sides', 'name': 'tambourine'}, {'frequency': 'r', 'id': 1078, 'synset': 'tank.n.01', 'synonyms': ['army_tank', 'armored_combat_vehicle', 'armoured_combat_vehicle'], 'def': 'an enclosed armored military vehicle; has a cannon and moves on caterpillar treads', 'name': 'army_tank'}, {'frequency': 'c', 'id': 1079, 'synset': 'tank.n.02', 'synonyms': ['tank_(storage_vessel)', 'storage_tank'], 'def': 'a large (usually metallic) vessel for holding gases or liquids', 'name': 'tank_(storage_vessel)'}, {'frequency': 'f', 'id': 1080, 'synset': 'tank_top.n.01', 'synonyms': ['tank_top_(clothing)'], 'def': 'a tight-fitting sleeveless shirt with wide shoulder straps and low neck and no front opening', 'name': 'tank_top_(clothing)'}, {'frequency': 'c', 'id': 1081, 'synset': 'tape.n.01', 'synonyms': ['tape_(sticky_cloth_or_paper)'], 'def': 'a long thin piece of cloth or paper as used for binding or fastening', 'name': 'tape_(sticky_cloth_or_paper)'}, {'frequency': 'c', 'id': 1082, 'synset': 'tape.n.04', 'synonyms': ['tape_measure', 'measuring_tape'], 'def': 'measuring instrument consisting of a narrow strip (cloth or metal) marked in inches or centimeters and used for measuring lengths', 'name': 'tape_measure'}, {'frequency': 'c', 'id': 1083, 'synset': 'tapestry.n.02', 'synonyms': ['tapestry'], 'def': 'a heavy textile with a woven design; used for curtains and upholstery', 'name': 'tapestry'}, {'frequency': 'f', 'id': 1084, 'synset': 'tarpaulin.n.01', 'synonyms': ['tarp'], 'def': 'waterproofed canvas', 'name': 'tarp'}, {'frequency': 'c', 'id': 1085, 'synset': 'tartan.n.01', 'synonyms': ['tartan', 'plaid'], 'def': 'a cloth having a crisscross design', 'name': 'tartan'}, {'frequency': 'c', 'id': 1086, 'synset': 'tassel.n.01', 'synonyms': ['tassel'], 'def': 'adornment consisting of a bunch of cords fastened at one end', 'name': 'tassel'}, {'frequency': 'r', 'id': 1087, 'synset': 'tea_bag.n.01', 'synonyms': ['tea_bag'], 'def': 'a measured amount of tea in a bag for an individual serving of tea', 'name': 'tea_bag'}, {'frequency': 'c', 'id': 1088, 'synset': 'teacup.n.02', 'synonyms': ['teacup'], 'def': 'a cup from which tea is drunk', 'name': 'teacup'}, {'frequency': 'c', 'id': 1089, 'synset': 'teakettle.n.01', 'synonyms': ['teakettle'], 'def': 'kettle for boiling water to make tea', 'name': 'teakettle'}, {'frequency': 'c', 'id': 1090, 'synset': 'teapot.n.01', 'synonyms': ['teapot'], 'def': 'pot for brewing tea; usually has a spout and handle', 'name': 'teapot'}, {'frequency': 'f', 'id': 1091, 'synset': 'teddy.n.01', 'synonyms': ['teddy_bear'], 'def': "plaything consisting of a child's toy bear (usually plush and stuffed with soft materials)", 'name': 'teddy_bear'}, {'frequency': 'f', 'id': 1092, 'synset': 'telephone.n.01', 'synonyms': ['telephone', 'phone', 'telephone_set'], 'def': 'electronic device for communicating by voice over long distances', 'name': 'telephone'}, {'frequency': 'c', 'id': 1093, 'synset': 'telephone_booth.n.01', 'synonyms': ['telephone_booth', 'phone_booth', 'call_box', 'telephone_box', 'telephone_kiosk'], 'def': 'booth for using a telephone', 'name': 'telephone_booth'}, {'frequency': 'f', 'id': 1094, 'synset': 'telephone_pole.n.01', 'synonyms': ['telephone_pole', 'telegraph_pole', 'telegraph_post'], 'def': 'tall pole supporting telephone wires', 'name': 'telephone_pole'}, {'frequency': 'r', 'id': 1095, 'synset': 'telephoto_lens.n.01', 'synonyms': ['telephoto_lens', 'zoom_lens'], 'def': 'a camera lens that magnifies the image', 'name': 'telephoto_lens'}, {'frequency': 'c', 'id': 1096, 'synset': 'television_camera.n.01', 'synonyms': ['television_camera', 'tv_camera'], 'def': 'television equipment for capturing and recording video', 'name': 'television_camera'}, {'frequency': 'f', 'id': 1097, 'synset': 'television_receiver.n.01', 'synonyms': ['television_set', 'tv', 'tv_set'], 'def': 'an electronic device that receives television signals and displays them on a screen', 'name': 'television_set'}, {'frequency': 'f', 'id': 1098, 'synset': 'tennis_ball.n.01', 'synonyms': ['tennis_ball'], 'def': 'ball about the size of a fist used in playing tennis', 'name': 'tennis_ball'}, {'frequency': 'f', 'id': 1099, 'synset': 'tennis_racket.n.01', 'synonyms': ['tennis_racket'], 'def': 'a racket used to play tennis', 'name': 'tennis_racket'}, {'frequency': 'r', 'id': 1100, 'synset': 'tequila.n.01', 'synonyms': ['tequila'], 'def': 'Mexican liquor made from fermented juices of an agave plant', 'name': 'tequila'}, {'frequency': 'c', 'id': 1101, 'synset': 'thermometer.n.01', 'synonyms': ['thermometer'], 'def': 'measuring instrument for measuring temperature', 'name': 'thermometer'}, {'frequency': 'c', 'id': 1102, 'synset': 'thermos.n.01', 'synonyms': ['thermos_bottle'], 'def': 'vacuum flask that preserves temperature of hot or cold drinks', 'name': 'thermos_bottle'}, {'frequency': 'c', 'id': 1103, 'synset': 'thermostat.n.01', 'synonyms': ['thermostat'], 'def': 'a regulator for automatically regulating temperature by starting or stopping the supply of heat', 'name': 'thermostat'}, {'frequency': 'r', 'id': 1104, 'synset': 'thimble.n.02', 'synonyms': ['thimble'], 'def': 'a small metal cap to protect the finger while sewing; can be used as a small container', 'name': 'thimble'}, {'frequency': 'c', 'id': 1105, 'synset': 'thread.n.01', 'synonyms': ['thread', 'yarn'], 'def': 'a fine cord of twisted fibers (of cotton or silk or wool or nylon etc.) used in sewing and weaving', 'name': 'thread'}, {'frequency': 'c', 'id': 1106, 'synset': 'thumbtack.n.01', 'synonyms': ['thumbtack', 'drawing_pin', 'pushpin'], 'def': 'a tack for attaching papers to a bulletin board or drawing board', 'name': 'thumbtack'}, {'frequency': 'c', 'id': 1107, 'synset': 'tiara.n.01', 'synonyms': ['tiara'], 'def': 'a jeweled headdress worn by women on formal occasions', 'name': 'tiara'}, {'frequency': 'c', 'id': 1108, 'synset': 'tiger.n.02', 'synonyms': ['tiger'], 'def': 'large feline of forests in most of Asia having a tawny coat with black stripes', 'name': 'tiger'}, {'frequency': 'c', 'id': 1109, 'synset': 'tights.n.01', 'synonyms': ['tights_(clothing)', 'leotards'], 'def': 'skintight knit hose covering the body from the waist to the feet worn by acrobats and dancers and as stockings by women and girls', 'name': 'tights_(clothing)'}, {'frequency': 'c', 'id': 1110, 'synset': 'timer.n.01', 'synonyms': ['timer', 'stopwatch'], 'def': 'a timepiece that measures a time interval and signals its end', 'name': 'timer'}, {'frequency': 'f', 'id': 1111, 'synset': 'tinfoil.n.01', 'synonyms': ['tinfoil'], 'def': 'foil made of tin or an alloy of tin and lead', 'name': 'tinfoil'}, {'frequency': 'r', 'id': 1112, 'synset': 'tinsel.n.01', 'synonyms': ['tinsel'], 'def': 'a showy decoration that is basically valueless', 'name': 'tinsel'}, {'frequency': 'f', 'id': 1113, 'synset': 'tissue.n.02', 'synonyms': ['tissue_paper'], 'def': 'a soft thin (usually translucent) paper', 'name': 'tissue_paper'}, {'frequency': 'c', 'id': 1114, 'synset': 'toast.n.01', 'synonyms': ['toast_(food)'], 'def': 'slice of bread that has been toasted', 'name': 'toast_(food)'}, {'frequency': 'f', 'id': 1115, 'synset': 'toaster.n.02', 'synonyms': ['toaster'], 'def': 'a kitchen appliance (usually electric) for toasting bread', 'name': 'toaster'}, {'frequency': 'c', 'id': 1116, 'synset': 'toaster_oven.n.01', 'synonyms': ['toaster_oven'], 'def': 'kitchen appliance consisting of a small electric oven for toasting or warming food', 'name': 'toaster_oven'}, {'frequency': 'f', 'id': 1117, 'synset': 'toilet.n.02', 'synonyms': ['toilet'], 'def': 'a plumbing fixture for defecation and urination', 'name': 'toilet'}, {'frequency': 'f', 'id': 1118, 'synset': 'toilet_tissue.n.01', 'synonyms': ['toilet_tissue', 'toilet_paper', 'bathroom_tissue'], 'def': 'a soft thin absorbent paper for use in toilets', 'name': 'toilet_tissue'}, {'frequency': 'f', 'id': 1119, 'synset': 'tomato.n.01', 'synonyms': ['tomato'], 'def': 'mildly acid red or yellow pulpy fruit eaten as a vegetable', 'name': 'tomato'}, {'frequency': 'c', 'id': 1120, 'synset': 'tongs.n.01', 'synonyms': ['tongs'], 'def': 'any of various devices for taking hold of objects; usually have two hinged legs with handles above and pointed hooks below', 'name': 'tongs'}, {'frequency': 'c', 'id': 1121, 'synset': 'toolbox.n.01', 'synonyms': ['toolbox'], 'def': 'a box or chest or cabinet for holding hand tools', 'name': 'toolbox'}, {'frequency': 'f', 'id': 1122, 'synset': 'toothbrush.n.01', 'synonyms': ['toothbrush'], 'def': 'small brush; has long handle; used to clean teeth', 'name': 'toothbrush'}, {'frequency': 'f', 'id': 1123, 'synset': 'toothpaste.n.01', 'synonyms': ['toothpaste'], 'def': 'a dentifrice in the form of a paste', 'name': 'toothpaste'}, {'frequency': 'c', 'id': 1124, 'synset': 'toothpick.n.01', 'synonyms': ['toothpick'], 'def': 'pick consisting of a small strip of wood or plastic; used to pick food from between the teeth', 'name': 'toothpick'}, {'frequency': 'c', 'id': 1125, 'synset': 'top.n.09', 'synonyms': ['cover'], 'def': 'covering for a hole (especially a hole in the top of a container)', 'name': 'cover'}, {'frequency': 'c', 'id': 1126, 'synset': 'tortilla.n.01', 'synonyms': ['tortilla'], 'def': 'thin unleavened pancake made from cornmeal or wheat flour', 'name': 'tortilla'}, {'frequency': 'c', 'id': 1127, 'synset': 'tow_truck.n.01', 'synonyms': ['tow_truck'], 'def': 'a truck equipped to hoist and pull wrecked cars (or to remove cars from no-parking zones)', 'name': 'tow_truck'}, {'frequency': 'f', 'id': 1128, 'synset': 'towel.n.01', 'synonyms': ['towel'], 'def': 'a rectangular piece of absorbent cloth (or paper) for drying or wiping', 'name': 'towel'}, {'frequency': 'f', 'id': 1129, 'synset': 'towel_rack.n.01', 'synonyms': ['towel_rack', 'towel_rail', 'towel_bar'], 'def': 'a rack consisting of one or more bars on which towels can be hung', 'name': 'towel_rack'}, {'frequency': 'f', 'id': 1130, 'synset': 'toy.n.03', 'synonyms': ['toy'], 'def': 'a device regarded as providing amusement', 'name': 'toy'}, {'frequency': 'c', 'id': 1131, 'synset': 'tractor.n.01', 'synonyms': ['tractor_(farm_equipment)'], 'def': 'a wheeled vehicle with large wheels; used in farming and other applications', 'name': 'tractor_(farm_equipment)'}, {'frequency': 'f', 'id': 1132, 'synset': 'traffic_light.n.01', 'synonyms': ['traffic_light'], 'def': 'a device to control vehicle traffic often consisting of three or more lights', 'name': 'traffic_light'}, {'frequency': 'r', 'id': 1133, 'synset': 'trail_bike.n.01', 'synonyms': ['dirt_bike'], 'def': 'a lightweight motorcycle equipped with rugged tires and suspension for off-road use', 'name': 'dirt_bike'}, {'frequency': 'c', 'id': 1134, 'synset': 'trailer_truck.n.01', 'synonyms': ['trailer_truck', 'tractor_trailer', 'trucking_rig', 'articulated_lorry', 'semi_truck'], 'def': 'a truck consisting of a tractor and trailer together', 'name': 'trailer_truck'}, {'frequency': 'f', 'id': 1135, 'synset': 'train.n.01', 'synonyms': ['train_(railroad_vehicle)', 'railroad_train'], 'def': 'public or private transport provided by a line of railway cars coupled together and drawn by a locomotive', 'name': 'train_(railroad_vehicle)'}, {'frequency': 'r', 'id': 1136, 'synset': 'trampoline.n.01', 'synonyms': ['trampoline'], 'def': 'gymnastic apparatus consisting of a strong canvas sheet attached with springs to a metal frame', 'name': 'trampoline'}, {'frequency': 'f', 'id': 1137, 'synset': 'tray.n.01', 'synonyms': ['tray'], 'def': 'an open receptacle for holding or displaying or serving articles or food', 'name': 'tray'}, {'frequency': 'r', 'id': 1138, 'synset': 'tree_house.n.01', 'synonyms': ['tree_house'], 'def': '(NOT A TREE) a PLAYHOUSE built in the branches of a tree', 'name': 'tree_house'}, {'frequency': 'r', 'id': 1139, 'synset': 'trench_coat.n.01', 'synonyms': ['trench_coat'], 'def': 'a military style raincoat; belted with deep pockets', 'name': 'trench_coat'}, {'frequency': 'r', 'id': 1140, 'synset': 'triangle.n.05', 'synonyms': ['triangle_(musical_instrument)'], 'def': 'a percussion instrument consisting of a metal bar bent in the shape of an open triangle', 'name': 'triangle_(musical_instrument)'}, {'frequency': 'r', 'id': 1141, 'synset': 'tricycle.n.01', 'synonyms': ['tricycle'], 'def': 'a vehicle with three wheels that is moved by foot pedals', 'name': 'tricycle'}, {'frequency': 'c', 'id': 1142, 'synset': 'tripod.n.01', 'synonyms': ['tripod'], 'def': 'a three-legged rack used for support', 'name': 'tripod'}, {'frequency': 'f', 'id': 1143, 'synset': 'trouser.n.01', 'synonyms': ['trousers', 'pants_(clothing)'], 'def': 'a garment extending from the waist to the knee or ankle, covering each leg separately', 'name': 'trousers'}, {'frequency': 'f', 'id': 1144, 'synset': 'truck.n.01', 'synonyms': ['truck'], 'def': 'an automotive vehicle suitable for hauling', 'name': 'truck'}, {'frequency': 'r', 'id': 1145, 'synset': 'truffle.n.03', 'synonyms': ['truffle_(chocolate)', 'chocolate_truffle'], 'def': 'creamy chocolate candy', 'name': 'truffle_(chocolate)'}, {'frequency': 'c', 'id': 1146, 'synset': 'trunk.n.02', 'synonyms': ['trunk'], 'def': 'luggage consisting of a large strong case used when traveling or for storage', 'name': 'trunk'}, {'frequency': 'r', 'id': 1147, 'synset': 'tub.n.02', 'synonyms': ['vat'], 'def': 'a large open vessel for holding or storing liquids', 'name': 'vat'}, {'frequency': 'c', 'id': 1148, 'synset': 'turban.n.01', 'synonyms': ['turban'], 'def': 'a traditional headdress consisting of a long scarf wrapped around the head', 'name': 'turban'}, {'frequency': 'r', 'id': 1149, 'synset': 'turkey.n.01', 'synonyms': ['turkey_(bird)'], 'def': 'large gallinaceous bird with fan-shaped tail; widely domesticated for food', 'name': 'turkey_(bird)'}, {'frequency': 'c', 'id': 1150, 'synset': 'turkey.n.04', 'synonyms': ['turkey_(food)'], 'def': 'flesh of large domesticated fowl usually roasted', 'name': 'turkey_(food)'}, {'frequency': 'r', 'id': 1151, 'synset': 'turnip.n.01', 'synonyms': ['turnip'], 'def': 'widely cultivated plant having a large fleshy edible white or yellow root', 'name': 'turnip'}, {'frequency': 'c', 'id': 1152, 'synset': 'turtle.n.02', 'synonyms': ['turtle'], 'def': 'any of various aquatic and land reptiles having a bony shell and flipper-like limbs for swimming', 'name': 'turtle'}, {'frequency': 'r', 'id': 1153, 'synset': 'turtleneck.n.01', 'synonyms': ['turtleneck_(clothing)', 'polo-neck'], 'def': 'a sweater or jersey with a high close-fitting collar', 'name': 'turtleneck_(clothing)'}, {'frequency': 'r', 'id': 1154, 'synset': 'typewriter.n.01', 'synonyms': ['typewriter'], 'def': 'hand-operated character printer for printing written messages one character at a time', 'name': 'typewriter'}, {'frequency': 'f', 'id': 1155, 'synset': 'umbrella.n.01', 'synonyms': ['umbrella'], 'def': 'a lightweight handheld collapsible canopy', 'name': 'umbrella'}, {'frequency': 'c', 'id': 1156, 'synset': 'underwear.n.01', 'synonyms': ['underwear', 'underclothes', 'underclothing', 'underpants'], 'def': 'undergarment worn next to the skin and under the outer garments', 'name': 'underwear'}, {'frequency': 'r', 'id': 1157, 'synset': 'unicycle.n.01', 'synonyms': ['unicycle'], 'def': 'a vehicle with a single wheel that is driven by pedals', 'name': 'unicycle'}, {'frequency': 'c', 'id': 1158, 'synset': 'urinal.n.01', 'synonyms': ['urinal'], 'def': 'a plumbing fixture (usually attached to the wall) used by men to urinate', 'name': 'urinal'}, {'frequency': 'r', 'id': 1159, 'synset': 'urn.n.01', 'synonyms': ['urn'], 'def': 'a large vase that usually has a pedestal or feet', 'name': 'urn'}, {'frequency': 'c', 'id': 1160, 'synset': 'vacuum.n.04', 'synonyms': ['vacuum_cleaner'], 'def': 'an electrical home appliance that cleans by suction', 'name': 'vacuum_cleaner'}, {'frequency': 'c', 'id': 1161, 'synset': 'valve.n.03', 'synonyms': ['valve'], 'def': 'control consisting of a mechanical device for controlling the flow of a fluid', 'name': 'valve'}, {'frequency': 'f', 'id': 1162, 'synset': 'vase.n.01', 'synonyms': ['vase'], 'def': 'an open jar of glass or porcelain used as an ornament or to hold flowers', 'name': 'vase'}, {'frequency': 'c', 'id': 1163, 'synset': 'vending_machine.n.01', 'synonyms': ['vending_machine'], 'def': 'a slot machine for selling goods', 'name': 'vending_machine'}, {'frequency': 'f', 'id': 1164, 'synset': 'vent.n.01', 'synonyms': ['vent', 'blowhole', 'air_vent'], 'def': 'a hole for the escape of gas or air', 'name': 'vent'}, {'frequency': 'c', 'id': 1165, 'synset': 'videotape.n.01', 'synonyms': ['videotape'], 'def': 'a video recording made on magnetic tape', 'name': 'videotape'}, {'frequency': 'r', 'id': 1166, 'synset': 'vinegar.n.01', 'synonyms': ['vinegar'], 'def': 'sour-tasting liquid produced usually by oxidation of the alcohol in wine or cider and used as a condiment or food preservative', 'name': 'vinegar'}, {'frequency': 'r', 'id': 1167, 'synset': 'violin.n.01', 'synonyms': ['violin', 'fiddle'], 'def': 'bowed stringed instrument that is the highest member of the violin family', 'name': 'violin'}, {'frequency': 'r', 'id': 1168, 'synset': 'vodka.n.01', 'synonyms': ['vodka'], 'def': 'unaged colorless liquor originating in Russia', 'name': 'vodka'}, {'frequency': 'r', 'id': 1169, 'synset': 'volleyball.n.02', 'synonyms': ['volleyball'], 'def': 'an inflated ball used in playing volleyball', 'name': 'volleyball'}, {'frequency': 'r', 'id': 1170, 'synset': 'vulture.n.01', 'synonyms': ['vulture'], 'def': 'any of various large birds of prey having naked heads and weak claws and feeding chiefly on carrion', 'name': 'vulture'}, {'frequency': 'c', 'id': 1171, 'synset': 'waffle.n.01', 'synonyms': ['waffle'], 'def': 'pancake batter baked in a waffle iron', 'name': 'waffle'}, {'frequency': 'r', 'id': 1172, 'synset': 'waffle_iron.n.01', 'synonyms': ['waffle_iron'], 'def': 'a kitchen appliance for baking waffles', 'name': 'waffle_iron'}, {'frequency': 'c', 'id': 1173, 'synset': 'wagon.n.01', 'synonyms': ['wagon'], 'def': 'any of various kinds of wheeled vehicles drawn by an animal or a tractor', 'name': 'wagon'}, {'frequency': 'c', 'id': 1174, 'synset': 'wagon_wheel.n.01', 'synonyms': ['wagon_wheel'], 'def': 'a wheel of a wagon', 'name': 'wagon_wheel'}, {'frequency': 'c', 'id': 1175, 'synset': 'walking_stick.n.01', 'synonyms': ['walking_stick'], 'def': 'a stick carried in the hand for support in walking', 'name': 'walking_stick'}, {'frequency': 'c', 'id': 1176, 'synset': 'wall_clock.n.01', 'synonyms': ['wall_clock'], 'def': 'a clock mounted on a wall', 'name': 'wall_clock'}, {'frequency': 'f', 'id': 1177, 'synset': 'wall_socket.n.01', 'synonyms': ['wall_socket', 'wall_plug', 'electric_outlet', 'electrical_outlet', 'outlet', 'electric_receptacle'], 'def': 'receptacle providing a place in a wiring system where current can be taken to run electrical devices', 'name': 'wall_socket'}, {'frequency': 'c', 'id': 1178, 'synset': 'wallet.n.01', 'synonyms': ['wallet', 'billfold'], 'def': 'a pocket-size case for holding papers and paper money', 'name': 'wallet'}, {'frequency': 'r', 'id': 1179, 'synset': 'walrus.n.01', 'synonyms': ['walrus'], 'def': 'either of two large northern marine mammals having ivory tusks and tough hide over thick blubber', 'name': 'walrus'}, {'frequency': 'r', 'id': 1180, 'synset': 'wardrobe.n.01', 'synonyms': ['wardrobe'], 'def': 'a tall piece of furniture that provides storage space for clothes; has a door and rails or hooks for hanging clothes', 'name': 'wardrobe'}, {'frequency': 'r', 'id': 1181, 'synset': 'wasabi.n.02', 'synonyms': ['wasabi'], 'def': 'the thick green root of the wasabi plant that the Japanese use in cooking and that tastes like strong horseradish', 'name': 'wasabi'}, {'frequency': 'c', 'id': 1182, 'synset': 'washer.n.03', 'synonyms': ['automatic_washer', 'washing_machine'], 'def': 'a home appliance for washing clothes and linens automatically', 'name': 'automatic_washer'}, {'frequency': 'f', 'id': 1183, 'synset': 'watch.n.01', 'synonyms': ['watch', 'wristwatch'], 'def': 'a small, portable timepiece', 'name': 'watch'}, {'frequency': 'f', 'id': 1184, 'synset': 'water_bottle.n.01', 'synonyms': ['water_bottle'], 'def': 'a bottle for holding water', 'name': 'water_bottle'}, {'frequency': 'c', 'id': 1185, 'synset': 'water_cooler.n.01', 'synonyms': ['water_cooler'], 'def': 'a device for cooling and dispensing drinking water', 'name': 'water_cooler'}, {'frequency': 'c', 'id': 1186, 'synset': 'water_faucet.n.01', 'synonyms': ['water_faucet', 'water_tap', 'tap_(water_faucet)'], 'def': 'a faucet for drawing water from a pipe or cask', 'name': 'water_faucet'}, {'frequency': 'r', 'id': 1187, 'synset': 'water_filter.n.01', 'synonyms': ['water_filter'], 'def': 'a filter to remove impurities from the water supply', 'name': 'water_filter'}, {'frequency': 'r', 'id': 1188, 'synset': 'water_heater.n.01', 'synonyms': ['water_heater', 'hot-water_heater'], 'def': 'a heater and storage tank to supply heated water', 'name': 'water_heater'}, {'frequency': 'r', 'id': 1189, 'synset': 'water_jug.n.01', 'synonyms': ['water_jug'], 'def': 'a jug that holds water', 'name': 'water_jug'}, {'frequency': 'r', 'id': 1190, 'synset': 'water_pistol.n.01', 'synonyms': ['water_gun', 'squirt_gun'], 'def': 'plaything consisting of a toy pistol that squirts water', 'name': 'water_gun'}, {'frequency': 'c', 'id': 1191, 'synset': 'water_scooter.n.01', 'synonyms': ['water_scooter', 'sea_scooter', 'jet_ski'], 'def': 'a motorboat resembling a motor scooter (NOT A SURFBOARD OR WATER SKI)', 'name': 'water_scooter'}, {'frequency': 'c', 'id': 1192, 'synset': 'water_ski.n.01', 'synonyms': ['water_ski'], 'def': 'broad ski for skimming over water towed by a speedboat (DO NOT MARK WATER)', 'name': 'water_ski'}, {'frequency': 'c', 'id': 1193, 'synset': 'water_tower.n.01', 'synonyms': ['water_tower'], 'def': 'a large reservoir for water', 'name': 'water_tower'}, {'frequency': 'c', 'id': 1194, 'synset': 'watering_can.n.01', 'synonyms': ['watering_can'], 'def': 'a container with a handle and a spout with a perforated nozzle; used to sprinkle water over plants', 'name': 'watering_can'}, {'frequency': 'c', 'id': 1195, 'synset': 'watermelon.n.02', 'synonyms': ['watermelon'], 'def': 'large oblong or roundish melon with a hard green rind and sweet watery red or occasionally yellowish pulp', 'name': 'watermelon'}, {'frequency': 'f', 'id': 1196, 'synset': 'weathervane.n.01', 'synonyms': ['weathervane', 'vane_(weathervane)', 'wind_vane'], 'def': 'mechanical device attached to an elevated structure; rotates freely to show the direction of the wind', 'name': 'weathervane'}, {'frequency': 'c', 'id': 1197, 'synset': 'webcam.n.01', 'synonyms': ['webcam'], 'def': 'a digital camera designed to take digital photographs and transmit them over the internet', 'name': 'webcam'}, {'frequency': 'c', 'id': 1198, 'synset': 'wedding_cake.n.01', 'synonyms': ['wedding_cake', 'bridecake'], 'def': 'a rich cake with two or more tiers and covered with frosting and decorations; served at a wedding reception', 'name': 'wedding_cake'}, {'frequency': 'c', 'id': 1199, 'synset': 'wedding_ring.n.01', 'synonyms': ['wedding_ring', 'wedding_band'], 'def': 'a ring given to the bride and/or groom at the wedding', 'name': 'wedding_ring'}, {'frequency': 'f', 'id': 1200, 'synset': 'wet_suit.n.01', 'synonyms': ['wet_suit'], 'def': 'a close-fitting garment made of a permeable material; worn in cold water to retain body heat', 'name': 'wet_suit'}, {'frequency': 'f', 'id': 1201, 'synset': 'wheel.n.01', 'synonyms': ['wheel'], 'def': 'a circular frame with spokes (or a solid disc) that can rotate on a shaft or axle', 'name': 'wheel'}, {'frequency': 'c', 'id': 1202, 'synset': 'wheelchair.n.01', 'synonyms': ['wheelchair'], 'def': 'a movable chair mounted on large wheels', 'name': 'wheelchair'}, {'frequency': 'c', 'id': 1203, 'synset': 'whipped_cream.n.01', 'synonyms': ['whipped_cream'], 'def': 'cream that has been beaten until light and fluffy', 'name': 'whipped_cream'}, {'frequency': 'r', 'id': 1204, 'synset': 'whiskey.n.01', 'synonyms': ['whiskey'], 'def': 'a liquor made from fermented mash of grain', 'name': 'whiskey'}, {'frequency': 'r', 'id': 1205, 'synset': 'whistle.n.03', 'synonyms': ['whistle'], 'def': 'a small wind instrument that produces a whistling sound by blowing into it', 'name': 'whistle'}, {'frequency': 'r', 'id': 1206, 'synset': 'wick.n.02', 'synonyms': ['wick'], 'def': 'a loosely woven cord in a candle or oil lamp that is lit on fire', 'name': 'wick'}, {'frequency': 'c', 'id': 1207, 'synset': 'wig.n.01', 'synonyms': ['wig'], 'def': 'hairpiece covering the head and made of real or synthetic hair', 'name': 'wig'}, {'frequency': 'c', 'id': 1208, 'synset': 'wind_chime.n.01', 'synonyms': ['wind_chime'], 'def': 'a decorative arrangement of pieces of metal or glass or pottery that hang together loosely so the wind can cause them to tinkle', 'name': 'wind_chime'}, {'frequency': 'c', 'id': 1209, 'synset': 'windmill.n.01', 'synonyms': ['windmill'], 'def': 'a mill that is powered by the wind', 'name': 'windmill'}, {'frequency': 'c', 'id': 1210, 'synset': 'window_box.n.01', 'synonyms': ['window_box_(for_plants)'], 'def': 'a container for growing plants on a windowsill', 'name': 'window_box_(for_plants)'}, {'frequency': 'f', 'id': 1211, 'synset': 'windshield_wiper.n.01', 'synonyms': ['windshield_wiper', 'windscreen_wiper', 'wiper_(for_windshield/screen)'], 'def': 'a mechanical device that cleans the windshield', 'name': 'windshield_wiper'}, {'frequency': 'c', 'id': 1212, 'synset': 'windsock.n.01', 'synonyms': ['windsock', 'air_sock', 'air-sleeve', 'wind_sleeve', 'wind_cone'], 'def': 'a truncated cloth cone mounted on a mast/pole; shows wind direction', 'name': 'windsock'}, {'frequency': 'f', 'id': 1213, 'synset': 'wine_bottle.n.01', 'synonyms': ['wine_bottle'], 'def': 'a bottle for holding wine', 'name': 'wine_bottle'}, {'frequency': 'r', 'id': 1214, 'synset': 'wine_bucket.n.01', 'synonyms': ['wine_bucket', 'wine_cooler'], 'def': 'a bucket of ice used to chill a bottle of wine', 'name': 'wine_bucket'}, {'frequency': 'f', 'id': 1215, 'synset': 'wineglass.n.01', 'synonyms': ['wineglass'], 'def': 'a glass that has a stem and in which wine is served', 'name': 'wineglass'}, {'frequency': 'r', 'id': 1216, 'synset': 'wing_chair.n.01', 'synonyms': ['wing_chair'], 'def': 'easy chair having wings on each side of a high back', 'name': 'wing_chair'}, {'frequency': 'c', 'id': 1217, 'synset': 'winker.n.02', 'synonyms': ['blinder_(for_horses)'], 'def': 'blinds that prevent a horse from seeing something on either side', 'name': 'blinder_(for_horses)'}, {'frequency': 'c', 'id': 1218, 'synset': 'wok.n.01', 'synonyms': ['wok'], 'def': 'pan with a convex bottom; used for frying in Chinese cooking', 'name': 'wok'}, {'frequency': 'r', 'id': 1219, 'synset': 'wolf.n.01', 'synonyms': ['wolf'], 'def': 'a wild carnivorous mammal of the dog family, living and hunting in packs', 'name': 'wolf'}, {'frequency': 'c', 'id': 1220, 'synset': 'wooden_spoon.n.02', 'synonyms': ['wooden_spoon'], 'def': 'a spoon made of wood', 'name': 'wooden_spoon'}, {'frequency': 'c', 'id': 1221, 'synset': 'wreath.n.01', 'synonyms': ['wreath'], 'def': 'an arrangement of flowers, leaves, or stems fastened in a ring', 'name': 'wreath'}, {'frequency': 'c', 'id': 1222, 'synset': 'wrench.n.03', 'synonyms': ['wrench', 'spanner'], 'def': 'a hand tool that is used to hold or twist a nut or bolt', 'name': 'wrench'}, {'frequency': 'c', 'id': 1223, 'synset': 'wristband.n.01', 'synonyms': ['wristband'], 'def': 'band consisting of a part of a sleeve that covers the wrist', 'name': 'wristband'}, {'frequency': 'f', 'id': 1224, 'synset': 'wristlet.n.01', 'synonyms': ['wristlet', 'wrist_band'], 'def': 'a band or bracelet worn around the wrist', 'name': 'wristlet'}, {'frequency': 'r', 'id': 1225, 'synset': 'yacht.n.01', 'synonyms': ['yacht'], 'def': 'an expensive vessel propelled by sail or power and used for cruising or racing', 'name': 'yacht'}, {'frequency': 'r', 'id': 1226, 'synset': 'yak.n.02', 'synonyms': ['yak'], 'def': 'large long-haired wild ox of Tibet often domesticated', 'name': 'yak'}, {'frequency': 'c', 'id': 1227, 'synset': 'yogurt.n.01', 'synonyms': ['yogurt', 'yoghurt', 'yoghourt'], 'def': 'a custard-like food made from curdled milk', 'name': 'yogurt'}, {'frequency': 'r', 'id': 1228, 'synset': 'yoke.n.07', 'synonyms': ['yoke_(animal_equipment)'], 'def': 'gear joining two animals at the neck; NOT egg yolk', 'name': 'yoke_(animal_equipment)'}, {'frequency': 'f', 'id': 1229, 'synset': 'zebra.n.01', 'synonyms': ['zebra'], 'def': 'any of several fleet black-and-white striped African equines', 'name': 'zebra'}, {'frequency': 'c', 'id': 1230, 'synset': 'zucchini.n.02', 'synonyms': ['zucchini', 'courgette'], 'def': 'small cucumber-shaped vegetable marrow; typically dark green', 'name': 'zucchini'}]  # noqa
+# fmt: on
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/detectron2/data/datasets/pascal_voc.py b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/data/datasets/pascal_voc.py
new file mode 100644
index 0000000000000000000000000000000000000000..5872d96575b428e90b29a7759a2f7b32dcc15d25
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/data/datasets/pascal_voc.py
@@ -0,0 +1,80 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+
+import numpy as np
+import os
+import xml.etree.ElementTree as ET
+from fvcore.common.file_io import PathManager
+
+from detectron2.data import DatasetCatalog, MetadataCatalog
+from detectron2.structures import BoxMode
+
+__all__ = ["register_pascal_voc"]
+
+
+# fmt: off
+CLASS_NAMES = [
+    "aeroplane", "bicycle", "bird", "boat", "bottle", "bus", "car", "cat",
+    "chair", "cow", "diningtable", "dog", "horse", "motorbike", "person",
+    "pottedplant", "sheep", "sofa", "train", "tvmonitor",
+]
+# fmt: on
+
+
+def load_voc_instances(dirname: str, split: str):
+    """
+    Load Pascal VOC detection annotations to Detectron2 format.
+
+    Args:
+        dirname: Contain "Annotations", "ImageSets", "JPEGImages"
+        split (str): one of "train", "test", "val", "trainval"
+    """
+    with PathManager.open(os.path.join(dirname, "ImageSets", "Main", split + ".txt")) as f:
+        fileids = np.loadtxt(f, dtype=np.str)
+
+    # Needs to read many small annotation files. Makes sense at local
+    annotation_dirname = PathManager.get_local_path(os.path.join(dirname, "Annotations/"))
+    dicts = []
+    for fileid in fileids:
+        anno_file = os.path.join(annotation_dirname, fileid + ".xml")
+        jpeg_file = os.path.join(dirname, "JPEGImages", fileid + ".jpg")
+
+        with PathManager.open(anno_file) as f:
+            tree = ET.parse(f)
+
+        r = {
+            "file_name": jpeg_file,
+            "image_id": fileid,
+            "height": int(tree.findall("./size/height")[0].text),
+            "width": int(tree.findall("./size/width")[0].text),
+        }
+        instances = []
+
+        for obj in tree.findall("object"):
+            cls = obj.find("name").text
+            # We include "difficult" samples in training.
+            # Based on limited experiments, they don't hurt accuracy.
+            # difficult = int(obj.find("difficult").text)
+            # if difficult == 1:
+            # continue
+            bbox = obj.find("bndbox")
+            bbox = [float(bbox.find(x).text) for x in ["xmin", "ymin", "xmax", "ymax"]]
+            # Original annotations are integers in the range [1, W or H]
+            # Assuming they mean 1-based pixel indices (inclusive),
+            # a box with annotation (xmin=1, xmax=W) covers the whole image.
+            # In coordinate space this is represented by (xmin=0, xmax=W)
+            bbox[0] -= 1.0
+            bbox[1] -= 1.0
+            instances.append(
+                {"category_id": CLASS_NAMES.index(cls), "bbox": bbox, "bbox_mode": BoxMode.XYXY_ABS}
+            )
+        r["annotations"] = instances
+        dicts.append(r)
+    return dicts
+
+
+def register_pascal_voc(name, dirname, split, year):
+    DatasetCatalog.register(name, lambda: load_voc_instances(dirname, split))
+    MetadataCatalog.get(name).set(
+        thing_classes=CLASS_NAMES, dirname=dirname, year=year, split=split
+    )
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/detectron2/data/datasets/register_coco.py b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/data/datasets/register_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..a0a4db66f23ffbf42f551bf56e18c7acbfe3f71e
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/data/datasets/register_coco.py
@@ -0,0 +1,129 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+import copy
+import os
+
+from detectron2.data import DatasetCatalog, MetadataCatalog
+
+from .coco import load_coco_json, load_sem_seg
+
+"""
+This file contains functions to register a COCO-format dataset to the DatasetCatalog.
+"""
+
+__all__ = ["register_coco_instances", "register_coco_panoptic_separated"]
+
+
+def register_coco_instances(name, metadata, json_file, image_root):
+    """
+    Register a dataset in COCO's json annotation format for
+    instance detection, instance segmentation and keypoint detection.
+    (i.e., Type 1 and 2 in http://cocodataset.org/#format-data.
+    `instances*.json` and `person_keypoints*.json` in the dataset).
+
+    This is an example of how to register a new dataset.
+    You can do something similar to this function, to register new data.
+
+    Args:
+        name (str): the name that identifies a dataset, e.g. "coco_2014_train".
+        metadata (dict): extra metadata associated with this dataset.  You can
+            leave it as an empty dict.
+        json_file (str): path to the json instance annotation file.
+        image_root (str or path-like): directory which contains all the images.
+    """
+    assert isinstance(name, str), name
+    assert isinstance(json_file, (str, os.PathLike)), json_file
+    assert isinstance(image_root, (str, os.PathLike)), image_root
+    # 1. register a function which returns dicts
+    DatasetCatalog.register(name, lambda: load_coco_json(json_file, image_root, name))
+
+    # 2. Optionally, add metadata about this dataset,
+    # since they might be useful in evaluation, visualization or logging
+    MetadataCatalog.get(name).set(
+        json_file=json_file, image_root=image_root, evaluator_type="coco", **metadata
+    )
+
+
+def register_coco_panoptic_separated(
+    name, metadata, image_root, panoptic_root, panoptic_json, sem_seg_root, instances_json
+):
+    """
+    Register a COCO panoptic segmentation dataset named `name`.
+    The annotations in this registered dataset will contain both instance annotations and
+    semantic annotations, each with its own contiguous ids. Hence it's called "separated".
+
+    It follows the setting used by the PanopticFPN paper:
+
+    1. The instance annotations directly come from polygons in the COCO
+       instances annotation task, rather than from the masks in the COCO panoptic annotations.
+
+       The two format have small differences:
+       Polygons in the instance annotations may have overlaps.
+       The mask annotations are produced by labeling the overlapped polygons
+       with depth ordering.
+
+    2. The semantic annotations are converted from panoptic annotations, where
+       all "things" are assigned a semantic id of 0.
+       All semantic categories will therefore have ids in contiguous
+       range [1, #stuff_categories].
+
+    This function will also register a pure semantic segmentation dataset
+    named ``name + '_stuffonly'``.
+
+    Args:
+        name (str): the name that identifies a dataset,
+            e.g. "coco_2017_train_panoptic"
+        metadata (dict): extra metadata associated with this dataset.
+        image_root (str): directory which contains all the images
+        panoptic_root (str): directory which contains panoptic annotation images
+        panoptic_json (str): path to the json panoptic annotation file
+        sem_seg_root (str): directory which contains all the ground truth segmentation annotations.
+        instances_json (str): path to the json instance annotation file
+    """
+    panoptic_name = name + "_separated"
+    DatasetCatalog.register(
+        panoptic_name,
+        lambda: merge_to_panoptic(
+            load_coco_json(instances_json, image_root, panoptic_name),
+            load_sem_seg(sem_seg_root, image_root),
+        ),
+    )
+    MetadataCatalog.get(panoptic_name).set(
+        panoptic_root=panoptic_root,
+        image_root=image_root,
+        panoptic_json=panoptic_json,
+        sem_seg_root=sem_seg_root,
+        json_file=instances_json,  # TODO rename
+        evaluator_type="coco_panoptic_seg",
+        **metadata
+    )
+
+    semantic_name = name + "_stuffonly"
+    DatasetCatalog.register(semantic_name, lambda: load_sem_seg(sem_seg_root, image_root))
+    MetadataCatalog.get(semantic_name).set(
+        sem_seg_root=sem_seg_root, image_root=image_root, evaluator_type="sem_seg", **metadata
+    )
+
+
+def merge_to_panoptic(detection_dicts, sem_seg_dicts):
+    """
+    Create dataset dicts for panoptic segmentation, by
+    merging two dicts using "file_name" field to match their entries.
+
+    Args:
+        detection_dicts (list[dict]): lists of dicts for object detection or instance segmentation.
+        sem_seg_dicts (list[dict]): lists of dicts for semantic segmentation.
+
+    Returns:
+        list[dict] (one per input image): Each dict contains all (key, value) pairs from dicts in
+            both detection_dicts and sem_seg_dicts that correspond to the same image.
+            The function assumes that the same key in different dicts has the same value.
+    """
+    results = []
+    sem_seg_file_to_entry = {x["file_name"]: x for x in sem_seg_dicts}
+    assert len(sem_seg_file_to_entry) > 0
+
+    for det_dict in detection_dicts:
+        dic = copy.copy(det_dict)
+        dic.update(sem_seg_file_to_entry[dic["file_name"]])
+        results.append(dic)
+    return results
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/detectron2/data/detection_utils.py b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/data/detection_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..e19c7e2f2b4600b77923141ccd04693d4086562f
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/data/detection_utils.py
@@ -0,0 +1,516 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+
+"""
+Common data processing utilities that are used in a
+typical object detection data pipeline.
+"""
+import logging
+import numpy as np
+import pycocotools.mask as mask_util
+import torch
+from fvcore.common.file_io import PathManager
+from PIL import Image, ImageOps
+
+from detectron2.structures import (
+    BitMasks,
+    Boxes,
+    BoxMode,
+    Instances,
+    Keypoints,
+    PolygonMasks,
+    RotatedBoxes,
+    polygons_to_bitmask,
+)
+
+from . import transforms as T
+from .catalog import MetadataCatalog
+
+
+class SizeMismatchError(ValueError):
+    """
+    When loaded image has difference width/height compared with annotation.
+    """
+
+
+# https://en.wikipedia.org/wiki/YUV#SDTV_with_BT.601
+_M_RGB2YUV = [[0.299, 0.587, 0.114], [-0.14713, -0.28886, 0.436], [0.615, -0.51499, -0.10001]]
+_M_YUV2RGB = [[1.0, 0.0, 1.13983], [1.0, -0.39465, -0.58060], [1.0, 2.03211, 0.0]]
+
+
+def convert_PIL_to_numpy(image, format):
+    """
+    Convert PIL image to numpy array of target format.
+
+    Args:
+        image (PIL.Image): a PIL image
+        format (str): the format of output image
+
+    Returns:
+        (np.ndarray): also see `read_image`
+    """
+    if format is not None:
+        # PIL only supports RGB, so convert to RGB and flip channels over below
+        conversion_format = format
+        if format in ["BGR", "YUV-BT.601"]:
+            conversion_format = "RGB"
+        image = image.convert(conversion_format)
+    image = np.asarray(image)
+    # PIL squeezes out the channel dimension for "L", so make it HWC
+    if format == "L":
+        image = np.expand_dims(image, -1)
+
+    # handle formats not supported by PIL
+    elif format == "BGR":
+        # flip channels if needed
+        image = image[:, :, ::-1]
+    elif format == "YUV-BT.601":
+        image = image / 255.0
+        image = np.dot(image, np.array(_M_RGB2YUV).T)
+
+    return image
+
+
+def convert_image_to_rgb(image, format):
+    """
+    Convert numpy image from given format to RGB.
+
+    Args:
+        image (np.ndarray): a numpy image
+        format (str): the format of input image, also see `read_image`
+
+    Returns:
+        (np.ndarray): HWC RGB image in 0-255 range, can be either float or uint8
+    """
+    if format == "BGR":
+        image = image[:, :, [2, 1, 0]]
+    elif format == "YUV-BT.601":
+        image = np.dot(image, np.array(_M_YUV2RGB).T)
+        image = image * 255.0
+    else:
+        if format == "L":
+            image = image[:, :, 0]
+        image = image.astype(np.uint8)
+        image = np.asarray(Image.fromarray(image, mode=format).convert("RGB"))
+    return image
+
+
+def read_image(file_name, format=None):
+    """
+    Read an image into the given format.
+    Will apply rotation and flipping if the image has such exif information.
+
+    Args:
+        file_name (str): image file path
+        format (str): one of the supported image modes in PIL, or "BGR" or "YUV-BT.601"
+
+    Returns:
+        image (np.ndarray): an HWC image in the given format, which is 0-255, uint8 for
+            supported image modes in PIL or "BGR"; float (0-1 for Y) for YUV-BT.601.
+    """
+    with PathManager.open(file_name, "rb") as f:
+        image = Image.open(f)
+
+        # capture and ignore this bug: https://github.com/python-pillow/Pillow/issues/3973
+        try:
+            image = ImageOps.exif_transpose(image)
+        except Exception:
+            pass
+
+        return convert_PIL_to_numpy(image, format)
+
+
+def check_image_size(dataset_dict, image):
+    """
+    Raise an error if the image does not match the size specified in the dict.
+    """
+    if "width" in dataset_dict or "height" in dataset_dict:
+        image_wh = (image.shape[1], image.shape[0])
+        expected_wh = (dataset_dict["width"], dataset_dict["height"])
+        if not image_wh == expected_wh:
+            raise SizeMismatchError(
+                "Mismatched (W,H){}, got {}, expect {}".format(
+                    " for image " + dataset_dict["file_name"]
+                    if "file_name" in dataset_dict
+                    else "",
+                    image_wh,
+                    expected_wh,
+                )
+            )
+
+    # To ensure bbox always remap to original image size
+    if "width" not in dataset_dict:
+        dataset_dict["width"] = image.shape[1]
+    if "height" not in dataset_dict:
+        dataset_dict["height"] = image.shape[0]
+
+
+def transform_proposals(dataset_dict, image_shape, transforms, min_box_side_len, proposal_topk):
+    """
+    Apply transformations to the proposals in dataset_dict, if any.
+
+    Args:
+        dataset_dict (dict): a dict read from the dataset, possibly
+            contains fields "proposal_boxes", "proposal_objectness_logits", "proposal_bbox_mode"
+        image_shape (tuple): height, width
+        transforms (TransformList):
+        min_box_side_len (int): keep proposals with at least this size
+        proposal_topk (int): only keep top-K scoring proposals
+
+    The input dict is modified in-place, with abovementioned keys removed. A new
+    key "proposals" will be added. Its value is an `Instances`
+    object which contains the transformed proposals in its field
+    "proposal_boxes" and "objectness_logits".
+    """
+    if "proposal_boxes" in dataset_dict:
+        # Transform proposal boxes
+        boxes = transforms.apply_box(
+            BoxMode.convert(
+                dataset_dict.pop("proposal_boxes"),
+                dataset_dict.pop("proposal_bbox_mode"),
+                BoxMode.XYXY_ABS,
+            )
+        )
+        boxes = Boxes(boxes)
+        objectness_logits = torch.as_tensor(
+            dataset_dict.pop("proposal_objectness_logits").astype("float32")
+        )
+
+        boxes.clip(image_shape)
+        keep = boxes.nonempty(threshold=min_box_side_len)
+        boxes = boxes[keep]
+        objectness_logits = objectness_logits[keep]
+
+        proposals = Instances(image_shape)
+        proposals.proposal_boxes = boxes[:proposal_topk]
+        proposals.objectness_logits = objectness_logits[:proposal_topk]
+        dataset_dict["proposals"] = proposals
+
+
+def transform_instance_annotations(
+    annotation, transforms, image_size, *, keypoint_hflip_indices=None
+):
+    """
+    Apply transforms to box, segmentation and keypoints annotations of a single instance.
+
+    It will use `transforms.apply_box` for the box, and
+    `transforms.apply_coords` for segmentation polygons & keypoints.
+    If you need anything more specially designed for each data structure,
+    you'll need to implement your own version of this function or the transforms.
+
+    Args:
+        annotation (dict): dict of instance annotations for a single instance.
+            It will be modified in-place.
+        transforms (TransformList):
+        image_size (tuple): the height, width of the transformed image
+        keypoint_hflip_indices (ndarray[int]): see `create_keypoint_hflip_indices`.
+
+    Returns:
+        dict:
+            the same input dict with fields "bbox", "segmentation", "keypoints"
+            transformed according to `transforms`.
+            The "bbox_mode" field will be set to XYXY_ABS.
+    """
+    bbox = BoxMode.convert(annotation["bbox"], annotation["bbox_mode"], BoxMode.XYXY_ABS)
+    # Note that bbox is 1d (per-instance bounding box)
+    annotation["bbox"] = transforms.apply_box([bbox])[0]
+    annotation["bbox_mode"] = BoxMode.XYXY_ABS
+
+    if "segmentation" in annotation:
+        # each instance contains 1 or more polygons
+        segm = annotation["segmentation"]
+        if isinstance(segm, list):
+            # polygons
+            polygons = [np.asarray(p).reshape(-1, 2) for p in segm]
+            annotation["segmentation"] = [
+                p.reshape(-1) for p in transforms.apply_polygons(polygons)
+            ]
+        elif isinstance(segm, dict):
+            # RLE
+            mask = mask_util.decode(segm)
+            mask = transforms.apply_segmentation(mask)
+            assert tuple(mask.shape[:2]) == image_size
+            annotation["segmentation"] = mask
+        else:
+            raise ValueError(
+                "Cannot transform segmentation of type '{}'!"
+                "Supported types are: polygons as list[list[float] or ndarray],"
+                " COCO-style RLE as a dict.".format(type(segm))
+            )
+
+    if "keypoints" in annotation:
+        keypoints = transform_keypoint_annotations(
+            annotation["keypoints"], transforms, image_size, keypoint_hflip_indices
+        )
+        annotation["keypoints"] = keypoints
+
+    return annotation
+
+
+def transform_keypoint_annotations(keypoints, transforms, image_size, keypoint_hflip_indices=None):
+    """
+    Transform keypoint annotations of an image.
+
+    Args:
+        keypoints (list[float]): Nx3 float in Detectron2 Dataset format.
+        transforms (TransformList):
+        image_size (tuple): the height, width of the transformed image
+        keypoint_hflip_indices (ndarray[int]): see `create_keypoint_hflip_indices`.
+    """
+    # (N*3,) -> (N, 3)
+    keypoints = np.asarray(keypoints, dtype="float64").reshape(-1, 3)
+    keypoints[:, :2] = transforms.apply_coords(keypoints[:, :2])
+
+    # This assumes that HorizFlipTransform is the only one that does flip
+    do_hflip = sum(isinstance(t, T.HFlipTransform) for t in transforms.transforms) % 2 == 1
+
+    # Alternative way: check if probe points was horizontally flipped.
+    # probe = np.asarray([[0.0, 0.0], [image_width, 0.0]])
+    # probe_aug = transforms.apply_coords(probe.copy())
+    # do_hflip = np.sign(probe[1][0] - probe[0][0]) != np.sign(probe_aug[1][0] - probe_aug[0][0])  # noqa
+
+    # If flipped, swap each keypoint with its opposite-handed equivalent
+    if do_hflip:
+        assert keypoint_hflip_indices is not None
+        keypoints = keypoints[keypoint_hflip_indices, :]
+
+    # Maintain COCO convention that if visibility == 0, then x, y = 0
+    # TODO may need to reset visibility for cropped keypoints,
+    # but it does not matter for our existing algorithms
+    keypoints[keypoints[:, 2] == 0] = 0
+    return keypoints
+
+
+def annotations_to_instances(annos, image_size, mask_format="polygon"):
+    """
+    Create an :class:`Instances` object used by the models,
+    from instance annotations in the dataset dict.
+
+    Args:
+        annos (list[dict]): a list of instance annotations in one image, each
+            element for one instance.
+        image_size (tuple): height, width
+
+    Returns:
+        Instances:
+            It will contain fields "gt_boxes", "gt_classes",
+            "gt_masks", "gt_keypoints", if they can be obtained from `annos`.
+            This is the format that builtin models expect.
+    """
+    boxes = [BoxMode.convert(obj["bbox"], obj["bbox_mode"], BoxMode.XYXY_ABS) for obj in annos]
+    target = Instances(image_size)
+    boxes = target.gt_boxes = Boxes(boxes)
+    boxes.clip(image_size)
+
+    classes = [obj["category_id"] for obj in annos]
+    classes = torch.tensor(classes, dtype=torch.int64)
+    target.gt_classes = classes
+
+    if len(annos) and "segmentation" in annos[0]:
+        segms = [obj["segmentation"] for obj in annos]
+        if mask_format == "polygon":
+            masks = PolygonMasks(segms)
+        else:
+            assert mask_format == "bitmask", mask_format
+            masks = []
+            for segm in segms:
+                if isinstance(segm, list):
+                    # polygon
+                    masks.append(polygons_to_bitmask(segm, *image_size))
+                elif isinstance(segm, dict):
+                    # COCO RLE
+                    masks.append(mask_util.decode(segm))
+                elif isinstance(segm, np.ndarray):
+                    assert segm.ndim == 2, "Expect segmentation of 2 dimensions, got {}.".format(
+                        segm.ndim
+                    )
+                    # mask array
+                    masks.append(segm)
+                else:
+                    raise ValueError(
+                        "Cannot convert segmentation of type '{}' to BitMasks!"
+                        "Supported types are: polygons as list[list[float] or ndarray],"
+                        " COCO-style RLE as a dict, or a full-image segmentation mask "
+                        "as a 2D ndarray.".format(type(segm))
+                    )
+            # torch.from_numpy does not support array with negative stride.
+            masks = BitMasks(
+                torch.stack([torch.from_numpy(np.ascontiguousarray(x)) for x in masks])
+            )
+        target.gt_masks = masks
+
+    if len(annos) and "keypoints" in annos[0]:
+        kpts = [obj.get("keypoints", []) for obj in annos]
+        target.gt_keypoints = Keypoints(kpts)
+
+    return target
+
+
+def annotations_to_instances_rotated(annos, image_size):
+    """
+    Create an :class:`Instances` object used by the models,
+    from instance annotations in the dataset dict.
+    Compared to `annotations_to_instances`, this function is for rotated boxes only
+
+    Args:
+        annos (list[dict]): a list of instance annotations in one image, each
+            element for one instance.
+        image_size (tuple): height, width
+
+    Returns:
+        Instances:
+            Containing fields "gt_boxes", "gt_classes",
+            if they can be obtained from `annos`.
+            This is the format that builtin models expect.
+    """
+    boxes = [obj["bbox"] for obj in annos]
+    target = Instances(image_size)
+    boxes = target.gt_boxes = RotatedBoxes(boxes)
+    boxes.clip(image_size)
+
+    classes = [obj["category_id"] for obj in annos]
+    classes = torch.tensor(classes, dtype=torch.int64)
+    target.gt_classes = classes
+
+    return target
+
+
+def filter_empty_instances(instances, by_box=True, by_mask=True, box_threshold=1e-5):
+    """
+    Filter out empty instances in an `Instances` object.
+
+    Args:
+        instances (Instances):
+        by_box (bool): whether to filter out instances with empty boxes
+        by_mask (bool): whether to filter out instances with empty masks
+        box_threshold (float): minimum width and height to be considered non-empty
+
+    Returns:
+        Instances: the filtered instances.
+    """
+    assert by_box or by_mask
+    r = []
+    if by_box:
+        r.append(instances.gt_boxes.nonempty(threshold=box_threshold))
+    if instances.has("gt_masks") and by_mask:
+        r.append(instances.gt_masks.nonempty())
+
+    # TODO: can also filter visible keypoints
+
+    if not r:
+        return instances
+    m = r[0]
+    for x in r[1:]:
+        m = m & x
+    return instances[m]
+
+
+def create_keypoint_hflip_indices(dataset_names):
+    """
+    Args:
+        dataset_names (list[str]): list of dataset names
+    Returns:
+        ndarray[int]: a vector of size=#keypoints, storing the
+        horizontally-flipped keypoint indices.
+    """
+
+    check_metadata_consistency("keypoint_names", dataset_names)
+    check_metadata_consistency("keypoint_flip_map", dataset_names)
+
+    meta = MetadataCatalog.get(dataset_names[0])
+    names = meta.keypoint_names
+    # TODO flip -> hflip
+    flip_map = dict(meta.keypoint_flip_map)
+    flip_map.update({v: k for k, v in flip_map.items()})
+    flipped_names = [i if i not in flip_map else flip_map[i] for i in names]
+    flip_indices = [names.index(i) for i in flipped_names]
+    return np.asarray(flip_indices)
+
+
+def gen_crop_transform_with_instance(crop_size, image_size, instance):
+    """
+    Generate a CropTransform so that the cropping region contains
+    the center of the given instance.
+
+    Args:
+        crop_size (tuple): h, w in pixels
+        image_size (tuple): h, w
+        instance (dict): an annotation dict of one instance, in Detectron2's
+            dataset format.
+    """
+    crop_size = np.asarray(crop_size, dtype=np.int32)
+    bbox = BoxMode.convert(instance["bbox"], instance["bbox_mode"], BoxMode.XYXY_ABS)
+    center_yx = (bbox[1] + bbox[3]) * 0.5, (bbox[0] + bbox[2]) * 0.5
+    assert (
+        image_size[0] >= center_yx[0] and image_size[1] >= center_yx[1]
+    ), "The annotation bounding box is outside of the image!"
+    assert (
+        image_size[0] >= crop_size[0] and image_size[1] >= crop_size[1]
+    ), "Crop size is larger than image size!"
+
+    min_yx = np.maximum(np.floor(center_yx).astype(np.int32) - crop_size, 0)
+    max_yx = np.maximum(np.asarray(image_size, dtype=np.int32) - crop_size, 0)
+    max_yx = np.minimum(max_yx, np.ceil(center_yx).astype(np.int32))
+
+    y0 = np.random.randint(min_yx[0], max_yx[0] + 1)
+    x0 = np.random.randint(min_yx[1], max_yx[1] + 1)
+    return T.CropTransform(x0, y0, crop_size[1], crop_size[0])
+
+
+def check_metadata_consistency(key, dataset_names):
+    """
+    Check that the data have consistent metadata.
+
+    Args:
+        key (str): a metadata key
+        dataset_names (list[str]): a list of dataset names
+
+    Raises:
+        AttributeError: if the key does not exist in the metadata
+        ValueError: if the given data do not have the same metadata values defined by key
+    """
+    if len(dataset_names) == 0:
+        return
+    logger = logging.getLogger(__name__)
+    entries_per_dataset = [getattr(MetadataCatalog.get(d), key) for d in dataset_names]
+    for idx, entry in enumerate(entries_per_dataset):
+        if entry != entries_per_dataset[0]:
+            logger.error(
+                "Metadata '{}' for dataset '{}' is '{}'".format(key, dataset_names[idx], str(entry))
+            )
+            logger.error(
+                "Metadata '{}' for dataset '{}' is '{}'".format(
+                    key, dataset_names[0], str(entries_per_dataset[0])
+                )
+            )
+            raise ValueError("Datasets have different metadata '{}'!".format(key))
+
+
+def build_transform_gen(cfg, is_train):
+    """
+    Create a list of :class:`TransformGen` from config.
+    Now it includes resizing and flipping.
+
+    Returns:
+        list[TransformGen]
+    """
+    if is_train:
+        min_size = cfg.INPUT.MIN_SIZE_TRAIN
+        max_size = cfg.INPUT.MAX_SIZE_TRAIN
+        sample_style = cfg.INPUT.MIN_SIZE_TRAIN_SAMPLING
+    else:
+        min_size = cfg.INPUT.MIN_SIZE_TEST
+        max_size = cfg.INPUT.MAX_SIZE_TEST
+        sample_style = "choice"
+    if sample_style == "range":
+        assert len(min_size) == 2, "more than 2 ({}) min_size(s) are provided for ranges".format(
+            len(min_size)
+        )
+
+    logger = logging.getLogger(__name__)
+    tfm_gens = []
+    tfm_gens.append(T.ResizeShortestEdge(min_size, max_size, sample_style))
+    if is_train:
+        tfm_gens.append(T.RandomFlip())
+        logger.info("TransformGens used in training: " + str(tfm_gens))
+    return tfm_gens
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/detectron2/data/samplers/__init__.py b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/data/samplers/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..9cfa8a65259a850b8259016d482a0eac1bbafb38
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/data/samplers/__init__.py
@@ -0,0 +1,10 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+from .distributed_sampler import InferenceSampler, RepeatFactorTrainingSampler, TrainingSampler
+from .grouped_batch_sampler import GroupedBatchSampler
+
+__all__ = [
+    "GroupedBatchSampler",
+    "TrainingSampler",
+    "InferenceSampler",
+    "RepeatFactorTrainingSampler",
+]
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/detectron2/data/samplers/distributed_sampler.py b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/data/samplers/distributed_sampler.py
new file mode 100644
index 0000000000000000000000000000000000000000..4ac57bbd10519be99114155d717802deac53e8fb
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/data/samplers/distributed_sampler.py
@@ -0,0 +1,199 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+import itertools
+import math
+from collections import defaultdict
+from typing import Optional
+import torch
+from torch.utils.data.sampler import Sampler
+
+from detectron2.utils import comm
+
+
+class TrainingSampler(Sampler):
+    """
+    In training, we only care about the "infinite stream" of training data.
+    So this sampler produces an infinite stream of indices and
+    all workers cooperate to correctly shuffle the indices and sample different indices.
+
+    The samplers in each worker effectively produces `indices[worker_id::num_workers]`
+    where `indices` is an infinite stream of indices consisting of
+    `shuffle(range(size)) + shuffle(range(size)) + ...` (if shuffle is True)
+    or `range(size) + range(size) + ...` (if shuffle is False)
+    """
+
+    def __init__(self, size: int, shuffle: bool = True, seed: Optional[int] = None):
+        """
+        Args:
+            size (int): the total number of data of the underlying dataset to sample from
+            shuffle (bool): whether to shuffle the indices or not
+            seed (int): the initial seed of the shuffle. Must be the same
+                across all workers. If None, will use a random seed shared
+                among workers (require synchronization among all workers).
+        """
+        self._size = size
+        assert size > 0
+        self._shuffle = shuffle
+        if seed is None:
+            seed = comm.shared_random_seed()
+        self._seed = int(seed)
+
+        self._rank = comm.get_rank()
+        self._world_size = comm.get_world_size()
+
+    def __iter__(self):
+        start = self._rank
+        yield from itertools.islice(self._infinite_indices(), start, None, self._world_size)
+
+    def _infinite_indices(self):
+        g = torch.Generator()
+        g.manual_seed(self._seed)
+        while True:
+            if self._shuffle:
+                yield from torch.randperm(self._size, generator=g)
+            else:
+                yield from torch.arange(self._size)
+
+
+class RepeatFactorTrainingSampler(Sampler):
+    """
+    Similar to TrainingSampler, but suitable for training on class imbalanced data
+    like LVIS. In each epoch, an image may appear multiple times based on its "repeat
+    factor". The repeat factor for an image is a function of the frequency the rarest
+    category labeled in that image. The "frequency of category c" in [0, 1] is defined
+    as the fraction of images in the training set (without repeats) in which category c
+    appears.
+
+    See :paper:`lvis` (>= v2) Appendix B.2.
+    """
+
+    def __init__(self, dataset_dicts, repeat_thresh, shuffle=True, seed=None):
+        """
+        Args:
+            dataset_dicts (list[dict]): annotations in Detectron2 dataset format.
+            repeat_thresh (float): frequency threshold below which data is repeated.
+            shuffle (bool): whether to shuffle the indices or not
+            seed (int): the initial seed of the shuffle. Must be the same
+                across all workers. If None, will use a random seed shared
+                among workers (require synchronization among all workers).
+        """
+        self._shuffle = shuffle
+        if seed is None:
+            seed = comm.shared_random_seed()
+        self._seed = int(seed)
+
+        self._rank = comm.get_rank()
+        self._world_size = comm.get_world_size()
+
+        # Get fractional repeat factors and split into whole number (_int_part)
+        # and fractional (_frac_part) parts.
+        rep_factors = self._get_repeat_factors(dataset_dicts, repeat_thresh)
+        self._int_part = torch.trunc(rep_factors)
+        self._frac_part = rep_factors - self._int_part
+
+    def _get_repeat_factors(self, dataset_dicts, repeat_thresh):
+        """
+        Compute (fractional) per-image repeat factors.
+
+        Args:
+            See __init__.
+
+        Returns:
+            torch.Tensor: the i-th element is the repeat factor for the dataset image
+                at index i.
+        """
+        # 1. For each category c, compute the fraction of images that contain it: f(c)
+        category_freq = defaultdict(int)
+        for dataset_dict in dataset_dicts:  # For each image (without repeats)
+            cat_ids = {ann["category_id"] for ann in dataset_dict["annotations"]}
+            for cat_id in cat_ids:
+                category_freq[cat_id] += 1
+        num_images = len(dataset_dicts)
+        for k, v in category_freq.items():
+            category_freq[k] = v / num_images
+
+        # 2. For each category c, compute the category-level repeat factor:
+        #    r(c) = max(1, sqrt(t / f(c)))
+        category_rep = {
+            cat_id: max(1.0, math.sqrt(repeat_thresh / cat_freq))
+            for cat_id, cat_freq in category_freq.items()
+        }
+
+        # 3. For each image I, compute the image-level repeat factor:
+        #    r(I) = max_{c in I} r(c)
+        rep_factors = []
+        for dataset_dict in dataset_dicts:
+            cat_ids = {ann["category_id"] for ann in dataset_dict["annotations"]}
+            rep_factor = max({category_rep[cat_id] for cat_id in cat_ids})
+            rep_factors.append(rep_factor)
+
+        return torch.tensor(rep_factors, dtype=torch.float32)
+
+    def _get_epoch_indices(self, generator):
+        """
+        Create a list of dataset indices (with repeats) to use for one epoch.
+
+        Args:
+            generator (torch.Generator): pseudo random number generator used for
+                stochastic rounding.
+
+        Returns:
+            torch.Tensor: list of dataset indices to use in one epoch. Each index
+                is repeated based on its calculated repeat factor.
+        """
+        # Since repeat factors are fractional, we use stochastic rounding so
+        # that the target repeat factor is achieved in expectation over the
+        # course of training
+        rands = torch.rand(len(self._frac_part), generator=generator)
+        rep_factors = self._int_part + (rands < self._frac_part).float()
+        # Construct a list of indices in which we repeat images as specified
+        indices = []
+        for dataset_index, rep_factor in enumerate(rep_factors):
+            indices.extend([dataset_index] * int(rep_factor.item()))
+        return torch.tensor(indices, dtype=torch.int64)
+
+    def __iter__(self):
+        start = self._rank
+        yield from itertools.islice(self._infinite_indices(), start, None, self._world_size)
+
+    def _infinite_indices(self):
+        g = torch.Generator()
+        g.manual_seed(self._seed)
+        while True:
+            # Sample indices with repeats determined by stochastic rounding; each
+            # "epoch" may have a slightly different size due to the rounding.
+            indices = self._get_epoch_indices(g)
+            if self._shuffle:
+                randperm = torch.randperm(len(indices), generator=g)
+                yield from indices[randperm]
+            else:
+                yield from indices
+
+
+class InferenceSampler(Sampler):
+    """
+    Produce indices for inference.
+    Inference needs to run on the __exact__ set of samples,
+    therefore when the total number of samples is not divisible by the number of workers,
+    this sampler produces different number of samples on different workers.
+    """
+
+    def __init__(self, size: int):
+        """
+        Args:
+            size (int): the total number of data of the underlying dataset to sample from
+        """
+        self._size = size
+        assert size > 0
+        self._rank = comm.get_rank()
+        self._world_size = comm.get_world_size()
+
+        shard_size = (self._size - 1) // self._world_size + 1
+        begin = shard_size * self._rank
+        end = min(shard_size * (self._rank + 1), self._size)
+        self._local_indices = range(begin, end)
+
+    def __iter__(self):
+        yield from self._local_indices
+
+    def __len__(self):
+        return len(self._local_indices)
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/detectron2/data/samplers/grouped_batch_sampler.py b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/data/samplers/grouped_batch_sampler.py
new file mode 100644
index 0000000000000000000000000000000000000000..138e106136083383d9f8729f1da930804463b297
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/data/samplers/grouped_batch_sampler.py
@@ -0,0 +1,47 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+import numpy as np
+from torch.utils.data.sampler import BatchSampler, Sampler
+
+
+class GroupedBatchSampler(BatchSampler):
+    """
+    Wraps another sampler to yield a mini-batch of indices.
+    It enforces that the batch only contain elements from the same group.
+    It also tries to provide mini-batches which follows an ordering which is
+    as close as possible to the ordering from the original sampler.
+    """
+
+    def __init__(self, sampler, group_ids, batch_size):
+        """
+        Args:
+            sampler (Sampler): Base sampler.
+            group_ids (list[int]): If the sampler produces indices in range [0, N),
+                `group_ids` must be a list of `N` ints which contains the group id of each sample.
+                The group ids must be a set of integers in the range [0, num_groups).
+            batch_size (int): Size of mini-batch.
+        """
+        if not isinstance(sampler, Sampler):
+            raise ValueError(
+                "sampler should be an instance of "
+                "torch.utils.data.Sampler, but got sampler={}".format(sampler)
+            )
+        self.sampler = sampler
+        self.group_ids = np.asarray(group_ids)
+        assert self.group_ids.ndim == 1
+        self.batch_size = batch_size
+        groups = np.unique(self.group_ids).tolist()
+
+        # buffer the indices of each group until batch size is reached
+        self.buffer_per_group = {k: [] for k in groups}
+
+    def __iter__(self):
+        for idx in self.sampler:
+            group_id = self.group_ids[idx]
+            group_buffer = self.buffer_per_group[group_id]
+            group_buffer.append(idx)
+            if len(group_buffer) == self.batch_size:
+                yield group_buffer[:]  # yield a copy of the list
+                del group_buffer[:]
+
+    def __len__(self):
+        raise NotImplementedError("len() of GroupedBatchSampler is not well-defined.")
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/detectron2/data/transforms/__init__.py b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/data/transforms/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..f7638bb58009ff3e00eb1373f2faa5dc2f30100d
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/data/transforms/__init__.py
@@ -0,0 +1,6 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+from .transform import *
+from fvcore.transforms.transform import *
+from .transform_gen import *
+
+__all__ = [k for k in globals().keys() if not k.startswith("_")]
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/detectron2/data/transforms/transform.py b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/data/transforms/transform.py
new file mode 100644
index 0000000000000000000000000000000000000000..bd937538da4bed77ccb6a7ee45d7f15dc0281384
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/data/transforms/transform.py
@@ -0,0 +1,241 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+# File: transform.py
+
+import numpy as np
+import torch
+import torch.nn.functional as F
+from fvcore.transforms.transform import HFlipTransform, NoOpTransform, Transform
+from PIL import Image
+
+try:
+    import cv2  # noqa
+except ImportError:
+    # OpenCV is an optional dependency at the moment
+    pass
+
+__all__ = ["ExtentTransform", "ResizeTransform", "RotationTransform"]
+
+
+class ExtentTransform(Transform):
+    """
+    Extracts a subregion from the source image and scales it to the output size.
+
+    The fill color is used to map pixels from the source rect that fall outside
+    the source image.
+
+    See: https://pillow.readthedocs.io/en/latest/PIL.html#PIL.ImageTransform.ExtentTransform
+    """
+
+    def __init__(self, src_rect, output_size, interp=Image.LINEAR, fill=0):
+        """
+        Args:
+            src_rect (x0, y0, x1, y1): src coordinates
+            output_size (h, w): dst image size
+            interp: PIL interpolation methods
+            fill: Fill color used when src_rect extends outside image
+        """
+        super().__init__()
+        self._set_attributes(locals())
+
+    def apply_image(self, img, interp=None):
+        h, w = self.output_size
+        ret = Image.fromarray(img).transform(
+            size=(w, h),
+            method=Image.EXTENT,
+            data=self.src_rect,
+            resample=interp if interp else self.interp,
+            fill=self.fill,
+        )
+        return np.asarray(ret)
+
+    def apply_coords(self, coords):
+        # Transform image center from source coordinates into output coordinates
+        # and then map the new origin to the corner of the output image.
+        h, w = self.output_size
+        x0, y0, x1, y1 = self.src_rect
+        new_coords = coords.astype(np.float32)
+        new_coords[:, 0] -= 0.5 * (x0 + x1)
+        new_coords[:, 1] -= 0.5 * (y0 + y1)
+        new_coords[:, 0] *= w / (x1 - x0)
+        new_coords[:, 1] *= h / (y1 - y0)
+        new_coords[:, 0] += 0.5 * w
+        new_coords[:, 1] += 0.5 * h
+        return new_coords
+
+    def apply_segmentation(self, segmentation):
+        segmentation = self.apply_image(segmentation, interp=Image.NEAREST)
+        return segmentation
+
+
+class ResizeTransform(Transform):
+    """
+    Resize the image to a target size.
+    """
+
+    def __init__(self, h, w, new_h, new_w, interp=None):
+        """
+        Args:
+            h, w (int): original image size
+            new_h, new_w (int): new image size
+            interp: PIL interpolation methods, defaults to bilinear.
+        """
+        # TODO decide on PIL vs opencv
+        super().__init__()
+        if interp is None:
+            interp = Image.BILINEAR
+        self._set_attributes(locals())
+
+    def apply_image(self, img, interp=None):
+        assert img.shape[:2] == (self.h, self.w)
+        assert len(img.shape) <= 4
+
+        if img.dtype == np.uint8:
+            pil_image = Image.fromarray(img)
+            interp_method = interp if interp is not None else self.interp
+            pil_image = pil_image.resize((self.new_w, self.new_h), interp_method)
+            ret = np.asarray(pil_image)
+        else:
+            # PIL only supports uint8
+            img = torch.from_numpy(img)
+            shape = list(img.shape)
+            shape_4d = shape[:2] + [1] * (4 - len(shape)) + shape[2:]
+            img = img.view(shape_4d).permute(2, 3, 0, 1)  # hw(c) -> nchw
+            _PIL_RESIZE_TO_INTERPOLATE_MODE = {Image.BILINEAR: "bilinear", Image.BICUBIC: "bicubic"}
+            mode = _PIL_RESIZE_TO_INTERPOLATE_MODE[self.interp]
+            img = F.interpolate(img, (self.new_h, self.new_w), mode=mode, align_corners=False)
+            shape[:2] = (self.new_h, self.new_w)
+            ret = img.permute(2, 3, 0, 1).view(shape).numpy()  # nchw -> hw(c)
+
+        return ret
+
+    def apply_coords(self, coords):
+        coords[:, 0] = coords[:, 0] * (self.new_w * 1.0 / self.w)
+        coords[:, 1] = coords[:, 1] * (self.new_h * 1.0 / self.h)
+        return coords
+
+    def apply_segmentation(self, segmentation):
+        segmentation = self.apply_image(segmentation, interp=Image.NEAREST)
+        return segmentation
+
+    def inverse(self):
+        return ResizeTransform(self.new_h, self.new_w, self.h, self.w, self.interp)
+
+
+class RotationTransform(Transform):
+    """
+    This method returns a copy of this image, rotated the given
+    number of degrees counter clockwise around its center.
+    """
+
+    def __init__(self, h, w, angle, expand=True, center=None, interp=None):
+        """
+        Args:
+            h, w (int): original image size
+            angle (float): degrees for rotation
+            expand (bool): choose if the image should be resized to fit the whole
+                rotated image (default), or simply cropped
+            center (tuple (width, height)): coordinates of the rotation center
+                if left to None, the center will be fit to the center of each image
+                center has no effect if expand=True because it only affects shifting
+            interp: cv2 interpolation method, default cv2.INTER_LINEAR
+        """
+        super().__init__()
+        image_center = np.array((w / 2, h / 2))
+        if center is None:
+            center = image_center
+        if interp is None:
+            interp = cv2.INTER_LINEAR
+        abs_cos, abs_sin = abs(np.cos(np.deg2rad(angle))), abs(np.sin(np.deg2rad(angle)))
+        if expand:
+            # find the new width and height bounds
+            bound_w, bound_h = np.rint(
+                [h * abs_sin + w * abs_cos, h * abs_cos + w * abs_sin]
+            ).astype(int)
+        else:
+            bound_w, bound_h = w, h
+
+        self._set_attributes(locals())
+        self.rm_coords = self.create_rotation_matrix()
+        # Needed because of this problem https://github.com/opencv/opencv/issues/11784
+        self.rm_image = self.create_rotation_matrix(offset=-0.5)
+
+    def apply_image(self, img, interp=None):
+        """
+        demo should be a numpy array, formatted as Height * Width * Nchannels
+        """
+        if len(img) == 0 or self.angle % 360 == 0:
+            return img
+        assert img.shape[:2] == (self.h, self.w)
+        interp = interp if interp is not None else self.interp
+        return cv2.warpAffine(img, self.rm_image, (self.bound_w, self.bound_h), flags=interp)
+
+    def apply_coords(self, coords):
+        """
+        coords should be a N * 2 array-like, containing N couples of (x, y) points
+        """
+        coords = np.asarray(coords, dtype=float)
+        if len(coords) == 0 or self.angle % 360 == 0:
+            return coords
+        return cv2.transform(coords[:, np.newaxis, :], self.rm_coords)[:, 0, :]
+
+    def apply_segmentation(self, segmentation):
+        segmentation = self.apply_image(segmentation, interp=cv2.INTER_NEAREST)
+        return segmentation
+
+    def create_rotation_matrix(self, offset=0):
+        center = (self.center[0] + offset, self.center[1] + offset)
+        rm = cv2.getRotationMatrix2D(tuple(center), self.angle, 1)
+        if self.expand:
+            # Find the coordinates of the center of rotation in the new image
+            # The only point for which we know the future coordinates is the center of the image
+            rot_im_center = cv2.transform(self.image_center[None, None, :] + offset, rm)[0, 0, :]
+            new_center = np.array([self.bound_w / 2, self.bound_h / 2]) + offset - rot_im_center
+            # shift the rotation center to the new coordinates
+            rm[:, 2] += new_center
+        return rm
+
+
+def HFlip_rotated_box(transform, rotated_boxes):
+    """
+    Apply the horizontal flip transform on rotated boxes.
+
+    Args:
+        rotated_boxes (ndarray): Nx5 floating point array of
+            (x_center, y_center, width, height, angle_degrees) format
+            in absolute coordinates.
+    """
+    # Transform x_center
+    rotated_boxes[:, 0] = transform.width - rotated_boxes[:, 0]
+    # Transform angle
+    rotated_boxes[:, 4] = -rotated_boxes[:, 4]
+    return rotated_boxes
+
+
+def Resize_rotated_box(transform, rotated_boxes):
+    """
+    Apply the resizing transform on rotated boxes. For details of how these (approximation)
+    formulas are derived, please refer to :meth:`RotatedBoxes.scale`.
+
+    Args:
+        rotated_boxes (ndarray): Nx5 floating point array of
+            (x_center, y_center, width, height, angle_degrees) format
+            in absolute coordinates.
+    """
+    scale_factor_x = transform.new_w * 1.0 / transform.w
+    scale_factor_y = transform.new_h * 1.0 / transform.h
+    rotated_boxes[:, 0] *= scale_factor_x
+    rotated_boxes[:, 1] *= scale_factor_y
+    theta = rotated_boxes[:, 4] * np.pi / 180.0
+    c = np.cos(theta)
+    s = np.sin(theta)
+    rotated_boxes[:, 2] *= np.sqrt(np.square(scale_factor_x * c) + np.square(scale_factor_y * s))
+    rotated_boxes[:, 3] *= np.sqrt(np.square(scale_factor_x * s) + np.square(scale_factor_y * c))
+    rotated_boxes[:, 4] = np.arctan2(scale_factor_x * s, scale_factor_y * c) * 180 / np.pi
+
+    return rotated_boxes
+
+
+HFlipTransform.register_type("rotated_box", HFlip_rotated_box)
+NoOpTransform.register_type("rotated_box", lambda t, x: x)
+ResizeTransform.register_type("rotated_box", Resize_rotated_box)
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/detectron2/data/transforms/transform_gen.py b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/data/transforms/transform_gen.py
new file mode 100644
index 0000000000000000000000000000000000000000..197a0ebf6750a7ea459aa7e14413b4a41adcd42e
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/data/transforms/transform_gen.py
@@ -0,0 +1,534 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+# File: transformer.py
+
+import inspect
+import numpy as np
+import pprint
+import sys
+from abc import ABCMeta, abstractmethod
+from fvcore.transforms.transform import (
+    BlendTransform,
+    CropTransform,
+    HFlipTransform,
+    NoOpTransform,
+    Transform,
+    TransformList,
+    VFlipTransform,
+)
+from PIL import Image
+
+from .transform import ExtentTransform, ResizeTransform, RotationTransform
+
+__all__ = [
+    "RandomApply",
+    "RandomBrightness",
+    "RandomContrast",
+    "RandomCrop",
+    "RandomExtent",
+    "RandomFlip",
+    "RandomSaturation",
+    "RandomLighting",
+    "RandomRotation",
+    "Resize",
+    "ResizeShortestEdge",
+    "TransformGen",
+    "apply_transform_gens",
+]
+
+
+def check_dtype(img):
+    assert isinstance(img, np.ndarray), "[TransformGen] Needs an numpy array, but got a {}!".format(
+        type(img)
+    )
+    assert not isinstance(img.dtype, np.integer) or (
+        img.dtype == np.uint8
+    ), "[TransformGen] Got image of type {}, use uint8 or floating points instead!".format(
+        img.dtype
+    )
+    assert img.ndim in [2, 3], img.ndim
+
+
+class TransformGen(metaclass=ABCMeta):
+    """
+    TransformGen takes an image of type uint8 in range [0, 255], or
+    floating point in range [0, 1] or [0, 255] as input.
+
+    It creates a :class:`Transform` based on the given image, sometimes with randomness.
+    The transform can then be used to transform images
+    or other data (boxes, points, annotations, etc.) associated with it.
+
+    The assumption made in this class
+    is that the image itself is sufficient to instantiate a transform.
+    When this assumption is not true, you need to create the transforms by your own.
+
+    A list of `TransformGen` can be applied with :func:`apply_transform_gens`.
+    """
+
+    def _init(self, params=None):
+        if params:
+            for k, v in params.items():
+                if k != "self" and not k.startswith("_"):
+                    setattr(self, k, v)
+
+    @abstractmethod
+    def get_transform(self, img):
+        pass
+
+    def _rand_range(self, low=1.0, high=None, size=None):
+        """
+        Uniform float random number between low and high.
+        """
+        if high is None:
+            low, high = 0, low
+        if size is None:
+            size = []
+        return np.random.uniform(low, high, size)
+
+    def __repr__(self):
+        """
+        Produce something like:
+        "MyTransformGen(field1={self.field1}, field2={self.field2})"
+        """
+        try:
+            sig = inspect.signature(self.__init__)
+            classname = type(self).__name__
+            argstr = []
+            for name, param in sig.parameters.items():
+                assert (
+                    param.kind != param.VAR_POSITIONAL and param.kind != param.VAR_KEYWORD
+                ), "The default __repr__ doesn't support *args or **kwargs"
+                assert hasattr(self, name), (
+                    "Attribute {} not found! "
+                    "Default __repr__ only works if attributes match the constructor.".format(name)
+                )
+                attr = getattr(self, name)
+                default = param.default
+                if default is attr:
+                    continue
+                argstr.append("{}={}".format(name, pprint.pformat(attr)))
+            return "{}({})".format(classname, ", ".join(argstr))
+        except AssertionError:
+            return super().__repr__()
+
+    __str__ = __repr__
+
+
+class RandomApply(TransformGen):
+    """
+    Randomly apply the wrapper transformation with a given probability.
+    """
+
+    def __init__(self, transform, prob=0.5):
+        """
+        Args:
+            transform (Transform, TransformGen): the transform to be wrapped
+                by the `RandomApply`. The `transform` can either be a
+                `Transform` or `TransformGen` instance.
+            prob (float): probability between 0.0 and 1.0 that
+                the wrapper transformation is applied
+        """
+        super().__init__()
+        assert isinstance(transform, (Transform, TransformGen)), (
+            f"The given transform must either be a Transform or TransformGen instance. "
+            f"Not {type(transform)}"
+        )
+        assert 0.0 <= prob <= 1.0, f"Probablity must be between 0.0 and 1.0 (given: {prob})"
+        self.prob = prob
+        self.transform = transform
+
+    def get_transform(self, img):
+        do = self._rand_range() < self.prob
+        if do:
+            if isinstance(self.transform, TransformGen):
+                return self.transform.get_transform(img)
+            else:
+                return self.transform
+        else:
+            return NoOpTransform()
+
+
+class RandomFlip(TransformGen):
+    """
+    Flip the image horizontally or vertically with the given probability.
+    """
+
+    def __init__(self, prob=0.5, *, horizontal=True, vertical=False):
+        """
+        Args:
+            prob (float): probability of flip.
+            horizontal (boolean): whether to apply horizontal flipping
+            vertical (boolean): whether to apply vertical flipping
+        """
+        super().__init__()
+
+        if horizontal and vertical:
+            raise ValueError("Cannot do both horiz and vert. Please use two Flip instead.")
+        if not horizontal and not vertical:
+            raise ValueError("At least one of horiz or vert has to be True!")
+        self._init(locals())
+
+    def get_transform(self, img):
+        h, w = img.shape[:2]
+        do = self._rand_range() < self.prob
+        if do:
+            if self.horizontal:
+                return HFlipTransform(w)
+            elif self.vertical:
+                return VFlipTransform(h)
+        else:
+            return NoOpTransform()
+
+
+class Resize(TransformGen):
+    """ Resize image to a target size"""
+
+    def __init__(self, shape, interp=Image.BILINEAR):
+        """
+        Args:
+            shape: (h, w) tuple or a int
+            interp: PIL interpolation method
+        """
+        if isinstance(shape, int):
+            shape = (shape, shape)
+        shape = tuple(shape)
+        self._init(locals())
+
+    def get_transform(self, img):
+        return ResizeTransform(
+            img.shape[0], img.shape[1], self.shape[0], self.shape[1], self.interp
+        )
+
+
+class ResizeShortestEdge(TransformGen):
+    """
+    Scale the shorter edge to the given size, with a limit of `max_size` on the longer edge.
+    If `max_size` is reached, then downscale so that the longer edge does not exceed max_size.
+    """
+
+    def __init__(
+        self, short_edge_length, max_size=sys.maxsize, sample_style="range", interp=Image.BILINEAR
+    ):
+        """
+        Args:
+            short_edge_length (list[int]): If ``sample_style=="range"``,
+                a [min, max] interval from which to sample the shortest edge length.
+                If ``sample_style=="choice"``, a list of shortest edge lengths to sample from.
+            max_size (int): maximum allowed longest edge length.
+            sample_style (str): either "range" or "choice".
+        """
+        super().__init__()
+        assert sample_style in ["range", "choice"], sample_style
+
+        self.is_range = sample_style == "range"
+        if isinstance(short_edge_length, int):
+            short_edge_length = (short_edge_length, short_edge_length)
+        self._init(locals())
+
+    def get_transform(self, img):
+        h, w = img.shape[:2]
+
+        if self.is_range:
+            size = np.random.randint(self.short_edge_length[0], self.short_edge_length[1] + 1)
+        else:
+            size = np.random.choice(self.short_edge_length)
+        if size == 0:
+            return NoOpTransform()
+
+        scale = size * 1.0 / min(h, w)
+        if h < w:
+            newh, neww = size, scale * w
+        else:
+            newh, neww = scale * h, size
+        if max(newh, neww) > self.max_size:
+            scale = self.max_size * 1.0 / max(newh, neww)
+            newh = newh * scale
+            neww = neww * scale
+        neww = int(neww + 0.5)
+        newh = int(newh + 0.5)
+        return ResizeTransform(h, w, newh, neww, self.interp)
+
+
+class RandomRotation(TransformGen):
+    """
+    This method returns a copy of this image, rotated the given
+    number of degrees counter clockwise around the given center.
+    """
+
+    def __init__(self, angle, expand=True, center=None, sample_style="range", interp=None):
+        """
+        Args:
+            angle (list[float]): If ``sample_style=="range"``,
+                a [min, max] interval from which to sample the angle (in degrees).
+                If ``sample_style=="choice"``, a list of angles to sample from
+            expand (bool): choose if the image should be resized to fit the whole
+                rotated image (default), or simply cropped
+            center (list[[float, float]]):  If ``sample_style=="range"``,
+                a [[minx, miny], [maxx, maxy]] relative interval from which to sample the center,
+                [0, 0] being the top left of the image and [1, 1] the bottom right.
+                If ``sample_style=="choice"``, a list of centers to sample from
+                Default: None, which means that the center of rotation is the center of the image
+                center has no effect if expand=True because it only affects shifting
+        """
+        super().__init__()
+        assert sample_style in ["range", "choice"], sample_style
+        self.is_range = sample_style == "range"
+        if isinstance(angle, (float, int)):
+            angle = (angle, angle)
+        if center is not None and isinstance(center[0], (float, int)):
+            center = (center, center)
+        self._init(locals())
+
+    def get_transform(self, img):
+        h, w = img.shape[:2]
+        center = None
+        if self.is_range:
+            angle = np.random.uniform(self.angle[0], self.angle[1])
+            if self.center is not None:
+                center = (
+                    np.random.uniform(self.center[0][0], self.center[1][0]),
+                    np.random.uniform(self.center[0][1], self.center[1][1]),
+                )
+        else:
+            angle = np.random.choice(self.angle)
+            if self.center is not None:
+                center = np.random.choice(self.center)
+
+        if center is not None:
+            center = (w * center[0], h * center[1])  # Convert to absolute coordinates
+
+        return RotationTransform(h, w, angle, expand=self.expand, center=center, interp=self.interp)
+
+
+class RandomCrop(TransformGen):
+    """
+    Randomly crop a subimage out of an image.
+    """
+
+    def __init__(self, crop_type: str, crop_size):
+        """
+        Args:
+            crop_type (str): one of "relative_range", "relative", "absolute".
+                See `config/defaults.py` for explanation.
+            crop_size (tuple[float]): the relative ratio or absolute pixels of
+                height and width
+        """
+        super().__init__()
+        assert crop_type in ["relative_range", "relative", "absolute"]
+        self._init(locals())
+
+    def get_transform(self, img):
+        h, w = img.shape[:2]
+        croph, cropw = self.get_crop_size((h, w))
+        assert h >= croph and w >= cropw, "Shape computation in {} has bugs.".format(self)
+        h0 = np.random.randint(h - croph + 1)
+        w0 = np.random.randint(w - cropw + 1)
+        return CropTransform(w0, h0, cropw, croph)
+
+    def get_crop_size(self, image_size):
+        """
+        Args:
+            image_size (tuple): height, width
+
+        Returns:
+            crop_size (tuple): height, width in absolute pixels
+        """
+        h, w = image_size
+        if self.crop_type == "relative":
+            ch, cw = self.crop_size
+            return int(h * ch + 0.5), int(w * cw + 0.5)
+        elif self.crop_type == "relative_range":
+            crop_size = np.asarray(self.crop_size, dtype=np.float32)
+            ch, cw = crop_size + np.random.rand(2) * (1 - crop_size)
+            return int(h * ch + 0.5), int(w * cw + 0.5)
+        elif self.crop_type == "absolute":
+            return (min(self.crop_size[0], h), min(self.crop_size[1], w))
+        else:
+            NotImplementedError("Unknown crop type {}".format(self.crop_type))
+
+
+class RandomExtent(TransformGen):
+    """
+    Outputs an image by cropping a random "subrect" of the source image.
+
+    The subrect can be parameterized to include pixels outside the source image,
+    in which case they will be set to zeros (i.e. black). The size of the output
+    image will vary with the size of the random subrect.
+    """
+
+    def __init__(self, scale_range, shift_range):
+        """
+        Args:
+            output_size (h, w): Dimensions of output image
+            scale_range (l, h): Range of input-to-output size scaling factor
+            shift_range (x, y): Range of shifts of the cropped subrect. The rect
+                is shifted by [w / 2 * Uniform(-x, x), h / 2 * Uniform(-y, y)],
+                where (w, h) is the (width, height) of the input image. Set each
+                component to zero to crop at the image's center.
+        """
+        super().__init__()
+        self._init(locals())
+
+    def get_transform(self, img):
+        img_h, img_w = img.shape[:2]
+
+        # Initialize src_rect to fit the input image.
+        src_rect = np.array([-0.5 * img_w, -0.5 * img_h, 0.5 * img_w, 0.5 * img_h])
+
+        # Apply a random scaling to the src_rect.
+        src_rect *= np.random.uniform(self.scale_range[0], self.scale_range[1])
+
+        # Apply a random shift to the coordinates origin.
+        src_rect[0::2] += self.shift_range[0] * img_w * (np.random.rand() - 0.5)
+        src_rect[1::2] += self.shift_range[1] * img_h * (np.random.rand() - 0.5)
+
+        # Map src_rect coordinates into image coordinates (center at corner).
+        src_rect[0::2] += 0.5 * img_w
+        src_rect[1::2] += 0.5 * img_h
+
+        return ExtentTransform(
+            src_rect=(src_rect[0], src_rect[1], src_rect[2], src_rect[3]),
+            output_size=(int(src_rect[3] - src_rect[1]), int(src_rect[2] - src_rect[0])),
+        )
+
+
+class RandomContrast(TransformGen):
+    """
+    Randomly transforms image contrast.
+
+    Contrast intensity is uniformly sampled in (intensity_min, intensity_max).
+    - intensity < 1 will reduce contrast
+    - intensity = 1 will preserve the input image
+    - intensity > 1 will increase contrast
+
+    See: https://pillow.readthedocs.io/en/3.0.x/reference/ImageEnhance.html
+    """
+
+    def __init__(self, intensity_min, intensity_max):
+        """
+        Args:
+            intensity_min (float): Minimum augmentation
+            intensity_max (float): Maximum augmentation
+        """
+        super().__init__()
+        self._init(locals())
+
+    def get_transform(self, img):
+        w = np.random.uniform(self.intensity_min, self.intensity_max)
+        return BlendTransform(src_image=img.mean(), src_weight=1 - w, dst_weight=w)
+
+
+class RandomBrightness(TransformGen):
+    """
+    Randomly transforms image brightness.
+
+    Brightness intensity is uniformly sampled in (intensity_min, intensity_max).
+    - intensity < 1 will reduce brightness
+    - intensity = 1 will preserve the input image
+    - intensity > 1 will increase brightness
+
+    See: https://pillow.readthedocs.io/en/3.0.x/reference/ImageEnhance.html
+    """
+
+    def __init__(self, intensity_min, intensity_max):
+        """
+        Args:
+            intensity_min (float): Minimum augmentation
+            intensity_max (float): Maximum augmentation
+        """
+        super().__init__()
+        self._init(locals())
+
+    def get_transform(self, img):
+        w = np.random.uniform(self.intensity_min, self.intensity_max)
+        return BlendTransform(src_image=0, src_weight=1 - w, dst_weight=w)
+
+
+class RandomSaturation(TransformGen):
+    """
+    Randomly transforms image saturation.
+
+    Saturation intensity is uniformly sampled in (intensity_min, intensity_max).
+    - intensity < 1 will reduce saturation (make the image more grayscale)
+    - intensity = 1 will preserve the input image
+    - intensity > 1 will increase saturation
+
+    See: https://pillow.readthedocs.io/en/3.0.x/reference/ImageEnhance.html
+    """
+
+    def __init__(self, intensity_min, intensity_max):
+        """
+        Args:
+            intensity_min (float): Minimum augmentation (1 preserves input).
+            intensity_max (float): Maximum augmentation (1 preserves input).
+        """
+        super().__init__()
+        self._init(locals())
+
+    def get_transform(self, img):
+        assert img.shape[-1] == 3, "Saturation only works on RGB images"
+        w = np.random.uniform(self.intensity_min, self.intensity_max)
+        grayscale = img.dot([0.299, 0.587, 0.114])[:, :, np.newaxis]
+        return BlendTransform(src_image=grayscale, src_weight=1 - w, dst_weight=w)
+
+
+class RandomLighting(TransformGen):
+    """
+    Randomly transforms image color using fixed PCA over ImageNet.
+
+    The degree of color jittering is randomly sampled via a normal distribution,
+    with standard deviation given by the scale parameter.
+    """
+
+    def __init__(self, scale):
+        """
+        Args:
+            scale (float): Standard deviation of principal component weighting.
+        """
+        super().__init__()
+        self._init(locals())
+        self.eigen_vecs = np.array(
+            [[-0.5675, 0.7192, 0.4009], [-0.5808, -0.0045, -0.8140], [-0.5836, -0.6948, 0.4203]]
+        )
+        self.eigen_vals = np.array([0.2175, 0.0188, 0.0045])
+
+    def get_transform(self, img):
+        assert img.shape[-1] == 3, "Saturation only works on RGB images"
+        weights = np.random.normal(scale=self.scale, size=3)
+        return BlendTransform(
+            src_image=self.eigen_vecs.dot(weights * self.eigen_vals), src_weight=1.0, dst_weight=1.0
+        )
+
+
+def apply_transform_gens(transform_gens, img):
+    """
+    Apply a list of :class:`TransformGen` or :class:`Transform` on the input image, and
+    returns the transformed image and a list of transforms.
+
+    We cannot simply create and return all transforms without
+    applying it to the image, because a subsequent transform may
+    need the output of the previous one.
+
+    Args:
+        transform_gens (list): list of :class:`TransformGen` or :class:`Transform` instance to
+            be applied.
+        img (ndarray): uint8 or floating point images with 1 or 3 channels.
+
+    Returns:
+        ndarray: the transformed image
+        TransformList: contain the transforms that's used.
+    """
+    for g in transform_gens:
+        assert isinstance(g, (Transform, TransformGen)), g
+
+    check_dtype(img)
+
+    tfms = []
+    for g in transform_gens:
+        tfm = g.get_transform(img) if isinstance(g, TransformGen) else g
+        assert isinstance(
+            tfm, Transform
+        ), "TransformGen {} must return an instance of Transform! Got {} instead".format(g, tfm)
+        img = tfm.apply_image(img)
+        tfms.append(tfm)
+    return img, TransformList(tfms)
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/detectron2/engine/__init__.py b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/engine/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..6a4538da3e66593e4ef8916cd9cbca3c83b8c14e
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/engine/__init__.py
@@ -0,0 +1,12 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+
+from .launch import *
+from .train_loop import *
+
+__all__ = [k for k in globals().keys() if not k.startswith("_")]
+
+
+# prefer to let hooks and defaults live in separate namespaces (therefore not in __all__)
+# but still make them available here
+from .hooks import *
+from .defaults import *
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/detectron2/engine/defaults.py b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/engine/defaults.py
new file mode 100644
index 0000000000000000000000000000000000000000..db9ab68f21d77b9e3be730c4784abe665df3d96a
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/engine/defaults.py
@@ -0,0 +1,531 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+
+"""
+This file contains components with some default boilerplate logic user may need
+in training / testing. They will not work for everyone, but many users may find them useful.
+
+The behavior of functions/classes in this file is subject to change,
+since they are meant to represent the "common default behavior" people need in their projects.
+"""
+
+import argparse
+import logging
+import os
+import sys
+from collections import OrderedDict
+import torch
+from fvcore.common.file_io import PathManager
+from fvcore.nn.precise_bn import get_bn_modules
+from torch.nn.parallel import DistributedDataParallel
+
+import detectron2.data.transforms as T
+from detectron2.checkpoint import DetectionCheckpointer
+from detectron2.data import (
+    MetadataCatalog,
+    build_detection_test_loader,
+    build_detection_train_loader,
+)
+from detectron2.evaluation import (
+    DatasetEvaluator,
+    inference_on_dataset,
+    print_csv_format,
+    verify_results,
+)
+from detectron2.modeling import build_model
+from detectron2.solver import build_lr_scheduler, build_optimizer
+from detectron2.utils import comm
+from detectron2.utils.collect_env import collect_env_info
+from detectron2.utils.env import seed_all_rng
+from detectron2.utils.events import CommonMetricPrinter, JSONWriter, TensorboardXWriter
+from detectron2.utils.logger import setup_logger
+
+from . import hooks
+from .train_loop import SimpleTrainer
+
+__all__ = ["default_argument_parser", "default_setup", "DefaultPredictor", "DefaultTrainer"]
+
+
+def default_argument_parser(epilog=None):
+    """
+    Create a parser with some common arguments used by detectron2 users.
+
+    Args:
+        epilog (str): epilog passed to ArgumentParser describing the usage.
+
+    Returns:
+        argparse.ArgumentParser:
+    """
+    parser = argparse.ArgumentParser(
+        epilog=epilog
+        or f"""
+Examples:
+
+Run on single machine:
+    $ {sys.argv[0]} --num-gpus 8 --config-file cfg.yaml MODEL.WEIGHTS /path/to/weight.pth
+
+Run on multiple machines:
+    (machine0)$ {sys.argv[0]} --machine-rank 0 --num-machines 2 --dist-url <URL> [--other-flags]
+    (machine1)$ {sys.argv[0]} --machine-rank 1 --num-machines 2 --dist-url <URL> [--other-flags]
+""",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+    )
+    parser.add_argument("--config-file", default="", metavar="FILE", help="path to config file")
+    parser.add_argument(
+        "--resume",
+        action="store_true",
+        help="whether to attempt to resume from the checkpoint directory",
+    )
+    parser.add_argument("--eval-only", action="store_true", help="perform evaluation only")
+    parser.add_argument("--num-gpus", type=int, default=1, help="number of gpus *per machine*")
+    parser.add_argument("--num-machines", type=int, default=1, help="total number of machines")
+    parser.add_argument(
+        "--machine-rank", type=int, default=0, help="the rank of this machine (unique per machine)"
+    )
+
+    # PyTorch still may leave orphan processes in multi-gpu training.
+    # Therefore we use a deterministic way to obtain port,
+    # so that users are aware of orphan processes by seeing the port occupied.
+    port = 2 ** 15 + 2 ** 14 + hash(os.getuid() if sys.platform != "win32" else 1) % 2 ** 14
+    parser.add_argument(
+        "--dist-url",
+        default="tcp://127.0.0.1:{}".format(port),
+        help="initialization URL for pytorch distributed backend. See "
+        "https://pytorch.org/docs/stable/distributed.html for details.",
+    )
+    parser.add_argument(
+        "opts",
+        help="Modify config options using the command-line",
+        default=None,
+        nargs=argparse.REMAINDER,
+    )
+    return parser
+
+
+def default_setup(cfg, args):
+    """
+    Perform some basic common setups at the beginning of a job, including:
+
+    1. Set up the detectron2 logger
+    2. Log basic information about environment, cmdline arguments, and config
+    3. Backup the config to the output directory
+
+    Args:
+        cfg (CfgNode): the full config to be used
+        args (argparse.NameSpace): the command line arguments to be logged
+    """
+    output_dir = cfg.OUTPUT_DIR
+    if comm.is_main_process() and output_dir:
+        PathManager.mkdirs(output_dir)
+
+    rank = comm.get_rank()
+    setup_logger(output_dir, distributed_rank=rank, name="fvcore")
+    logger = setup_logger(output_dir, distributed_rank=rank)
+
+    logger.info("Rank of current process: {}. World size: {}".format(rank, comm.get_world_size()))
+    logger.info("Environment info:\n" + collect_env_info())
+
+    logger.info("Command line arguments: " + str(args))
+    if hasattr(args, "config_file") and args.config_file != "":
+        logger.info(
+            "Contents of args.config_file={}:\n{}".format(
+                args.config_file, PathManager.open(args.config_file, "r").read()
+            )
+        )
+
+    logger.info("Running with full config:\n{}".format(cfg))
+    if comm.is_main_process() and output_dir:
+        # Note: some of our scripts may expect the existence of
+        # config.yaml in output directory
+        path = os.path.join(output_dir, "config.yaml")
+        with PathManager.open(path, "w") as f:
+            f.write(cfg.dump())
+        logger.info("Full config saved to {}".format(path))
+
+    # make sure each worker has a different, yet deterministic seed if specified
+    seed_all_rng(None if cfg.SEED < 0 else cfg.SEED + rank)
+
+    # cudnn benchmark has large overhead. It shouldn't be used considering the small size of
+    # typical validation set.
+    if not (hasattr(args, "eval_only") and args.eval_only):
+        torch.backends.cudnn.benchmark = cfg.CUDNN_BENCHMARK
+
+
+class DefaultPredictor:
+    """
+    Create a simple end-to-end predictor with the given config that runs on
+    single device for a single input image.
+
+    Compared to using the model directly, this class does the following additions:
+
+    1. Load checkpoint from `cfg.MODEL.WEIGHTS`.
+    2. Always take BGR image as the input and apply conversion defined by `cfg.INPUT.FORMAT`.
+    3. Apply resizing defined by `cfg.INPUT.{MIN,MAX}_SIZE_TEST`.
+    4. Take one input image and produce a single output, instead of a batch.
+
+    If you'd like to do anything more fancy, please refer to its source code
+    as examples to build and use the model manually.
+
+    Attributes:
+        metadata (Metadata): the metadata of the underlying dataset, obtained from
+            cfg.DATASETS.TEST.
+
+    Examples:
+
+    .. code-block:: python
+
+        pred = DefaultPredictor(cfg)
+        inputs = cv2.imread("input.jpg")
+        outputs = pred(inputs)
+    """
+
+    def __init__(self, cfg):
+        self.cfg = cfg.clone()  # cfg can be modified by model
+        self.model = build_model(self.cfg)
+        self.model.eval()
+        self.metadata = MetadataCatalog.get(cfg.DATASETS.TEST[0])
+
+        checkpointer = DetectionCheckpointer(self.model)
+        checkpointer.load(cfg.MODEL.WEIGHTS)
+
+        self.transform_gen = T.ResizeShortestEdge(
+            [cfg.INPUT.MIN_SIZE_TEST, cfg.INPUT.MIN_SIZE_TEST], cfg.INPUT.MAX_SIZE_TEST
+        )
+
+        self.input_format = cfg.INPUT.FORMAT
+        assert self.input_format in ["RGB", "BGR"], self.input_format
+
+    def __call__(self, original_image):
+        """
+        Args:
+            original_image (np.ndarray): an image of shape (H, W, C) (in BGR order).
+
+        Returns:
+            predictions (dict):
+                the output of the model for one image only.
+                See :doc:`/tutorials/models` for details about the format.
+        """
+        with torch.no_grad():  # https://github.com/sphinx-doc/sphinx/issues/4258
+            # Apply pre-processing to image.
+            if self.input_format == "RGB":
+                # whether the model expects BGR inputs or RGB
+                original_image = original_image[:, :, ::-1]
+            height, width = original_image.shape[:2]
+            image = self.transform_gen.get_transform(original_image).apply_image(original_image)
+            image = torch.as_tensor(image.astype("float32").transpose(2, 0, 1))
+
+            inputs = {"image": image, "height": height, "width": width}
+            predictions = self.model([inputs])[0]
+            return predictions
+
+
+class DefaultTrainer(SimpleTrainer):
+    """
+    A trainer with default training logic. Compared to `SimpleTrainer`, it
+    contains the following logic in addition:
+
+    1. Create model, optimizer, scheduler, dataloader from the given config.
+    2. Load a checkpoint or `cfg.MODEL.WEIGHTS`, if exists, when
+       `resume_or_load` is called.
+    3. Register a few common hooks.
+
+    It is created to simplify the **standard model training workflow** and reduce code boilerplate
+    for users who only need the standard training workflow, with standard features.
+    It means this class makes *many assumptions* about your training logic that
+    may easily become invalid in a new research. In fact, any assumptions beyond those made in the
+    :class:`SimpleTrainer` are too much for research.
+
+    The code of this class has been annotated about restrictive assumptions it mades.
+    When they do not work for you, you're encouraged to:
+
+    1. Overwrite methods of this class, OR:
+    2. Use :class:`SimpleTrainer`, which only does minimal SGD training and
+       nothing else. You can then add your own hooks if needed. OR:
+    3. Write your own training loop similar to `tools/plain_train_net.py`.
+
+    Also note that the behavior of this class, like other functions/classes in
+    this file, is not stable, since it is meant to represent the "common default behavior".
+    It is only guaranteed to work well with the standard models and training workflow in detectron2.
+    To obtain more stable behavior, write your own training logic with other public APIs.
+
+    Examples:
+
+    .. code-block:: python
+
+        trainer = DefaultTrainer(cfg)
+        trainer.resume_or_load()  # load last checkpoint or MODEL.WEIGHTS
+        trainer.train()
+
+    Attributes:
+        scheduler:
+        checkpointer (DetectionCheckpointer):
+        cfg (CfgNode):
+    """
+
+    def __init__(self, cfg):
+        """
+        Args:
+            cfg (CfgNode):
+        """
+        logger = logging.getLogger("detectron2")
+        if not logger.isEnabledFor(logging.INFO):  # setup_logger is not called for d2
+            setup_logger()
+        # Assume these objects must be constructed in this order.
+        model = self.build_model(cfg)
+        optimizer = self.build_optimizer(cfg, model)
+        data_loader = self.build_train_loader(cfg)
+
+        # For training, wrap with DDP. But don't need this for inference.
+        if comm.get_world_size() > 1:
+            model = DistributedDataParallel(
+                model, device_ids=[comm.get_local_rank()], broadcast_buffers=False
+            )
+        super().__init__(model, data_loader, optimizer)
+
+        self.scheduler = self.build_lr_scheduler(cfg, optimizer)
+        # Assume no other objects need to be checkpointed.
+        # We can later make it checkpoint the stateful hooks
+        self.checkpointer = DetectionCheckpointer(
+            # Assume you want to save checkpoints together with logs/statistics
+            model,
+            cfg.OUTPUT_DIR,
+            optimizer=optimizer,
+            scheduler=self.scheduler,
+        )
+        self.start_iter = 0
+        self.max_iter = cfg.SOLVER.MAX_ITER
+        self.cfg = cfg
+
+        self.register_hooks(self.build_hooks())
+
+    def resume_or_load(self, resume=True):
+        """
+        If `resume==True`, and last checkpoint exists, resume from it and load all
+        checkpointables (eg. optimizer and scheduler).
+
+        Otherwise, load the model specified by the config (skip all checkpointables).
+
+        Args:
+            resume (bool): whether to do resume or not
+        """
+        checkpoint = self.checkpointer.resume_or_load(self.cfg.MODEL.WEIGHTS, resume=resume)
+        self.start_iter = checkpoint.get("iteration", -1) if resume else -1
+        # The checkpoint stores the training iteration that just finished, thus we start
+        # at the next iteration (or iter zero if there's no checkpoint).
+        self.start_iter += 1
+
+    def build_hooks(self):
+        """
+        Build a list of default hooks, including timing, evaluation,
+        checkpointing, lr scheduling, precise BN, writing events.
+
+        Returns:
+            list[HookBase]:
+        """
+        cfg = self.cfg.clone()
+        cfg.defrost()
+        cfg.DATALOADER.NUM_WORKERS = 0  # save some memory and time for PreciseBN
+
+        ret = [
+            hooks.IterationTimer(),
+            hooks.LRScheduler(self.optimizer, self.scheduler),
+            hooks.PreciseBN(
+                # Run at the same freq as (but before) evaluation.
+                cfg.TEST.EVAL_PERIOD,
+                self.model,
+                # Build a new data loader to not affect training
+                self.build_train_loader(cfg),
+                cfg.TEST.PRECISE_BN.NUM_ITER,
+            )
+            if cfg.TEST.PRECISE_BN.ENABLED and get_bn_modules(self.model)
+            else None,
+        ]
+
+        # Do PreciseBN before checkpointer, because it updates the model and need to
+        # be saved by checkpointer.
+        # This is not always the best: if checkpointing has a different frequency,
+        # some checkpoints may have more precise statistics than others.
+        if comm.is_main_process():
+            ret.append(hooks.PeriodicCheckpointer(self.checkpointer, cfg.SOLVER.CHECKPOINT_PERIOD))
+
+        def test_and_save_results():
+            self._last_eval_results = self.test(self.cfg, self.model)
+            return self._last_eval_results
+
+        # Do evaluation after checkpointer, because then if it fails,
+        # we can use the saved checkpoint to debug.
+        ret.append(hooks.EvalHook(cfg.TEST.EVAL_PERIOD, test_and_save_results))
+
+        if comm.is_main_process():
+            # run writers in the end, so that evaluation metrics are written
+            ret.append(hooks.PeriodicWriter(self.build_writers(), period=20))
+        return ret
+
+    def build_writers(self):
+        """
+        Build a list of writers to be used. By default it contains
+        writers that write metrics to the screen,
+        a json file, and a tensorboard event file respectively.
+        If you'd like a different list of writers, you can overwrite it in
+        your trainer.
+
+        Returns:
+            list[EventWriter]: a list of :class:`EventWriter` objects.
+
+        It is now implemented by:
+
+        .. code-block:: python
+
+            return [
+                CommonMetricPrinter(self.max_iter),
+                JSONWriter(os.path.join(self.cfg.OUTPUT_DIR, "metrics.json")),
+                TensorboardXWriter(self.cfg.OUTPUT_DIR),
+            ]
+
+        """
+        # Here the default print/log frequency of each writer is used.
+        return [
+            # It may not always print what you want to see, since it prints "common" metrics only.
+            CommonMetricPrinter(self.max_iter),
+            JSONWriter(os.path.join(self.cfg.OUTPUT_DIR, "metrics.json")),
+            TensorboardXWriter(self.cfg.OUTPUT_DIR),
+        ]
+
+    def train(self):
+        """
+        Run training.
+
+        Returns:
+            OrderedDict of results, if evaluation is enabled. Otherwise None.
+        """
+        super().train(self.start_iter, self.max_iter)
+        if len(self.cfg.TEST.EXPECTED_RESULTS) and comm.is_main_process():
+            assert hasattr(
+                self, "_last_eval_results"
+            ), "No evaluation results obtained during training!"
+            verify_results(self.cfg, self._last_eval_results)
+            return self._last_eval_results
+
+    @classmethod
+    def build_model(cls, cfg):
+        """
+        Returns:
+            torch.nn.Module:
+
+        It now calls :func:`detectron2.modeling.build_model`.
+        Overwrite it if you'd like a different model.
+        """
+        model = build_model(cfg)
+        logger = logging.getLogger(__name__)
+        logger.info("Model:\n{}".format(model))
+        return model
+
+    @classmethod
+    def build_optimizer(cls, cfg, model):
+        """
+        Returns:
+            torch.optim.Optimizer:
+
+        It now calls :func:`detectron2.solver.build_optimizer`.
+        Overwrite it if you'd like a different optimizer.
+        """
+        return build_optimizer(cfg, model)
+
+    @classmethod
+    def build_lr_scheduler(cls, cfg, optimizer):
+        """
+        It now calls :func:`detectron2.solver.build_lr_scheduler`.
+        Overwrite it if you'd like a different scheduler.
+        """
+        return build_lr_scheduler(cfg, optimizer)
+
+    @classmethod
+    def build_train_loader(cls, cfg):
+        """
+        Returns:
+            iterable
+
+        It now calls :func:`detectron2.data.build_detection_train_loader`.
+        Overwrite it if you'd like a different data loader.
+        """
+        return build_detection_train_loader(cfg)
+
+    @classmethod
+    def build_test_loader(cls, cfg, dataset_name):
+        """
+        Returns:
+            iterable
+
+        It now calls :func:`detectron2.data.build_detection_test_loader`.
+        Overwrite it if you'd like a different data loader.
+        """
+        return build_detection_test_loader(cfg, dataset_name)
+
+    @classmethod
+    def build_evaluator(cls, cfg, dataset_name):
+        """
+        Returns:
+            DatasetEvaluator or None
+
+        It is not implemented by default.
+        """
+        raise NotImplementedError(
+            """
+If you want DefaultTrainer to automatically run evaluation,
+please implement `build_evaluator()` in subclasses (see train_net.py for example).
+Alternatively, you can call evaluation functions yourself (see Colab balloon tutorial for example).
+"""
+        )
+
+    @classmethod
+    def test(cls, cfg, model, evaluators=None):
+        """
+        Args:
+            cfg (CfgNode):
+            model (nn.Module):
+            evaluators (list[DatasetEvaluator] or None): if None, will call
+                :meth:`build_evaluator`. Otherwise, must have the same length as
+                `cfg.DATASETS.TEST`.
+
+        Returns:
+            dict: a dict of result metrics
+        """
+        logger = logging.getLogger(__name__)
+        if isinstance(evaluators, DatasetEvaluator):
+            evaluators = [evaluators]
+        if evaluators is not None:
+            assert len(cfg.DATASETS.TEST) == len(evaluators), "{} != {}".format(
+                len(cfg.DATASETS.TEST), len(evaluators)
+            )
+
+        results = OrderedDict()
+        for idx, dataset_name in enumerate(cfg.DATASETS.TEST):
+            data_loader = cls.build_test_loader(cfg, dataset_name)
+            # When evaluators are passed in as arguments,
+            # implicitly assume that evaluators can be created before data_loader.
+            if evaluators is not None:
+                evaluator = evaluators[idx]
+            else:
+                try:
+                    evaluator = cls.build_evaluator(cfg, dataset_name)
+                except NotImplementedError:
+                    logger.warn(
+                        "No evaluator found. Use `DefaultTrainer.test(evaluators=)`, "
+                        "or implement its `build_evaluator` method."
+                    )
+                    results[dataset_name] = {}
+                    continue
+            results_i = inference_on_dataset(model, data_loader, evaluator)
+            results[dataset_name] = results_i
+            if comm.is_main_process():
+                assert isinstance(
+                    results_i, dict
+                ), "Evaluator must return a dict on the main process. Got {} instead.".format(
+                    results_i
+                )
+                logger.info("Evaluation results for {} in csv format:".format(dataset_name))
+                print_csv_format(results_i)
+
+        if len(results) == 1:
+            results = list(results.values())[0]
+        return results
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/detectron2/engine/hooks.py b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/engine/hooks.py
new file mode 100644
index 0000000000000000000000000000000000000000..e5085b4561302d2328ab505568dec4e9fc5ee0ad
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/engine/hooks.py
@@ -0,0 +1,427 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+
+import datetime
+import itertools
+import logging
+import os
+import tempfile
+import time
+from collections import Counter
+import torch
+from fvcore.common.checkpoint import PeriodicCheckpointer as _PeriodicCheckpointer
+from fvcore.common.file_io import PathManager
+from fvcore.common.timer import Timer
+from fvcore.nn.precise_bn import get_bn_modules, update_bn_stats
+
+import detectron2.utils.comm as comm
+from detectron2.evaluation.testing import flatten_results_dict
+from detectron2.utils.events import EventStorage, EventWriter
+
+from .train_loop import HookBase
+
+__all__ = [
+    "CallbackHook",
+    "IterationTimer",
+    "PeriodicWriter",
+    "PeriodicCheckpointer",
+    "LRScheduler",
+    "AutogradProfiler",
+    "EvalHook",
+    "PreciseBN",
+]
+
+
+"""
+Implement some common hooks.
+"""
+
+
+class CallbackHook(HookBase):
+    """
+    Create a hook using callback functions provided by the user.
+    """
+
+    def __init__(self, *, before_train=None, after_train=None, before_step=None, after_step=None):
+        """
+        Each argument is a function that takes one argument: the trainer.
+        """
+        self._before_train = before_train
+        self._before_step = before_step
+        self._after_step = after_step
+        self._after_train = after_train
+
+    def before_train(self):
+        if self._before_train:
+            self._before_train(self.trainer)
+
+    def after_train(self):
+        if self._after_train:
+            self._after_train(self.trainer)
+        # The functions may be closures that hold reference to the trainer
+        # Therefore, delete them to avoid circular reference.
+        del self._before_train, self._after_train
+        del self._before_step, self._after_step
+
+    def before_step(self):
+        if self._before_step:
+            self._before_step(self.trainer)
+
+    def after_step(self):
+        if self._after_step:
+            self._after_step(self.trainer)
+
+
+class IterationTimer(HookBase):
+    """
+    Track the time spent for each iteration (each run_step call in the trainer).
+    Print a summary in the end of training.
+
+    This hook uses the time between the call to its :meth:`before_step`
+    and :meth:`after_step` methods.
+    Under the convention that :meth:`before_step` of all hooks should only
+    take negligible amount of time, the :class:`IterationTimer` hook should be
+    placed at the beginning of the list of hooks to obtain accurate timing.
+    """
+
+    def __init__(self, warmup_iter=3):
+        """
+        Args:
+            warmup_iter (int): the number of iterations at the beginning to exclude
+                from timing.
+        """
+        self._warmup_iter = warmup_iter
+        self._step_timer = Timer()
+        self._start_time = time.perf_counter()
+        self._total_timer = Timer()
+
+    def before_train(self):
+        self._start_time = time.perf_counter()
+        self._total_timer.reset()
+        self._total_timer.pause()
+
+    def after_train(self):
+        logger = logging.getLogger(__name__)
+        total_time = time.perf_counter() - self._start_time
+        total_time_minus_hooks = self._total_timer.seconds()
+        hook_time = total_time - total_time_minus_hooks
+
+        num_iter = self.trainer.iter + 1 - self.trainer.start_iter - self._warmup_iter
+
+        if num_iter > 0 and total_time_minus_hooks > 0:
+            # Speed is meaningful only after warmup
+            # NOTE this format is parsed by grep in some scripts
+            logger.info(
+                "Overall training speed: {} iterations in {} ({:.4f} s / it)".format(
+                    num_iter,
+                    str(datetime.timedelta(seconds=int(total_time_minus_hooks))),
+                    total_time_minus_hooks / num_iter,
+                )
+            )
+
+        logger.info(
+            "Total training time: {} ({} on hooks)".format(
+                str(datetime.timedelta(seconds=int(total_time))),
+                str(datetime.timedelta(seconds=int(hook_time))),
+            )
+        )
+
+    def before_step(self):
+        self._step_timer.reset()
+        self._total_timer.resume()
+
+    def after_step(self):
+        # +1 because we're in after_step
+        iter_done = self.trainer.iter - self.trainer.start_iter + 1
+        if iter_done >= self._warmup_iter:
+            sec = self._step_timer.seconds()
+            self.trainer.storage.put_scalars(time=sec)
+        else:
+            self._start_time = time.perf_counter()
+            self._total_timer.reset()
+
+        self._total_timer.pause()
+
+
+class PeriodicWriter(HookBase):
+    """
+    Write events to EventStorage periodically.
+
+    It is executed every ``period`` iterations and after the last iteration.
+    """
+
+    def __init__(self, writers, period=20):
+        """
+        Args:
+            writers (list[EventWriter]): a list of EventWriter objects
+            period (int):
+        """
+        self._writers = writers
+        for w in writers:
+            assert isinstance(w, EventWriter), w
+        self._period = period
+
+    def after_step(self):
+        if (self.trainer.iter + 1) % self._period == 0 or (
+            self.trainer.iter == self.trainer.max_iter - 1
+        ):
+            for writer in self._writers:
+                writer.write()
+
+    def after_train(self):
+        for writer in self._writers:
+            writer.close()
+
+
+class PeriodicCheckpointer(_PeriodicCheckpointer, HookBase):
+    """
+    Same as :class:`detectron2.checkpoint.PeriodicCheckpointer`, but as a hook.
+
+    Note that when used as a hook,
+    it is unable to save additional data other than what's defined
+    by the given `checkpointer`.
+
+    It is executed every ``period`` iterations and after the last iteration.
+    """
+
+    def before_train(self):
+        self.max_iter = self.trainer.max_iter
+
+    def after_step(self):
+        # No way to use **kwargs
+        self.step(self.trainer.iter)
+
+
+class LRScheduler(HookBase):
+    """
+    A hook which executes a torch builtin LR scheduler and summarizes the LR.
+    It is executed after every iteration.
+    """
+
+    def __init__(self, optimizer, scheduler):
+        """
+        Args:
+            optimizer (torch.optim.Optimizer):
+            scheduler (torch.optim._LRScheduler)
+        """
+        self._optimizer = optimizer
+        self._scheduler = scheduler
+
+        # NOTE: some heuristics on what LR to summarize
+        # summarize the param group with most parameters
+        largest_group = max(len(g["params"]) for g in optimizer.param_groups)
+
+        if largest_group == 1:
+            # If all groups have one parameter,
+            # then find the most common initial LR, and use it for summary
+            lr_count = Counter([g["lr"] for g in optimizer.param_groups])
+            lr = lr_count.most_common()[0][0]
+            for i, g in enumerate(optimizer.param_groups):
+                if g["lr"] == lr:
+                    self._best_param_group_id = i
+                    break
+        else:
+            for i, g in enumerate(optimizer.param_groups):
+                if len(g["params"]) == largest_group:
+                    self._best_param_group_id = i
+                    break
+
+    def after_step(self):
+        lr = self._optimizer.param_groups[self._best_param_group_id]["lr"]
+        self.trainer.storage.put_scalar("lr", lr, smoothing_hint=False)
+        self._scheduler.step()
+
+
+class AutogradProfiler(HookBase):
+    """
+    A hook which runs `torch.autograd.profiler.profile`.
+
+    Examples:
+
+    .. code-block:: python
+
+        hooks.AutogradProfiler(
+             lambda trainer: trainer.iter > 10 and trainer.iter < 20, self.cfg.OUTPUT_DIR
+        )
+
+    The above example will run the profiler for iteration 10~20 and dump
+    results to ``OUTPUT_DIR``. We did not profile the first few iterations
+    because they are typically slower than the rest.
+    The result files can be loaded in the ``chrome://tracing`` page in chrome browser.
+
+    Note:
+        When used together with NCCL on older version of GPUs,
+        autograd profiler may cause deadlock because it unnecessarily allocates
+        memory on every device it sees. The memory management calls, if
+        interleaved with NCCL calls, lead to deadlock on GPUs that do not
+        support `cudaLaunchCooperativeKernelMultiDevice`.
+    """
+
+    def __init__(self, enable_predicate, output_dir, *, use_cuda=True):
+        """
+        Args:
+            enable_predicate (callable[trainer -> bool]): a function which takes a trainer,
+                and returns whether to enable the profiler.
+                It will be called once every step, and can be used to select which steps to profile.
+            output_dir (str): the output directory to dump tracing files.
+            use_cuda (bool): same as in `torch.autograd.profiler.profile`.
+        """
+        self._enable_predicate = enable_predicate
+        self._use_cuda = use_cuda
+        self._output_dir = output_dir
+
+    def before_step(self):
+        if self._enable_predicate(self.trainer):
+            self._profiler = torch.autograd.profiler.profile(use_cuda=self._use_cuda)
+            self._profiler.__enter__()
+        else:
+            self._profiler = None
+
+    def after_step(self):
+        if self._profiler is None:
+            return
+        self._profiler.__exit__(None, None, None)
+        PathManager.mkdirs(self._output_dir)
+        out_file = os.path.join(
+            self._output_dir, "profiler-trace-iter{}.json".format(self.trainer.iter)
+        )
+        if "://" not in out_file:
+            self._profiler.export_chrome_trace(out_file)
+        else:
+            # Support non-posix filesystems
+            with tempfile.TemporaryDirectory(prefix="detectron2_profiler") as d:
+                tmp_file = os.path.join(d, "tmp.json")
+                self._profiler.export_chrome_trace(tmp_file)
+                with open(tmp_file) as f:
+                    content = f.read()
+            with PathManager.open(out_file, "w") as f:
+                f.write(content)
+
+
+class EvalHook(HookBase):
+    """
+    Run an evaluation function periodically, and at the end of training.
+
+    It is executed every ``eval_period`` iterations and after the last iteration.
+    """
+
+    def __init__(self, eval_period, eval_function):
+        """
+        Args:
+            eval_period (int): the period to run `eval_function`.
+            eval_function (callable): a function which takes no arguments, and
+                returns a nested dict of evaluation metrics.
+
+        Note:
+            This hook must be enabled in all or none workers.
+            If you would like only certain workers to perform evaluation,
+            give other workers a no-op function (`eval_function=lambda: None`).
+        """
+        self._period = eval_period
+        self._func = eval_function
+
+    def _do_eval(self):
+        results = self._func()
+
+        if results:
+            assert isinstance(
+                results, dict
+            ), "Eval function must return a dict. Got {} instead.".format(results)
+
+            flattened_results = flatten_results_dict(results)
+            for k, v in flattened_results.items():
+                try:
+                    v = float(v)
+                except Exception:
+                    raise ValueError(
+                        "[EvalHook] eval_function should return a nested dict of float. "
+                        "Got '{}: {}' instead.".format(k, v)
+                    )
+            self.trainer.storage.put_scalars(**flattened_results, smoothing_hint=False)
+
+        # Evaluation may take different time among workers.
+        # A barrier make them start the next iteration together.
+        comm.synchronize()
+
+    def after_step(self):
+        next_iter = self.trainer.iter + 1
+        is_final = next_iter == self.trainer.max_iter
+        if is_final or (self._period > 0 and next_iter % self._period == 0):
+            self._do_eval()
+
+    def after_train(self):
+        # func is likely a closure that holds reference to the trainer
+        # therefore we clean it to avoid circular reference in the end
+        del self._func
+
+
+class PreciseBN(HookBase):
+    """
+    The standard implementation of BatchNorm uses EMA in inference, which is
+    sometimes suboptimal.
+    This class computes the true average of statistics rather than the moving average,
+    and put true averages to every BN layer in the given model.
+
+    It is executed every ``period`` iterations and after the last iteration.
+    """
+
+    def __init__(self, period, model, data_loader, num_iter):
+        """
+        Args:
+            period (int): the period this hook is run, or 0 to not run during training.
+                The hook will always run in the end of training.
+            model (nn.Module): a module whose all BN layers in training mode will be
+                updated by precise BN.
+                Note that user is responsible for ensuring the BN layers to be
+                updated are in training mode when this hook is triggered.
+            data_loader (iterable): it will produce data to be run by `model(data)`.
+            num_iter (int): number of iterations used to compute the precise
+                statistics.
+        """
+        self._logger = logging.getLogger(__name__)
+        if len(get_bn_modules(model)) == 0:
+            self._logger.info(
+                "PreciseBN is disabled because model does not contain BN layers in training mode."
+            )
+            self._disabled = True
+            return
+
+        self._model = model
+        self._data_loader = data_loader
+        self._num_iter = num_iter
+        self._period = period
+        self._disabled = False
+
+        self._data_iter = None
+
+    def after_step(self):
+        next_iter = self.trainer.iter + 1
+        is_final = next_iter == self.trainer.max_iter
+        if is_final or (self._period > 0 and next_iter % self._period == 0):
+            self.update_stats()
+
+    def update_stats(self):
+        """
+        Update the model with precise statistics. Users can manually call this method.
+        """
+        if self._disabled:
+            return
+
+        if self._data_iter is None:
+            self._data_iter = iter(self._data_loader)
+
+        def data_loader():
+            for num_iter in itertools.count(1):
+                if num_iter % 100 == 0:
+                    self._logger.info(
+                        "Running precise-BN ... {}/{} iterations.".format(num_iter, self._num_iter)
+                    )
+                # This way we can reuse the same iterator
+                yield next(self._data_iter)
+
+        with EventStorage():  # capture events in a new storage to discard them
+            self._logger.info(
+                "Running precise-BN for {} iterations...  ".format(self._num_iter)
+                + "Note that this could produce different statistics every time."
+            )
+            update_bn_stats(self._model, data_loader(), self._num_iter)
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/detectron2/engine/launch.py b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/engine/launch.py
new file mode 100644
index 0000000000000000000000000000000000000000..9efbb0395d2c788d8cfe2cbbf66cde6ddc053585
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/engine/launch.py
@@ -0,0 +1,89 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+import logging
+import torch
+import torch.distributed as dist
+import torch.multiprocessing as mp
+
+from detectron2.utils import comm
+
+__all__ = ["launch"]
+
+
+def _find_free_port():
+    import socket
+
+    sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+    # Binding to port 0 will cause the OS to find an available port for us
+    sock.bind(("", 0))
+    port = sock.getsockname()[1]
+    sock.close()
+    # NOTE: there is still a chance the port could be taken by other processes.
+    return port
+
+
+def launch(main_func, num_gpus_per_machine, num_machines=1, machine_rank=0, dist_url=None, args=()):
+    """
+    Args:
+        main_func: a function that will be called by `main_func(*args)`
+        num_machines (int): the total number of machines
+        machine_rank (int): the rank of this machine (one per machine)
+        dist_url (str): url to connect to for distributed jobs, including protocol
+                       e.g. "tcp://127.0.0.1:8686".
+                       Can be set to "auto" to automatically select a free port on localhost
+        args (tuple): arguments passed to main_func
+    """
+    world_size = num_machines * num_gpus_per_machine
+    if world_size > 1:
+        # https://github.com/pytorch/pytorch/pull/14391
+        # TODO prctl in spawned processes
+
+        if dist_url == "auto":
+            assert num_machines == 1, "dist_url=auto not supported in multi-machine jobs."
+            port = _find_free_port()
+            dist_url = f"tcp://127.0.0.1:{port}"
+        if num_machines > 1 and dist_url.startswith("file://"):
+            logger = logging.getLogger(__name__)
+            logger.warning(
+                "file:// is not a reliable init_method in multi-machine jobs. Prefer tcp://"
+            )
+
+        mp.spawn(
+            _distributed_worker,
+            nprocs=num_gpus_per_machine,
+            args=(main_func, world_size, num_gpus_per_machine, machine_rank, dist_url, args),
+            daemon=False,
+        )
+    else:
+        main_func(*args)
+
+
+def _distributed_worker(
+    local_rank, main_func, world_size, num_gpus_per_machine, machine_rank, dist_url, args
+):
+    assert torch.cuda.is_available(), "cuda is not available. Please check your installation."
+    global_rank = machine_rank * num_gpus_per_machine + local_rank
+    try:
+        dist.init_process_group(
+            backend="NCCL", init_method=dist_url, world_size=world_size, rank=global_rank
+        )
+    except Exception as e:
+        logger = logging.getLogger(__name__)
+        logger.error("Process group URL: {}".format(dist_url))
+        raise e
+    # synchronize is needed here to prevent a possible timeout after calling init_process_group
+    # See: https://github.com/facebookresearch/maskrcnn-benchmark/issues/172
+    comm.synchronize()
+
+    assert num_gpus_per_machine <= torch.cuda.device_count()
+    torch.cuda.set_device(local_rank)
+
+    # Setup the local process group (which contains ranks within the same machine)
+    assert comm._LOCAL_PROCESS_GROUP is None
+    num_machines = world_size // num_gpus_per_machine
+    for i in range(num_machines):
+        ranks_on_i = list(range(i * num_gpus_per_machine, (i + 1) * num_gpus_per_machine))
+        pg = dist.new_group(ranks_on_i)
+        if i == machine_rank:
+            comm._LOCAL_PROCESS_GROUP = pg
+
+    main_func(*args)
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/detectron2/engine/train_loop.py b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/engine/train_loop.py
new file mode 100644
index 0000000000000000000000000000000000000000..453c9acfde2d65a182fbf18a6bce4b4583df5ca5
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/engine/train_loop.py
@@ -0,0 +1,273 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+
+import logging
+import numpy as np
+import time
+import weakref
+import torch
+
+import detectron2.utils.comm as comm
+from detectron2.utils.events import EventStorage
+
+__all__ = ["HookBase", "TrainerBase", "SimpleTrainer"]
+
+
+class HookBase:
+    """
+    Base class for hooks that can be registered with :class:`TrainerBase`.
+
+    Each hook can implement 4 methods. The way they are called is demonstrated
+    in the following snippet:
+
+    .. code-block:: python
+
+        hook.before_train()
+        for iter in range(start_iter, max_iter):
+            hook.before_step()
+            trainer.run_step()
+            hook.after_step()
+        hook.after_train()
+
+    Notes:
+        1. In the hook method, users can access `self.trainer` to access more
+           properties about the context (e.g., current iteration).
+
+        2. A hook that does something in :meth:`before_step` can often be
+           implemented equivalently in :meth:`after_step`.
+           If the hook takes non-trivial time, it is strongly recommended to
+           implement the hook in :meth:`after_step` instead of :meth:`before_step`.
+           The convention is that :meth:`before_step` should only take negligible time.
+
+           Following this convention will allow hooks that do care about the difference
+           between :meth:`before_step` and :meth:`after_step` (e.g., timer) to
+           function properly.
+
+    Attributes:
+        trainer: A weak reference to the trainer object. Set by the trainer when the hook is
+            registered.
+    """
+
+    def before_train(self):
+        """
+        Called before the first iteration.
+        """
+        pass
+
+    def after_train(self):
+        """
+        Called after the last iteration.
+        """
+        pass
+
+    def before_step(self):
+        """
+        Called before each iteration.
+        """
+        pass
+
+    def after_step(self):
+        """
+        Called after each iteration.
+        """
+        pass
+
+
+class TrainerBase:
+    """
+    Base class for iterative trainer with hooks.
+
+    The only assumption we made here is: the training runs in a loop.
+    A subclass can implement what the loop is.
+    We made no assumptions about the existence of dataloader, optimizer, model, etc.
+
+    Attributes:
+        iter(int): the current iteration.
+
+        start_iter(int): The iteration to start with.
+            By convention the minimum possible value is 0.
+
+        max_iter(int): The iteration to end training.
+
+        storage(EventStorage): An EventStorage that's opened during the course of training.
+    """
+
+    def __init__(self):
+        self._hooks = []
+
+    def register_hooks(self, hooks):
+        """
+        Register hooks to the trainer. The hooks are executed in the order
+        they are registered.
+
+        Args:
+            hooks (list[Optional[HookBase]]): list of hooks
+        """
+        hooks = [h for h in hooks if h is not None]
+        for h in hooks:
+            assert isinstance(h, HookBase)
+            # To avoid circular reference, hooks and trainer cannot own each other.
+            # This normally does not matter, but will cause memory leak if the
+            # involved objects contain __del__:
+            # See http://engineering.hearsaysocial.com/2013/06/16/circular-references-in-python/
+            h.trainer = weakref.proxy(self)
+        self._hooks.extend(hooks)
+
+    def train(self, start_iter: int, max_iter: int):
+        """
+        Args:
+            start_iter, max_iter (int): See docs above
+        """
+        logger = logging.getLogger(__name__)
+        logger.info("Starting training from iteration {}".format(start_iter))
+
+        self.iter = self.start_iter = start_iter
+        self.max_iter = max_iter
+
+        with EventStorage(start_iter) as self.storage:
+            try:
+                self.before_train()
+                for self.iter in range(start_iter, max_iter):
+                    self.before_step()
+                    self.run_step()
+                    self.after_step()
+            except Exception:
+                logger.exception("Exception during training:")
+                raise
+            finally:
+                self.after_train()
+
+    def before_train(self):
+        for h in self._hooks:
+            h.before_train()
+
+    def after_train(self):
+        for h in self._hooks:
+            h.after_train()
+
+    def before_step(self):
+        for h in self._hooks:
+            h.before_step()
+
+    def after_step(self):
+        for h in self._hooks:
+            h.after_step()
+        # this guarantees, that in each hook's after_step, storage.iter == trainer.iter
+        self.storage.step()
+
+    def run_step(self):
+        raise NotImplementedError
+
+
+class SimpleTrainer(TrainerBase):
+    """
+    A simple trainer for the most common type of task:
+    single-cost single-optimizer single-data-source iterative optimization.
+    It assumes that every step, you:
+
+    1. Compute the loss with a data from the data_loader.
+    2. Compute the gradients with the above loss.
+    3. Update the model with the optimizer.
+
+    If you want to do anything fancier than this,
+    either subclass TrainerBase and implement your own `run_step`,
+    or write your own training loop.
+    """
+
+    def __init__(self, model, data_loader, optimizer):
+        """
+        Args:
+            model: a torch Module. Takes a data from data_loader and returns a
+                dict of losses.
+            data_loader: an iterable. Contains data to be used to call model.
+            optimizer: a torch optimizer.
+        """
+        super().__init__()
+
+        """
+        We set the model to training mode in the trainer.
+        However it's valid to train a model that's in eval mode.
+        If you want your model (or a submodule of it) to behave
+        like evaluation during training, you can overwrite its train() method.
+        """
+        model.train()
+
+        self.model = model
+        self.data_loader = data_loader
+        self._data_loader_iter = iter(data_loader)
+        self.optimizer = optimizer
+
+    def run_step(self):
+        """
+        Implement the standard training logic described above.
+        """
+        assert self.model.training, "[SimpleTrainer] model was changed to eval mode!"
+        start = time.perf_counter()
+        """
+        If you want to do something with the data, you can wrap the dataloader.
+        """
+        data = next(self._data_loader_iter)
+        data_time = time.perf_counter() - start
+
+        """
+        If you want to do something with the losses, you can wrap the model.
+        """
+        loss_dict = self.model(data)
+        losses = sum(loss_dict.values())
+        self._detect_anomaly(losses, loss_dict)
+
+        metrics_dict = loss_dict
+        metrics_dict["data_time"] = data_time
+        self._write_metrics(metrics_dict)
+
+        """
+        If you need to accumulate gradients or something similar, you can
+        wrap the optimizer with your custom `zero_grad()` method.
+        """
+        self.optimizer.zero_grad()
+        losses.backward()
+
+        """
+        If you need gradient clipping/scaling or other processing, you can
+        wrap the optimizer with your custom `step()` method.
+        """
+        self.optimizer.step()
+
+    def _detect_anomaly(self, losses, loss_dict):
+        if not torch.isfinite(losses).all():
+            raise FloatingPointError(
+                "Loss became infinite or NaN at iteration={}!\nloss_dict = {}".format(
+                    self.iter, loss_dict
+                )
+            )
+
+    def _write_metrics(self, metrics_dict: dict):
+        """
+        Args:
+            metrics_dict (dict): dict of scalar metrics
+        """
+        metrics_dict = {
+            k: v.detach().cpu().item() if isinstance(v, torch.Tensor) else float(v)
+            for k, v in metrics_dict.items()
+        }
+        # gather metrics among all workers for logging
+        # This assumes we do DDP-style training, which is currently the only
+        # supported method in detectron2.
+        all_metrics_dict = comm.gather(metrics_dict)
+
+        if comm.is_main_process():
+            if "data_time" in all_metrics_dict[0]:
+                # data_time among workers can have high variance. The actual latency
+                # caused by data_time is the maximum among workers.
+                data_time = np.max([x.pop("data_time") for x in all_metrics_dict])
+                self.storage.put_scalar("data_time", data_time)
+
+            # average the rest metrics
+            metrics_dict = {
+                k: np.mean([x[k] for x in all_metrics_dict]) for k in all_metrics_dict[0].keys()
+            }
+            total_losses_reduced = sum(loss for loss in metrics_dict.values())
+
+            self.storage.put_scalar("total_loss", total_losses_reduced)
+            if len(metrics_dict) > 1:
+                self.storage.put_scalars(**metrics_dict)
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/detectron2/evaluation/__init__.py b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/evaluation/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..f1d2f1001af2eb46060db362a94d9dae26e3fb4e
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/evaluation/__init__.py
@@ -0,0 +1,12 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+from .cityscapes_evaluation import CityscapesInstanceEvaluator, CityscapesSemSegEvaluator
+from .coco_evaluation import COCOEvaluator
+from .rotated_coco_evaluation import RotatedCOCOEvaluator
+from .evaluator import DatasetEvaluator, DatasetEvaluators, inference_context, inference_on_dataset
+from .lvis_evaluation import LVISEvaluator
+from .panoptic_evaluation import COCOPanopticEvaluator
+from .pascal_voc_evaluation import PascalVOCDetectionEvaluator
+from .sem_seg_evaluation import SemSegEvaluator
+from .testing import print_csv_format, verify_results
+
+__all__ = [k for k in globals().keys() if not k.startswith("_")]
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/detectron2/evaluation/cityscapes_evaluation.py b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/evaluation/cityscapes_evaluation.py
new file mode 100644
index 0000000000000000000000000000000000000000..f6287a8980b10d9d13f0f0e6a0f0e1a16ff3566c
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/evaluation/cityscapes_evaluation.py
@@ -0,0 +1,187 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+import glob
+import logging
+import numpy as np
+import os
+import tempfile
+from collections import OrderedDict
+import torch
+from fvcore.common.file_io import PathManager
+from PIL import Image
+
+from detectron2.data import MetadataCatalog
+from detectron2.utils import comm
+
+from .evaluator import DatasetEvaluator
+
+
+class CityscapesEvaluator(DatasetEvaluator):
+    """
+    Base class for evaluation using cityscapes API.
+    """
+
+    def __init__(self, dataset_name):
+        """
+        Args:
+            dataset_name (str): the name of the dataset.
+                It must have the following metadata associated with it:
+                "thing_classes", "gt_dir".
+        """
+        self._metadata = MetadataCatalog.get(dataset_name)
+        self._cpu_device = torch.device("cpu")
+        self._logger = logging.getLogger(__name__)
+
+    def reset(self):
+        self._working_dir = tempfile.TemporaryDirectory(prefix="cityscapes_eval_")
+        self._temp_dir = self._working_dir.name
+        # All workers will write to the same results directory
+        # TODO this does not work in distributed training
+        self._temp_dir = comm.all_gather(self._temp_dir)[0]
+        if self._temp_dir != self._working_dir.name:
+            self._working_dir.cleanup()
+        self._logger.info(
+            "Writing cityscapes results to temporary directory {} ...".format(self._temp_dir)
+        )
+
+
+class CityscapesInstanceEvaluator(CityscapesEvaluator):
+    """
+    Evaluate instance segmentation results using cityscapes API.
+
+    Note:
+        * It does not work in multi-machine distributed training.
+        * It contains a synchronization, therefore has to be used on all ranks.
+        * Only the main process runs evaluation.
+    """
+
+    def process(self, inputs, outputs):
+        from cityscapesscripts.helpers.labels import name2label
+
+        for input, output in zip(inputs, outputs):
+            file_name = input["file_name"]
+            basename = os.path.splitext(os.path.basename(file_name))[0]
+            pred_txt = os.path.join(self._temp_dir, basename + "_pred.txt")
+
+            output = output["instances"].to(self._cpu_device)
+            num_instances = len(output)
+            with open(pred_txt, "w") as fout:
+                for i in range(num_instances):
+                    pred_class = output.pred_classes[i]
+                    classes = self._metadata.thing_classes[pred_class]
+                    class_id = name2label[classes].id
+                    score = output.scores[i]
+                    mask = output.pred_masks[i].numpy().astype("uint8")
+                    png_filename = os.path.join(
+                        self._temp_dir, basename + "_{}_{}.png".format(i, classes)
+                    )
+
+                    Image.fromarray(mask * 255).save(png_filename)
+                    fout.write("{} {} {}\n".format(os.path.basename(png_filename), class_id, score))
+
+    def evaluate(self):
+        """
+        Returns:
+            dict: has a key "segm", whose value is a dict of "AP" and "AP50".
+        """
+        comm.synchronize()
+        if comm.get_rank() > 0:
+            return
+        import cityscapesscripts.evaluation.evalInstanceLevelSemanticLabeling as cityscapes_eval
+
+        self._logger.info("Evaluating results under {} ...".format(self._temp_dir))
+
+        # set some global states in cityscapes evaluation API, before evaluating
+        cityscapes_eval.args.predictionPath = os.path.abspath(self._temp_dir)
+        cityscapes_eval.args.predictionWalk = None
+        cityscapes_eval.args.JSONOutput = False
+        cityscapes_eval.args.colorized = False
+        cityscapes_eval.args.gtInstancesFile = os.path.join(self._temp_dir, "gtInstances.json")
+
+        # These lines are adopted from
+        # https://github.com/mcordts/cityscapesScripts/blob/master/cityscapesscripts/evaluation/evalInstanceLevelSemanticLabeling.py # noqa
+        gt_dir = PathManager.get_local_path(self._metadata.gt_dir)
+        groundTruthImgList = glob.glob(os.path.join(gt_dir, "*", "*_gtFine_instanceIds.png"))
+        assert len(
+            groundTruthImgList
+        ), "Cannot find any ground truth images to use for evaluation. Searched for: {}".format(
+            cityscapes_eval.args.groundTruthSearch
+        )
+        predictionImgList = []
+        for gt in groundTruthImgList:
+            predictionImgList.append(cityscapes_eval.getPrediction(gt, cityscapes_eval.args))
+        results = cityscapes_eval.evaluateImgLists(
+            predictionImgList, groundTruthImgList, cityscapes_eval.args
+        )["averages"]
+
+        ret = OrderedDict()
+        ret["segm"] = {"AP": results["allAp"] * 100, "AP50": results["allAp50%"] * 100}
+        self._working_dir.cleanup()
+        return ret
+
+
+class CityscapesSemSegEvaluator(CityscapesEvaluator):
+    """
+    Evaluate semantic segmentation results using cityscapes API.
+
+    Note:
+        * It does not work in multi-machine distributed training.
+        * It contains a synchronization, therefore has to be used on all ranks.
+        * Only the main process runs evaluation.
+    """
+
+    def process(self, inputs, outputs):
+        from cityscapesscripts.helpers.labels import trainId2label
+
+        for input, output in zip(inputs, outputs):
+            file_name = input["file_name"]
+            basename = os.path.splitext(os.path.basename(file_name))[0]
+            pred_filename = os.path.join(self._temp_dir, basename + "_pred.png")
+
+            output = output["sem_seg"].argmax(dim=0).to(self._cpu_device).numpy()
+            pred = 255 * np.ones(output.shape, dtype=np.uint8)
+            for train_id, label in trainId2label.items():
+                if label.ignoreInEval:
+                    continue
+                pred[output == train_id] = label.id
+            Image.fromarray(pred).save(pred_filename)
+
+    def evaluate(self):
+        comm.synchronize()
+        if comm.get_rank() > 0:
+            return
+        # Load the Cityscapes eval script *after* setting the required env var,
+        # since the script reads CITYSCAPES_DATASET into global variables at load time.
+        import cityscapesscripts.evaluation.evalPixelLevelSemanticLabeling as cityscapes_eval
+
+        self._logger.info("Evaluating results under {} ...".format(self._temp_dir))
+
+        # set some global states in cityscapes evaluation API, before evaluating
+        cityscapes_eval.args.predictionPath = os.path.abspath(self._temp_dir)
+        cityscapes_eval.args.predictionWalk = None
+        cityscapes_eval.args.JSONOutput = False
+        cityscapes_eval.args.colorized = False
+
+        # These lines are adopted from
+        # https://github.com/mcordts/cityscapesScripts/blob/master/cityscapesscripts/evaluation/evalPixelLevelSemanticLabeling.py # noqa
+        gt_dir = PathManager.get_local_path(self._metadata.gt_dir)
+        groundTruthImgList = glob.glob(os.path.join(gt_dir, "*", "*_gtFine_labelIds.png"))
+        assert len(
+            groundTruthImgList
+        ), "Cannot find any ground truth images to use for evaluation. Searched for: {}".format(
+            cityscapes_eval.args.groundTruthSearch
+        )
+        predictionImgList = []
+        for gt in groundTruthImgList:
+            predictionImgList.append(cityscapes_eval.getPrediction(cityscapes_eval.args, gt))
+        results = cityscapes_eval.evaluateImgLists(
+            predictionImgList, groundTruthImgList, cityscapes_eval.args
+        )
+        ret = OrderedDict()
+        ret["sem_seg"] = {
+            "IoU": 100.0 * results["averageScoreClasses"],
+            "iIoU": 100.0 * results["averageScoreInstClasses"],
+            "IoU_sup": 100.0 * results["averageScoreCategories"],
+            "iIoU_sup": 100.0 * results["averageScoreInstCategories"],
+        }
+        self._working_dir.cleanup()
+        return ret
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/detectron2/evaluation/coco_evaluation.py b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/evaluation/coco_evaluation.py
new file mode 100644
index 0000000000000000000000000000000000000000..64b0903a43187db785113267ed16e82be6f5b28c
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/evaluation/coco_evaluation.py
@@ -0,0 +1,512 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+import contextlib
+import copy
+import io
+import itertools
+import json
+import logging
+import numpy as np
+import os
+import pickle
+from collections import OrderedDict
+import pycocotools.mask as mask_util
+import torch
+from fvcore.common.file_io import PathManager
+from pycocotools.coco import COCO
+from pycocotools.cocoeval import COCOeval
+from tabulate import tabulate
+
+import detectron2.utils.comm as comm
+from detectron2.data import MetadataCatalog
+from detectron2.data.datasets.coco import convert_to_coco_json
+from detectron2.structures import Boxes, BoxMode, pairwise_iou
+from detectron2.utils.logger import create_small_table
+
+from .evaluator import DatasetEvaluator
+
+
+class COCOEvaluator(DatasetEvaluator):
+    """
+    Evaluate object proposal, instance detection/segmentation, keypoint detection
+    outputs using COCO's metrics and APIs.
+    """
+
+    def __init__(self, dataset_name, cfg, distributed, output_dir=None):
+        """
+        Args:
+            dataset_name (str): name of the dataset to be evaluated.
+                It must have either the following corresponding metadata:
+
+                    "json_file": the path to the COCO format annotation
+
+                Or it must be in detectron2's standard dataset format
+                so it can be converted to COCO format automatically.
+            cfg (CfgNode): config instance
+            distributed (True): if True, will collect results from all ranks and run evaluation
+                in the main process.
+                Otherwise, will evaluate the results in the current process.
+            output_dir (str): optional, an output directory to dump all
+                results predicted on the dataset. The dump contains two files:
+
+                1. "instance_predictions.pth" a file in torch serialization
+                   format that contains all the raw original predictions.
+                2. "coco_instances_results.json" a json file in COCO's result
+                   format.
+        """
+        self._tasks = self._tasks_from_config(cfg)
+        self._distributed = distributed
+        self._output_dir = output_dir
+
+        self._cpu_device = torch.device("cpu")
+        self._logger = logging.getLogger(__name__)
+
+        self._metadata = MetadataCatalog.get(dataset_name)
+        if not hasattr(self._metadata, "json_file"):
+            self._logger.warning(
+                f"json_file was not found in MetaDataCatalog for '{dataset_name}'."
+                " Trying to convert it to COCO format ..."
+            )
+
+            cache_path = os.path.join(output_dir, f"{dataset_name}_coco_format.json")
+            self._metadata.json_file = cache_path
+            convert_to_coco_json(dataset_name, cache_path)
+
+        json_file = PathManager.get_local_path(self._metadata.json_file)
+        with contextlib.redirect_stdout(io.StringIO()):
+            self._coco_api = COCO(json_file)
+
+        self._kpt_oks_sigmas = cfg.TEST.KEYPOINT_OKS_SIGMAS
+        # Test set json files do not contain annotations (evaluation must be
+        # performed using the COCO evaluation server).
+        self._do_evaluation = "annotations" in self._coco_api.split_name
+
+    def reset(self):
+        self._predictions = []
+
+    def _tasks_from_config(self, cfg):
+        """
+        Returns:
+            tuple[str]: tasks that can be evaluated under the given configuration.
+        """
+        tasks = ("bbox",)
+        if cfg.MODEL.MASK_ON:
+            tasks = tasks + ("segm",)
+        if cfg.MODEL.KEYPOINT_ON:
+            tasks = tasks + ("keypoints",)
+        return tasks
+
+    def process(self, inputs, outputs):
+        """
+        Args:
+            inputs: the inputs to a COCO model (e.g., GeneralizedRCNN).
+                It is a list of dict. Each dict corresponds to an image and
+                contains keys like "height", "width", "file_name", "image_id".
+            outputs: the outputs of a COCO model. It is a list of dicts with key
+                "instances" that contains :class:`Instances`.
+        """
+        for input, output in zip(inputs, outputs):
+            prediction = {"image_id": input["image_id"]}
+
+            # TODO this is ugly
+            if "instances" in output:
+                instances = output["instances"].to(self._cpu_device)
+                prediction["instances"] = instances_to_coco_json(instances, input["image_id"])
+            if "proposals" in output:
+                prediction["proposals"] = output["proposals"].to(self._cpu_device)
+            self._predictions.append(prediction)
+
+    def evaluate(self):
+        if self._distributed:
+            comm.synchronize()
+            predictions = comm.gather(self._predictions, dst=0)
+            predictions = list(itertools.chain(*predictions))
+
+            if not comm.is_main_process():
+                return {}
+        else:
+            predictions = self._predictions
+
+        if len(predictions) == 0:
+            self._logger.warning("[COCOEvaluator] Did not receive valid predictions.")
+            return {}
+
+        if self._output_dir:
+            PathManager.mkdirs(self._output_dir)
+            file_path = os.path.join(self._output_dir, "instances_predictions.pth")
+            with PathManager.open(file_path, "wb") as f:
+                torch.save(predictions, f)
+
+        self._results = OrderedDict()
+        if "proposals" in predictions[0]:
+            self._eval_box_proposals(predictions)
+        if "instances" in predictions[0]:
+            self._eval_predictions(set(self._tasks), predictions)
+        # Copy so the caller can do whatever with results
+        return copy.deepcopy(self._results)
+
+    def _eval_predictions(self, tasks, predictions):
+        """
+        Evaluate predictions on the given tasks.
+        Fill self._results with the metrics of the tasks.
+        """
+        self._logger.info("Preparing results for COCO format ...")
+        coco_results = list(itertools.chain(*[x["instances"] for x in predictions]))
+
+        # unmap the category ids for COCO
+        if hasattr(self._metadata, "thing_dataset_id_to_contiguous_id"):
+            reverse_id_mapping = {
+                v: k for k, v in self._metadata.thing_dataset_id_to_contiguous_id.items()
+            }
+            for result in coco_results:
+                category_id = result["category_id"]
+                assert (
+                    category_id in reverse_id_mapping
+                ), "A prediction has category_id={}, which is not available in the dataset.".format(
+                    category_id
+                )
+                result["category_id"] = reverse_id_mapping[category_id]
+
+        if self._output_dir:
+            file_path = os.path.join(self._output_dir, "coco_instances_results.json")
+            self._logger.info("Saving results to {}".format(file_path))
+            with PathManager.open(file_path, "w") as f:
+                f.write(json.dumps(coco_results))
+                f.flush()
+
+        if not self._do_evaluation:
+            self._logger.info("Annotations are not available for evaluation.")
+            return
+
+        self._logger.info("Evaluating predictions ...")
+        for task in sorted(tasks):
+            coco_eval = (
+                _evaluate_predictions_on_coco(
+                    self._coco_api, coco_results, task, kpt_oks_sigmas=self._kpt_oks_sigmas
+                )
+                if len(coco_results) > 0
+                else None  # cocoapi does not handle empty results very well
+            )
+
+            res = self._derive_coco_results(
+                coco_eval, task, class_names=self._metadata.get("thing_classes")
+            )
+            self._results[task] = res
+
+    def _eval_box_proposals(self, predictions):
+        """
+        Evaluate the box proposals in predictions.
+        Fill self._results with the metrics for "box_proposals" task.
+        """
+        if self._output_dir:
+            # Saving generated box proposals to file.
+            # Predicted box_proposals are in XYXY_ABS mode.
+            bbox_mode = BoxMode.XYXY_ABS.value
+            ids, boxes, objectness_logits = [], [], []
+            for prediction in predictions:
+                ids.append(prediction["image_id"])
+                boxes.append(prediction["proposals"].proposal_boxes.tensor.numpy())
+                objectness_logits.append(prediction["proposals"].objectness_logits.numpy())
+
+            proposal_data = {
+                "boxes": boxes,
+                "objectness_logits": objectness_logits,
+                "ids": ids,
+                "bbox_mode": bbox_mode,
+            }
+            with PathManager.open(os.path.join(self._output_dir, "box_proposals.pkl"), "wb") as f:
+                pickle.dump(proposal_data, f)
+
+        if not self._do_evaluation:
+            self._logger.info("Annotations are not available for evaluation.")
+            return
+
+        self._logger.info("Evaluating bbox proposals ...")
+        res = {}
+        areas = {"all": "", "small": "s", "medium": "m", "large": "l"}
+        for limit in [100, 1000]:
+            for area, suffix in areas.items():
+                stats = _evaluate_box_proposals(predictions, self._coco_api, area=area, limit=limit)
+                key = "AR{}@{:d}".format(suffix, limit)
+                res[key] = float(stats["ar"].item() * 100)
+        self._logger.info("Proposal metrics: \n" + create_small_table(res))
+        self._results["box_proposals"] = res
+
+    def _derive_coco_results(self, coco_eval, iou_type, class_names=None):
+        """
+        Derive the desired score numbers from summarized COCOeval.
+
+        Args:
+            coco_eval (None or COCOEval): None represents no predictions from model.
+            iou_type (str):
+            class_names (None or list[str]): if provided, will use it to predict
+                per-category AP.
+
+        Returns:
+            a dict of {metric name: score}
+        """
+
+        metrics = {
+            "bbox": ["AP", "AP50", "AP75", "APs", "APm", "APl"],
+            "segm": ["AP", "AP50", "AP75", "APs", "APm", "APl"],
+            "keypoints": ["AP", "AP50", "AP75", "APm", "APl"],
+        }[iou_type]
+
+        if coco_eval is None:
+            self._logger.warn("No predictions from the model!")
+            return {metric: float("nan") for metric in metrics}
+
+        # the standard metrics
+        results = {
+            metric: float(coco_eval.stats[idx] * 100 if coco_eval.stats[idx] >= 0 else "nan")
+            for idx, metric in enumerate(metrics)
+        }
+        self._logger.info(
+            "Evaluation results for {}: \n".format(iou_type) + create_small_table(results)
+        )
+        if not np.isfinite(sum(results.values())):
+            self._logger.info("Note that some metrics cannot be computed.")
+
+        if class_names is None or len(class_names) <= 1:
+            return results
+        # Compute per-category AP
+        # from https://github.com/facebookresearch/Detectron/blob/a6a835f5b8208c45d0dce217ce9bbda915f44df7/detectron/datasets/json_dataset_evaluator.py#L222-L252 # noqa
+        precisions = coco_eval.eval["precision"]
+        # precision has dims (iou, recall, cls, area range, max dets)
+        assert len(class_names) == precisions.shape[2]
+
+        results_per_category = []
+        for idx, name in enumerate(class_names):
+            # area range index 0: all area ranges
+            # max dets index -1: typically 100 per image
+            precision = precisions[:, :, idx, 0, -1]
+            precision = precision[precision > -1]
+            ap = np.mean(precision) if precision.size else float("nan")
+            results_per_category.append(("{}".format(name), float(ap * 100)))
+
+        # tabulate it
+        N_COLS = min(6, len(results_per_category) * 2)
+        results_flatten = list(itertools.chain(*results_per_category))
+        results_2d = itertools.zip_longest(*[results_flatten[i::N_COLS] for i in range(N_COLS)])
+        table = tabulate(
+            results_2d,
+            tablefmt="pipe",
+            floatfmt=".3f",
+            headers=["category", "AP"] * (N_COLS // 2),
+            numalign="left",
+        )
+        self._logger.info("Per-category {} AP: \n".format(iou_type) + table)
+
+        results.update({"AP-" + name: ap for name, ap in results_per_category})
+        return results
+
+
+def instances_to_coco_json(instances, img_id):
+    """
+    Dump an "Instances" object to a COCO-format json that's used for evaluation.
+
+    Args:
+        instances (Instances):
+        img_id (int): the image id
+
+    Returns:
+        list[dict]: list of json annotations in COCO format.
+    """
+    num_instance = len(instances)
+    if num_instance == 0:
+        return []
+
+    boxes = instances.pred_boxes.tensor.numpy()
+    boxes = BoxMode.convert(boxes, BoxMode.XYXY_ABS, BoxMode.XYWH_ABS)
+    boxes = boxes.tolist()
+    scores = instances.scores.tolist()
+    classes = instances.pred_classes.tolist()
+
+    has_mask = instances.has("pred_masks")
+    if has_mask:
+        # use RLE to encode the masks, because they are too large and takes memory
+        # since this evaluator stores outputs of the entire dataset
+        rles = [
+            mask_util.encode(np.array(mask[:, :, None], order="F", dtype="uint8"))[0]
+            for mask in instances.pred_masks
+        ]
+        for rle in rles:
+            # "counts" is an array encoded by mask_util as a byte-stream. Python3's
+            # json writer which always produces strings cannot serialize a bytestream
+            # unless you decode it. Thankfully, utf-8 works out (which is also what
+            # the pycocotools/_mask.pyx does).
+            rle["counts"] = rle["counts"].decode("utf-8")
+
+    has_keypoints = instances.has("pred_keypoints")
+    if has_keypoints:
+        keypoints = instances.pred_keypoints
+
+    results = []
+    for k in range(num_instance):
+        result = {
+            "image_id": img_id,
+            "category_id": classes[k],
+            "bbox": boxes[k],
+            "score": scores[k],
+        }
+        if has_mask:
+            result["segmentation"] = rles[k]
+        if has_keypoints:
+            # In COCO annotations,
+            # keypoints coordinates are pixel indices.
+            # However our predictions are floating point coordinates.
+            # Therefore we subtract 0.5 to be consistent with the annotation format.
+            # This is the inverse of data loading logic in `data/coco.py`.
+            keypoints[k][:, :2] -= 0.5
+            result["keypoints"] = keypoints[k].flatten().tolist()
+        results.append(result)
+    return results
+
+
+# inspired from Detectron:
+# https://github.com/facebookresearch/Detectron/blob/a6a835f5b8208c45d0dce217ce9bbda915f44df7/detectron/datasets/json_dataset_evaluator.py#L255 # noqa
+def _evaluate_box_proposals(dataset_predictions, coco_api, thresholds=None, area="all", limit=None):
+    """
+    Evaluate detection proposal recall metrics. This function is a much
+    faster alternative to the official COCO API recall evaluation code. However,
+    it produces slightly different results.
+    """
+    # Record max overlap value for each gt box
+    # Return vector of overlap values
+    areas = {
+        "all": 0,
+        "small": 1,
+        "medium": 2,
+        "large": 3,
+        "96-128": 4,
+        "128-256": 5,
+        "256-512": 6,
+        "512-inf": 7,
+    }
+    area_ranges = [
+        [0 ** 2, 1e5 ** 2],  # all
+        [0 ** 2, 32 ** 2],  # small
+        [32 ** 2, 96 ** 2],  # medium
+        [96 ** 2, 1e5 ** 2],  # large
+        [96 ** 2, 128 ** 2],  # 96-128
+        [128 ** 2, 256 ** 2],  # 128-256
+        [256 ** 2, 512 ** 2],  # 256-512
+        [512 ** 2, 1e5 ** 2],
+    ]  # 512-inf
+    assert area in areas, "Unknown area range: {}".format(area)
+    area_range = area_ranges[areas[area]]
+    gt_overlaps = []
+    num_pos = 0
+
+    for prediction_dict in dataset_predictions:
+        predictions = prediction_dict["proposals"]
+
+        # sort predictions in descending order
+        # TODO maybe remove this and make it explicit in the documentation
+        inds = predictions.objectness_logits.sort(descending=True)[1]
+        predictions = predictions[inds]
+
+        ann_ids = coco_api.getAnnIds(imgIds=prediction_dict["image_id"])
+        anno = coco_api.loadAnns(ann_ids)
+        gt_boxes = [
+            BoxMode.convert(obj["bbox"], BoxMode.XYWH_ABS, BoxMode.XYXY_ABS)
+            for obj in anno
+            if obj["iscrowd"] == 0
+        ]
+        gt_boxes = torch.as_tensor(gt_boxes).reshape(-1, 4)  # guard against no boxes
+        gt_boxes = Boxes(gt_boxes)
+        gt_areas = torch.as_tensor([obj["area"] for obj in anno if obj["iscrowd"] == 0])
+
+        if len(gt_boxes) == 0 or len(predictions) == 0:
+            continue
+
+        valid_gt_inds = (gt_areas >= area_range[0]) & (gt_areas <= area_range[1])
+        gt_boxes = gt_boxes[valid_gt_inds]
+
+        num_pos += len(gt_boxes)
+
+        if len(gt_boxes) == 0:
+            continue
+
+        if limit is not None and len(predictions) > limit:
+            predictions = predictions[:limit]
+
+        overlaps = pairwise_iou(predictions.proposal_boxes, gt_boxes)
+
+        _gt_overlaps = torch.zeros(len(gt_boxes))
+        for j in range(min(len(predictions), len(gt_boxes))):
+            # find which proposal box maximally covers each gt box
+            # and get the iou amount of coverage for each gt box
+            max_overlaps, argmax_overlaps = overlaps.max(dim=0)
+
+            # find which gt box is 'best' covered (i.e. 'best' = most iou)
+            gt_ovr, gt_ind = max_overlaps.max(dim=0)
+            assert gt_ovr >= 0
+            # find the proposal box that covers the best covered gt box
+            box_ind = argmax_overlaps[gt_ind]
+            # record the iou coverage of this gt box
+            _gt_overlaps[j] = overlaps[box_ind, gt_ind]
+            assert _gt_overlaps[j] == gt_ovr
+            # mark the proposal box and the gt box as used
+            overlaps[box_ind, :] = -1
+            overlaps[:, gt_ind] = -1
+
+        # append recorded iou coverage level
+        gt_overlaps.append(_gt_overlaps)
+    gt_overlaps = (
+        torch.cat(gt_overlaps, dim=0) if len(gt_overlaps) else torch.zeros(0, dtype=torch.float32)
+    )
+    gt_overlaps, _ = torch.sort(gt_overlaps)
+
+    if thresholds is None:
+        step = 0.05
+        thresholds = torch.arange(0.5, 0.95 + 1e-5, step, dtype=torch.float32)
+    recalls = torch.zeros_like(thresholds)
+    # compute recall for each iou threshold
+    for i, t in enumerate(thresholds):
+        recalls[i] = (gt_overlaps >= t).float().sum() / float(num_pos)
+    # ar = 2 * np.trapz(recalls, thresholds)
+    ar = recalls.mean()
+    return {
+        "ar": ar,
+        "recalls": recalls,
+        "thresholds": thresholds,
+        "gt_overlaps": gt_overlaps,
+        "num_pos": num_pos,
+    }
+
+
+def _evaluate_predictions_on_coco(coco_gt, coco_results, iou_type, kpt_oks_sigmas=None):
+    """
+    Evaluate the coco results using COCOEval API.
+    """
+    assert len(coco_results) > 0
+
+    if iou_type == "segm":
+        coco_results = copy.deepcopy(coco_results)
+        # When evaluating mask AP, if the results contain bbox, cocoapi will
+        # use the box area as the area of the instance, instead of the mask area.
+        # This leads to a different definition of small/medium/large.
+        # We remove the bbox field to let mask AP use mask area.
+        for c in coco_results:
+            c.pop("bbox", None)
+
+    coco_dt = coco_gt.loadRes(coco_results)
+    coco_eval = COCOeval(coco_gt, coco_dt, iou_type)
+    # Use the COCO default keypoint OKS sigmas unless overrides are specified
+    if kpt_oks_sigmas:
+        coco_eval.params.kpt_oks_sigmas = np.array(kpt_oks_sigmas)
+
+    if iou_type == "keypoints":
+        num_keypoints = len(coco_results[0]["keypoints"]) // 3
+        assert len(coco_eval.params.kpt_oks_sigmas) == num_keypoints, (
+            "[COCOEvaluator] The length of cfg.TEST.KEYPOINT_OKS_SIGMAS (default: 17) "
+            "must be equal to the number of keypoints. However the prediction has {} "
+            "keypoints! For more information please refer to "
+            "http://cocodataset.org/#keypoints-eval.".format(num_keypoints)
+        )
+
+    coco_eval.evaluate()
+    coco_eval.accumulate()
+    coco_eval.summarize()
+
+    return coco_eval
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/detectron2/evaluation/evaluator.py b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/evaluation/evaluator.py
new file mode 100644
index 0000000000000000000000000000000000000000..dcb98043a1ededb3925d0ecbba3914d6409dc022
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/evaluation/evaluator.py
@@ -0,0 +1,196 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+import datetime
+import logging
+import time
+from collections import OrderedDict
+from contextlib import contextmanager
+import torch
+
+from detectron2.utils.comm import get_world_size, is_main_process
+from detectron2.utils.logger import log_every_n_seconds
+
+
+class DatasetEvaluator:
+    """
+    Base class for a dataset evaluator.
+
+    The function :func:`inference_on_dataset` runs the model over
+    all samples in the dataset, and have a DatasetEvaluator to process the inputs/outputs.
+
+    This class will accumulate information of the inputs/outputs (by :meth:`process`),
+    and produce evaluation results in the end (by :meth:`evaluate`).
+    """
+
+    def reset(self):
+        """
+        Preparation for a new round of evaluation.
+        Should be called before starting a round of evaluation.
+        """
+        pass
+
+    def process(self, inputs, outputs):
+        """
+        Process the pair of inputs and outputs.
+        If they contain batches, the pairs can be consumed one-by-one using `zip`:
+
+        .. code-block:: python
+
+            for input_, output in zip(inputs, outputs):
+                # do evaluation on single input/output pair
+                ...
+
+        Args:
+            inputs (list): the inputs that's used to call the model.
+            outputs (list): the return value of `model(inputs)`
+        """
+        pass
+
+    def evaluate(self):
+        """
+        Evaluate/summarize the performance, after processing all input/output pairs.
+
+        Returns:
+            dict:
+                A new evaluator class can return a dict of arbitrary format
+                as long as the user can process the results.
+                In our train_net.py, we expect the following format:
+
+                * key: the name of the task (e.g., bbox)
+                * value: a dict of {metric name: score}, e.g.: {"AP50": 80}
+        """
+        pass
+
+
+class DatasetEvaluators(DatasetEvaluator):
+    """
+    Wrapper class to combine multiple :class:`DatasetEvaluator` instances.
+
+    This class dispatches every evaluation call to
+    all of its :class:`DatasetEvaluator`.
+    """
+
+    def __init__(self, evaluators):
+        """
+        Args:
+            evaluators (list): the evaluators to combine.
+        """
+        super().__init__()
+        self._evaluators = evaluators
+
+    def reset(self):
+        for evaluator in self._evaluators:
+            evaluator.reset()
+
+    def process(self, inputs, outputs):
+        for evaluator in self._evaluators:
+            evaluator.process(inputs, outputs)
+
+    def evaluate(self):
+        results = OrderedDict()
+        for evaluator in self._evaluators:
+            result = evaluator.evaluate()
+            if is_main_process() and result is not None:
+                for k, v in result.items():
+                    assert (
+                        k not in results
+                    ), "Different evaluators produce results with the same key {}".format(k)
+                    results[k] = v
+        return results
+
+
+def inference_on_dataset(model, data_loader, evaluator):
+    """
+    Run model on the data_loader and evaluate the metrics with evaluator.
+    Also benchmark the inference speed of `model.forward` accurately.
+    The model will be used in eval mode.
+
+    Args:
+        model (nn.Module): a module which accepts an object from
+            `data_loader` and returns some outputs. It will be temporarily set to `eval` mode.
+
+            If you wish to evaluate a model in `training` mode instead, you can
+            wrap the given model and override its behavior of `.eval()` and `.train()`.
+        data_loader: an iterable object with a length.
+            The elements it generates will be the inputs to the model.
+        evaluator (DatasetEvaluator): the evaluator to run. Use `None` if you only want
+            to benchmark, but don't want to do any evaluation.
+
+    Returns:
+        The return value of `evaluator.evaluate()`
+    """
+    num_devices = get_world_size()
+    logger = logging.getLogger(__name__)
+    logger.info("Start inference on {} images".format(len(data_loader)))
+
+    total = len(data_loader)  # inference data loader must have a fixed length
+    if evaluator is None:
+        # create a no-op evaluator
+        evaluator = DatasetEvaluators([])
+    evaluator.reset()
+
+    num_warmup = min(5, total - 1)
+    start_time = time.perf_counter()
+    total_compute_time = 0
+    with inference_context(model), torch.no_grad():
+        for idx, inputs in enumerate(data_loader):
+            if idx == num_warmup:
+                start_time = time.perf_counter()
+                total_compute_time = 0
+
+            start_compute_time = time.perf_counter()
+            outputs = model(inputs)
+            if torch.cuda.is_available():
+                torch.cuda.synchronize()
+            total_compute_time += time.perf_counter() - start_compute_time
+            evaluator.process(inputs, outputs)
+
+            iters_after_start = idx + 1 - num_warmup * int(idx >= num_warmup)
+            seconds_per_img = total_compute_time / iters_after_start
+            if idx >= num_warmup * 2 or seconds_per_img > 5:
+                total_seconds_per_img = (time.perf_counter() - start_time) / iters_after_start
+                eta = datetime.timedelta(seconds=int(total_seconds_per_img * (total - idx - 1)))
+                log_every_n_seconds(
+                    logging.INFO,
+                    "Inference done {}/{}. {:.4f} s / demo. ETA={}".format(
+                        idx + 1, total, seconds_per_img, str(eta)
+                    ),
+                    n=5,
+                )
+
+    # Measure the time only for this worker (before the synchronization barrier)
+    total_time = time.perf_counter() - start_time
+    total_time_str = str(datetime.timedelta(seconds=total_time))
+    # NOTE this format is parsed by grep
+    logger.info(
+        "Total inference time: {} ({:.6f} s / demo per device, on {} devices)".format(
+            total_time_str, total_time / (total - num_warmup), num_devices
+        )
+    )
+    total_compute_time_str = str(datetime.timedelta(seconds=int(total_compute_time)))
+    logger.info(
+        "Total inference pure compute time: {} ({:.6f} s / demo per device, on {} devices)".format(
+            total_compute_time_str, total_compute_time / (total - num_warmup), num_devices
+        )
+    )
+
+    results = evaluator.evaluate()
+    # An evaluator may return None when not in main process.
+    # Replace it by an empty dict instead to make it easier for downstream code to handle
+    if results is None:
+        results = {}
+    return results
+
+
+@contextmanager
+def inference_context(model):
+    """
+    A context where the model is temporarily changed to eval mode,
+    and restored to previous mode afterwards.
+
+    Args:
+        model: a torch Module
+    """
+    training_mode = model.training
+    model.eval()
+    yield
+    model.train(training_mode)
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/detectron2/evaluation/lvis_evaluation.py b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/evaluation/lvis_evaluation.py
new file mode 100644
index 0000000000000000000000000000000000000000..e55f50fb9d1fa7ccb685f812b603c10f9a1ffea0
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/evaluation/lvis_evaluation.py
@@ -0,0 +1,350 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+import copy
+import itertools
+import json
+import logging
+import os
+import pickle
+from collections import OrderedDict
+import torch
+from fvcore.common.file_io import PathManager
+
+import detectron2.utils.comm as comm
+from detectron2.data import MetadataCatalog
+from detectron2.structures import Boxes, BoxMode, pairwise_iou
+from detectron2.utils.logger import create_small_table
+
+from .coco_evaluation import instances_to_coco_json
+from .evaluator import DatasetEvaluator
+
+
+class LVISEvaluator(DatasetEvaluator):
+    """
+    Evaluate object proposal and instance detection/segmentation outputs using
+    LVIS's metrics and evaluation API.
+    """
+
+    def __init__(self, dataset_name, cfg, distributed, output_dir=None):
+        """
+        Args:
+            dataset_name (str): name of the dataset to be evaluated.
+                It must have the following corresponding metadata:
+                "json_file": the path to the LVIS format annotation
+            cfg (CfgNode): config instance
+            distributed (True): if True, will collect results from all ranks for evaluation.
+                Otherwise, will evaluate the results in the current process.
+            output_dir (str): optional, an output directory to dump results.
+        """
+        from lvis import LVIS
+
+        self._tasks = self._tasks_from_config(cfg)
+        self._distributed = distributed
+        self._output_dir = output_dir
+
+        self._cpu_device = torch.device("cpu")
+        self._logger = logging.getLogger(__name__)
+
+        self._metadata = MetadataCatalog.get(dataset_name)
+        json_file = PathManager.get_local_path(self._metadata.json_file)
+        self._lvis_api = LVIS(json_file)
+        # Test set json files do not contain annotations (evaluation must be
+        # performed using the LVIS evaluation server).
+        self._do_evaluation = len(self._lvis_api.get_ann_ids()) > 0
+
+    def reset(self):
+        self._predictions = []
+
+    def _tasks_from_config(self, cfg):
+        """
+        Returns:
+            tuple[str]: tasks that can be evaluated under the given configuration.
+        """
+        tasks = ("bbox",)
+        if cfg.MODEL.MASK_ON:
+            tasks = tasks + ("segm",)
+        return tasks
+
+    def process(self, inputs, outputs):
+        """
+        Args:
+            inputs: the inputs to a LVIS model (e.g., GeneralizedRCNN).
+                It is a list of dict. Each dict corresponds to an image and
+                contains keys like "height", "width", "file_name", "image_id".
+            outputs: the outputs of a LVIS model. It is a list of dicts with key
+                "instances" that contains :class:`Instances`.
+        """
+        for input, output in zip(inputs, outputs):
+            prediction = {"image_id": input["image_id"]}
+
+            if "instances" in output:
+                instances = output["instances"].to(self._cpu_device)
+                prediction["instances"] = instances_to_coco_json(instances, input["image_id"])
+            if "proposals" in output:
+                prediction["proposals"] = output["proposals"].to(self._cpu_device)
+            self._predictions.append(prediction)
+
+    def evaluate(self):
+        if self._distributed:
+            comm.synchronize()
+            predictions = comm.gather(self._predictions, dst=0)
+            predictions = list(itertools.chain(*predictions))
+
+            if not comm.is_main_process():
+                return
+        else:
+            predictions = self._predictions
+
+        if len(predictions) == 0:
+            self._logger.warning("[LVISEvaluator] Did not receive valid predictions.")
+            return {}
+
+        if self._output_dir:
+            PathManager.mkdirs(self._output_dir)
+            file_path = os.path.join(self._output_dir, "instances_predictions.pth")
+            with PathManager.open(file_path, "wb") as f:
+                torch.save(predictions, f)
+
+        self._results = OrderedDict()
+        if "proposals" in predictions[0]:
+            self._eval_box_proposals(predictions)
+        if "instances" in predictions[0]:
+            self._eval_predictions(set(self._tasks), predictions)
+        # Copy so the caller can do whatever with results
+        return copy.deepcopy(self._results)
+
+    def _eval_predictions(self, tasks, predictions):
+        """
+        Evaluate predictions on the given tasks.
+        Fill self._results with the metrics of the tasks.
+
+        Args:
+            predictions (list[dict]): list of outputs from the model
+        """
+        self._logger.info("Preparing results in the LVIS format ...")
+        lvis_results = list(itertools.chain(*[x["instances"] for x in predictions]))
+
+        # LVIS evaluator can be used to evaluate results for COCO dataset categories.
+        # In this case `_metadata` variable will have a field with COCO-specific category mapping.
+        if hasattr(self._metadata, "thing_dataset_id_to_contiguous_id"):
+            reverse_id_mapping = {
+                v: k for k, v in self._metadata.thing_dataset_id_to_contiguous_id.items()
+            }
+            for result in lvis_results:
+                result["category_id"] = reverse_id_mapping[result["category_id"]]
+        else:
+            # unmap the category ids for LVIS (from 0-indexed to 1-indexed)
+            for result in lvis_results:
+                result["category_id"] += 1
+
+        if self._output_dir:
+            file_path = os.path.join(self._output_dir, "lvis_instances_results.json")
+            self._logger.info("Saving results to {}".format(file_path))
+            with PathManager.open(file_path, "w") as f:
+                f.write(json.dumps(lvis_results))
+                f.flush()
+
+        if not self._do_evaluation:
+            self._logger.info("Annotations are not available for evaluation.")
+            return
+
+        self._logger.info("Evaluating predictions ...")
+        for task in sorted(tasks):
+            res = _evaluate_predictions_on_lvis(
+                self._lvis_api, lvis_results, task, class_names=self._metadata.get("thing_classes")
+            )
+            self._results[task] = res
+
+    def _eval_box_proposals(self, predictions):
+        """
+        Evaluate the box proposals in predictions.
+        Fill self._results with the metrics for "box_proposals" task.
+        """
+        if self._output_dir:
+            # Saving generated box proposals to file.
+            # Predicted box_proposals are in XYXY_ABS mode.
+            bbox_mode = BoxMode.XYXY_ABS.value
+            ids, boxes, objectness_logits = [], [], []
+            for prediction in predictions:
+                ids.append(prediction["image_id"])
+                boxes.append(prediction["proposals"].proposal_boxes.tensor.numpy())
+                objectness_logits.append(prediction["proposals"].objectness_logits.numpy())
+
+            proposal_data = {
+                "boxes": boxes,
+                "objectness_logits": objectness_logits,
+                "ids": ids,
+                "bbox_mode": bbox_mode,
+            }
+            with PathManager.open(os.path.join(self._output_dir, "box_proposals.pkl"), "wb") as f:
+                pickle.dump(proposal_data, f)
+
+        if not self._do_evaluation:
+            self._logger.info("Annotations are not available for evaluation.")
+            return
+
+        self._logger.info("Evaluating bbox proposals ...")
+        res = {}
+        areas = {"all": "", "small": "s", "medium": "m", "large": "l"}
+        for limit in [100, 1000]:
+            for area, suffix in areas.items():
+                stats = _evaluate_box_proposals(predictions, self._lvis_api, area=area, limit=limit)
+                key = "AR{}@{:d}".format(suffix, limit)
+                res[key] = float(stats["ar"].item() * 100)
+        self._logger.info("Proposal metrics: \n" + create_small_table(res))
+        self._results["box_proposals"] = res
+
+
+# inspired from Detectron:
+# https://github.com/facebookresearch/Detectron/blob/a6a835f5b8208c45d0dce217ce9bbda915f44df7/detectron/datasets/json_dataset_evaluator.py#L255 # noqa
+def _evaluate_box_proposals(dataset_predictions, lvis_api, thresholds=None, area="all", limit=None):
+    """
+    Evaluate detection proposal recall metrics. This function is a much
+    faster alternative to the official LVIS API recall evaluation code. However,
+    it produces slightly different results.
+    """
+    # Record max overlap value for each gt box
+    # Return vector of overlap values
+    areas = {
+        "all": 0,
+        "small": 1,
+        "medium": 2,
+        "large": 3,
+        "96-128": 4,
+        "128-256": 5,
+        "256-512": 6,
+        "512-inf": 7,
+    }
+    area_ranges = [
+        [0 ** 2, 1e5 ** 2],  # all
+        [0 ** 2, 32 ** 2],  # small
+        [32 ** 2, 96 ** 2],  # medium
+        [96 ** 2, 1e5 ** 2],  # large
+        [96 ** 2, 128 ** 2],  # 96-128
+        [128 ** 2, 256 ** 2],  # 128-256
+        [256 ** 2, 512 ** 2],  # 256-512
+        [512 ** 2, 1e5 ** 2],
+    ]  # 512-inf
+    assert area in areas, "Unknown area range: {}".format(area)
+    area_range = area_ranges[areas[area]]
+    gt_overlaps = []
+    num_pos = 0
+
+    for prediction_dict in dataset_predictions:
+        predictions = prediction_dict["proposals"]
+
+        # sort predictions in descending order
+        # TODO maybe remove this and make it explicit in the documentation
+        inds = predictions.objectness_logits.sort(descending=True)[1]
+        predictions = predictions[inds]
+
+        ann_ids = lvis_api.get_ann_ids(img_ids=[prediction_dict["image_id"]])
+        anno = lvis_api.load_anns(ann_ids)
+        gt_boxes = [
+            BoxMode.convert(obj["bbox"], BoxMode.XYWH_ABS, BoxMode.XYXY_ABS) for obj in anno
+        ]
+        gt_boxes = torch.as_tensor(gt_boxes).reshape(-1, 4)  # guard against no boxes
+        gt_boxes = Boxes(gt_boxes)
+        gt_areas = torch.as_tensor([obj["area"] for obj in anno])
+
+        if len(gt_boxes) == 0 or len(predictions) == 0:
+            continue
+
+        valid_gt_inds = (gt_areas >= area_range[0]) & (gt_areas <= area_range[1])
+        gt_boxes = gt_boxes[valid_gt_inds]
+
+        num_pos += len(gt_boxes)
+
+        if len(gt_boxes) == 0:
+            continue
+
+        if limit is not None and len(predictions) > limit:
+            predictions = predictions[:limit]
+
+        overlaps = pairwise_iou(predictions.proposal_boxes, gt_boxes)
+
+        _gt_overlaps = torch.zeros(len(gt_boxes))
+        for j in range(min(len(predictions), len(gt_boxes))):
+            # find which proposal box maximally covers each gt box
+            # and get the iou amount of coverage for each gt box
+            max_overlaps, argmax_overlaps = overlaps.max(dim=0)
+
+            # find which gt box is 'best' covered (i.e. 'best' = most iou)
+            gt_ovr, gt_ind = max_overlaps.max(dim=0)
+            assert gt_ovr >= 0
+            # find the proposal box that covers the best covered gt box
+            box_ind = argmax_overlaps[gt_ind]
+            # record the iou coverage of this gt box
+            _gt_overlaps[j] = overlaps[box_ind, gt_ind]
+            assert _gt_overlaps[j] == gt_ovr
+            # mark the proposal box and the gt box as used
+            overlaps[box_ind, :] = -1
+            overlaps[:, gt_ind] = -1
+
+        # append recorded iou coverage level
+        gt_overlaps.append(_gt_overlaps)
+    gt_overlaps = (
+        torch.cat(gt_overlaps, dim=0) if len(gt_overlaps) else torch.zeros(0, dtype=torch.float32)
+    )
+    gt_overlaps, _ = torch.sort(gt_overlaps)
+
+    if thresholds is None:
+        step = 0.05
+        thresholds = torch.arange(0.5, 0.95 + 1e-5, step, dtype=torch.float32)
+    recalls = torch.zeros_like(thresholds)
+    # compute recall for each iou threshold
+    for i, t in enumerate(thresholds):
+        recalls[i] = (gt_overlaps >= t).float().sum() / float(num_pos)
+    # ar = 2 * np.trapz(recalls, thresholds)
+    ar = recalls.mean()
+    return {
+        "ar": ar,
+        "recalls": recalls,
+        "thresholds": thresholds,
+        "gt_overlaps": gt_overlaps,
+        "num_pos": num_pos,
+    }
+
+
+def _evaluate_predictions_on_lvis(lvis_gt, lvis_results, iou_type, class_names=None):
+    """
+    Args:
+        iou_type (str):
+        kpt_oks_sigmas (list[float]):
+        class_names (None or list[str]): if provided, will use it to predict
+            per-category AP.
+
+    Returns:
+        a dict of {metric name: score}
+    """
+    metrics = {
+        "bbox": ["AP", "AP50", "AP75", "APs", "APm", "APl", "APr", "APc", "APf"],
+        "segm": ["AP", "AP50", "AP75", "APs", "APm", "APl", "APr", "APc", "APf"],
+    }[iou_type]
+
+    logger = logging.getLogger(__name__)
+
+    if len(lvis_results) == 0:  # TODO: check if needed
+        logger.warn("No predictions from the model!")
+        return {metric: float("nan") for metric in metrics}
+
+    if iou_type == "segm":
+        lvis_results = copy.deepcopy(lvis_results)
+        # When evaluating mask AP, if the results contain bbox, LVIS API will
+        # use the box area as the area of the instance, instead of the mask area.
+        # This leads to a different definition of small/medium/large.
+        # We remove the bbox field to let mask AP use mask area.
+        for c in lvis_results:
+            c.pop("bbox", None)
+
+    from lvis import LVISEval, LVISResults
+
+    lvis_results = LVISResults(lvis_gt, lvis_results)
+    lvis_eval = LVISEval(lvis_gt, lvis_results, iou_type)
+    lvis_eval.run()
+    lvis_eval.print_results()
+
+    # Pull the standard metrics from the LVIS results
+    results = lvis_eval.get_results()
+    results = {metric: float(results[metric] * 100) for metric in metrics}
+    logger.info("Evaluation results for {}: \n".format(iou_type) + create_small_table(results))
+    return results
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/detectron2/evaluation/panoptic_evaluation.py b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/evaluation/panoptic_evaluation.py
new file mode 100644
index 0000000000000000000000000000000000000000..fb5e7ab87b1dd5bb3e0c5d1e405e321c48d9e6a0
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/evaluation/panoptic_evaluation.py
@@ -0,0 +1,167 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+import contextlib
+import io
+import itertools
+import json
+import logging
+import os
+import tempfile
+from collections import OrderedDict
+from fvcore.common.file_io import PathManager
+from PIL import Image
+from tabulate import tabulate
+
+from detectron2.data import MetadataCatalog
+from detectron2.utils import comm
+
+from .evaluator import DatasetEvaluator
+
+logger = logging.getLogger(__name__)
+
+
+class COCOPanopticEvaluator(DatasetEvaluator):
+    """
+    Evaluate Panoptic Quality metrics on COCO using PanopticAPI.
+    It saves panoptic segmentation prediction in `output_dir`
+
+    It contains a synchronize call and has to be called from all workers.
+    """
+
+    def __init__(self, dataset_name, output_dir):
+        """
+        Args:
+            dataset_name (str): name of the dataset
+            output_dir (str): output directory to save results for evaluation
+        """
+        self._metadata = MetadataCatalog.get(dataset_name)
+        self._thing_contiguous_id_to_dataset_id = {
+            v: k for k, v in self._metadata.thing_dataset_id_to_contiguous_id.items()
+        }
+        self._stuff_contiguous_id_to_dataset_id = {
+            v: k for k, v in self._metadata.stuff_dataset_id_to_contiguous_id.items()
+        }
+
+        self._predictions_json = os.path.join(output_dir, "predictions.json")
+
+    def reset(self):
+        self._predictions = []
+
+    def _convert_category_id(self, segment_info):
+        isthing = segment_info.pop("isthing", None)
+        if isthing is None:
+            # the model produces panoptic category id directly. No more conversion needed
+            return segment_info
+        if isthing is True:
+            segment_info["category_id"] = self._thing_contiguous_id_to_dataset_id[
+                segment_info["category_id"]
+            ]
+        else:
+            segment_info["category_id"] = self._stuff_contiguous_id_to_dataset_id[
+                segment_info["category_id"]
+            ]
+        return segment_info
+
+    def process(self, inputs, outputs):
+        from panopticapi.utils import id2rgb
+
+        for input, output in zip(inputs, outputs):
+            panoptic_img, segments_info = output["panoptic_seg"]
+            panoptic_img = panoptic_img.cpu().numpy()
+
+            file_name = os.path.basename(input["file_name"])
+            file_name_png = os.path.splitext(file_name)[0] + ".png"
+            with io.BytesIO() as out:
+                Image.fromarray(id2rgb(panoptic_img)).save(out, format="PNG")
+                segments_info = [self._convert_category_id(x) for x in segments_info]
+                self._predictions.append(
+                    {
+                        "image_id": input["image_id"],
+                        "file_name": file_name_png,
+                        "png_string": out.getvalue(),
+                        "segments_info": segments_info,
+                    }
+                )
+
+    def evaluate(self):
+        comm.synchronize()
+
+        self._predictions = comm.gather(self._predictions)
+        self._predictions = list(itertools.chain(*self._predictions))
+        if not comm.is_main_process():
+            return
+
+        # PanopticApi requires local files
+        gt_json = PathManager.get_local_path(self._metadata.panoptic_json)
+        gt_folder = PathManager.get_local_path(self._metadata.panoptic_root)
+
+        with tempfile.TemporaryDirectory(prefix="panoptic_eval") as pred_dir:
+            logger.info("Writing all panoptic predictions to {} ...".format(pred_dir))
+            for p in self._predictions:
+                with open(os.path.join(pred_dir, p["file_name"]), "wb") as f:
+                    f.write(p.pop("png_string"))
+
+            with open(gt_json, "r") as f:
+                json_data = json.load(f)
+            json_data["annotations"] = self._predictions
+            with PathManager.open(self._predictions_json, "w") as f:
+                f.write(json.dumps(json_data))
+
+            from panopticapi.evaluation import pq_compute
+
+            with contextlib.redirect_stdout(io.StringIO()):
+                pq_res = pq_compute(
+                    gt_json,
+                    PathManager.get_local_path(self._predictions_json),
+                    gt_folder=gt_folder,
+                    pred_folder=pred_dir,
+                )
+
+        res = {}
+        res["PQ"] = 100 * pq_res["All"]["pq"]
+        res["SQ"] = 100 * pq_res["All"]["sq"]
+        res["RQ"] = 100 * pq_res["All"]["rq"]
+        res["PQ_th"] = 100 * pq_res["Things"]["pq"]
+        res["SQ_th"] = 100 * pq_res["Things"]["sq"]
+        res["RQ_th"] = 100 * pq_res["Things"]["rq"]
+        res["PQ_st"] = 100 * pq_res["Stuff"]["pq"]
+        res["SQ_st"] = 100 * pq_res["Stuff"]["sq"]
+        res["RQ_st"] = 100 * pq_res["Stuff"]["rq"]
+
+        results = OrderedDict({"panoptic_seg": res})
+        _print_panoptic_results(pq_res)
+
+        return results
+
+
+def _print_panoptic_results(pq_res):
+    headers = ["", "PQ", "SQ", "RQ", "#categories"]
+    data = []
+    for name in ["All", "Things", "Stuff"]:
+        row = [name] + [pq_res[name][k] * 100 for k in ["pq", "sq", "rq"]] + [pq_res[name]["n"]]
+        data.append(row)
+    table = tabulate(
+        data, headers=headers, tablefmt="pipe", floatfmt=".3f", stralign="center", numalign="center"
+    )
+    logger.info("Panoptic Evaluation Results:\n" + table)
+
+
+if __name__ == "__main__":
+    from detectron2.utils.logger import setup_logger
+
+    logger = setup_logger()
+    import argparse
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--gt-json")
+    parser.add_argument("--gt-dir")
+    parser.add_argument("--pred-json")
+    parser.add_argument("--pred-dir")
+    args = parser.parse_args()
+
+    from panopticapi.evaluation import pq_compute
+
+    with contextlib.redirect_stdout(io.StringIO()):
+        pq_res = pq_compute(
+            args.gt_json, args.pred_json, gt_folder=args.gt_dir, pred_folder=args.pred_dir
+        )
+        _print_panoptic_results(pq_res)
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/detectron2/evaluation/pascal_voc_evaluation.py b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/evaluation/pascal_voc_evaluation.py
new file mode 100644
index 0000000000000000000000000000000000000000..22d2e523d23c695e06e5da5cb3a210a6d1945dfb
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/evaluation/pascal_voc_evaluation.py
@@ -0,0 +1,294 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+
+import logging
+import numpy as np
+import os
+import tempfile
+import xml.etree.ElementTree as ET
+from collections import OrderedDict, defaultdict
+from functools import lru_cache
+import torch
+from fvcore.common.file_io import PathManager
+
+from detectron2.data import MetadataCatalog
+from detectron2.utils import comm
+
+from .evaluator import DatasetEvaluator
+
+
+class PascalVOCDetectionEvaluator(DatasetEvaluator):
+    """
+    Evaluate Pascal VOC AP.
+    It contains a synchronization, therefore has to be called from all ranks.
+
+    Note that this is a rewrite of the official Matlab API.
+    The results should be similar, but not identical to the one produced by
+    the official API.
+    """
+
+    def __init__(self, dataset_name):
+        """
+        Args:
+            dataset_name (str): name of the dataset, e.g., "voc_2007_test"
+        """
+        self._dataset_name = dataset_name
+        meta = MetadataCatalog.get(dataset_name)
+        self._anno_file_template = os.path.join(meta.dirname, "Annotations", "{}.xml")
+        self._image_set_path = os.path.join(meta.dirname, "ImageSets", "Main", meta.split + ".txt")
+        self._class_names = meta.thing_classes
+        assert meta.year in [2007, 2012], meta.year
+        self._is_2007 = meta.year == 2007
+        self._cpu_device = torch.device("cpu")
+        self._logger = logging.getLogger(__name__)
+
+    def reset(self):
+        self._predictions = defaultdict(list)  # class name -> list of prediction strings
+
+    def process(self, inputs, outputs):
+        for input, output in zip(inputs, outputs):
+            image_id = input["image_id"]
+            instances = output["instances"].to(self._cpu_device)
+            boxes = instances.pred_boxes.tensor.numpy()
+            scores = instances.scores.tolist()
+            classes = instances.pred_classes.tolist()
+            for box, score, cls in zip(boxes, scores, classes):
+                xmin, ymin, xmax, ymax = box
+                # The inverse of data loading logic in `data/pascal_voc.py`
+                xmin += 1
+                ymin += 1
+                self._predictions[cls].append(
+                    f"{image_id} {score:.3f} {xmin:.1f} {ymin:.1f} {xmax:.1f} {ymax:.1f}"
+                )
+
+    def evaluate(self):
+        """
+        Returns:
+            dict: has a key "segm", whose value is a dict of "AP", "AP50", and "AP75".
+        """
+        all_predictions = comm.gather(self._predictions, dst=0)
+        if not comm.is_main_process():
+            return
+        predictions = defaultdict(list)
+        for predictions_per_rank in all_predictions:
+            for clsid, lines in predictions_per_rank.items():
+                predictions[clsid].extend(lines)
+        del all_predictions
+
+        self._logger.info(
+            "Evaluating {} using {} metric. "
+            "Note that results do not use the official Matlab API.".format(
+                self._dataset_name, 2007 if self._is_2007 else 2012
+            )
+        )
+
+        with tempfile.TemporaryDirectory(prefix="pascal_voc_eval_") as dirname:
+            res_file_template = os.path.join(dirname, "{}.txt")
+
+            aps = defaultdict(list)  # iou -> ap per class
+            for cls_id, cls_name in enumerate(self._class_names):
+                lines = predictions.get(cls_id, [""])
+
+                with open(res_file_template.format(cls_name), "w") as f:
+                    f.write("\n".join(lines))
+
+                for thresh in range(50, 100, 5):
+                    rec, prec, ap = voc_eval(
+                        res_file_template,
+                        self._anno_file_template,
+                        self._image_set_path,
+                        cls_name,
+                        ovthresh=thresh / 100.0,
+                        use_07_metric=self._is_2007,
+                    )
+                    aps[thresh].append(ap * 100)
+
+        ret = OrderedDict()
+        mAP = {iou: np.mean(x) for iou, x in aps.items()}
+        ret["bbox"] = {"AP": np.mean(list(mAP.values())), "AP50": mAP[50], "AP75": mAP[75]}
+        return ret
+
+
+##############################################################################
+#
+# Below code is modified from
+# https://github.com/rbgirshick/py-faster-rcnn/blob/master/lib/datasets/voc_eval.py
+# --------------------------------------------------------
+# Fast/er R-CNN
+# Licensed under The MIT License [see LICENSE for details]
+# Written by Bharath Hariharan
+# --------------------------------------------------------
+
+"""Python implementation of the PASCAL VOC devkit's AP evaluation code."""
+
+
+@lru_cache(maxsize=None)
+def parse_rec(filename):
+    """Parse a PASCAL VOC xml file."""
+    with PathManager.open(filename) as f:
+        tree = ET.parse(f)
+    objects = []
+    for obj in tree.findall("object"):
+        obj_struct = {}
+        obj_struct["name"] = obj.find("name").text
+        obj_struct["pose"] = obj.find("pose").text
+        obj_struct["truncated"] = int(obj.find("truncated").text)
+        obj_struct["difficult"] = int(obj.find("difficult").text)
+        bbox = obj.find("bndbox")
+        obj_struct["bbox"] = [
+            int(bbox.find("xmin").text),
+            int(bbox.find("ymin").text),
+            int(bbox.find("xmax").text),
+            int(bbox.find("ymax").text),
+        ]
+        objects.append(obj_struct)
+
+    return objects
+
+
+def voc_ap(rec, prec, use_07_metric=False):
+    """Compute VOC AP given precision and recall. If use_07_metric is true, uses
+    the VOC 07 11-point method (default:False).
+    """
+    if use_07_metric:
+        # 11 point metric
+        ap = 0.0
+        for t in np.arange(0.0, 1.1, 0.1):
+            if np.sum(rec >= t) == 0:
+                p = 0
+            else:
+                p = np.max(prec[rec >= t])
+            ap = ap + p / 11.0
+    else:
+        # correct AP calculation
+        # first append sentinel values at the end
+        mrec = np.concatenate(([0.0], rec, [1.0]))
+        mpre = np.concatenate(([0.0], prec, [0.0]))
+
+        # compute the precision envelope
+        for i in range(mpre.size - 1, 0, -1):
+            mpre[i - 1] = np.maximum(mpre[i - 1], mpre[i])
+
+        # to calculate area under PR curve, look for points
+        # where X axis (recall) changes value
+        i = np.where(mrec[1:] != mrec[:-1])[0]
+
+        # and sum (\Delta recall) * prec
+        ap = np.sum((mrec[i + 1] - mrec[i]) * mpre[i + 1])
+    return ap
+
+
+def voc_eval(detpath, annopath, imagesetfile, classname, ovthresh=0.5, use_07_metric=False):
+    """rec, prec, ap = voc_eval(detpath,
+                                annopath,
+                                imagesetfile,
+                                classname,
+                                [ovthresh],
+                                [use_07_metric])
+
+    Top level function that does the PASCAL VOC evaluation.
+
+    detpath: Path to detections
+        detpath.format(classname) should produce the detection results file.
+    annopath: Path to annotations
+        annopath.format(imagename) should be the xml annotations file.
+    imagesetfile: Text file containing the list of images, one image per line.
+    classname: Category name (duh)
+    [ovthresh]: Overlap threshold (default = 0.5)
+    [use_07_metric]: Whether to use VOC07's 11 point AP computation
+        (default False)
+    """
+    # assumes detections are in detpath.format(classname)
+    # assumes annotations are in annopath.format(imagename)
+    # assumes imagesetfile is a text file with each line an image name
+
+    # first load gt
+    # read list of images
+    with PathManager.open(imagesetfile, "r") as f:
+        lines = f.readlines()
+    imagenames = [x.strip() for x in lines]
+
+    # load annots
+    recs = {}
+    for imagename in imagenames:
+        recs[imagename] = parse_rec(annopath.format(imagename))
+
+    # extract gt objects for this class
+    class_recs = {}
+    npos = 0
+    for imagename in imagenames:
+        R = [obj for obj in recs[imagename] if obj["name"] == classname]
+        bbox = np.array([x["bbox"] for x in R])
+        difficult = np.array([x["difficult"] for x in R]).astype(np.bool)
+        # difficult = np.array([False for x in R]).astype(np.bool)  # treat all "difficult" as GT
+        det = [False] * len(R)
+        npos = npos + sum(~difficult)
+        class_recs[imagename] = {"bbox": bbox, "difficult": difficult, "det": det}
+
+    # read dets
+    detfile = detpath.format(classname)
+    with open(detfile, "r") as f:
+        lines = f.readlines()
+
+    splitlines = [x.strip().split(" ") for x in lines]
+    image_ids = [x[0] for x in splitlines]
+    confidence = np.array([float(x[1]) for x in splitlines])
+    BB = np.array([[float(z) for z in x[2:]] for x in splitlines]).reshape(-1, 4)
+
+    # sort by confidence
+    sorted_ind = np.argsort(-confidence)
+    BB = BB[sorted_ind, :]
+    image_ids = [image_ids[x] for x in sorted_ind]
+
+    # go down dets and mark TPs and FPs
+    nd = len(image_ids)
+    tp = np.zeros(nd)
+    fp = np.zeros(nd)
+    for d in range(nd):
+        R = class_recs[image_ids[d]]
+        bb = BB[d, :].astype(float)
+        ovmax = -np.inf
+        BBGT = R["bbox"].astype(float)
+
+        if BBGT.size > 0:
+            # compute overlaps
+            # intersection
+            ixmin = np.maximum(BBGT[:, 0], bb[0])
+            iymin = np.maximum(BBGT[:, 1], bb[1])
+            ixmax = np.minimum(BBGT[:, 2], bb[2])
+            iymax = np.minimum(BBGT[:, 3], bb[3])
+            iw = np.maximum(ixmax - ixmin + 1.0, 0.0)
+            ih = np.maximum(iymax - iymin + 1.0, 0.0)
+            inters = iw * ih
+
+            # union
+            uni = (
+                (bb[2] - bb[0] + 1.0) * (bb[3] - bb[1] + 1.0)
+                + (BBGT[:, 2] - BBGT[:, 0] + 1.0) * (BBGT[:, 3] - BBGT[:, 1] + 1.0)
+                - inters
+            )
+
+            overlaps = inters / uni
+            ovmax = np.max(overlaps)
+            jmax = np.argmax(overlaps)
+
+        if ovmax > ovthresh:
+            if not R["difficult"][jmax]:
+                if not R["det"][jmax]:
+                    tp[d] = 1.0
+                    R["det"][jmax] = 1
+                else:
+                    fp[d] = 1.0
+        else:
+            fp[d] = 1.0
+
+    # compute precision recall
+    fp = np.cumsum(fp)
+    tp = np.cumsum(tp)
+    rec = tp / float(npos)
+    # avoid divide by zero in case the first detection matches a difficult
+    # ground truth
+    prec = tp / np.maximum(tp + fp, np.finfo(np.float64).eps)
+    ap = voc_ap(rec, prec, use_07_metric)
+
+    return rec, prec, ap
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/detectron2/evaluation/rotated_coco_evaluation.py b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/evaluation/rotated_coco_evaluation.py
new file mode 100644
index 0000000000000000000000000000000000000000..30746e1aaac9a1feb0c7994d9229423e9f04bb51
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/evaluation/rotated_coco_evaluation.py
@@ -0,0 +1,204 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+import itertools
+import json
+import numpy as np
+import os
+import torch
+from fvcore.common.file_io import PathManager
+from pycocotools.cocoeval import COCOeval, maskUtils
+
+from detectron2.structures import BoxMode, RotatedBoxes, pairwise_iou_rotated
+
+from .coco_evaluation import COCOEvaluator
+
+
+class RotatedCOCOeval(COCOeval):
+    @staticmethod
+    def is_rotated(box_list):
+        if type(box_list) == np.ndarray:
+            return box_list.shape[1] == 5
+        elif type(box_list) == list:
+            if box_list == []:  # cannot decide the box_dim
+                return False
+            return np.all(
+                np.array(
+                    [
+                        (len(obj) == 5) and ((type(obj) == list) or (type(obj) == np.ndarray))
+                        for obj in box_list
+                    ]
+                )
+            )
+        return False
+
+    @staticmethod
+    def boxlist_to_tensor(boxlist, output_box_dim):
+        if type(boxlist) == np.ndarray:
+            box_tensor = torch.from_numpy(boxlist)
+        elif type(boxlist) == list:
+            if boxlist == []:
+                return torch.zeros((0, output_box_dim), dtype=torch.float32)
+            else:
+                box_tensor = torch.FloatTensor(boxlist)
+        else:
+            raise Exception("Unrecognized boxlist type")
+
+        input_box_dim = box_tensor.shape[1]
+        if input_box_dim != output_box_dim:
+            if input_box_dim == 4 and output_box_dim == 5:
+                box_tensor = BoxMode.convert(box_tensor, BoxMode.XYWH_ABS, BoxMode.XYWHA_ABS)
+            else:
+                raise Exception(
+                    "Unable to convert from {}-dim box to {}-dim box".format(
+                        input_box_dim, output_box_dim
+                    )
+                )
+        return box_tensor
+
+    def compute_iou_dt_gt(self, dt, gt, is_crowd):
+        if self.is_rotated(dt) or self.is_rotated(gt):
+            # TODO: take is_crowd into consideration
+            assert all(c == 0 for c in is_crowd)
+            dt = RotatedBoxes(self.boxlist_to_tensor(dt, output_box_dim=5))
+            gt = RotatedBoxes(self.boxlist_to_tensor(gt, output_box_dim=5))
+            return pairwise_iou_rotated(dt, gt)
+        else:
+            # This is the same as the classical COCO evaluation
+            return maskUtils.iou(dt, gt, is_crowd)
+
+    def computeIoU(self, imgId, catId):
+        p = self.params
+        if p.useCats:
+            gt = self._gts[imgId, catId]
+            dt = self._dts[imgId, catId]
+        else:
+            gt = [_ for cId in p.catIds for _ in self._gts[imgId, cId]]
+            dt = [_ for cId in p.catIds for _ in self._dts[imgId, cId]]
+        if len(gt) == 0 and len(dt) == 0:
+            return []
+        inds = np.argsort([-d["score"] for d in dt], kind="mergesort")
+        dt = [dt[i] for i in inds]
+        if len(dt) > p.maxDets[-1]:
+            dt = dt[0 : p.maxDets[-1]]
+
+        assert p.iouType == "bbox", "unsupported iouType for iou computation"
+
+        g = [g["bbox"] for g in gt]
+        d = [d["bbox"] for d in dt]
+
+        # compute iou between each dt and gt region
+        iscrowd = [int(o["iscrowd"]) for o in gt]
+
+        # Note: this function is copied from cocoeval.py in cocoapi
+        # and the major difference is here.
+        ious = self.compute_iou_dt_gt(d, g, iscrowd)
+        return ious
+
+
+class RotatedCOCOEvaluator(COCOEvaluator):
+    """
+    Evaluate object proposal/instance detection outputs using COCO-like metrics and APIs,
+    with rotated boxes support.
+    Note: this uses IOU only and does not consider angle differences.
+    """
+
+    def process(self, inputs, outputs):
+        """
+        Args:
+            inputs: the inputs to a COCO model (e.g., GeneralizedRCNN).
+                It is a list of dict. Each dict corresponds to an image and
+                contains keys like "height", "width", "file_name", "image_id".
+            outputs: the outputs of a COCO model. It is a list of dicts with key
+                "instances" that contains :class:`Instances`.
+        """
+        for input, output in zip(inputs, outputs):
+            prediction = {"image_id": input["image_id"]}
+
+            if "instances" in output:
+                instances = output["instances"].to(self._cpu_device)
+
+                prediction["instances"] = self.instances_to_json(instances, input["image_id"])
+            if "proposals" in output:
+                prediction["proposals"] = output["proposals"].to(self._cpu_device)
+            self._predictions.append(prediction)
+
+    def instances_to_json(self, instances, img_id):
+        num_instance = len(instances)
+        if num_instance == 0:
+            return []
+
+        boxes = instances.pred_boxes.tensor.numpy()
+        if boxes.shape[1] == 4:
+            boxes = BoxMode.convert(boxes, BoxMode.XYXY_ABS, BoxMode.XYWH_ABS)
+        boxes = boxes.tolist()
+        scores = instances.scores.tolist()
+        classes = instances.pred_classes.tolist()
+
+        results = []
+        for k in range(num_instance):
+            result = {
+                "image_id": img_id,
+                "category_id": classes[k],
+                "bbox": boxes[k],
+                "score": scores[k],
+            }
+
+            results.append(result)
+        return results
+
+    def _eval_predictions(self, tasks, predictions):
+        """
+        Evaluate predictions on the given tasks.
+        Fill self._results with the metrics of the tasks.
+        """
+        self._logger.info("Preparing results for COCO format ...")
+        coco_results = list(itertools.chain(*[x["instances"] for x in predictions]))
+
+        # unmap the category ids for COCO
+        if hasattr(self._metadata, "thing_dataset_id_to_contiguous_id"):
+            reverse_id_mapping = {
+                v: k for k, v in self._metadata.thing_dataset_id_to_contiguous_id.items()
+            }
+            for result in coco_results:
+                result["category_id"] = reverse_id_mapping[result["category_id"]]
+
+        if self._output_dir:
+            file_path = os.path.join(self._output_dir, "coco_instances_results.json")
+            self._logger.info("Saving results to {}".format(file_path))
+            with PathManager.open(file_path, "w") as f:
+                f.write(json.dumps(coco_results))
+                f.flush()
+
+        if not self._do_evaluation:
+            self._logger.info("Annotations are not available for evaluation.")
+            return
+
+        self._logger.info("Evaluating predictions ...")
+        for task in sorted(tasks):
+            assert task == "bbox", "Task {} is not supported".format(task)
+            coco_eval = (
+                self._evaluate_predictions_on_coco(self._coco_api, coco_results)
+                if len(coco_results) > 0
+                else None  # cocoapi does not handle empty results very well
+            )
+
+            res = self._derive_coco_results(
+                coco_eval, task, class_names=self._metadata.get("thing_classes")
+            )
+            self._results[task] = res
+
+    def _evaluate_predictions_on_coco(self, coco_gt, coco_results):
+        """
+        Evaluate the coco results using COCOEval API.
+        """
+        assert len(coco_results) > 0
+
+        coco_dt = coco_gt.loadRes(coco_results)
+
+        # Only bbox is supported for now
+        coco_eval = RotatedCOCOeval(coco_gt, coco_dt, iouType="bbox")
+
+        coco_eval.evaluate()
+        coco_eval.accumulate()
+        coco_eval.summarize()
+
+        return coco_eval
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/detectron2/evaluation/sem_seg_evaluation.py b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/evaluation/sem_seg_evaluation.py
new file mode 100644
index 0000000000000000000000000000000000000000..fb3b28d79284a5eeb335fc8ee8d859b4e46510ef
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/evaluation/sem_seg_evaluation.py
@@ -0,0 +1,168 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+import itertools
+import json
+import logging
+import numpy as np
+import os
+from collections import OrderedDict
+import PIL.Image as Image
+import pycocotools.mask as mask_util
+import torch
+from fvcore.common.file_io import PathManager
+
+from detectron2.data import DatasetCatalog, MetadataCatalog
+from detectron2.utils.comm import all_gather, is_main_process, synchronize
+
+from .evaluator import DatasetEvaluator
+
+
+class SemSegEvaluator(DatasetEvaluator):
+    """
+    Evaluate semantic segmentation
+    """
+
+    def __init__(self, dataset_name, distributed, num_classes, ignore_label=255, output_dir=None):
+        """
+        Args:
+            dataset_name (str): name of the dataset to be evaluated.
+            distributed (True): if True, will collect results from all ranks for evaluation.
+                Otherwise, will evaluate the results in the current process.
+            num_classes (int): number of classes
+            ignore_label (int): value in semantic segmentation ground truth. Predictions for the
+            corresponding pixels should be ignored.
+            output_dir (str): an output directory to dump results.
+        """
+        self._dataset_name = dataset_name
+        self._distributed = distributed
+        self._output_dir = output_dir
+        self._num_classes = num_classes
+        self._ignore_label = ignore_label
+        self._N = num_classes + 1
+
+        self._cpu_device = torch.device("cpu")
+        self._logger = logging.getLogger(__name__)
+
+        self.input_file_to_gt_file = {
+            dataset_record["file_name"]: dataset_record["sem_seg_file_name"]
+            for dataset_record in DatasetCatalog.get(dataset_name)
+        }
+
+        meta = MetadataCatalog.get(dataset_name)
+        # Dict that maps contiguous training ids to COCO category ids
+        try:
+            c2d = meta.stuff_dataset_id_to_contiguous_id
+            self._contiguous_id_to_dataset_id = {v: k for k, v in c2d.items()}
+        except AttributeError:
+            self._contiguous_id_to_dataset_id = None
+        self._class_names = meta.stuff_classes
+
+    def reset(self):
+        self._conf_matrix = np.zeros((self._N, self._N), dtype=np.int64)
+        self._predictions = []
+
+    def process(self, inputs, outputs):
+        """
+        Args:
+            inputs: the inputs to a model.
+                It is a list of dicts. Each dict corresponds to an image and
+                contains keys like "height", "width", "file_name".
+            outputs: the outputs of a model. It is either list of semantic segmentation predictions
+                (Tensor [H, W]) or list of dicts with key "sem_seg" that contains semantic
+                segmentation prediction in the same format.
+        """
+        for input, output in zip(inputs, outputs):
+            output = output["sem_seg"].argmax(dim=0).to(self._cpu_device)
+            pred = np.array(output, dtype=np.int)
+            with PathManager.open(self.input_file_to_gt_file[input["file_name"]], "rb") as f:
+                gt = np.array(Image.open(f), dtype=np.int)
+
+            gt[gt == self._ignore_label] = self._num_classes
+
+            self._conf_matrix += np.bincount(
+                self._N * pred.reshape(-1) + gt.reshape(-1), minlength=self._N ** 2
+            ).reshape(self._N, self._N)
+
+            self._predictions.extend(self.encode_json_sem_seg(pred, input["file_name"]))
+
+    def evaluate(self):
+        """
+        Evaluates standard semantic segmentation metrics (http://cocodataset.org/#stuff-eval):
+
+        * Mean intersection-over-union averaged across classes (mIoU)
+        * Frequency Weighted IoU (fwIoU)
+        * Mean pixel accuracy averaged across classes (mACC)
+        * Pixel Accuracy (pACC)
+        """
+        if self._distributed:
+            synchronize()
+            conf_matrix_list = all_gather(self._conf_matrix)
+            self._predictions = all_gather(self._predictions)
+            self._predictions = list(itertools.chain(*self._predictions))
+            if not is_main_process():
+                return
+
+            self._conf_matrix = np.zeros_like(self._conf_matrix)
+            for conf_matrix in conf_matrix_list:
+                self._conf_matrix += conf_matrix
+
+        if self._output_dir:
+            PathManager.mkdirs(self._output_dir)
+            file_path = os.path.join(self._output_dir, "sem_seg_predictions.json")
+            with PathManager.open(file_path, "w") as f:
+                f.write(json.dumps(self._predictions))
+
+        acc = np.full(self._num_classes, np.nan, dtype=np.float)
+        iou = np.full(self._num_classes, np.nan, dtype=np.float)
+        tp = self._conf_matrix.diagonal()[:-1].astype(np.float)
+        pos_gt = np.sum(self._conf_matrix[:-1, :-1], axis=0).astype(np.float)
+        class_weights = pos_gt / np.sum(pos_gt)
+        pos_pred = np.sum(self._conf_matrix[:-1, :-1], axis=1).astype(np.float)
+        acc_valid = pos_gt > 0
+        acc[acc_valid] = tp[acc_valid] / pos_gt[acc_valid]
+        iou_valid = (pos_gt + pos_pred) > 0
+        union = pos_gt + pos_pred - tp
+        iou[acc_valid] = tp[acc_valid] / union[acc_valid]
+        macc = np.sum(acc[acc_valid]) / np.sum(acc_valid)
+        miou = np.sum(iou[acc_valid]) / np.sum(iou_valid)
+        fiou = np.sum(iou[acc_valid] * class_weights[acc_valid])
+        pacc = np.sum(tp) / np.sum(pos_gt)
+
+        res = {}
+        res["mIoU"] = 100 * miou
+        res["fwIoU"] = 100 * fiou
+        for i, name in enumerate(self._class_names):
+            res["IoU-{}".format(name)] = 100 * iou[i]
+        res["mACC"] = 100 * macc
+        res["pACC"] = 100 * pacc
+        for i, name in enumerate(self._class_names):
+            res["ACC-{}".format(name)] = 100 * acc[i]
+
+        if self._output_dir:
+            file_path = os.path.join(self._output_dir, "sem_seg_evaluation.pth")
+            with PathManager.open(file_path, "wb") as f:
+                torch.save(res, f)
+        results = OrderedDict({"sem_seg": res})
+        self._logger.info(results)
+        return results
+
+    def encode_json_sem_seg(self, sem_seg, input_file_name):
+        """
+        Convert semantic segmentation to COCO stuff format with segments encoded as RLEs.
+        See http://cocodataset.org/#format-results
+        """
+        json_list = []
+        for label in np.unique(sem_seg):
+            if self._contiguous_id_to_dataset_id is not None:
+                assert (
+                    label in self._contiguous_id_to_dataset_id
+                ), "Label {} is not in the metadata info for {}".format(label, self._dataset_name)
+                dataset_id = self._contiguous_id_to_dataset_id[label]
+            else:
+                dataset_id = int(label)
+            mask = (sem_seg == label).astype(np.uint8)
+            mask_rle = mask_util.encode(np.array(mask[:, :, None], order="F"))[0]
+            mask_rle["counts"] = mask_rle["counts"].decode("utf-8")
+            json_list.append(
+                {"file_name": input_file_name, "category_id": dataset_id, "segmentation": mask_rle}
+            )
+        return json_list
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/detectron2/evaluation/testing.py b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/evaluation/testing.py
new file mode 100644
index 0000000000000000000000000000000000000000..95addebc185111c572cb19aa98f7e055b21fc74e
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/evaluation/testing.py
@@ -0,0 +1,78 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+import logging
+import numpy as np
+import pprint
+import sys
+from collections import OrderedDict
+from collections.abc import Mapping
+
+
+def print_csv_format(results):
+    """
+    Print main metrics in a format similar to Detectron,
+    so that they are easy to copypaste into a spreadsheet.
+
+    Args:
+        results (OrderedDict[dict]): task_name -> {metric -> score}
+    """
+    assert isinstance(results, OrderedDict), results  # unordered results cannot be properly printed
+    logger = logging.getLogger(__name__)
+    for task, res in results.items():
+        # Don't print "AP-category" metrics since they are usually not tracked.
+        important_res = [(k, v) for k, v in res.items() if "-" not in k]
+        logger.info("copypaste: Task: {}".format(task))
+        logger.info("copypaste: " + ",".join([k[0] for k in important_res]))
+        logger.info("copypaste: " + ",".join(["{0:.4f}".format(k[1]) for k in important_res]))
+
+
+def verify_results(cfg, results):
+    """
+    Args:
+        results (OrderedDict[dict]): task_name -> {metric -> score}
+
+    Returns:
+        bool: whether the verification succeeds or not
+    """
+    expected_results = cfg.TEST.EXPECTED_RESULTS
+    if not len(expected_results):
+        return True
+
+    ok = True
+    for task, metric, expected, tolerance in expected_results:
+        actual = results[task][metric]
+        if not np.isfinite(actual):
+            ok = False
+        diff = abs(actual - expected)
+        if diff > tolerance:
+            ok = False
+
+    logger = logging.getLogger(__name__)
+    if not ok:
+        logger.error("Result verification failed!")
+        logger.error("Expected Results: " + str(expected_results))
+        logger.error("Actual Results: " + pprint.pformat(results))
+
+        sys.exit(1)
+    else:
+        logger.info("Results verification passed.")
+    return ok
+
+
+def flatten_results_dict(results):
+    """
+    Expand a hierarchical dict of scalars into a flat dict of scalars.
+    If results[k1][k2][k3] = v, the returned dict will have the entry
+    {"k1/k2/k3": v}.
+
+    Args:
+        results (dict):
+    """
+    r = {}
+    for k, v in results.items():
+        if isinstance(v, Mapping):
+            v = flatten_results_dict(v)
+            for kk, vv in v.items():
+                r[k + "/" + kk] = vv
+        else:
+            r[k] = v
+    return r
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/detectron2/export/README.md b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/export/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..9bd8b57c1a5f15e391eb63b690f1051b1ad79d21
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/export/README.md
@@ -0,0 +1,10 @@
+
+This directory contains code to prepare a detectron2 model for deployment.
+Currently it supports exporting a detectron2 model to Caffe2 format through ONNX.
+
+Please see [documentation](https://detectron2.readthedocs.io/tutorials/deployment.html) for its usage.
+
+
+### Acknowledgements
+
+Thanks to Mobile Vision team at Facebook for developing the conversion tools.
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/detectron2/export/__init__.py b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/export/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..1e2bf4d0670ed0ccd73dbdb7ce27a8e617bbf6aa
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/export/__init__.py
@@ -0,0 +1,5 @@
+# -*- coding: utf-8 -*-
+
+from .api import *
+
+__all__ = [k for k in globals().keys() if not k.startswith("_")]
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/detectron2/export/api.py b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/export/api.py
new file mode 100644
index 0000000000000000000000000000000000000000..a7600714e1edb019def04f9d0d1a063668943101
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/export/api.py
@@ -0,0 +1,277 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+import copy
+import logging
+import os
+import torch
+from caffe2.proto import caffe2_pb2
+from torch import nn
+
+from detectron2.config import CfgNode as CN
+
+from .caffe2_export import export_caffe2_detection_model
+from .caffe2_export import export_onnx_model as export_onnx_model_impl
+from .caffe2_export import run_and_save_graph
+from .caffe2_inference import ProtobufDetectionModel
+from .caffe2_modeling import META_ARCH_CAFFE2_EXPORT_TYPE_MAP, convert_batched_inputs_to_c2_format
+from .shared import get_pb_arg_vali, get_pb_arg_vals, save_graph
+
+__all__ = [
+    "add_export_config",
+    "export_caffe2_model",
+    "Caffe2Model",
+    "export_onnx_model",
+    "Caffe2Tracer",
+]
+
+
+def add_export_config(cfg):
+    """
+    Args:
+        cfg (CfgNode): a detectron2 config
+
+    Returns:
+        CfgNode: an updated config with new options that will be used
+            by :class:`Caffe2Tracer`.
+    """
+    is_frozen = cfg.is_frozen()
+    cfg.defrost()
+    cfg.EXPORT_CAFFE2 = CN()
+    cfg.EXPORT_CAFFE2.USE_HEATMAP_MAX_KEYPOINT = False
+    if is_frozen:
+        cfg.freeze()
+    return cfg
+
+
+class Caffe2Tracer:
+    """
+    Make a detectron2 model traceable with caffe2 style.
+
+    An original detectron2 model may not be traceable, or
+    cannot be deployed directly after being traced, due to some reasons:
+    1. control flow in some ops
+    2. custom ops
+    3. complicated pre/post processing
+
+    This class provides a traceable version of a detectron2 model by:
+    1. Rewrite parts of the model using ops in caffe2. Note that some ops do
+       not have GPU implementation.
+    2. Define the inputs "after pre-processing" as inputs to the model
+    3. Remove post-processing and produce raw layer outputs
+
+    More specifically about inputs: all builtin models take two input tensors.
+    (1) NCHW float "data" which is an image (usually in [0, 255])
+    (2) Nx3 float "im_info", each row of which is (height, width, 1.0)
+
+    After making a traceable model, the class provide methods to export such a
+    model to different deployment formats.
+
+    The class currently only supports models using builtin meta architectures.
+    """
+
+    def __init__(self, cfg, model, inputs):
+        """
+        Args:
+            cfg (CfgNode): a detectron2 config, with extra export-related options
+                added by :func:`add_export_config`.
+            model (nn.Module): a model built by
+                :func:`detectron2.modeling.build_model`.
+            inputs: sample inputs that the given model takes for inference.
+                Will be used to trace the model.
+        """
+        assert isinstance(cfg, CN), cfg
+        assert isinstance(model, torch.nn.Module), type(model)
+        if "EXPORT_CAFFE2" not in cfg:
+            cfg = add_export_config(cfg)  # will just the defaults
+
+        self.cfg = cfg
+        self.model = model
+        self.inputs = inputs
+
+    def _get_traceable(self):
+        # TODO how to make it extensible to support custom models
+        C2MetaArch = META_ARCH_CAFFE2_EXPORT_TYPE_MAP[self.cfg.MODEL.META_ARCHITECTURE]
+        traceable_model = C2MetaArch(self.cfg, copy.deepcopy(self.model))
+        traceable_inputs = traceable_model.get_caffe2_inputs(self.inputs)
+        return traceable_model, traceable_inputs
+
+    def export_caffe2(self):
+        """
+        Export the model to Caffe2's protobuf format.
+        The returned object can be saved with `.save_protobuf()` method.
+        The result can be loaded and executed using Caffe2 runtime.
+
+        Returns:
+            Caffe2Model
+        """
+        model, inputs = self._get_traceable()
+        predict_net, init_net = export_caffe2_detection_model(model, inputs)
+        return Caffe2Model(predict_net, init_net)
+
+    def export_onnx(self):
+        """
+        Export the model to ONNX format.
+        Note that the exported model contains custom ops only available in caffe2, therefore it
+        cannot be directly executed by other runtime. Post-processing or transformation passes
+        may be applied on the model to accommodate different runtimes.
+
+        Returns:
+            onnx.ModelProto: an onnx model.
+        """
+        model, inputs = self._get_traceable()
+        return export_onnx_model_impl(model, (inputs,))
+
+    def export_torchscript(self):
+        """
+        Export the model to a `torch.jit.TracedModule` by tracing.
+        The returned object can be saved to a file by ".save()".
+
+        Returns:
+            torch.jit.TracedModule: a torch TracedModule
+        """
+        model, inputs = self._get_traceable()
+        logger = logging.getLogger(__name__)
+        logger.info("Tracing the model with torch.jit.trace ...")
+        with torch.no_grad():
+            return torch.jit.trace(model, (inputs,), optimize=True)
+
+
+def export_caffe2_model(cfg, model, inputs):
+    """
+    Export a detectron2 model to caffe2 format.
+
+    Args:
+        cfg (CfgNode): a detectron2 config, with extra export-related options
+            added by :func:`add_export_config`.
+        model (nn.Module): a model built by
+            :func:`detectron2.modeling.build_model`.
+            It will be modified by this function.
+        inputs: sample inputs that the given model takes for inference.
+            Will be used to trace the model.
+
+    Returns:
+        Caffe2Model
+    """
+    return Caffe2Tracer(cfg, model, inputs).export_caffe2()
+
+
+def export_onnx_model(cfg, model, inputs):
+    """
+    Export a detectron2 model to ONNX format.
+    Note that the exported model contains custom ops only available in caffe2, therefore it
+    cannot be directly executed by other runtime. Post-processing or transformation passes
+    may be applied on the model to accommodate different runtimes.
+    Args:
+        cfg (CfgNode): a detectron2 config, with extra export-related options
+            added by :func:`add_export_config`.
+        model (nn.Module): a model built by
+            :func:`detectron2.modeling.build_model`.
+            It will be modified by this function.
+        inputs: sample inputs that the given model takes for inference.
+            Will be used to trace the model.
+    Returns:
+        onnx.ModelProto: an onnx model.
+    """
+    return Caffe2Tracer(cfg, model, inputs).export_onnx()
+
+
+class Caffe2Model(nn.Module):
+    """
+    A wrapper around the traced model in caffe2's pb format.
+    """
+
+    def __init__(self, predict_net, init_net):
+        super().__init__()
+        self.eval()  # always in eval mode
+        self._predict_net = predict_net
+        self._init_net = init_net
+        self._predictor = None
+
+    @property
+    def predict_net(self):
+        """
+        Returns:
+            core.Net: the underlying caffe2 predict net
+        """
+        return self._predict_net
+
+    @property
+    def init_net(self):
+        """
+        Returns:
+            core.Net: the underlying caffe2 init net
+        """
+        return self._init_net
+
+    __init__.__HIDE_SPHINX_DOC__ = True
+
+    def save_protobuf(self, output_dir):
+        """
+        Save the model as caffe2's protobuf format.
+
+        Args:
+            output_dir (str): the output directory to save protobuf files.
+        """
+        logger = logging.getLogger(__name__)
+        logger.info("Saving model to {} ...".format(output_dir))
+        os.makedirs(output_dir, exist_ok=True)
+
+        with open(os.path.join(output_dir, "model.pb"), "wb") as f:
+            f.write(self._predict_net.SerializeToString())
+        with open(os.path.join(output_dir, "model.pbtxt"), "w") as f:
+            f.write(str(self._predict_net))
+        with open(os.path.join(output_dir, "model_init.pb"), "wb") as f:
+            f.write(self._init_net.SerializeToString())
+
+    def save_graph(self, output_file, inputs=None):
+        """
+        Save the graph as SVG format.
+
+        Args:
+            output_file (str): a SVG file
+            inputs: optional inputs given to the model.
+                If given, the inputs will be used to run the graph to record
+                shape of every tensor. The shape information will be
+                saved together with the graph.
+        """
+        if inputs is None:
+            save_graph(self._predict_net, output_file, op_only=False)
+        else:
+            size_divisibility = get_pb_arg_vali(self._predict_net, "size_divisibility", 0)
+            device = get_pb_arg_vals(self._predict_net, "device", b"cpu").decode("ascii")
+            inputs = convert_batched_inputs_to_c2_format(inputs, size_divisibility, device)
+            inputs = [x.cpu().numpy() for x in inputs]
+            run_and_save_graph(self._predict_net, self._init_net, inputs, output_file)
+
+    @staticmethod
+    def load_protobuf(dir):
+        """
+        Args:
+            dir (str): a directory used to save Caffe2Model with
+                :meth:`save_protobuf`.
+                The files "model.pb" and "model_init.pb" are needed.
+
+        Returns:
+            Caffe2Model: the caffe2 model loaded from this directory.
+        """
+        predict_net = caffe2_pb2.NetDef()
+        with open(os.path.join(dir, "model.pb"), "rb") as f:
+            predict_net.ParseFromString(f.read())
+
+        init_net = caffe2_pb2.NetDef()
+        with open(os.path.join(dir, "model_init.pb"), "rb") as f:
+            init_net.ParseFromString(f.read())
+
+        return Caffe2Model(predict_net, init_net)
+
+    def __call__(self, inputs):
+        """
+        An interface that wraps around a caffe2 model and mimics detectron2's models'
+        input & output format. This is used to compare the outputs of caffe2 model
+        with its original torch model.
+
+        Due to the extra conversion between torch/caffe2,
+        this method is not meant for benchmark.
+        """
+        if self._predictor is None:
+            self._predictor = ProtobufDetectionModel(self._predict_net, self._init_net)
+        return self._predictor(inputs)
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/detectron2/export/c10.py b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/export/c10.py
new file mode 100644
index 0000000000000000000000000000000000000000..6e3cbe3ce94d0c56596c645b8c85592ed5d31fe1
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/export/c10.py
@@ -0,0 +1,503 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+
+import math
+import torch
+import torch.nn.functional as F
+
+from detectron2.layers import cat
+from detectron2.layers.roi_align_rotated import ROIAlignRotated
+from detectron2.modeling import poolers
+from detectron2.modeling.proposal_generator import rpn
+from detectron2.modeling.roi_heads.mask_head import mask_rcnn_inference
+from detectron2.structures import Boxes, ImageList, Instances, Keypoints
+
+from .shared import alias, to_device
+
+
+"""
+This file contains caffe2-compatible implementation of several detectrno2 components.
+"""
+
+
+class Caffe2Boxes(Boxes):
+    """
+    Representing a list of detectron2.structures.Boxes from minibatch, each box
+    is represented by a 5d vector (batch index + 4 coordinates), or a 6d vector
+    (batch index + 5 coordinates) for RotatedBoxes.
+    """
+
+    def __init__(self, tensor):
+        assert isinstance(tensor, torch.Tensor)
+        assert tensor.dim() == 2 and tensor.size(-1) in [4, 5, 6], tensor.size()
+        # TODO: make tensor immutable when dim is Nx5 for Boxes,
+        # and Nx6 for RotatedBoxes?
+        self.tensor = tensor
+
+
+# TODO clean up this class, maybe just extend Instances
+class InstancesList(object):
+    """
+    Tensor representation of a list of Instances object for a batch of images.
+
+    When dealing with a batch of images with Caffe2 ops, a list of bboxes
+    (instances) are usually represented by single Tensor with size
+    (sigma(Ni), 5) or (sigma(Ni), 4) plus a batch split Tensor. This class is
+    for providing common functions to convert between these two representations.
+    """
+
+    def __init__(self, im_info, indices, extra_fields=None):
+        # [N, 3] -> (H, W, Scale)
+        self.im_info = im_info
+        # [N,] -> indice of batch to which the instance belongs
+        self.indices = indices
+        # [N, ...]
+        self.batch_extra_fields = extra_fields or {}
+
+        self.image_size = self.im_info
+
+    def get_fields(self):
+        """ like `get_fields` in the Instances object,
+        but return each field in tensor representations """
+        ret = {}
+        for k, v in self.batch_extra_fields.items():
+            # if isinstance(v, torch.Tensor):
+            #     tensor_rep = v
+            # elif isinstance(v, (Boxes, Keypoints)):
+            #     tensor_rep = v.tensor
+            # else:
+            #     raise ValueError("Can't find tensor representation for: {}".format())
+            ret[k] = v
+        return ret
+
+    def has(self, name):
+        return name in self.batch_extra_fields
+
+    def set(self, name, value):
+        data_len = len(value)
+        if len(self.batch_extra_fields):
+            assert (
+                len(self) == data_len
+            ), "Adding a field of length {} to a Instances of length {}".format(data_len, len(self))
+        self.batch_extra_fields[name] = value
+
+    def __setattr__(self, name, val):
+        if name in ["im_info", "indices", "batch_extra_fields", "image_size"]:
+            super().__setattr__(name, val)
+        else:
+            self.set(name, val)
+
+    def __getattr__(self, name):
+        if name not in self.batch_extra_fields:
+            raise AttributeError("Cannot find field '{}' in the given Instances!".format(name))
+        return self.batch_extra_fields[name]
+
+    def __len__(self):
+        return len(self.indices)
+
+    def flatten(self):
+        ret = []
+        for _, v in self.batch_extra_fields.items():
+            if isinstance(v, (Boxes, Keypoints)):
+                ret.append(v.tensor)
+            else:
+                ret.append(v)
+        return ret
+
+    @staticmethod
+    def to_d2_instances_list(instances_list):
+        """
+        Convert InstancesList to List[Instances]. The input `instances_list` can
+        also be a List[Instances], in this case this method is a non-op.
+        """
+        if not isinstance(instances_list, InstancesList):
+            assert all(isinstance(x, Instances) for x in instances_list)
+            return instances_list
+
+        ret = []
+        for i, info in enumerate(instances_list.im_info):
+            instances = Instances(torch.Size([int(info[0].item()), int(info[1].item())]))
+
+            ids = instances_list.indices == i
+            for k, v in instances_list.batch_extra_fields.items():
+                if isinstance(v, torch.Tensor):
+                    instances.set(k, v[ids])
+                    continue
+                elif isinstance(v, Boxes):
+                    instances.set(k, v[ids, -4:])
+                    continue
+
+                target_type, tensor_source = v
+                assert isinstance(tensor_source, torch.Tensor)
+                assert tensor_source.shape[0] == instances_list.indices.shape[0]
+                tensor_source = tensor_source[ids]
+
+                if issubclass(target_type, Boxes):
+                    instances.set(k, Boxes(tensor_source[:, -4:]))
+                elif issubclass(target_type, Keypoints):
+                    instances.set(k, Keypoints(tensor_source))
+                elif issubclass(target_type, torch.Tensor):
+                    instances.set(k, tensor_source)
+                else:
+                    raise ValueError("Can't handle targe type: {}".format(target_type))
+
+            ret.append(instances)
+        return ret
+
+
+class Caffe2Compatible(object):
+    def _get_tensor_mode(self):
+        return self._tensor_mode
+
+    def _set_tensor_mode(self, v):
+        self._tensor_mode = v
+
+    tensor_mode = property(_get_tensor_mode, _set_tensor_mode)
+    """
+    If true, the model expects C2-style tensor only inputs/outputs format.
+    """
+
+
+class Caffe2RPN(Caffe2Compatible, rpn.RPN):
+    def forward(self, images, features, gt_instances=None):
+        assert not self.training
+
+        features = [features[f] for f in self.in_features]
+        objectness_logits_pred, anchor_deltas_pred = self.rpn_head(features)
+
+        assert isinstance(images, ImageList)
+        if self.tensor_mode:
+            im_info = images.image_sizes
+        else:
+            im_info = torch.Tensor(
+                [[im_sz[0], im_sz[1], torch.Tensor([1.0])] for im_sz in images.image_sizes]
+            ).to(images.tensor.device)
+        assert isinstance(im_info, torch.Tensor)
+
+        rpn_rois_list = []
+        rpn_roi_probs_list = []
+        for scores, bbox_deltas, cell_anchors_tensor, feat_stride in zip(
+            objectness_logits_pred,
+            anchor_deltas_pred,
+            iter(self.anchor_generator.cell_anchors),
+            self.anchor_generator.strides,
+        ):
+            scores = scores.detach()
+            bbox_deltas = bbox_deltas.detach()
+
+            rpn_rois, rpn_roi_probs = torch.ops._caffe2.GenerateProposals(
+                scores,
+                bbox_deltas,
+                im_info,
+                cell_anchors_tensor,
+                spatial_scale=1.0 / feat_stride,
+                pre_nms_topN=self.pre_nms_topk[self.training],
+                post_nms_topN=self.post_nms_topk[self.training],
+                nms_thresh=self.nms_thresh,
+                min_size=self.min_box_side_len,
+                # correct_transform_coords=True,  # deprecated argument
+                angle_bound_on=True,  # Default
+                angle_bound_lo=-180,
+                angle_bound_hi=180,
+                clip_angle_thresh=1.0,  # Default
+                legacy_plus_one=False,
+            )
+            rpn_rois_list.append(rpn_rois)
+            rpn_roi_probs_list.append(rpn_roi_probs)
+
+        # For FPN in D2, in RPN all proposals from different levels are concated
+        # together, ranked and picked by top post_nms_topk. Then in ROIPooler
+        # it calculates level_assignments and calls the RoIAlign from
+        # the corresponding level.
+
+        if len(objectness_logits_pred) == 1:
+            rpn_rois = rpn_rois_list[0]
+            rpn_roi_probs = rpn_roi_probs_list[0]
+        else:
+            assert len(rpn_rois_list) == len(rpn_roi_probs_list)
+            rpn_post_nms_topN = self.post_nms_topk[self.training]
+
+            device = rpn_rois_list[0].device
+            input_list = [to_device(x, "cpu") for x in (rpn_rois_list + rpn_roi_probs_list)]
+
+            # TODO remove this after confirming rpn_max_level/rpn_min_level
+            # is not needed in CollectRpnProposals.
+            feature_strides = list(self.anchor_generator.strides)
+            rpn_min_level = int(math.log2(feature_strides[0]))
+            rpn_max_level = int(math.log2(feature_strides[-1]))
+            assert (rpn_max_level - rpn_min_level + 1) == len(
+                rpn_rois_list
+            ), "CollectRpnProposals requires continuous levels"
+
+            rpn_rois = torch.ops._caffe2.CollectRpnProposals(
+                input_list,
+                # NOTE: in current implementation, rpn_max_level and rpn_min_level
+                # are not needed, only the subtraction of two matters and it
+                # can be infer from the number of inputs. Keep them now for
+                # consistency.
+                rpn_max_level=2 + len(rpn_rois_list) - 1,
+                rpn_min_level=2,
+                rpn_post_nms_topN=rpn_post_nms_topN,
+            )
+            rpn_rois = to_device(rpn_rois, device)
+            rpn_roi_probs = []
+
+        proposals = self.c2_postprocess(im_info, rpn_rois, rpn_roi_probs, self.tensor_mode)
+        return proposals, {}
+
+    @staticmethod
+    def c2_postprocess(im_info, rpn_rois, rpn_roi_probs, tensor_mode):
+        proposals = InstancesList(
+            im_info=im_info,
+            indices=rpn_rois[:, 0],
+            extra_fields={
+                "proposal_boxes": Caffe2Boxes(rpn_rois),
+                "objectness_logits": (torch.Tensor, rpn_roi_probs),
+            },
+        )
+        if not tensor_mode:
+            proposals = InstancesList.to_d2_instances_list(proposals)
+        else:
+            proposals = [proposals]
+        return proposals
+
+
+class Caffe2ROIPooler(Caffe2Compatible, poolers.ROIPooler):
+    @staticmethod
+    def c2_preprocess(box_lists):
+        assert all(isinstance(x, Boxes) for x in box_lists)
+        if all(isinstance(x, Caffe2Boxes) for x in box_lists):
+            # input is pure-tensor based
+            assert len(box_lists) == 1
+            pooler_fmt_boxes = box_lists[0].tensor
+        else:
+            pooler_fmt_boxes = poolers.convert_boxes_to_pooler_format(box_lists)
+        return pooler_fmt_boxes
+
+    def forward(self, x, box_lists):
+        assert not self.training
+
+        pooler_fmt_boxes = self.c2_preprocess(box_lists)
+        num_level_assignments = len(self.level_poolers)
+
+        if num_level_assignments == 1:
+            if isinstance(self.level_poolers[0], ROIAlignRotated):
+                c2_roi_align = torch.ops._caffe2.RoIAlignRotated
+                aligned = True
+            else:
+                c2_roi_align = torch.ops._caffe2.RoIAlign
+                aligned = self.level_poolers[0].aligned
+
+            out = c2_roi_align(
+                x[0],
+                pooler_fmt_boxes,
+                order="NCHW",
+                spatial_scale=float(self.level_poolers[0].spatial_scale),
+                pooled_h=int(self.output_size[0]),
+                pooled_w=int(self.output_size[1]),
+                sampling_ratio=int(self.level_poolers[0].sampling_ratio),
+                aligned=aligned,
+            )
+            return out
+
+        device = pooler_fmt_boxes.device
+        assert (
+            self.max_level - self.min_level + 1 == 4
+        ), "Currently DistributeFpnProposals only support 4 levels"
+        fpn_outputs = torch.ops._caffe2.DistributeFpnProposals(
+            to_device(pooler_fmt_boxes, "cpu"),
+            roi_canonical_scale=self.canonical_box_size,
+            roi_canonical_level=self.canonical_level,
+            roi_max_level=self.max_level,
+            roi_min_level=self.min_level,
+            legacy_plus_one=False,
+        )
+        fpn_outputs = [to_device(x, device) for x in fpn_outputs]
+
+        rois_fpn_list = fpn_outputs[:-1]
+        rois_idx_restore_int32 = fpn_outputs[-1]
+
+        roi_feat_fpn_list = []
+        for roi_fpn, x_level, pooler in zip(rois_fpn_list, x, self.level_poolers):
+            if isinstance(pooler, ROIAlignRotated):
+                c2_roi_align = torch.ops._caffe2.RoIAlignRotated
+                aligned = True
+            else:
+                c2_roi_align = torch.ops._caffe2.RoIAlign
+                aligned = bool(pooler.aligned)
+
+            roi_feat_fpn = c2_roi_align(
+                x_level,
+                roi_fpn,
+                order="NCHW",
+                spatial_scale=float(pooler.spatial_scale),
+                pooled_h=int(self.output_size[0]),
+                pooled_w=int(self.output_size[1]),
+                sampling_ratio=int(pooler.sampling_ratio),
+                aligned=aligned,
+            )
+            roi_feat_fpn_list.append(roi_feat_fpn)
+
+        roi_feat_shuffled = cat(roi_feat_fpn_list, dim=0)
+        roi_feat = torch.ops._caffe2.BatchPermutation(roi_feat_shuffled, rois_idx_restore_int32)
+        return roi_feat
+
+
+class Caffe2FastRCNNOutputsInference:
+    def __init__(self, tensor_mode):
+        self.tensor_mode = tensor_mode  # whether the output is caffe2 tensor mode
+
+    def __call__(self, box_predictor, predictions, proposals):
+        """ equivalent to FastRCNNOutputLayers.inference """
+        score_thresh = box_predictor.test_score_thresh
+        nms_thresh = box_predictor.test_nms_thresh
+        topk_per_image = box_predictor.test_topk_per_image
+        is_rotated = len(box_predictor.box2box_transform.weights) == 5
+
+        if is_rotated:
+            box_dim = 5
+            assert box_predictor.box2box_transform.weights[4] == 1, (
+                "The weights for Rotated BBoxTransform in C2 have only 4 dimensions,"
+                + " thus enforcing the angle weight to be 1 for now"
+            )
+            box2box_transform_weights = box_predictor.box2box_transform.weights[:4]
+        else:
+            box_dim = 4
+            box2box_transform_weights = box_predictor.box2box_transform.weights
+
+        class_logits, box_regression = predictions
+        class_prob = F.softmax(class_logits, -1)
+
+        assert box_regression.shape[1] % box_dim == 0
+        cls_agnostic_bbox_reg = box_regression.shape[1] // box_dim == 1
+
+        input_tensor_mode = proposals[0].proposal_boxes.tensor.shape[1] == box_dim + 1
+
+        rois = type(proposals[0].proposal_boxes).cat([p.proposal_boxes for p in proposals])
+        device, dtype = rois.tensor.device, rois.tensor.dtype
+        if input_tensor_mode:
+            im_info = proposals[0].image_size
+            rois = rois.tensor
+        else:
+            im_info = torch.Tensor(
+                [[sz[0], sz[1], 1.0] for sz in [x.image_size for x in proposals]]
+            )
+            batch_ids = cat(
+                [
+                    torch.full((b, 1), i, dtype=dtype, device=device)
+                    for i, b in enumerate(len(p) for p in proposals)
+                ],
+                dim=0,
+            )
+            rois = torch.cat([batch_ids, rois.tensor], dim=1)
+
+        roi_pred_bbox, roi_batch_splits = torch.ops._caffe2.BBoxTransform(
+            to_device(rois, "cpu"),
+            to_device(box_regression, "cpu"),
+            to_device(im_info, "cpu"),
+            weights=box2box_transform_weights,
+            apply_scale=True,
+            rotated=is_rotated,
+            angle_bound_on=True,
+            angle_bound_lo=-180,
+            angle_bound_hi=180,
+            clip_angle_thresh=1.0,
+            legacy_plus_one=False,
+        )
+        roi_pred_bbox = to_device(roi_pred_bbox, device)
+        roi_batch_splits = to_device(roi_batch_splits, device)
+
+        nms_outputs = torch.ops._caffe2.BoxWithNMSLimit(
+            to_device(class_prob, "cpu"),
+            to_device(roi_pred_bbox, "cpu"),
+            to_device(roi_batch_splits, "cpu"),
+            score_thresh=float(score_thresh),
+            nms=float(nms_thresh),
+            detections_per_im=int(topk_per_image),
+            soft_nms_enabled=False,
+            soft_nms_method="linear",
+            soft_nms_sigma=0.5,
+            soft_nms_min_score_thres=0.001,
+            rotated=is_rotated,
+            cls_agnostic_bbox_reg=cls_agnostic_bbox_reg,
+            input_boxes_include_bg_cls=False,
+            output_classes_include_bg_cls=False,
+            legacy_plus_one=False,
+        )
+        roi_score_nms = to_device(nms_outputs[0], device)
+        roi_bbox_nms = to_device(nms_outputs[1], device)
+        roi_class_nms = to_device(nms_outputs[2], device)
+        roi_batch_splits_nms = to_device(nms_outputs[3], device)
+        roi_keeps_nms = to_device(nms_outputs[4], device)
+        roi_keeps_size_nms = to_device(nms_outputs[5], device)
+        if not self.tensor_mode:
+            roi_class_nms = roi_class_nms.to(torch.int64)
+
+        roi_batch_ids = cat(
+            [
+                torch.full((b, 1), i, dtype=dtype, device=device)
+                for i, b in enumerate(int(x.item()) for x in roi_batch_splits_nms)
+            ],
+            dim=0,
+        )
+
+        roi_class_nms = alias(roi_class_nms, "class_nms")
+        roi_score_nms = alias(roi_score_nms, "score_nms")
+        roi_bbox_nms = alias(roi_bbox_nms, "bbox_nms")
+        roi_batch_splits_nms = alias(roi_batch_splits_nms, "batch_splits_nms")
+        roi_keeps_nms = alias(roi_keeps_nms, "keeps_nms")
+        roi_keeps_size_nms = alias(roi_keeps_size_nms, "keeps_size_nms")
+
+        results = InstancesList(
+            im_info=im_info,
+            indices=roi_batch_ids[:, 0],
+            extra_fields={
+                "pred_boxes": Caffe2Boxes(roi_bbox_nms),
+                "scores": roi_score_nms,
+                "pred_classes": roi_class_nms,
+            },
+        )
+
+        if not self.tensor_mode:
+            results = InstancesList.to_d2_instances_list(results)
+            batch_splits = roi_batch_splits_nms.int().tolist()
+            kept_indices = list(roi_keeps_nms.to(torch.int64).split(batch_splits))
+        else:
+            results = [results]
+            kept_indices = [roi_keeps_nms]
+
+        return results, kept_indices
+
+
+class Caffe2MaskRCNNInference:
+    def __call__(self, pred_mask_logits, pred_instances):
+        """ equivalent to mask_head.mask_rcnn_inference """
+        if all(isinstance(x, InstancesList) for x in pred_instances):
+            assert len(pred_instances) == 1
+            mask_probs_pred = pred_mask_logits.sigmoid()
+            mask_probs_pred = alias(mask_probs_pred, "mask_fcn_probs")
+            pred_instances[0].pred_masks = mask_probs_pred
+        else:
+            mask_rcnn_inference(pred_mask_logits, pred_instances)
+
+
+class Caffe2KeypointRCNNInference:
+    def __init__(self, use_heatmap_max_keypoint):
+        self.use_heatmap_max_keypoint = use_heatmap_max_keypoint
+
+    def __call__(self, pred_keypoint_logits, pred_instances):
+        # just return the keypoint heatmap for now,
+        # there will be option to call HeatmapMaxKeypointOp
+        output = alias(pred_keypoint_logits, "kps_score")
+        if all(isinstance(x, InstancesList) for x in pred_instances):
+            assert len(pred_instances) == 1
+            if self.use_heatmap_max_keypoint:
+                device = output.device
+                output = torch.ops._caffe2.HeatmapMaxKeypoint(
+                    to_device(output, "cpu"),
+                    pred_instances[0].pred_boxes.tensor,
+                    should_output_softmax=True,  # worth make it configerable?
+                )
+                output = to_device(output, device)
+                output = alias(output, "keypoints_out")
+            pred_instances[0].pred_keypoints = output
+        return pred_keypoint_logits
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/detectron2/export/caffe2_export.py b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/export/caffe2_export.py
new file mode 100644
index 0000000000000000000000000000000000000000..ccac809d7bf49ab144b5f0a34f57e00c3534ad60
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/export/caffe2_export.py
@@ -0,0 +1,204 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+
+import copy
+import io
+import logging
+import numpy as np
+from typing import List
+import onnx
+import torch
+from caffe2.proto import caffe2_pb2
+from caffe2.python import core
+from caffe2.python.onnx.backend import Caffe2Backend
+from tabulate import tabulate
+from termcolor import colored
+from torch.onnx import OperatorExportTypes
+
+from .shared import (
+    ScopedWS,
+    construct_init_net_from_params,
+    fuse_alias_placeholder,
+    fuse_copy_between_cpu_and_gpu,
+    get_params_from_init_net,
+    group_norm_replace_aten_with_caffe2,
+    infer_device_type,
+    remove_dead_end_ops,
+    remove_reshape_for_fc,
+    save_graph,
+)
+
+logger = logging.getLogger(__name__)
+
+
+def export_onnx_model(model, inputs):
+    """
+    Trace and export a model to onnx format.
+
+    Args:
+        model (nn.Module):
+        inputs (tuple[args]): the model will be called by `model(*inputs)`
+
+    Returns:
+        an onnx model
+    """
+    assert isinstance(model, torch.nn.Module)
+
+    # make sure all modules are in eval mode, onnx may change the training state
+    # of the module if the states are not consistent
+    def _check_eval(module):
+        assert not module.training
+
+    model.apply(_check_eval)
+
+    # Export the model to ONNX
+    with torch.no_grad():
+        with io.BytesIO() as f:
+            torch.onnx.export(
+                model,
+                inputs,
+                f,
+                operator_export_type=OperatorExportTypes.ONNX_ATEN_FALLBACK,
+                # verbose=True,  # NOTE: uncomment this for debugging
+                # export_params=True,
+            )
+            onnx_model = onnx.load_from_string(f.getvalue())
+
+    # Apply ONNX's Optimization
+    all_passes = onnx.optimizer.get_available_passes()
+    passes = ["fuse_bn_into_conv"]
+    assert all(p in all_passes for p in passes)
+    onnx_model = onnx.optimizer.optimize(onnx_model, passes)
+    return onnx_model
+
+
+def _op_stats(net_def):
+    type_count = {}
+    for t in [op.type for op in net_def.op]:
+        type_count[t] = type_count.get(t, 0) + 1
+    type_count_list = sorted(type_count.items(), key=lambda kv: kv[0])  # alphabet
+    type_count_list = sorted(type_count_list, key=lambda kv: -kv[1])  # count
+    return "\n".join("{:>4}x {}".format(count, name) for name, count in type_count_list)
+
+
+def _assign_device_option(
+    predict_net: caffe2_pb2.NetDef, init_net: caffe2_pb2.NetDef, tensor_inputs: List[torch.Tensor]
+):
+    """
+    ONNX exported network doesn't have concept of device, assign necessary
+    device option for each op in order to make it runable on GPU runtime.
+    """
+
+    def _get_device_type(torch_tensor):
+        assert torch_tensor.device.type in ["cpu", "cuda"]
+        assert torch_tensor.device.index == 0
+        return torch_tensor.device.type
+
+    def _assign_op_device_option(net_proto, net_ssa, blob_device_types):
+        for op, ssa_i in zip(net_proto.op, net_ssa):
+            if op.type in ["CopyCPUToGPU", "CopyGPUToCPU"]:
+                op.device_option.CopyFrom(core.DeviceOption(caffe2_pb2.CUDA, 0))
+            else:
+                devices = [blob_device_types[b] for b in ssa_i[0] + ssa_i[1]]
+                assert all(d == devices[0] for d in devices)
+                if devices[0] == "cuda":
+                    op.device_option.CopyFrom(core.DeviceOption(caffe2_pb2.CUDA, 0))
+
+    # update ops in predict_net
+    predict_net_input_device_types = {
+        (name, 0): _get_device_type(tensor)
+        for name, tensor in zip(predict_net.external_input, tensor_inputs)
+    }
+    predict_net_device_types = infer_device_type(
+        predict_net, known_status=predict_net_input_device_types, device_name_style="pytorch"
+    )
+    predict_net_ssa, _ = core.get_ssa(predict_net)
+    _assign_op_device_option(predict_net, predict_net_ssa, predict_net_device_types)
+
+    # update ops in init_net
+    init_net_ssa, versions = core.get_ssa(init_net)
+    init_net_output_device_types = {
+        (name, versions[name]): predict_net_device_types[(name, 0)]
+        for name in init_net.external_output
+    }
+    init_net_device_types = infer_device_type(
+        init_net, known_status=init_net_output_device_types, device_name_style="pytorch"
+    )
+    _assign_op_device_option(init_net, init_net_ssa, init_net_device_types)
+
+
+def export_caffe2_detection_model(model: torch.nn.Module, tensor_inputs: List[torch.Tensor]):
+    """
+    Export a caffe2-compatible Detectron2 model to caffe2 format via ONNX.
+
+    Arg:
+        model: a caffe2-compatible version of detectron2 model, defined in caffe2_modeling.py
+        tensor_inputs: a list of tensors that caffe2 model takes as input.
+    """
+    model = copy.deepcopy(model)
+    assert isinstance(model, torch.nn.Module)
+    assert hasattr(model, "encode_additional_info")
+
+    # Export via ONNX
+    logger.info("Exporting a {} model via ONNX ...".format(type(model).__name__))
+    onnx_model = export_onnx_model(model, (tensor_inputs,))
+    # Convert ONNX model to Caffe2 protobuf
+    init_net, predict_net = Caffe2Backend.onnx_graph_to_caffe2_net(onnx_model)
+    ops_table = [[op.type, op.input, op.output] for op in predict_net.op]
+    table = tabulate(ops_table, headers=["type", "input", "output"], tablefmt="pipe")
+    logger.info(
+        "ONNX export Done. Exported predict_net (before optimizations):\n" + colored(table, "cyan")
+    )
+
+    # Apply protobuf optimization
+    fuse_alias_placeholder(predict_net, init_net)
+    if any(t.device.type != "cpu" for t in tensor_inputs):
+        fuse_copy_between_cpu_and_gpu(predict_net)
+        remove_dead_end_ops(init_net)
+        _assign_device_option(predict_net, init_net, tensor_inputs)
+    params, device_options = get_params_from_init_net(init_net)
+    predict_net, params = remove_reshape_for_fc(predict_net, params)
+    init_net = construct_init_net_from_params(params, device_options)
+    group_norm_replace_aten_with_caffe2(predict_net)
+
+    # Record necessary information for running the pb model in Detectron2 system.
+    model.encode_additional_info(predict_net, init_net)
+
+    logger.info("Operators used in predict_net: \n{}".format(_op_stats(predict_net)))
+    logger.info("Operators used in init_net: \n{}".format(_op_stats(init_net)))
+
+    return predict_net, init_net
+
+
+def run_and_save_graph(predict_net, init_net, tensor_inputs, graph_save_path):
+    """
+    Run the caffe2 model on given inputs, recording the shape and draw the graph.
+
+    predict_net/init_net: caffe2 model.
+    tensor_inputs: a list of tensors that caffe2 model takes as input.
+    graph_save_path: path for saving graph of exported model.
+    """
+
+    logger.info("Saving graph of ONNX exported model to {} ...".format(graph_save_path))
+    save_graph(predict_net, graph_save_path, op_only=False)
+
+    # Run the exported Caffe2 net
+    logger.info("Running ONNX exported model ...")
+    with ScopedWS("__ws_tmp__", True) as ws:
+        ws.RunNetOnce(init_net)
+        initialized_blobs = set(ws.Blobs())
+        uninitialized = [inp for inp in predict_net.external_input if inp not in initialized_blobs]
+        for name, blob in zip(uninitialized, tensor_inputs):
+            ws.FeedBlob(name, blob)
+
+        try:
+            ws.RunNetOnce(predict_net)
+        except RuntimeError as e:
+            logger.warning("Encountered RuntimeError: \n{}".format(str(e)))
+
+        ws_blobs = {b: ws.FetchBlob(b) for b in ws.Blobs()}
+        blob_sizes = {b: ws_blobs[b].shape for b in ws_blobs if isinstance(ws_blobs[b], np.ndarray)}
+
+        logger.info("Saving graph with blob shapes to {} ...".format(graph_save_path))
+        save_graph(predict_net, graph_save_path, op_only=False, blob_sizes=blob_sizes)
+
+        return ws_blobs
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/detectron2/export/caffe2_inference.py b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/export/caffe2_inference.py
new file mode 100644
index 0000000000000000000000000000000000000000..92718d04031b4513c2324ad596eae9cdbfa7c75e
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/export/caffe2_inference.py
@@ -0,0 +1,136 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+
+import collections
+import logging
+import numpy as np
+import torch
+from caffe2.proto import caffe2_pb2
+from caffe2.python import core
+
+from .caffe2_modeling import META_ARCH_CAFFE2_EXPORT_TYPE_MAP, convert_batched_inputs_to_c2_format
+from .shared import ScopedWS, get_pb_arg_vali, get_pb_arg_vals, infer_device_type
+
+logger = logging.getLogger(__name__)
+
+
+class ProtobufModel(torch.nn.Module):
+    """
+    A class works just like nn.Module in terms of inference, but running
+    caffe2 model under the hood. Input/Output are Dict[str, tensor] whose keys
+    are in external_input/output.
+    """
+
+    def __init__(self, predict_net, init_net):
+        logger.info("Initializing ProtobufModel ...")
+        super().__init__()
+        assert isinstance(predict_net, caffe2_pb2.NetDef)
+        assert isinstance(init_net, caffe2_pb2.NetDef)
+        self.ws_name = "__ws_tmp__"
+        self.net = core.Net(predict_net)
+
+        with ScopedWS(self.ws_name, is_reset=True, is_cleanup=False) as ws:
+            ws.RunNetOnce(init_net)
+            for blob in self.net.Proto().external_input:
+                if blob not in ws.Blobs():
+                    ws.CreateBlob(blob)
+            ws.CreateNet(self.net)
+
+        self._error_msgs = set()
+
+    def forward(self, inputs_dict):
+        assert all(inp in self.net.Proto().external_input for inp in inputs_dict)
+        with ScopedWS(self.ws_name, is_reset=False, is_cleanup=False) as ws:
+            for b, tensor in inputs_dict.items():
+                ws.FeedBlob(b, tensor)
+            try:
+                ws.RunNet(self.net.Proto().name)
+            except RuntimeError as e:
+                if not str(e) in self._error_msgs:
+                    self._error_msgs.add(str(e))
+                    logger.warning("Encountered new RuntimeError: \n{}".format(str(e)))
+                logger.warning("Catch the error and use partial results.")
+
+            outputs_dict = collections.OrderedDict(
+                [(b, ws.FetchBlob(b)) for b in self.net.Proto().external_output]
+            )
+            # Remove outputs of current run, this is necessary in order to
+            # prevent fetching the result from previous run if the model fails
+            # in the middle.
+            for b in self.net.Proto().external_output:
+                # Needs to create uninitialized blob to make the net runable.
+                # This is "equivalent" to: ws.RemoveBlob(b) then ws.CreateBlob(b),
+                # but there'no such API.
+                ws.FeedBlob(b, "{}, a C++ native class of type nullptr (uninitialized).".format(b))
+
+        return outputs_dict
+
+
+class ProtobufDetectionModel(torch.nn.Module):
+    """
+    A class works just like a pytorch meta arch in terms of inference, but running
+    caffe2 model under the hood.
+    """
+
+    def __init__(self, predict_net, init_net, *, convert_outputs=None):
+        """
+        Args:
+            predict_net, init_net (core.Net): caffe2 nets
+            convert_outptus (callable): a function that converts caffe2
+                outputs to the same format of the original pytorch model.
+                By default, use the one defined in the caffe2 meta_arch.
+        """
+        super().__init__()
+        self.protobuf_model = ProtobufModel(predict_net, init_net)
+        self.size_divisibility = get_pb_arg_vali(predict_net, "size_divisibility", 0)
+        self.device = get_pb_arg_vals(predict_net, "device", b"cpu").decode("ascii")
+
+        if convert_outputs is None:
+            meta_arch = get_pb_arg_vals(predict_net, "meta_architecture", b"GeneralizedRCNN")
+            meta_arch = META_ARCH_CAFFE2_EXPORT_TYPE_MAP[meta_arch.decode("ascii")]
+            self._convert_outputs = meta_arch.get_outputs_converter(predict_net, init_net)
+        else:
+            self._convert_outputs = convert_outputs
+
+    def _infer_output_devices(self, inputs_dict):
+        def _get_device_type(torch_tensor):
+            assert torch_tensor.device.type in ["cpu", "cuda"]
+            assert torch_tensor.device.index == 0
+            return torch_tensor.device.type
+
+        predict_net = self.protobuf_model.net.Proto()
+        input_device_types = {
+            (name, 0): _get_device_type(tensor) for name, tensor in inputs_dict.items()
+        }
+        device_type_map = infer_device_type(
+            predict_net, known_status=input_device_types, device_name_style="pytorch"
+        )
+        ssa, versions = core.get_ssa(predict_net)
+        versioned_outputs = [(name, versions[name]) for name in predict_net.external_output]
+        output_devices = [device_type_map[outp] for outp in versioned_outputs]
+        return output_devices
+
+    def _convert_inputs(self, batched_inputs):
+        # currently all models convert inputs in the same way
+        data, im_info = convert_batched_inputs_to_c2_format(
+            batched_inputs, self.size_divisibility, self.device
+        )
+        return {"data": data, "im_info": im_info}
+
+    def forward(self, batched_inputs):
+        c2_inputs = self._convert_inputs(batched_inputs)
+        c2_results = self.protobuf_model(c2_inputs)
+
+        if any(t.device.type != "cpu" for _, t in c2_inputs.items()):
+            output_devices = self._infer_output_devices(c2_inputs)
+        else:
+            output_devices = ["cpu" for _ in self.protobuf_model.net.Proto().external_output]
+
+        def _cast_caffe2_blob_to_torch_tensor(blob, device):
+            return torch.Tensor(blob).to(device) if isinstance(blob, np.ndarray) else None
+
+        c2_results = {
+            name: _cast_caffe2_blob_to_torch_tensor(c2_results[name], device)
+            for name, device in zip(self.protobuf_model.net.Proto().external_output, output_devices)
+        }
+
+        return self._convert_outputs(batched_inputs, c2_inputs, c2_results)
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/detectron2/export/caffe2_modeling.py b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/export/caffe2_modeling.py
new file mode 100644
index 0000000000000000000000000000000000000000..1732b322c75abc3ac178d61d31cdec4cdcd61dfd
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/export/caffe2_modeling.py
@@ -0,0 +1,493 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+
+import functools
+import io
+import struct
+import types
+import torch
+
+from detectron2.modeling import meta_arch
+from detectron2.modeling.box_regression import Box2BoxTransform
+from detectron2.modeling.meta_arch.panoptic_fpn import combine_semantic_and_instance_outputs
+from detectron2.modeling.postprocessing import detector_postprocess, sem_seg_postprocess
+from detectron2.modeling.roi_heads import keypoint_head
+from detectron2.structures import Boxes, ImageList, Instances, RotatedBoxes
+
+from .c10 import Caffe2Compatible
+from .patcher import ROIHeadsPatcher, patch_generalized_rcnn
+from .shared import (
+    alias,
+    check_set_pb_arg,
+    get_pb_arg_floats,
+    get_pb_arg_valf,
+    get_pb_arg_vali,
+    get_pb_arg_vals,
+    mock_torch_nn_functional_interpolate,
+)
+
+
+def assemble_rcnn_outputs_by_name(image_sizes, tensor_outputs, force_mask_on=False):
+    """
+    A function to assemble caffe2 model's outputs (i.e. Dict[str, Tensor])
+    to detectron2's format (i.e. list of Instances instance).
+    This only works when the model follows the Caffe2 detectron's naming convention.
+
+    Args:
+        image_sizes (List[List[int, int]]): [H, W] of every image.
+        tensor_outputs (Dict[str, Tensor]): external_output to its tensor.
+
+        force_mask_on (Bool): if true, the it make sure there'll be pred_masks even
+            if the mask is not found from tensor_outputs (usually due to model crash)
+    """
+
+    results = [Instances(image_size) for image_size in image_sizes]
+
+    batch_splits = tensor_outputs.get("batch_splits", None)
+    if batch_splits:
+        raise NotImplementedError()
+    assert len(image_sizes) == 1
+    result = results[0]
+
+    bbox_nms = tensor_outputs["bbox_nms"]
+    score_nms = tensor_outputs["score_nms"]
+    class_nms = tensor_outputs["class_nms"]
+    # Detection will always success because Conv support 0-batch
+    assert bbox_nms is not None
+    assert score_nms is not None
+    assert class_nms is not None
+    if bbox_nms.shape[1] == 5:
+        result.pred_boxes = RotatedBoxes(bbox_nms)
+    else:
+        result.pred_boxes = Boxes(bbox_nms)
+    result.scores = score_nms
+    result.pred_classes = class_nms.to(torch.int64)
+
+    mask_fcn_probs = tensor_outputs.get("mask_fcn_probs", None)
+    if mask_fcn_probs is not None:
+        # finish the mask pred
+        mask_probs_pred = mask_fcn_probs
+        num_masks = mask_probs_pred.shape[0]
+        class_pred = result.pred_classes
+        indices = torch.arange(num_masks, device=class_pred.device)
+        mask_probs_pred = mask_probs_pred[indices, class_pred][:, None]
+        result.pred_masks = mask_probs_pred
+    elif force_mask_on:
+        # NOTE: there's no way to know the height/width of mask here, it won't be
+        # used anyway when batch size is 0, so just set them to 0.
+        result.pred_masks = torch.zeros([0, 1, 0, 0], dtype=torch.uint8)
+
+    keypoints_out = tensor_outputs.get("keypoints_out", None)
+    kps_score = tensor_outputs.get("kps_score", None)
+    if keypoints_out is not None:
+        # keypoints_out: [N, 4, #kypoints], where 4 is in order of (x, y, score, prob)
+        keypoints_tensor = keypoints_out
+        # NOTE: it's possible that prob is not calculated if "should_output_softmax"
+        # is set to False in HeatmapMaxKeypoint, so just using raw score, seems
+        # it doesn't affect mAP. TODO: check more carefully.
+        keypoint_xyp = keypoints_tensor.transpose(1, 2)[:, :, [0, 1, 2]]
+        result.pred_keypoints = keypoint_xyp
+    elif kps_score is not None:
+        # keypoint heatmap to sparse data structure
+        pred_keypoint_logits = kps_score
+        keypoint_head.keypoint_rcnn_inference(pred_keypoint_logits, [result])
+
+    return results
+
+
+def _cast_to_f32(f64):
+    return struct.unpack("f", struct.pack("f", f64))[0]
+
+
+def set_caffe2_compatible_tensor_mode(model, enable=True):
+    def _fn(m):
+        if isinstance(m, Caffe2Compatible):
+            m.tensor_mode = enable
+
+    model.apply(_fn)
+
+
+def convert_batched_inputs_to_c2_format(batched_inputs, size_divisibility, device):
+    """
+    See get_caffe2_inputs() below.
+    """
+    assert all(isinstance(x, dict) for x in batched_inputs)
+    assert all(x["image"].dim() == 3 for x in batched_inputs)
+
+    images = [x["image"] for x in batched_inputs]
+    images = ImageList.from_tensors(images, size_divisibility)
+
+    im_info = []
+    for input_per_image, image_size in zip(batched_inputs, images.image_sizes):
+        target_height = input_per_image.get("height", image_size[0])
+        target_width = input_per_image.get("width", image_size[1])  # noqa
+        # NOTE: The scale inside im_info is kept as convention and for providing
+        # post-processing information if further processing is needed. For
+        # current Caffe2 model definitions that don't include post-processing inside
+        # the model, this number is not used.
+        # NOTE: There can be a slight difference between width and height
+        # scales, using a single number can results in numerical difference
+        # compared with D2's post-processing.
+        scale = target_height / image_size[0]
+        im_info.append([image_size[0], image_size[1], scale])
+    im_info = torch.Tensor(im_info)
+
+    return images.tensor.to(device), im_info.to(device)
+
+
+class Caffe2MetaArch(Caffe2Compatible, torch.nn.Module):
+    """
+    Base class for caffe2-compatible implementation of a meta architecture.
+    The forward is traceable and its traced graph can be converted to caffe2
+    graph through ONNX.
+    """
+
+    def __init__(self, cfg, torch_model):
+        """
+        Args:
+            cfg (CfgNode):
+            torch_model (nn.Module): the detectron2 model (meta_arch) to be
+                converted.
+        """
+        super().__init__()
+        self._wrapped_model = torch_model
+        self.eval()
+        set_caffe2_compatible_tensor_mode(self, True)
+
+    def get_caffe2_inputs(self, batched_inputs):
+        """
+        Convert pytorch-style structured inputs to caffe2-style inputs that
+        are tuples of tensors.
+
+        Args:
+            batched_inputs (list[dict]): inputs to a detectron2 model
+                in its standard format. Each dict has "image" (CHW tensor), and optionally
+                "height" and "width".
+
+        Returns:
+            tuple[Tensor]:
+                tuple of tensors that will be the inputs to the
+                :meth:`forward` method. For existing models, the first
+                is an NCHW tensor (padded and batched); the second is
+                a im_info Nx3 tensor, where the rows are
+                (height, width, unused legacy parameter)
+        """
+        return convert_batched_inputs_to_c2_format(
+            batched_inputs,
+            self._wrapped_model.backbone.size_divisibility,
+            self._wrapped_model.device,
+        )
+
+    def encode_additional_info(self, predict_net, init_net):
+        """
+        Save extra metadata that will be used by inference in the output protobuf.
+        """
+        pass
+
+    def forward(self, inputs):
+        """
+        Run the forward in caffe2-style. It has to use caffe2-compatible ops
+        and the method will be used for tracing.
+
+        Args:
+            inputs (tuple[Tensor]): inputs defined by :meth:`get_caffe2_input`.
+                They will be the inputs of the converted caffe2 graph.
+
+        Returns:
+            tuple[Tensor]: output tensors. They will be the outputs of the
+                converted caffe2 graph.
+        """
+        raise NotImplementedError
+
+    def _caffe2_preprocess_image(self, inputs):
+        """
+        Caffe2 implementation of preprocess_image, which is called inside each MetaArch's forward.
+        It normalizes the input images, and the final caffe2 graph assumes the
+        inputs have been batched already.
+        """
+        data, im_info = inputs
+        data = alias(data, "data")
+        im_info = alias(im_info, "im_info")
+        mean, std = self._wrapped_model.pixel_mean, self._wrapped_model.pixel_std
+        normalized_data = (data - mean) / std
+        normalized_data = alias(normalized_data, "normalized_data")
+
+        # Pack (data, im_info) into ImageList which is recognized by self.inference.
+        images = ImageList(tensor=normalized_data, image_sizes=im_info)
+        return images
+
+    @staticmethod
+    def get_outputs_converter(predict_net, init_net):
+        """
+        Creates a function that converts outputs of the caffe2 model to
+        detectron2's standard format.
+        The function uses information in `predict_net` and `init_net` that are
+        available at inferene time. Therefore the function logic can be used in inference.
+
+        The returned function has the following signature:
+
+            def convert(batched_inputs, c2_inputs, c2_results) -> detectron2_outputs
+
+        Where
+
+            * batched_inputs (list[dict]): the original input format of the meta arch
+            * c2_inputs (dict[str, Tensor]): the caffe2 inputs.
+            * c2_results (dict[str, Tensor]): the caffe2 output format,
+                corresponding to the outputs of the :meth:`forward` function.
+            * detectron2_outputs: the original output format of the meta arch.
+
+        This function can be used to compare the outputs of the original meta arch and
+        the converted caffe2 graph.
+
+        Returns:
+            callable: a callable of the above signature.
+        """
+        raise NotImplementedError
+
+
+class Caffe2GeneralizedRCNN(Caffe2MetaArch):
+    def __init__(self, cfg, torch_model):
+        assert isinstance(torch_model, meta_arch.GeneralizedRCNN)
+        torch_model = patch_generalized_rcnn(torch_model)
+        super().__init__(cfg, torch_model)
+
+        self.roi_heads_patcher = ROIHeadsPatcher(cfg, self._wrapped_model.roi_heads)
+
+    def encode_additional_info(self, predict_net, init_net):
+        size_divisibility = self._wrapped_model.backbone.size_divisibility
+        check_set_pb_arg(predict_net, "size_divisibility", "i", size_divisibility)
+        check_set_pb_arg(
+            predict_net, "device", "s", str.encode(str(self._wrapped_model.device), "ascii")
+        )
+        check_set_pb_arg(predict_net, "meta_architecture", "s", b"GeneralizedRCNN")
+
+    @mock_torch_nn_functional_interpolate()
+    def forward(self, inputs):
+        if not self.tensor_mode:
+            return self._wrapped_model.inference(inputs)
+        images = self._caffe2_preprocess_image(inputs)
+        features = self._wrapped_model.backbone(images.tensor)
+        proposals, _ = self._wrapped_model.proposal_generator(images, features)
+        with self.roi_heads_patcher.mock_roi_heads():
+            detector_results, _ = self._wrapped_model.roi_heads(images, features, proposals)
+        return tuple(detector_results[0].flatten())
+
+    @staticmethod
+    def get_outputs_converter(predict_net, init_net):
+        def f(batched_inputs, c2_inputs, c2_results):
+            image_sizes = [[int(im[0]), int(im[1])] for im in c2_inputs["im_info"]]
+            results = assemble_rcnn_outputs_by_name(image_sizes, c2_results)
+            return meta_arch.GeneralizedRCNN._postprocess(results, batched_inputs, image_sizes)
+
+        return f
+
+
+class Caffe2PanopticFPN(Caffe2MetaArch):
+    def __init__(self, cfg, torch_model):
+        assert isinstance(torch_model, meta_arch.PanopticFPN)
+        torch_model = patch_generalized_rcnn(torch_model)
+        super().__init__(cfg, torch_model)
+
+        self.roi_heads_patcher = ROIHeadsPatcher(cfg, self._wrapped_model.roi_heads)
+
+    @mock_torch_nn_functional_interpolate()
+    def forward(self, inputs):
+        assert self.tensor_mode
+        images = self._caffe2_preprocess_image(inputs)
+        features = self._wrapped_model.backbone(images.tensor)
+
+        sem_seg_results, _ = self._wrapped_model.sem_seg_head(features)
+        sem_seg_results = alias(sem_seg_results, "sem_seg")
+
+        proposals, _ = self._wrapped_model.proposal_generator(images, features)
+
+        with self.roi_heads_patcher.mock_roi_heads(self.tensor_mode):
+            detector_results, _ = self._wrapped_model.roi_heads(images, features, proposals)
+
+        return tuple(detector_results[0].flatten()) + (sem_seg_results,)
+
+    def encode_additional_info(self, predict_net, init_net):
+        size_divisibility = self._wrapped_model.backbone.size_divisibility
+        check_set_pb_arg(predict_net, "size_divisibility", "i", size_divisibility)
+        check_set_pb_arg(
+            predict_net, "device", "s", str.encode(str(self._wrapped_model.device), "ascii")
+        )
+        check_set_pb_arg(predict_net, "meta_architecture", "s", b"PanopticFPN")
+
+        # Inference parameters:
+        check_set_pb_arg(predict_net, "combine_on", "i", self._wrapped_model.combine_on)
+        check_set_pb_arg(
+            predict_net,
+            "combine_overlap_threshold",
+            "f",
+            _cast_to_f32(self._wrapped_model.combine_overlap_threshold),
+        )
+        check_set_pb_arg(
+            predict_net,
+            "combine_stuff_area_limit",
+            "i",
+            self._wrapped_model.combine_stuff_area_limit,
+        )
+        check_set_pb_arg(
+            predict_net,
+            "combine_instances_confidence_threshold",
+            "f",
+            _cast_to_f32(self._wrapped_model.combine_instances_confidence_threshold),
+        )
+
+    @staticmethod
+    def get_outputs_converter(predict_net, init_net):
+        combine_on = get_pb_arg_vali(predict_net, "combine_on", None)
+        combine_overlap_threshold = get_pb_arg_valf(predict_net, "combine_overlap_threshold", None)
+        combine_stuff_area_limit = get_pb_arg_vali(predict_net, "combine_stuff_area_limit", None)
+        combine_instances_confidence_threshold = get_pb_arg_valf(
+            predict_net, "combine_instances_confidence_threshold", None
+        )
+
+        def f(batched_inputs, c2_inputs, c2_results):
+            image_sizes = [[int(im[0]), int(im[1])] for im in c2_inputs["im_info"]]
+            detector_results = assemble_rcnn_outputs_by_name(
+                image_sizes, c2_results, force_mask_on=True
+            )
+            sem_seg_results = c2_results["sem_seg"]
+
+            # copied from meta_arch/panoptic_fpn.py ...
+            processed_results = []
+            for sem_seg_result, detector_result, input_per_image, image_size in zip(
+                sem_seg_results, detector_results, batched_inputs, image_sizes
+            ):
+                height = input_per_image.get("height", image_size[0])
+                width = input_per_image.get("width", image_size[1])
+                sem_seg_r = sem_seg_postprocess(sem_seg_result, image_size, height, width)
+                detector_r = detector_postprocess(detector_result, height, width)
+
+                processed_results.append({"sem_seg": sem_seg_r, "instances": detector_r})
+
+                if combine_on:
+                    panoptic_r = combine_semantic_and_instance_outputs(
+                        detector_r,
+                        sem_seg_r.argmax(dim=0),
+                        combine_overlap_threshold,
+                        combine_stuff_area_limit,
+                        combine_instances_confidence_threshold,
+                    )
+                    processed_results[-1]["panoptic_seg"] = panoptic_r
+            return processed_results
+
+        return f
+
+
+class Caffe2RetinaNet(Caffe2MetaArch):
+    def __init__(self, cfg, torch_model):
+        assert isinstance(torch_model, meta_arch.RetinaNet)
+        super().__init__(cfg, torch_model)
+
+    @mock_torch_nn_functional_interpolate()
+    def forward(self, inputs):
+        assert self.tensor_mode
+        images = self._caffe2_preprocess_image(inputs)
+
+        # explicitly return the images sizes to avoid removing "im_info" by ONNX
+        # since it's not used in the forward path
+        return_tensors = [images.image_sizes]
+
+        features = self._wrapped_model.backbone(images.tensor)
+        features = [features[f] for f in self._wrapped_model.in_features]
+        for i, feature_i in enumerate(features):
+            features[i] = alias(feature_i, "feature_{}".format(i), is_backward=True)
+            return_tensors.append(features[i])
+
+        box_cls, box_delta = self._wrapped_model.head(features)
+        for i, (box_cls_i, box_delta_i) in enumerate(zip(box_cls, box_delta)):
+            return_tensors.append(alias(box_cls_i, "box_cls_{}".format(i)))
+            return_tensors.append(alias(box_delta_i, "box_delta_{}".format(i)))
+
+        return tuple(return_tensors)
+
+    def encode_additional_info(self, predict_net, init_net):
+        size_divisibility = self._wrapped_model.backbone.size_divisibility
+        check_set_pb_arg(predict_net, "size_divisibility", "i", size_divisibility)
+        check_set_pb_arg(
+            predict_net, "device", "s", str.encode(str(self._wrapped_model.device), "ascii")
+        )
+        check_set_pb_arg(predict_net, "meta_architecture", "s", b"RetinaNet")
+
+        # Inference parameters:
+        check_set_pb_arg(
+            predict_net, "score_threshold", "f", _cast_to_f32(self._wrapped_model.score_threshold)
+        )
+        check_set_pb_arg(predict_net, "topk_candidates", "i", self._wrapped_model.topk_candidates)
+        check_set_pb_arg(
+            predict_net, "nms_threshold", "f", _cast_to_f32(self._wrapped_model.nms_threshold)
+        )
+        check_set_pb_arg(
+            predict_net,
+            "max_detections_per_image",
+            "i",
+            self._wrapped_model.max_detections_per_image,
+        )
+
+        check_set_pb_arg(
+            predict_net,
+            "bbox_reg_weights",
+            "floats",
+            [_cast_to_f32(w) for w in self._wrapped_model.box2box_transform.weights],
+        )
+        self._encode_anchor_generator_cfg(predict_net)
+
+    def _encode_anchor_generator_cfg(self, predict_net):
+        # serialize anchor_generator for future use
+        serialized_anchor_generator = io.BytesIO()
+        torch.save(self._wrapped_model.anchor_generator, serialized_anchor_generator)
+        # Ideally we can put anchor generating inside the model, then we don't
+        # need to store this information.
+        bytes = serialized_anchor_generator.getvalue()
+        check_set_pb_arg(predict_net, "serialized_anchor_generator", "s", bytes)
+
+    @staticmethod
+    def get_outputs_converter(predict_net, init_net):
+        self = types.SimpleNamespace()
+        serialized_anchor_generator = io.BytesIO(
+            get_pb_arg_vals(predict_net, "serialized_anchor_generator", None)
+        )
+        self.anchor_generator = torch.load(serialized_anchor_generator)
+        bbox_reg_weights = get_pb_arg_floats(predict_net, "bbox_reg_weights", None)
+        self.box2box_transform = Box2BoxTransform(weights=tuple(bbox_reg_weights))
+        self.score_threshold = get_pb_arg_valf(predict_net, "score_threshold", None)
+        self.topk_candidates = get_pb_arg_vali(predict_net, "topk_candidates", None)
+        self.nms_threshold = get_pb_arg_valf(predict_net, "nms_threshold", None)
+        self.max_detections_per_image = get_pb_arg_vali(
+            predict_net, "max_detections_per_image", None
+        )
+
+        # hack to reuse inference code from RetinaNet
+        self.inference = functools.partial(meta_arch.RetinaNet.inference, self)
+        self.inference_single_image = functools.partial(
+            meta_arch.RetinaNet.inference_single_image, self
+        )
+
+        def f(batched_inputs, c2_inputs, c2_results):
+            image_sizes = [[int(im[0]), int(im[1])] for im in c2_inputs["im_info"]]
+
+            num_features = len([x for x in c2_results.keys() if x.startswith("box_cls_")])
+            box_cls = [c2_results["box_cls_{}".format(i)] for i in range(num_features)]
+            box_delta = [c2_results["box_delta_{}".format(i)] for i in range(num_features)]
+
+            # For each feature level, feature should have the same batch size and
+            # spatial dimension as the box_cls and box_delta.
+            dummy_features = [box_delta[i].clone()[:, 0:0, :, :] for i in range(num_features)]
+            anchors = self.anchor_generator(dummy_features)
+
+            # self.num_classess can be inferred
+            self.num_classes = box_cls[0].shape[1] // (box_delta[0].shape[1] // 4)
+
+            results = self.inference(box_cls, box_delta, anchors, image_sizes)
+            return meta_arch.GeneralizedRCNN._postprocess(results, batched_inputs, image_sizes)
+
+        return f
+
+
+META_ARCH_CAFFE2_EXPORT_TYPE_MAP = {
+    "GeneralizedRCNN": Caffe2GeneralizedRCNN,
+    "PanopticFPN": Caffe2PanopticFPN,
+    "RetinaNet": Caffe2RetinaNet,
+}
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/detectron2/export/patcher.py b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/export/patcher.py
new file mode 100644
index 0000000000000000000000000000000000000000..3f0b0fd8122d12c10d06cfc1b0720e3c3374c737
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/export/patcher.py
@@ -0,0 +1,153 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+
+import contextlib
+import mock
+import torch
+
+from detectron2.modeling import poolers
+from detectron2.modeling.proposal_generator import rpn
+from detectron2.modeling.roi_heads import keypoint_head, mask_head
+from detectron2.modeling.roi_heads.fast_rcnn import FastRCNNOutputLayers
+
+from .c10 import (
+    Caffe2Compatible,
+    Caffe2FastRCNNOutputsInference,
+    Caffe2KeypointRCNNInference,
+    Caffe2MaskRCNNInference,
+    Caffe2ROIPooler,
+    Caffe2RPN,
+)
+
+
+class GenericMixin(object):
+    pass
+
+
+class Caffe2CompatibleConverter(object):
+    """
+    A GenericUpdater which implements the `create_from` interface, by modifying
+    module object and assign it with another class replaceCls.
+    """
+
+    def __init__(self, replaceCls):
+        self.replaceCls = replaceCls
+
+    def create_from(self, module):
+        # update module's class to the new class
+        assert isinstance(module, torch.nn.Module)
+        if issubclass(self.replaceCls, GenericMixin):
+            # replaceCls should act as mixin, create a new class on-the-fly
+            new_class = type(
+                "{}MixedWith{}".format(self.replaceCls.__name__, module.__class__.__name__),
+                (self.replaceCls, module.__class__),
+                {},  # {"new_method": lambda self: ...},
+            )
+            module.__class__ = new_class
+        else:
+            # replaceCls is complete class, this allow arbitrary class swap
+            module.__class__ = self.replaceCls
+
+        # initialize Caffe2Compatible
+        if isinstance(module, Caffe2Compatible):
+            module.tensor_mode = False
+
+        return module
+
+
+def patch(model, target, updater, *args, **kwargs):
+    """
+    recursively (post-order) update all modules with the target type and its
+    subclasses, make a initialization/composition/inheritance/... via the
+    updater.create_from.
+    """
+    for name, module in model.named_children():
+        model._modules[name] = patch(module, target, updater, *args, **kwargs)
+    if isinstance(model, target):
+        return updater.create_from(model, *args, **kwargs)
+    return model
+
+
+def patch_generalized_rcnn(model):
+    ccc = Caffe2CompatibleConverter
+    model = patch(model, rpn.RPN, ccc(Caffe2RPN))
+    model = patch(model, poolers.ROIPooler, ccc(Caffe2ROIPooler))
+
+    return model
+
+
+@contextlib.contextmanager
+def mock_fastrcnn_outputs_inference(
+    tensor_mode, check=True, box_predictor_type=FastRCNNOutputLayers
+):
+    with mock.patch.object(
+        box_predictor_type,
+        "inference",
+        autospec=True,
+        side_effect=Caffe2FastRCNNOutputsInference(tensor_mode),
+    ) as mocked_func:
+        yield
+    if check:
+        assert mocked_func.call_count > 0
+
+
+@contextlib.contextmanager
+def mock_mask_rcnn_inference(tensor_mode, patched_module, check=True):
+    with mock.patch(
+        "{}.mask_rcnn_inference".format(patched_module), side_effect=Caffe2MaskRCNNInference()
+    ) as mocked_func:
+        yield
+    if check:
+        assert mocked_func.call_count > 0
+
+
+@contextlib.contextmanager
+def mock_keypoint_rcnn_inference(tensor_mode, patched_module, use_heatmap_max_keypoint, check=True):
+    with mock.patch(
+        "{}.keypoint_rcnn_inference".format(patched_module),
+        side_effect=Caffe2KeypointRCNNInference(use_heatmap_max_keypoint),
+    ) as mocked_func:
+        yield
+    if check:
+        assert mocked_func.call_count > 0
+
+
+class ROIHeadsPatcher:
+    def __init__(self, cfg, heads):
+        self.heads = heads
+
+        self.use_heatmap_max_keypoint = cfg.EXPORT_CAFFE2.USE_HEATMAP_MAX_KEYPOINT
+
+    @contextlib.contextmanager
+    def mock_roi_heads(self, tensor_mode=True):
+        """
+        Patching several inference functions inside ROIHeads and its subclasses
+
+        Args:
+            tensor_mode (bool): whether the inputs/outputs are caffe2's tensor
+                format or not. Default to True.
+        """
+        # NOTE: this requries the `keypoint_rcnn_inference` and `mask_rcnn_inference`
+        # are called inside the same file as BaseXxxHead due to using mock.patch.
+        kpt_heads_mod = keypoint_head.BaseKeypointRCNNHead.__module__
+        mask_head_mod = mask_head.BaseMaskRCNNHead.__module__
+
+        mock_ctx_managers = [
+            mock_fastrcnn_outputs_inference(
+                tensor_mode=tensor_mode,
+                check=True,
+                box_predictor_type=type(self.heads.box_predictor),
+            )
+        ]
+        if getattr(self.heads, "keypoint_on", False):
+            mock_ctx_managers += [
+                mock_keypoint_rcnn_inference(
+                    tensor_mode, kpt_heads_mod, self.use_heatmap_max_keypoint
+                )
+            ]
+        if getattr(self.heads, "mask_on", False):
+            mock_ctx_managers += [mock_mask_rcnn_inference(tensor_mode, mask_head_mod)]
+
+        with contextlib.ExitStack() as stack:  # python 3.3+
+            for mgr in mock_ctx_managers:
+                stack.enter_context(mgr)
+            yield
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/detectron2/export/shared.py b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/export/shared.py
new file mode 100644
index 0000000000000000000000000000000000000000..cb7ffeb098f21178660572830164126fab63e0e1
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/export/shared.py
@@ -0,0 +1,1034 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+
+import collections
+import contextlib
+import copy
+import functools
+import logging
+import mock
+import numpy as np
+import os
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+import caffe2.python.utils as putils
+import torch
+import torch.nn.functional as F
+from caffe2.proto import caffe2_pb2
+from caffe2.python import core, net_drawer, workspace
+from torch.nn.functional import interpolate as interp
+
+logger = logging.getLogger(__name__)
+
+
+# ==== torch/utils_toffee/cast.py =======================================
+
+
+def to_device(t, device_str):
+    """
+    This function is a replacement of .to(another_device) such that it allows the
+    casting to be traced properly by explicitly calling the underlying copy ops.
+    It also avoids introducing unncessary op when casting to the same device.
+    """
+    src = t.device
+    dst = torch.device(device_str)
+
+    if src == dst:
+        return t
+    elif src.type == "cuda" and dst.type == "cpu":
+        return torch.ops._caffe2.CopyGPUToCPU(t)
+    elif src.type == "cpu" and dst.type == "cuda":
+        return torch.ops._caffe2.CopyCPUToGPU(t)
+    else:
+        raise RuntimeError("Can't cast tensor from device {} to device {}".format(src, dst))
+
+
+# ==== torch/utils_toffee/interpolate.py =======================================
+
+
+# Note: borrowed from vision/detection/fair/detectron/detectron/modeling/detector.py
+def BilinearInterpolation(tensor_in, up_scale):
+    assert up_scale % 2 == 0, "Scale should be even"
+
+    def upsample_filt(size):
+        factor = (size + 1) // 2
+        if size % 2 == 1:
+            center = factor - 1
+        else:
+            center = factor - 0.5
+
+        og = np.ogrid[:size, :size]
+        return (1 - abs(og[0] - center) / factor) * (1 - abs(og[1] - center) / factor)
+
+    kernel_size = int(up_scale) * 2
+    bil_filt = upsample_filt(kernel_size)
+
+    dim = int(tensor_in.shape[1])
+    kernel = np.zeros((dim, dim, kernel_size, kernel_size), dtype=np.float32)
+    kernel[range(dim), range(dim), :, :] = bil_filt
+
+    tensor_out = F.conv_transpose2d(
+        tensor_in,
+        weight=to_device(torch.Tensor(kernel), tensor_in.device),
+        bias=None,
+        stride=int(up_scale),
+        padding=int(up_scale / 2),
+    )
+
+    return tensor_out
+
+
+# NOTE: ONNX is incompatible with traced torch.nn.functional.interpolate if
+# using dynamic `scale_factor` rather than static `size`. (T43166860)
+# NOTE: Caffe2 Int8 conversion might not be able to quantize `size` properly.
+def onnx_compatibale_interpolate(
+    input, size=None, scale_factor=None, mode="nearest", align_corners=None
+):
+    # NOTE: The input dimensions are interpreted in the form:
+    # `mini-batch x channels x [optional depth] x [optional height] x width`.
+    if size is None and scale_factor is not None:
+        if input.dim() == 4:
+            if isinstance(scale_factor, (int, float)):
+                height_scale, width_scale = (scale_factor, scale_factor)
+            else:
+                assert isinstance(scale_factor, (tuple, list))
+                assert len(scale_factor) == 2
+                height_scale, width_scale = scale_factor
+
+            assert not align_corners, "No matching C2 op for align_corners == True"
+            if mode == "nearest":
+                return torch.ops._caffe2.ResizeNearest(
+                    input, order="NCHW", width_scale=width_scale, height_scale=height_scale
+                )
+            elif mode == "bilinear":
+                logger.warning(
+                    "Use F.conv_transpose2d for bilinear interpolate"
+                    " because there's no such C2 op, this may cause significant"
+                    " slowdown and the boundary pixels won't be as same as"
+                    " using F.interpolate due to padding."
+                )
+                assert height_scale == width_scale
+                return BilinearInterpolation(input, up_scale=height_scale)
+        logger.warning("Output size is not static, it might cause ONNX conversion issue")
+
+    return interp(input, size, scale_factor, mode, align_corners)
+
+
+@contextlib.contextmanager
+def mock_torch_nn_functional_interpolate():
+    if torch.onnx.is_in_onnx_export():
+        with mock.patch(
+            "torch.nn.functional.interpolate", side_effect=onnx_compatibale_interpolate
+        ):
+            yield
+    else:
+        yield
+
+
+# ==== torch/utils_caffe2/ws_utils.py ==========================================
+
+
+class ScopedWS(object):
+    def __init__(self, ws_name, is_reset, is_cleanup=False):
+        self.ws_name = ws_name
+        self.is_reset = is_reset
+        self.is_cleanup = is_cleanup
+        self.org_ws = ""
+
+    def __enter__(self):
+        self.org_ws = workspace.CurrentWorkspace()
+        if self.ws_name is not None:
+            workspace.SwitchWorkspace(self.ws_name, True)
+        if self.is_reset:
+            workspace.ResetWorkspace()
+
+        return workspace
+
+    def __exit__(self, *args):
+        if self.is_cleanup:
+            workspace.ResetWorkspace()
+        if self.ws_name is not None:
+            workspace.SwitchWorkspace(self.org_ws)
+
+
+def fetch_any_blob(name):
+    bb = None
+    try:
+        bb = workspace.FetchBlob(name)
+    except TypeError:
+        bb = workspace.FetchInt8Blob(name)
+    except Exception as e:
+        logger.error("Get blob {} error: {}".format(name, e))
+
+    return bb
+
+
+# ==== torch/utils_caffe2/protobuf.py ==========================================
+
+
+def get_pb_arg(pb, arg_name):
+    for x in pb.arg:
+        if x.name == arg_name:
+            return x
+    return None
+
+
+def get_pb_arg_valf(pb, arg_name, default_val):
+    arg = get_pb_arg(pb, arg_name)
+    return arg.f if arg is not None else default_val
+
+
+def get_pb_arg_floats(pb, arg_name, default_val):
+    arg = get_pb_arg(pb, arg_name)
+    return list(map(float, arg.floats)) if arg is not None else default_val
+
+
+def get_pb_arg_ints(pb, arg_name, default_val):
+    arg = get_pb_arg(pb, arg_name)
+    return list(map(int, arg.ints)) if arg is not None else default_val
+
+
+def get_pb_arg_vali(pb, arg_name, default_val):
+    arg = get_pb_arg(pb, arg_name)
+    return arg.i if arg is not None else default_val
+
+
+def get_pb_arg_vals(pb, arg_name, default_val):
+    arg = get_pb_arg(pb, arg_name)
+    return arg.s if arg is not None else default_val
+
+
+def get_pb_arg_valstrings(pb, arg_name, default_val):
+    arg = get_pb_arg(pb, arg_name)
+    return list(arg.strings) if arg is not None else default_val
+
+
+def check_set_pb_arg(pb, arg_name, arg_attr, arg_value, allow_override=False):
+    arg = get_pb_arg(pb, arg_name)
+    if arg is None:
+        arg = putils.MakeArgument(arg_name, arg_value)
+        assert hasattr(arg, arg_attr)
+        pb.arg.extend([arg])
+    if allow_override and getattr(arg, arg_attr) != arg_value:
+        logger.warning(
+            "Override argument {}: {} -> {}".format(arg_name, getattr(arg, arg_attr), arg_value)
+        )
+        setattr(arg, arg_attr, arg_value)
+    else:
+        assert arg is not None
+        assert getattr(arg, arg_attr) == arg_value, "Existing value {}, new value {}".format(
+            getattr(arg, arg_attr), arg_value
+        )
+
+
+def _create_const_fill_op_from_numpy(name, tensor, device_option=None):
+    assert type(tensor) == np.ndarray
+    kTypeNameMapper = {
+        np.dtype("float32"): "GivenTensorFill",
+        np.dtype("int32"): "GivenTensorIntFill",
+        np.dtype("int64"): "GivenTensorInt64Fill",
+        np.dtype("uint8"): "GivenTensorStringFill",
+    }
+
+    args_dict = {}
+    if tensor.dtype == np.dtype("uint8"):
+        args_dict.update({"values": [str(tensor.data)], "shape": [1]})
+    else:
+        args_dict.update({"values": tensor, "shape": tensor.shape})
+
+    if device_option is not None:
+        args_dict["device_option"] = device_option
+
+    return core.CreateOperator(kTypeNameMapper[tensor.dtype], [], [name], **args_dict)
+
+
+def _create_const_fill_op_from_c2_int8_tensor(name, int8_tensor):
+    assert type(int8_tensor) == workspace.Int8Tensor
+    kTypeNameMapper = {
+        np.dtype("int32"): "Int8GivenIntTensorFill",
+        np.dtype("uint8"): "Int8GivenTensorFill",
+    }
+
+    tensor = int8_tensor.data
+    assert tensor.dtype in [np.dtype("uint8"), np.dtype("int32")]
+    values = tensor.tobytes() if tensor.dtype == np.dtype("uint8") else tensor
+
+    return core.CreateOperator(
+        kTypeNameMapper[tensor.dtype],
+        [],
+        [name],
+        values=values,
+        shape=tensor.shape,
+        Y_scale=int8_tensor.scale,
+        Y_zero_point=int8_tensor.zero_point,
+    )
+
+
+def create_const_fill_op(
+    name: str,
+    blob: Union[np.ndarray, workspace.Int8Tensor],
+    device_option: Optional[caffe2_pb2.DeviceOption] = None,
+) -> caffe2_pb2.OperatorDef:
+    """
+    Given a blob object, return the Caffe2 operator that creates this blob
+    as constant. Currently support NumPy tensor and Caffe2 Int8Tensor.
+    """
+
+    tensor_type = type(blob)
+    assert tensor_type in [
+        np.ndarray,
+        workspace.Int8Tensor,
+    ], 'Error when creating const fill op for "{}", unsupported blob type: {}'.format(
+        name, type(blob)
+    )
+
+    if tensor_type == np.ndarray:
+        return _create_const_fill_op_from_numpy(name, blob, device_option)
+    elif tensor_type == workspace.Int8Tensor:
+        assert device_option is None
+        return _create_const_fill_op_from_c2_int8_tensor(name, blob)
+
+
+def construct_init_net_from_params(
+    params: Dict[str, Any], device_options: Optional[Dict[str, caffe2_pb2.DeviceOption]] = None
+) -> caffe2_pb2.NetDef:
+    """
+    Construct the init_net from params dictionary
+    """
+    init_net = caffe2_pb2.NetDef()
+    device_options = device_options or {}
+    for name, blob in params.items():
+        if isinstance(blob, str):
+            logger.warning(
+                (
+                    "Blob {} with type {} is not supported in generating init net,"
+                    " skipped.".format(name, type(blob))
+                )
+            )
+            continue
+        init_net.op.extend(
+            [create_const_fill_op(name, blob, device_option=device_options.get(name, None))]
+        )
+        init_net.external_output.append(name)
+    return init_net
+
+
+def get_producer_map(ssa):
+    """
+    Return dict from versioned blob to (i, j),
+        where i is index of producer op, j is the index of output of that op.
+    """
+    producer_map = {}
+    for i in range(len(ssa)):
+        outputs = ssa[i][1]
+        for j, outp in enumerate(outputs):
+            producer_map[outp] = (i, j)
+    return producer_map
+
+
+def get_consumer_map(ssa):
+    """
+    Return dict from versioned blob to list of (i, j),
+        where i is index of consumer op, j is the index of input of that op.
+    """
+    consumer_map = collections.defaultdict(list)
+    for i in range(len(ssa)):
+        inputs = ssa[i][0]
+        for j, inp in enumerate(inputs):
+            consumer_map[inp].append((i, j))
+    return consumer_map
+
+
+def get_params_from_init_net(
+    init_net: caffe2_pb2.NetDef,
+) -> [Dict[str, Any], Dict[str, caffe2_pb2.DeviceOption]]:
+    """
+    Take the output blobs from init_net by running it.
+    Outputs:
+        params: dict from blob name to numpy array
+        device_options: dict from blob name to the device option of its creating op
+    """
+    # NOTE: this assumes that the params is determined by producer op with the
+    # only exception be CopyGPUToCPU which is CUDA op but returns CPU tensor.
+    def _get_device_option(producer_op):
+        if producer_op.type == "CopyGPUToCPU":
+            return caffe2_pb2.DeviceOption()
+        else:
+            return producer_op.device_option
+
+    with ScopedWS("__get_params_from_init_net__", is_reset=True, is_cleanup=True) as ws:
+        ws.RunNetOnce(init_net)
+        params = {b: fetch_any_blob(b) for b in init_net.external_output}
+    ssa, versions = core.get_ssa(init_net)
+    producer_map = get_producer_map(ssa)
+    device_options = {
+        b: _get_device_option(init_net.op[producer_map[(b, versions[b])][0]])
+        for b in init_net.external_output
+    }
+    return params, device_options
+
+
+def _updater_raise(op, input_types, output_types):
+    raise RuntimeError(
+        "Failed to apply updater for op {} given input_types {} and"
+        " output_types {}".format(op, input_types, output_types)
+    )
+
+
+def _generic_status_identifier(
+    predict_net: caffe2_pb2.NetDef,
+    status_updater: Callable,
+    known_status: Dict[Tuple[str, int], Any],
+) -> Dict[Tuple[str, int], Any]:
+    """
+    Statically infer the status of each blob, the status can be such as device type
+        (CPU/GPU), layout (NCHW/NHWC), data type (float32/int8), etc. "Blob" here
+        is versioned blob (Tuple[str, int]) in the format compatible with ssa.
+    Inputs:
+        predict_net: the caffe2 network
+        status_updater: a callable, given an op and the status of its input/output,
+            it returns the updated status of input/output. `None` is used for
+            representing unknown status.
+        known_status: a dict containing known status, used as initialization.
+    Outputs:
+        A dict mapping from versioned blob to its status
+    """
+    ssa, versions = core.get_ssa(predict_net)
+    versioned_ext_input = [(b, 0) for b in predict_net.external_input]
+    versioned_ext_output = [(b, versions[b]) for b in predict_net.external_output]
+    all_versioned_blobs = set().union(*[set(x[0] + x[1]) for x in ssa])
+
+    allowed_vbs = all_versioned_blobs.union(versioned_ext_input).union(versioned_ext_output)
+    assert all(k in allowed_vbs for k in known_status)
+    assert all(v is not None for v in known_status.values())
+    _known_status = copy.deepcopy(known_status)
+
+    def _check_and_update(key, value):
+        assert value is not None
+        if key in _known_status:
+            if not _known_status[key] == value:
+                raise RuntimeError(
+                    "Confilict status for {}, existing status {}, new status {}".format(
+                        key, _known_status[key], value
+                    )
+                )
+        _known_status[key] = value
+
+    def _update_i(op, ssa_i):
+        versioned_inputs = ssa_i[0]
+        versioned_outputs = ssa_i[1]
+
+        inputs_status = [_known_status.get(b, None) for b in versioned_inputs]
+        outputs_status = [_known_status.get(b, None) for b in versioned_outputs]
+
+        new_inputs_status, new_outputs_status = status_updater(op, inputs_status, outputs_status)
+
+        for versioned_blob, status in zip(
+            versioned_inputs + versioned_outputs, new_inputs_status + new_outputs_status
+        ):
+            if status is not None:
+                _check_and_update(versioned_blob, status)
+
+    for op, ssa_i in zip(predict_net.op, ssa):
+        _update_i(op, ssa_i)
+    for op, ssa_i in zip(reversed(predict_net.op), reversed(ssa)):
+        _update_i(op, ssa_i)
+
+    # NOTE: This strictly checks all the blob from predict_net must be assgined
+    # a known status. However sometimes it's impossible (eg. having deadend op),
+    # we may relax this constraint if
+    for k in all_versioned_blobs:
+        if k not in _known_status:
+            raise NotImplementedError(
+                "Can not infer the status for {}. Currently only support the case where"
+                " a single forward and backward pass can identify status for all blobs.".format(k)
+            )
+
+    return _known_status
+
+
+def infer_device_type(
+    predict_net: caffe2_pb2.NetDef,
+    known_status: Dict[Tuple[str, int], Any],
+    device_name_style: str = "caffe2",
+) -> Dict[Tuple[str, int], str]:
+    """ Return the device type ("cpu" or "gpu"/"cuda") of each (versioned) blob """
+
+    assert device_name_style in ["caffe2", "pytorch"]
+    _CPU_STR = "cpu"
+    _GPU_STR = "gpu" if device_name_style == "caffe2" else "cuda"
+
+    def _copy_cpu_to_gpu_updater(op, input_types, output_types):
+        if input_types[0] == _GPU_STR or output_types[0] == _CPU_STR:
+            _updater_raise(op, input_types, output_types)
+        return ([_CPU_STR], [_GPU_STR])
+
+    def _copy_gpu_to_cpu_updater(op, input_types, output_types):
+        if input_types[0] == _CPU_STR or output_types[0] == _GPU_STR:
+            _updater_raise(op, input_types, output_types)
+        return ([_GPU_STR], [_CPU_STR])
+
+    def _other_ops_updater(op, input_types, output_types):
+        non_none_types = [x for x in input_types + output_types if x is not None]
+        if len(non_none_types) > 0:
+            the_type = non_none_types[0]
+            if not all(x == the_type for x in non_none_types):
+                _updater_raise(op, input_types, output_types)
+        else:
+            the_type = None
+        return ([the_type for _ in op.input], [the_type for _ in op.output])
+
+    def _device_updater(op, *args, **kwargs):
+        return {
+            "CopyCPUToGPU": _copy_cpu_to_gpu_updater,
+            "CopyGPUToCPU": _copy_gpu_to_cpu_updater,
+        }.get(op.type, _other_ops_updater)(op, *args, **kwargs)
+
+    return _generic_status_identifier(predict_net, _device_updater, known_status)
+
+
+# ==== torch/utils_caffe2/vis.py ===============================================
+
+
+def _modify_blob_names(ops, blob_rename_f):
+    ret = []
+
+    def _replace_list(blob_list, replaced_list):
+        del blob_list[:]
+        blob_list.extend(replaced_list)
+
+    for x in ops:
+        cur = copy.deepcopy(x)
+        _replace_list(cur.input, list(map(blob_rename_f, cur.input)))
+        _replace_list(cur.output, list(map(blob_rename_f, cur.output)))
+        ret.append(cur)
+
+    return ret
+
+
+def _rename_blob(name, blob_sizes, blob_ranges):
+    def _list_to_str(bsize):
+        ret = ", ".join([str(x) for x in bsize])
+        ret = "[" + ret + "]"
+        return ret
+
+    ret = name
+    if blob_sizes is not None and name in blob_sizes:
+        ret += "\n" + _list_to_str(blob_sizes[name])
+    if blob_ranges is not None and name in blob_ranges:
+        ret += "\n" + _list_to_str(blob_ranges[name])
+
+    return ret
+
+
+# graph_name could not contain word 'graph'
+def save_graph(net, file_name, graph_name="net", op_only=True, blob_sizes=None, blob_ranges=None):
+    blob_rename_f = functools.partial(_rename_blob, blob_sizes=blob_sizes, blob_ranges=blob_ranges)
+    return save_graph_base(net, file_name, graph_name, op_only, blob_rename_f)
+
+
+def save_graph_base(net, file_name, graph_name="net", op_only=True, blob_rename_func=None):
+    graph = None
+    ops = net.op
+    if blob_rename_func is not None:
+        ops = _modify_blob_names(ops, blob_rename_func)
+    if not op_only:
+        graph = net_drawer.GetPydotGraph(ops, graph_name, rankdir="TB")
+    else:
+        graph = net_drawer.GetPydotGraphMinimal(
+            ops, graph_name, rankdir="TB", minimal_dependency=True
+        )
+
+    try:
+        par_dir = os.path.dirname(file_name)
+        if not os.path.exists(par_dir):
+            os.makedirs(par_dir)
+
+        format = os.path.splitext(os.path.basename(file_name))[-1]
+        if format == ".png":
+            graph.write_png(file_name)
+        elif format == ".pdf":
+            graph.write_pdf(file_name)
+        elif format == ".svg":
+            graph.write_svg(file_name)
+        else:
+            print("Incorrect format {}".format(format))
+    except Exception as e:
+        print("Error when writing graph to image {}".format(e))
+
+    return graph
+
+
+# ==== torch/utils_toffee/aten_to_caffe2.py ====================================
+
+
+def group_norm_replace_aten_with_caffe2(predict_net: caffe2_pb2.NetDef):
+    """
+    For ONNX exported model, GroupNorm will be represented as ATen op,
+        this can be a drop in replacement from ATen to GroupNorm
+    """
+    count = 0
+    for op in predict_net.op:
+        if op.type == "ATen":
+            op_name = get_pb_arg_vals(op, "operator", None)  # return byte in py3
+            if op_name and op_name.decode() == "group_norm":
+                op.arg.remove(get_pb_arg(op, "operator"))
+
+                if get_pb_arg_vali(op, "cudnn_enabled", None):
+                    op.arg.remove(get_pb_arg(op, "cudnn_enabled"))
+
+                num_groups = get_pb_arg_vali(op, "num_groups", None)
+                if num_groups is not None:
+                    op.arg.remove(get_pb_arg(op, "num_groups"))
+                    check_set_pb_arg(op, "group", "i", num_groups)
+
+                op.type = "GroupNorm"
+                count += 1
+    if count > 1:
+        logger.info("Replaced {} ATen operator to GroupNormOp".format(count))
+
+
+# ==== torch/utils_toffee/alias.py =============================================
+
+
+def alias(x, name, is_backward=False):
+    if not torch.onnx.is_in_onnx_export():
+        return x
+    assert isinstance(x, torch.Tensor)
+    return torch.ops._caffe2.AliasWithName(x, name, is_backward=is_backward)
+
+
+def fuse_alias_placeholder(predict_net, init_net):
+    """ Remove AliasWithName placeholder and rename the input/output of it """
+    # First we finish all the re-naming
+    for i, op in enumerate(predict_net.op):
+        if op.type == "AliasWithName":
+            assert len(op.input) == 1
+            assert len(op.output) == 1
+            name = get_pb_arg_vals(op, "name", None).decode()
+            is_backward = bool(get_pb_arg_vali(op, "is_backward", 0))
+            rename_op_input(predict_net, init_net, i, 0, name, from_producer=is_backward)
+            rename_op_output(predict_net, i, 0, name)
+
+    # Remove AliasWithName, should be very safe since it's a non-op
+    new_ops = []
+    for op in predict_net.op:
+        if op.type != "AliasWithName":
+            new_ops.append(op)
+        else:
+            # safety check
+            assert op.input == op.output
+            assert op.input[0] == op.arg[0].s.decode()
+    del predict_net.op[:]
+    predict_net.op.extend(new_ops)
+
+
+# ==== torch/utils_caffe2/graph_transform.py ===================================
+
+
+class IllegalGraphTransformError(ValueError):
+    """ When a graph transform function call can't be executed. """
+
+
+def _rename_versioned_blob_in_proto(
+    proto: caffe2_pb2.NetDef,
+    old_name: str,
+    new_name: str,
+    version: int,
+    ssa: List[Tuple[List[Tuple[str, int]], List[Tuple[str, int]]]],
+    start_versions: Dict[str, int],
+    end_versions: Dict[str, int],
+):
+    """ In given proto, rename all blobs with matched version """
+    # Operater list
+    for op, i_th_ssa in zip(proto.op, ssa):
+        versioned_inputs, versioned_outputs = i_th_ssa
+        for i in range(len(op.input)):
+            if versioned_inputs[i] == (old_name, version):
+                op.input[i] = new_name
+        for i in range(len(op.output)):
+            if versioned_outputs[i] == (old_name, version):
+                op.output[i] = new_name
+    # external_input
+    if start_versions.get(old_name, 0) == version:
+        for i in range(len(proto.external_input)):
+            if proto.external_input[i] == old_name:
+                proto.external_input[i] = new_name
+    # external_output
+    if end_versions.get(old_name, 0) == version:
+        for i in range(len(proto.external_output)):
+            if proto.external_output[i] == old_name:
+                proto.external_output[i] = new_name
+
+
+def rename_op_input(
+    predict_net: caffe2_pb2.NetDef,
+    init_net: caffe2_pb2.NetDef,
+    op_id: int,
+    input_id: int,
+    new_name: str,
+    from_producer: bool = False,
+):
+    """
+    Rename the op_id-th operator in predict_net, change it's input_id-th input's
+        name to the new_name. It also does automatic re-route and change
+        external_input and init_net if necessary.
+    - It requires the input is only consumed by this op.
+    - This function modifies predict_net and init_net in-place.
+    - When from_producer is enable, this also updates other operators that consumes
+        the same input. Be cautious because may trigger unintended behavior.
+    """
+    assert isinstance(predict_net, caffe2_pb2.NetDef)
+    assert isinstance(init_net, caffe2_pb2.NetDef)
+
+    init_net_ssa, init_net_versions = core.get_ssa(init_net)
+    predict_net_ssa, predict_net_versions = core.get_ssa(
+        predict_net, copy.deepcopy(init_net_versions)
+    )
+
+    versioned_inputs, versioned_outputs = predict_net_ssa[op_id]
+    old_name, version = versioned_inputs[input_id]
+
+    if from_producer:
+        producer_map = get_producer_map(predict_net_ssa)
+        if not (old_name, version) in producer_map:
+            raise NotImplementedError(
+                "Can't find producer, the input {} is probably from"
+                " init_net, this is not supported yet.".format(old_name)
+            )
+        producer = producer_map[(old_name, version)]
+        rename_op_output(predict_net, producer[0], producer[1], new_name)
+        return
+
+    def contain_targets(op_ssa):
+        return (old_name, version) in op_ssa[0]
+
+    is_consumer = [contain_targets(op_ssa) for op_ssa in predict_net_ssa]
+    if sum(is_consumer) > 1:
+        raise IllegalGraphTransformError(
+            (
+                "Input '{}' of operator(#{}) are consumed by other ops, please use"
+                + " rename_op_output on the producer instead. Offending op: \n{}"
+            ).format(old_name, op_id, predict_net.op[op_id])
+        )
+
+    # update init_net
+    _rename_versioned_blob_in_proto(
+        init_net, old_name, new_name, version, init_net_ssa, {}, init_net_versions
+    )
+    # update predict_net
+    _rename_versioned_blob_in_proto(
+        predict_net,
+        old_name,
+        new_name,
+        version,
+        predict_net_ssa,
+        init_net_versions,
+        predict_net_versions,
+    )
+
+
+def rename_op_output(predict_net: caffe2_pb2.NetDef, op_id: int, output_id: int, new_name: str):
+    """
+    Rename the op_id-th operator in predict_net, change it's output_id-th input's
+        name to the new_name. It also does automatic re-route and change
+        external_output and if necessary.
+    - It allows multiple consumers of its output.
+    - This function modifies predict_net in-place, doesn't need init_net.
+    """
+    assert isinstance(predict_net, caffe2_pb2.NetDef)
+
+    ssa, blob_versions = core.get_ssa(predict_net)
+
+    versioned_inputs, versioned_outputs = ssa[op_id]
+    old_name, version = versioned_outputs[output_id]
+
+    # update predict_net
+    _rename_versioned_blob_in_proto(
+        predict_net, old_name, new_name, version, ssa, {}, blob_versions
+    )
+
+
+def get_sub_graph_external_input_output(
+    predict_net: caffe2_pb2.NetDef, sub_graph_op_indices: List[int]
+) -> Tuple[List[Tuple[str, int]], List[Tuple[str, int]]]:
+    """
+    Return the list of external input/output of sub-graph,
+    each element is tuple of the name and corresponding version in predict_net.
+
+    external input/output is defined the same way as caffe2 NetDef.
+    """
+    ssa, versions = core.get_ssa(predict_net)
+
+    all_inputs = []
+    all_outputs = []
+    for op_id in sub_graph_op_indices:
+        all_inputs += [inp for inp in ssa[op_id][0] if inp not in all_inputs]
+        all_outputs += list(ssa[op_id][1])  # ssa output won't repeat
+
+    # for versioned blobs, external inputs are just those blob in all_inputs
+    # but not in all_outputs
+    ext_inputs = [inp for inp in all_inputs if inp not in all_outputs]
+
+    # external outputs are essentially outputs of this subgraph that are used
+    # outside of this sub-graph (including predict_net.external_output)
+    all_other_inputs = sum(
+        (ssa[i][0] for i in range(len(ssa)) if i not in sub_graph_op_indices),
+        [(outp, versions[outp]) for outp in predict_net.external_output],
+    )
+    ext_outputs = [outp for outp in all_outputs if outp in set(all_other_inputs)]
+
+    return ext_inputs, ext_outputs
+
+
+class DiGraph:
+    """ A DAG representation of caffe2 graph, each vertice is a versioned blob. """
+
+    def __init__(self):
+        self.vertices = set()
+        self.graph = collections.defaultdict(list)
+
+    def add_edge(self, u, v):
+        self.graph[u].append(v)
+        self.vertices.add(u)
+        self.vertices.add(v)
+
+    # grab from https://www.geeksforgeeks.org/find-paths-given-source-destination/
+    def get_all_paths(self, s, d):
+        visited = {k: False for k in self.vertices}
+        path = []
+        all_paths = []
+
+        def _get_all_paths_util(graph, u, d, visited, path):
+            visited[u] = True
+            path.append(u)
+            if u == d:
+                all_paths.append(copy.deepcopy(path))
+            else:
+                for i in graph[u]:
+                    if not visited[i]:
+                        _get_all_paths_util(graph, i, d, visited, path)
+            path.pop()
+            visited[u] = False
+
+        _get_all_paths_util(self.graph, s, d, visited, path)
+        return all_paths
+
+    @staticmethod
+    def from_ssa(ssa):
+        graph = DiGraph()
+        for op_id in range(len(ssa)):
+            for inp in ssa[op_id][0]:
+                for outp in ssa[op_id][1]:
+                    graph.add_edge(inp, outp)
+        return graph
+
+
+def _get_dependency_chain(ssa, versioned_target, versioned_source):
+    """
+    Return the index list of relevant operator to produce target blob from source blob,
+        if there's no dependency, return empty list.
+    """
+
+    # finding all paths between nodes can be O(N!), thus we can only search
+    # in the subgraph using the op starting from the first consumer of source blob
+    # to the producer of the target blob.
+    consumer_map = get_consumer_map(ssa)
+    producer_map = get_producer_map(ssa)
+    start_op = min(x[0] for x in consumer_map[versioned_source]) - 15
+    end_op = (
+        producer_map[versioned_target][0] + 15 if versioned_target in producer_map else start_op
+    )
+    sub_graph_ssa = ssa[start_op : end_op + 1]
+    if len(sub_graph_ssa) > 30:
+        logger.warning(
+            "Subgraph bebetween {} and {} is large (from op#{} to op#{}), it"
+            " might take non-trival time to find all paths between them.".format(
+                versioned_source, versioned_target, start_op, end_op
+            )
+        )
+
+    dag = DiGraph.from_ssa(sub_graph_ssa)
+    paths = dag.get_all_paths(versioned_source, versioned_target)  # include two ends
+    ops_in_paths = [[producer_map[blob][0] for blob in path[1:]] for path in paths]
+    return sorted(set().union(*[set(ops) for ops in ops_in_paths]))
+
+
+def identify_reshape_sub_graph(predict_net: caffe2_pb2.NetDef) -> List[List[int]]:
+    """
+    Idenfity the reshape sub-graph in a protobuf.
+    The reshape sub-graph is defined as matching the following pattern:
+
+    (input_blob) -> Op_1 -> ... -> Op_N -> (new_shape) -─┐
+        └-------------------------------------------> Reshape -> (output_blob)
+
+    Return:
+        List of sub-graphs, each sub-graph is represented as a list of indices
+        of the relavent ops, [Op_1, Op_2, ..., Op_N, Reshape]
+    """
+
+    ssa, _ = core.get_ssa(predict_net)
+
+    ret = []
+    for i, op in enumerate(predict_net.op):
+        if op.type == "Reshape":
+            assert len(op.input) == 2
+            input_ssa = ssa[i][0]
+            data_source = input_ssa[0]
+            shape_source = input_ssa[1]
+            op_indices = _get_dependency_chain(ssa, shape_source, data_source)
+            ret.append(op_indices + [i])
+    return ret
+
+
+def remove_reshape_for_fc(predict_net, params):
+    """
+    In PyTorch nn.Linear has to take 2D tensor, this often leads to reshape
+        a 4D tensor to 2D by calling .view(). However this (dynamic) reshaping
+        doesn't work well with ONNX and Int8 tools, and cause using extra
+        ops (eg. ExpandDims) that might not be available on mobile.
+    Luckily Caffe2 supports 4D tensor for FC, so we can remove those reshape
+        after exporting ONNX model.
+    """
+    from caffe2.python import core
+
+    # find all reshape sub-graph that can be removed, which is now all Reshape
+    # sub-graph whose output is only consumed by FC.
+    # TODO: to make it safer, we may need the actually value to better determine
+    # if a Reshape before FC is removable.
+    reshape_sub_graphs = identify_reshape_sub_graph(predict_net)
+    sub_graphs_to_remove = []
+    for reshape_sub_graph in reshape_sub_graphs:
+        reshape_op_id = reshape_sub_graph[-1]
+        assert predict_net.op[reshape_op_id].type == "Reshape"
+        ssa, _ = core.get_ssa(predict_net)
+        reshape_output = ssa[reshape_op_id][1][0]
+        consumers = [i for i in range(len(ssa)) if reshape_output in ssa[i][0]]
+        if all(predict_net.op[consumer].type == "FC" for consumer in consumers):
+            # safety check if the sub-graph is isolated, for this reshape sub-graph,
+            # it means it has one non-param external input and one external output.
+            ext_inputs, ext_outputs = get_sub_graph_external_input_output(
+                predict_net, reshape_sub_graph
+            )
+            non_params_ext_inputs = [inp for inp in ext_inputs if inp[1] != 0]
+            if len(non_params_ext_inputs) == 1 and len(ext_outputs) == 1:
+                sub_graphs_to_remove.append(reshape_sub_graph)
+
+    # perform removing subgraph by:
+    # 1: rename the Reshape's output to its input, then the graph can be
+    #   seen as in-place itentify, meaning whose external input/output are the same.
+    # 2: simply remove those ops.
+    remove_op_ids = []
+    params_to_remove = []
+    for sub_graph in sub_graphs_to_remove:
+        logger.info(
+            "Remove Reshape sub-graph:\n{}".format(
+                "".join(["(#{:>4})\n{}".format(i, predict_net.op[i]) for i in sub_graph])
+            )
+        )
+        reshape_op_id = sub_graph[-1]
+        new_reshap_output = predict_net.op[reshape_op_id].input[0]
+        rename_op_output(predict_net, reshape_op_id, 0, new_reshap_output)
+        ext_inputs, ext_outputs = get_sub_graph_external_input_output(predict_net, sub_graph)
+        non_params_ext_inputs = [inp for inp in ext_inputs if inp[1] != 0]
+        params_ext_inputs = [inp for inp in ext_inputs if inp[1] == 0]
+        assert len(non_params_ext_inputs) == 1 and len(ext_outputs) == 1
+        assert ext_outputs[0][0] == non_params_ext_inputs[0][0]
+        assert ext_outputs[0][1] == non_params_ext_inputs[0][1] + 1
+        remove_op_ids.extend(sub_graph)
+        params_to_remove.extend(params_ext_inputs)
+
+    predict_net = copy.deepcopy(predict_net)
+    new_ops = [op for i, op in enumerate(predict_net.op) if i not in remove_op_ids]
+    del predict_net.op[:]
+    predict_net.op.extend(new_ops)
+    for versioned_params in params_to_remove:
+        name = versioned_params[0]
+        logger.info("Remove params: {} from init_net and predict_net.external_input".format(name))
+        del params[name]
+        predict_net.external_input.remove(name)
+
+    return predict_net, params
+
+
+def fuse_copy_between_cpu_and_gpu(predict_net: caffe2_pb2.NetDef):
+    """
+    In-place fuse extra copy ops between cpu/gpu for the following case:
+        a -CopyAToB-> b -CopyBToA> c1 -NextOp1-> d1
+                        -CopyBToA> c2 -NextOp2-> d2
+    The fused network will look like:
+        a -NextOp1-> d1
+          -NextOp2-> d2
+    """
+
+    _COPY_OPS = ["CopyCPUToGPU", "CopyGPUToCPU"]
+
+    def _fuse_once(predict_net):
+        ssa, blob_versions = core.get_ssa(predict_net)
+        consumer_map = get_consumer_map(ssa)
+        versioned_external_output = [
+            (name, blob_versions[name]) for name in predict_net.external_output
+        ]
+
+        for op_id, op in enumerate(predict_net.op):
+            if op.type in _COPY_OPS:
+                fw_copy_versioned_output = ssa[op_id][1][0]
+                consumer_ids = [x[0] for x in consumer_map[fw_copy_versioned_output]]
+                reverse_op_type = _COPY_OPS[1 - _COPY_OPS.index(op.type)]
+
+                is_fusable = (
+                    len(consumer_ids) > 0
+                    and fw_copy_versioned_output not in versioned_external_output
+                    and all(
+                        predict_net.op[_op_id].type == reverse_op_type
+                        and ssa[_op_id][1][0] not in versioned_external_output
+                        for _op_id in consumer_ids
+                    )
+                )
+
+                if is_fusable:
+                    for rv_copy_op_id in consumer_ids:
+                        # making each NextOp uses "a" directly and removing Copy ops
+                        rs_copy_versioned_output = ssa[rv_copy_op_id][1][0]
+                        next_op_id, inp_id = consumer_map[rs_copy_versioned_output][0]
+                        predict_net.op[next_op_id].input[inp_id] = op.input[0]
+                    # remove CopyOps
+                    new_ops = [
+                        op
+                        for i, op in enumerate(predict_net.op)
+                        if i != op_id and i not in consumer_ids
+                    ]
+                    del predict_net.op[:]
+                    predict_net.op.extend(new_ops)
+                    return True
+
+        return False
+
+    # _fuse_once returns False is nothing can be fused
+    while _fuse_once(predict_net):
+        pass
+
+
+def remove_dead_end_ops(net_def: caffe2_pb2.NetDef):
+    """ remove ops if its output is not used or not in external_output """
+    ssa, versions = core.get_ssa(net_def)
+    versioned_external_output = [(name, versions[name]) for name in net_def.external_output]
+    consumer_map = get_consumer_map(ssa)
+    removed_op_ids = set()
+
+    def _is_dead_end(versioned_blob):
+        return not (
+            versioned_blob in versioned_external_output
+            or (
+                len(consumer_map[versioned_blob]) > 0
+                and all(x[0] not in removed_op_ids for x in consumer_map[versioned_blob])
+            )
+        )
+
+    for i, ssa_i in reversed(list(enumerate(ssa))):
+        versioned_outputs = ssa_i[1]
+        if all(_is_dead_end(outp) for outp in versioned_outputs):
+            removed_op_ids.add(i)
+
+    # simply removing those deadend ops should have no effect to external_output
+    new_ops = [op for i, op in enumerate(net_def.op) if i not in removed_op_ids]
+    del net_def.op[:]
+    net_def.op.extend(new_ops)
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/detectron2/layers/__init__.py b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/layers/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..2753739a03659dff5bc5b87f8c8417056d319842
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/layers/__init__.py
@@ -0,0 +1,12 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+from .batch_norm import FrozenBatchNorm2d, get_norm, NaiveSyncBatchNorm
+from .deform_conv import DeformConv, ModulatedDeformConv
+from .mask_ops import paste_masks_in_image
+from .nms import batched_nms, batched_nms_rotated, nms, nms_rotated
+from .roi_align import ROIAlign, roi_align
+from .roi_align_rotated import ROIAlignRotated, roi_align_rotated
+from .shape_spec import ShapeSpec
+from .wrappers import BatchNorm2d, Conv2d, ConvTranspose2d, cat, interpolate, Linear
+from .blocks import CNNBlockBase
+
+__all__ = [k for k in globals().keys() if not k.startswith("_")]
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/detectron2/layers/batch_norm.py b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/layers/batch_norm.py
new file mode 100644
index 0000000000000000000000000000000000000000..1339c6eaedfbc65c9604043234b738382d07fd40
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/layers/batch_norm.py
@@ -0,0 +1,242 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+import logging
+import torch
+import torch.distributed as dist
+from torch import nn
+from torch.autograd.function import Function
+from torch.nn import functional as F
+
+from detectron2.utils import comm
+
+from .wrappers import BatchNorm2d
+
+TORCH_VERSION = tuple(int(x) for x in torch.__version__.split(".")[:2])
+
+
+class FrozenBatchNorm2d(nn.Module):
+    """
+    BatchNorm2d where the batch statistics and the affine parameters are fixed.
+
+    It contains non-trainable buffers called
+    "weight" and "bias", "running_mean", "running_var",
+    initialized to perform identity transformation.
+
+    The pre-trained backbone models from Caffe2 only contain "weight" and "bias",
+    which are computed from the original four parameters of BN.
+    The affine transform `x * weight + bias` will perform the equivalent
+    computation of `(x - running_mean) / sqrt(running_var) * weight + bias`.
+    When loading a backbone model from Caffe2, "running_mean" and "running_var"
+    will be left unchanged as identity transformation.
+
+    Other pre-trained backbone models may contain all 4 parameters.
+
+    The forward is implemented by `F.batch_norm(..., training=False)`.
+    """
+
+    _version = 3
+
+    def __init__(self, num_features, eps=1e-5):
+        super().__init__()
+        self.num_features = num_features
+        self.eps = eps
+        self.register_buffer("weight", torch.ones(num_features))
+        self.register_buffer("bias", torch.zeros(num_features))
+        self.register_buffer("running_mean", torch.zeros(num_features))
+        self.register_buffer("running_var", torch.ones(num_features) - eps)
+
+    def forward(self, x):
+        if x.requires_grad:
+            # When gradients are needed, F.batch_norm will use extra memory
+            # because its backward op computes gradients for weight/bias as well.
+            scale = self.weight * (self.running_var + self.eps).rsqrt()
+            bias = self.bias - self.running_mean * scale
+            scale = scale.reshape(1, -1, 1, 1)
+            bias = bias.reshape(1, -1, 1, 1)
+            return x * scale + bias
+        else:
+            # When gradients are not needed, F.batch_norm is a single fused op
+            # and provide more optimization opportunities.
+            return F.batch_norm(
+                x,
+                self.running_mean,
+                self.running_var,
+                self.weight,
+                self.bias,
+                training=False,
+                eps=self.eps,
+            )
+
+    def _load_from_state_dict(
+        self, state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs
+    ):
+        version = local_metadata.get("version", None)
+
+        if version is None or version < 2:
+            # No running_mean/var in early versions
+            # This will silent the warnings
+            if prefix + "running_mean" not in state_dict:
+                state_dict[prefix + "running_mean"] = torch.zeros_like(self.running_mean)
+            if prefix + "running_var" not in state_dict:
+                state_dict[prefix + "running_var"] = torch.ones_like(self.running_var)
+
+        if version is not None and version < 3:
+            logger = logging.getLogger(__name__)
+            logger.info("FrozenBatchNorm {} is upgraded to version 3.".format(prefix.rstrip(".")))
+            # In version < 3, running_var are used without +eps.
+            state_dict[prefix + "running_var"] -= self.eps
+
+        super()._load_from_state_dict(
+            state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs
+        )
+
+    def __repr__(self):
+        return "FrozenBatchNorm2d(num_features={}, eps={})".format(self.num_features, self.eps)
+
+    @classmethod
+    def convert_frozen_batchnorm(cls, module):
+        """
+        Convert BatchNorm/SyncBatchNorm in module into FrozenBatchNorm.
+
+        Args:
+            module (torch.nn.Module):
+
+        Returns:
+            If module is BatchNorm/SyncBatchNorm, returns a new module.
+            Otherwise, in-place convert module and return it.
+
+        Similar to convert_sync_batchnorm in
+        https://github.com/pytorch/pytorch/blob/master/torch/nn/modules/batchnorm.py
+        """
+        bn_module = nn.modules.batchnorm
+        bn_module = (bn_module.BatchNorm2d, bn_module.SyncBatchNorm)
+        res = module
+        if isinstance(module, bn_module):
+            res = cls(module.num_features)
+            if module.affine:
+                res.weight.data = module.weight.data.clone().detach()
+                res.bias.data = module.bias.data.clone().detach()
+            res.running_mean.data = module.running_mean.data
+            res.running_var.data = module.running_var.data
+            res.eps = module.eps
+        else:
+            for name, child in module.named_children():
+                new_child = cls.convert_frozen_batchnorm(child)
+                if new_child is not child:
+                    res.add_module(name, new_child)
+        return res
+
+
+def get_norm(norm, out_channels):
+    """
+    Args:
+        norm (str or callable): either one of BN, SyncBN, FrozenBN, GN;
+            or a callable that takes a channel number and returns
+            the normalization layer as a nn.Module.
+
+    Returns:
+        nn.Module or None: the normalization layer
+    """
+    if isinstance(norm, str):
+        if len(norm) == 0:
+            return None
+        norm = {
+            "BN": BatchNorm2d,
+            # Fixed in https://github.com/pytorch/pytorch/pull/36382
+            "SyncBN": NaiveSyncBatchNorm if TORCH_VERSION <= (1, 5) else nn.SyncBatchNorm,
+            "FrozenBN": FrozenBatchNorm2d,
+            "GN": lambda channels: nn.GroupNorm(32, channels),
+            # for debugging:
+            "nnSyncBN": nn.SyncBatchNorm,
+            "naiveSyncBN": NaiveSyncBatchNorm,
+        }[norm]
+    return norm(out_channels)
+
+
+class AllReduce(Function):
+    @staticmethod
+    def forward(ctx, input):
+        input_list = [torch.zeros_like(input) for k in range(dist.get_world_size())]
+        # Use allgather instead of allreduce since I don't trust in-place operations ..
+        dist.all_gather(input_list, input, async_op=False)
+        inputs = torch.stack(input_list, dim=0)
+        return torch.sum(inputs, dim=0)
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        dist.all_reduce(grad_output, async_op=False)
+        return grad_output
+
+
+class NaiveSyncBatchNorm(BatchNorm2d):
+    """
+    In PyTorch<=1.5, `nn.SyncBatchNorm` has incorrect gradient
+    when the batch size on each worker is different.
+    (e.g., when scale augmentation is used, or when it is applied to mask head).
+
+    This is a slower but correct alternative to `nn.SyncBatchNorm`.
+
+    Note:
+        There isn't a single definition of Sync BatchNorm.
+
+        When ``stats_mode==""``, this module computes overall statistics by using
+        statistics of each worker with equal weight.  The result is true statistics
+        of all samples (as if they are all on one worker) only when all workers
+        have the same (N, H, W). This mode does not support inputs with zero batch size.
+
+        When ``stats_mode=="N"``, this module computes overall statistics by weighting
+        the statistics of each worker by their ``N``. The result is true statistics
+        of all samples (as if they are all on one worker) only when all workers
+        have the same (H, W). It is slower than ``stats_mode==""``.
+
+        Even though the result of this module may not be the true statistics of all samples,
+        it may still be reasonable because it might be preferrable to assign equal weights
+        to all workers, regardless of their (H, W) dimension, instead of putting larger weight
+        on larger images. From preliminary experiments, little difference is found between such
+        a simplified implementation and an accurate computation of overall mean & variance.
+    """
+
+    def __init__(self, *args, stats_mode="", **kwargs):
+        super().__init__(*args, **kwargs)
+        assert stats_mode in ["", "N"]
+        self._stats_mode = stats_mode
+
+    def forward(self, input):
+        if comm.get_world_size() == 1 or not self.training:
+            return super().forward(input)
+
+        B, C = input.shape[0], input.shape[1]
+
+        mean = torch.mean(input, dim=[0, 2, 3])
+        meansqr = torch.mean(input * input, dim=[0, 2, 3])
+
+        if self._stats_mode == "":
+            assert B > 0, 'SyncBatchNorm(stats_mode="") does not support zero batch size.'
+            vec = torch.cat([mean, meansqr], dim=0)
+            vec = AllReduce.apply(vec) * (1.0 / dist.get_world_size())
+            mean, meansqr = torch.split(vec, C)
+            momentum = self.momentum
+        else:
+            if B == 0:
+                vec = torch.zeros([2 * C + 1], device=mean.device, dtype=mean.dtype)
+                vec = vec + input.sum()  # make sure there is gradient w.r.t input
+            else:
+                vec = torch.cat(
+                    [mean, meansqr, torch.ones([1], device=mean.device, dtype=mean.dtype)], dim=0
+                )
+            vec = AllReduce.apply(vec * B)
+
+            total_batch = vec[-1].detach()
+            momentum = total_batch.clamp(max=1) * self.momentum  # no update if total_batch is 0
+            total_batch = torch.max(total_batch, torch.ones_like(total_batch))  # avoid div-by-zero
+            mean, meansqr, _ = torch.split(vec / total_batch, C)
+
+        var = meansqr - mean * mean
+        invstd = torch.rsqrt(var + self.eps)
+        scale = self.weight * invstd
+        bias = self.bias - mean * scale
+        scale = scale.reshape(1, -1, 1, 1)
+        bias = bias.reshape(1, -1, 1, 1)
+
+        self.running_mean += momentum * (mean.detach() - self.running_mean)
+        self.running_var += momentum * (var.detach() - self.running_var)
+        return input * scale + bias
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/detectron2/layers/blocks.py b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/layers/blocks.py
new file mode 100644
index 0000000000000000000000000000000000000000..1d06fec22e472febbc960c49f747acddd2ab7208
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/layers/blocks.py
@@ -0,0 +1,48 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+
+from torch import nn
+
+from .batch_norm import FrozenBatchNorm2d
+
+
+class CNNBlockBase(nn.Module):
+    """
+    A CNN block is assumed to have input channels, output channels and a stride.
+    The input and output of `forward()` method must be NCHW tensors.
+    The method can perform arbitrary computation but must match the given
+    channels and stride specification.
+
+    Attribute:
+        in_channels (int):
+        out_channels (int):
+        stride (int):
+    """
+
+    def __init__(self, in_channels, out_channels, stride):
+        """
+        The `__init__` method of any subclass should also contain these arguments.
+
+        Args:
+            in_channels (int):
+            out_channels (int):
+            stride (int):
+        """
+        super().__init__()
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.stride = stride
+
+    def freeze(self):
+        """
+        Make this block not trainable.
+        This method sets all parameters to `requires_grad=False`,
+        and convert all BatchNorm layers to FrozenBatchNorm
+
+        Returns:
+            the block itself
+        """
+        for p in self.parameters():
+            p.requires_grad = False
+        FrozenBatchNorm2d.convert_frozen_batchnorm(self)
+        return self
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/detectron2/layers/csrc/README.md b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/layers/csrc/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..778ed3da0bae89820831bcd8a72ff7b9cad8d4dd
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/layers/csrc/README.md
@@ -0,0 +1,7 @@
+
+
+To add a new Op:
+
+1. Create a new directory
+2. Implement new ops there
+3. Delcare its Python interface in `vision.cpp`.
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/detectron2/layers/csrc/ROIAlign/ROIAlign.h b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/layers/csrc/ROIAlign/ROIAlign.h
new file mode 100644
index 0000000000000000000000000000000000000000..2d95eac6e29d5e5624afbc6c545776d78ebc709c
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/layers/csrc/ROIAlign/ROIAlign.h
@@ -0,0 +1,130 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+#pragma once
+#include <torch/types.h>
+
+namespace detectron2 {
+
+at::Tensor ROIAlign_forward_cpu(
+    const at::Tensor& input,
+    const at::Tensor& rois,
+    const float spatial_scale,
+    const int pooled_height,
+    const int pooled_width,
+    const int sampling_ratio,
+    bool aligned);
+
+at::Tensor ROIAlign_backward_cpu(
+    const at::Tensor& grad,
+    const at::Tensor& rois,
+    const float spatial_scale,
+    const int pooled_height,
+    const int pooled_width,
+    const int batch_size,
+    const int channels,
+    const int height,
+    const int width,
+    const int sampling_ratio,
+    bool aligned);
+
+#ifdef WITH_CUDA
+at::Tensor ROIAlign_forward_cuda(
+    const at::Tensor& input,
+    const at::Tensor& rois,
+    const float spatial_scale,
+    const int pooled_height,
+    const int pooled_width,
+    const int sampling_ratio,
+    bool aligned);
+
+at::Tensor ROIAlign_backward_cuda(
+    const at::Tensor& grad,
+    const at::Tensor& rois,
+    const float spatial_scale,
+    const int pooled_height,
+    const int pooled_width,
+    const int batch_size,
+    const int channels,
+    const int height,
+    const int width,
+    const int sampling_ratio,
+    bool aligned);
+#endif
+
+// Interface for Python
+inline at::Tensor ROIAlign_forward(
+    const at::Tensor& input,
+    const at::Tensor& rois,
+    const float spatial_scale,
+    const int pooled_height,
+    const int pooled_width,
+    const int sampling_ratio,
+    bool aligned) {
+  if (input.is_cuda()) {
+#ifdef WITH_CUDA
+    return ROIAlign_forward_cuda(
+        input,
+        rois,
+        spatial_scale,
+        pooled_height,
+        pooled_width,
+        sampling_ratio,
+        aligned);
+#else
+    AT_ERROR("Not compiled with GPU support");
+#endif
+  }
+  return ROIAlign_forward_cpu(
+      input,
+      rois,
+      spatial_scale,
+      pooled_height,
+      pooled_width,
+      sampling_ratio,
+      aligned);
+}
+
+inline at::Tensor ROIAlign_backward(
+    const at::Tensor& grad,
+    const at::Tensor& rois,
+    const float spatial_scale,
+    const int pooled_height,
+    const int pooled_width,
+    const int batch_size,
+    const int channels,
+    const int height,
+    const int width,
+    const int sampling_ratio,
+    bool aligned) {
+  if (grad.is_cuda()) {
+#ifdef WITH_CUDA
+    return ROIAlign_backward_cuda(
+        grad,
+        rois,
+        spatial_scale,
+        pooled_height,
+        pooled_width,
+        batch_size,
+        channels,
+        height,
+        width,
+        sampling_ratio,
+        aligned);
+#else
+    AT_ERROR("Not compiled with GPU support");
+#endif
+  }
+  return ROIAlign_backward_cpu(
+      grad,
+      rois,
+      spatial_scale,
+      pooled_height,
+      pooled_width,
+      batch_size,
+      channels,
+      height,
+      width,
+      sampling_ratio,
+      aligned);
+}
+
+} // namespace detectron2
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/detectron2/layers/csrc/ROIAlign/ROIAlign_cpu.cpp b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/layers/csrc/ROIAlign/ROIAlign_cpu.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..52fc83f8140b29de7b2ad3cb490b8cb672959e16
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/layers/csrc/ROIAlign/ROIAlign_cpu.cpp
@@ -0,0 +1,508 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+#include <ATen/TensorUtils.h>
+#include "ROIAlign.h"
+
+namespace {
+
+// implementation taken from Caffe2
+template <typename T>
+struct PreCalc {
+  int pos1;
+  int pos2;
+  int pos3;
+  int pos4;
+  T w1;
+  T w2;
+  T w3;
+  T w4;
+};
+
+template <typename T>
+void pre_calc_for_bilinear_interpolate(
+    const int height,
+    const int width,
+    const int pooled_height,
+    const int pooled_width,
+    const int iy_upper,
+    const int ix_upper,
+    T roi_start_h,
+    T roi_start_w,
+    T bin_size_h,
+    T bin_size_w,
+    int roi_bin_grid_h,
+    int roi_bin_grid_w,
+    std::vector<PreCalc<T>>& pre_calc) {
+  int pre_calc_index = 0;
+  for (int ph = 0; ph < pooled_height; ph++) {
+    for (int pw = 0; pw < pooled_width; pw++) {
+      for (int iy = 0; iy < iy_upper; iy++) {
+        const T yy = roi_start_h + ph * bin_size_h +
+            static_cast<T>(iy + .5f) * bin_size_h /
+                static_cast<T>(roi_bin_grid_h); // e.g., 0.5, 1.5
+        for (int ix = 0; ix < ix_upper; ix++) {
+          const T xx = roi_start_w + pw * bin_size_w +
+              static_cast<T>(ix + .5f) * bin_size_w /
+                  static_cast<T>(roi_bin_grid_w);
+
+          T x = xx;
+          T y = yy;
+          // deal with: inverse elements are out of feature map boundary
+          if (y < -1.0 || y > height || x < -1.0 || x > width) {
+            // empty
+            PreCalc<T> pc;
+            pc.pos1 = 0;
+            pc.pos2 = 0;
+            pc.pos3 = 0;
+            pc.pos4 = 0;
+            pc.w1 = 0;
+            pc.w2 = 0;
+            pc.w3 = 0;
+            pc.w4 = 0;
+            pre_calc[pre_calc_index] = pc;
+            pre_calc_index += 1;
+            continue;
+          }
+
+          if (y <= 0) {
+            y = 0;
+          }
+          if (x <= 0) {
+            x = 0;
+          }
+
+          int y_low = (int)y;
+          int x_low = (int)x;
+          int y_high;
+          int x_high;
+
+          if (y_low >= height - 1) {
+            y_high = y_low = height - 1;
+            y = (T)y_low;
+          } else {
+            y_high = y_low + 1;
+          }
+
+          if (x_low >= width - 1) {
+            x_high = x_low = width - 1;
+            x = (T)x_low;
+          } else {
+            x_high = x_low + 1;
+          }
+
+          T ly = y - y_low;
+          T lx = x - x_low;
+          T hy = 1. - ly, hx = 1. - lx;
+          T w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
+
+          // save weights and indices
+          PreCalc<T> pc;
+          pc.pos1 = y_low * width + x_low;
+          pc.pos2 = y_low * width + x_high;
+          pc.pos3 = y_high * width + x_low;
+          pc.pos4 = y_high * width + x_high;
+          pc.w1 = w1;
+          pc.w2 = w2;
+          pc.w3 = w3;
+          pc.w4 = w4;
+          pre_calc[pre_calc_index] = pc;
+
+          pre_calc_index += 1;
+        }
+      }
+    }
+  }
+}
+
+template <typename T>
+void ROIAlignForward(
+    const int nthreads,
+    const T* input,
+    const T& spatial_scale,
+    const int channels,
+    const int height,
+    const int width,
+    const int pooled_height,
+    const int pooled_width,
+    const int sampling_ratio,
+    const T* rois,
+    T* output,
+    bool aligned) {
+  int n_rois = nthreads / channels / pooled_width / pooled_height;
+  // (n, c, ph, pw) is an element in the pooled output
+  // can be parallelized using omp
+  // #pragma omp parallel for num_threads(32)
+  for (int n = 0; n < n_rois; n++) {
+    int index_n = n * channels * pooled_width * pooled_height;
+
+    const T* offset_rois = rois + n * 5;
+    int roi_batch_ind = offset_rois[0];
+
+    // Do not use rounding; this implementation detail is critical
+    T offset = aligned ? (T)0.5 : (T)0.0;
+    T roi_start_w = offset_rois[1] * spatial_scale - offset;
+    T roi_start_h = offset_rois[2] * spatial_scale - offset;
+    T roi_end_w = offset_rois[3] * spatial_scale - offset;
+    T roi_end_h = offset_rois[4] * spatial_scale - offset;
+
+    T roi_width = roi_end_w - roi_start_w;
+    T roi_height = roi_end_h - roi_start_h;
+    if (aligned) {
+      AT_ASSERTM(
+          roi_width >= 0 && roi_height >= 0,
+          "ROIs in ROIAlign cannot have non-negative size!");
+    } else { // for backward-compatibility only
+      roi_width = std::max(roi_width, (T)1.);
+      roi_height = std::max(roi_height, (T)1.);
+    }
+    T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
+    T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
+
+    // We use roi_bin_grid to sample the grid and mimic integral
+    int roi_bin_grid_h = (sampling_ratio > 0)
+        ? sampling_ratio
+        : ceil(roi_height / pooled_height); // e.g., = 2
+    int roi_bin_grid_w =
+        (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);
+
+    // We do average (integral) pooling inside a bin
+    // When the grid is empty, output zeros == 0/1, instead of NaN.
+    const T count = std::max(roi_bin_grid_h * roi_bin_grid_w, 1); // e.g. = 4
+
+    // we want to precalculate indices and weights shared by all channels,
+    // this is the key point of optimization
+    std::vector<PreCalc<T>> pre_calc(
+        roi_bin_grid_h * roi_bin_grid_w * pooled_width * pooled_height);
+    pre_calc_for_bilinear_interpolate(
+        height,
+        width,
+        pooled_height,
+        pooled_width,
+        roi_bin_grid_h,
+        roi_bin_grid_w,
+        roi_start_h,
+        roi_start_w,
+        bin_size_h,
+        bin_size_w,
+        roi_bin_grid_h,
+        roi_bin_grid_w,
+        pre_calc);
+
+    for (int c = 0; c < channels; c++) {
+      int index_n_c = index_n + c * pooled_width * pooled_height;
+      const T* offset_input =
+          input + (roi_batch_ind * channels + c) * height * width;
+      int pre_calc_index = 0;
+
+      for (int ph = 0; ph < pooled_height; ph++) {
+        for (int pw = 0; pw < pooled_width; pw++) {
+          int index = index_n_c + ph * pooled_width + pw;
+
+          T output_val = 0.;
+          for (int iy = 0; iy < roi_bin_grid_h; iy++) {
+            for (int ix = 0; ix < roi_bin_grid_w; ix++) {
+              PreCalc<T> pc = pre_calc[pre_calc_index];
+              output_val += pc.w1 * offset_input[pc.pos1] +
+                  pc.w2 * offset_input[pc.pos2] +
+                  pc.w3 * offset_input[pc.pos3] + pc.w4 * offset_input[pc.pos4];
+
+              pre_calc_index += 1;
+            }
+          }
+          output_val /= count;
+
+          output[index] = output_val;
+        } // for pw
+      } // for ph
+    } // for c
+  } // for n
+}
+
+template <typename T>
+void bilinear_interpolate_gradient(
+    const int height,
+    const int width,
+    T y,
+    T x,
+    T& w1,
+    T& w2,
+    T& w3,
+    T& w4,
+    int& x_low,
+    int& x_high,
+    int& y_low,
+    int& y_high,
+    const int index /* index for debug only*/) {
+  // deal with cases that inverse elements are out of feature map boundary
+  if (y < -1.0 || y > height || x < -1.0 || x > width) {
+    // empty
+    w1 = w2 = w3 = w4 = 0.;
+    x_low = x_high = y_low = y_high = -1;
+    return;
+  }
+
+  if (y <= 0)
+    y = 0;
+  if (x <= 0)
+    x = 0;
+
+  y_low = (int)y;
+  x_low = (int)x;
+
+  if (y_low >= height - 1) {
+    y_high = y_low = height - 1;
+    y = (T)y_low;
+  } else {
+    y_high = y_low + 1;
+  }
+
+  if (x_low >= width - 1) {
+    x_high = x_low = width - 1;
+    x = (T)x_low;
+  } else {
+    x_high = x_low + 1;
+  }
+
+  T ly = y - y_low;
+  T lx = x - x_low;
+  T hy = 1. - ly, hx = 1. - lx;
+
+  // reference in forward
+  // T v1 = input[y_low * width + x_low];
+  // T v2 = input[y_low * width + x_high];
+  // T v3 = input[y_high * width + x_low];
+  // T v4 = input[y_high * width + x_high];
+  // T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+
+  w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
+
+  return;
+}
+
+template <class T>
+inline void add(T* address, const T& val) {
+  *address += val;
+}
+
+template <typename T>
+void ROIAlignBackward(
+    const int nthreads,
+    // may not be contiguous, and should be indexed using n_stride, etc
+    const T* grad_output,
+    const T& spatial_scale,
+    const int channels,
+    const int height,
+    const int width,
+    const int pooled_height,
+    const int pooled_width,
+    const int sampling_ratio,
+    T* grad_input,
+    const T* rois,
+    const int n_stride,
+    const int c_stride,
+    const int h_stride,
+    const int w_stride,
+    bool aligned) {
+  for (int index = 0; index < nthreads; index++) {
+    // (n, c, ph, pw) is an element in the pooled output
+    int pw = index % pooled_width;
+    int ph = (index / pooled_width) % pooled_height;
+    int c = (index / pooled_width / pooled_height) % channels;
+    int n = index / pooled_width / pooled_height / channels;
+
+    const T* offset_rois = rois + n * 5;
+    int roi_batch_ind = offset_rois[0];
+
+    // Do not use rounding; this implementation detail is critical
+    T offset = aligned ? (T)0.5 : (T)0.0;
+    T roi_start_w = offset_rois[1] * spatial_scale - offset;
+    T roi_start_h = offset_rois[2] * spatial_scale - offset;
+    T roi_end_w = offset_rois[3] * spatial_scale - offset;
+    T roi_end_h = offset_rois[4] * spatial_scale - offset;
+
+    T roi_width = roi_end_w - roi_start_w;
+    T roi_height = roi_end_h - roi_start_h;
+    if (aligned) {
+      AT_ASSERTM(
+          roi_width >= 0 && roi_height >= 0,
+          "ROIs in ROIAlign do not have non-negative size!");
+    } else { // for backward-compatibility only
+      roi_width = std::max(roi_width, (T)1.);
+      roi_height = std::max(roi_height, (T)1.);
+    }
+    T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
+    T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
+
+    T* offset_grad_input =
+        grad_input + ((roi_batch_ind * channels + c) * height * width);
+
+    int output_offset = n * n_stride + c * c_stride;
+    const T* offset_grad_output = grad_output + output_offset;
+    const T grad_output_this_bin =
+        offset_grad_output[ph * h_stride + pw * w_stride];
+
+    // We use roi_bin_grid to sample the grid and mimic integral
+    int roi_bin_grid_h = (sampling_ratio > 0)
+        ? sampling_ratio
+        : ceil(roi_height / pooled_height); // e.g., = 2
+    int roi_bin_grid_w =
+        (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);
+
+    // We do average (integral) pooling inside a bin
+    const T count = roi_bin_grid_h * roi_bin_grid_w; // e.g. = 4
+
+    for (int iy = 0; iy < roi_bin_grid_h; iy++) {
+      const T y = roi_start_h + ph * bin_size_h +
+          static_cast<T>(iy + .5f) * bin_size_h /
+              static_cast<T>(roi_bin_grid_h); // e.g., 0.5, 1.5
+      for (int ix = 0; ix < roi_bin_grid_w; ix++) {
+        const T x = roi_start_w + pw * bin_size_w +
+            static_cast<T>(ix + .5f) * bin_size_w /
+                static_cast<T>(roi_bin_grid_w);
+
+        T w1, w2, w3, w4;
+        int x_low, x_high, y_low, y_high;
+
+        bilinear_interpolate_gradient(
+            height,
+            width,
+            y,
+            x,
+            w1,
+            w2,
+            w3,
+            w4,
+            x_low,
+            x_high,
+            y_low,
+            y_high,
+            index);
+
+        T g1 = grad_output_this_bin * w1 / count;
+        T g2 = grad_output_this_bin * w2 / count;
+        T g3 = grad_output_this_bin * w3 / count;
+        T g4 = grad_output_this_bin * w4 / count;
+
+        if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) {
+          // atomic add is not needed for now since it is single threaded
+          add(offset_grad_input + y_low * width + x_low, static_cast<T>(g1));
+          add(offset_grad_input + y_low * width + x_high, static_cast<T>(g2));
+          add(offset_grad_input + y_high * width + x_low, static_cast<T>(g3));
+          add(offset_grad_input + y_high * width + x_high, static_cast<T>(g4));
+        } // if
+      } // ix
+    } // iy
+  } // for
+} // ROIAlignBackward
+
+} // namespace
+
+namespace detectron2 {
+
+at::Tensor ROIAlign_forward_cpu(
+    const at::Tensor& input,
+    const at::Tensor& rois,
+    const float spatial_scale,
+    const int pooled_height,
+    const int pooled_width,
+    const int sampling_ratio,
+    bool aligned) {
+  AT_ASSERTM(input.device().is_cpu(), "input must be a CPU tensor");
+  AT_ASSERTM(rois.device().is_cpu(), "rois must be a CPU tensor");
+
+  at::TensorArg input_t{input, "input", 1}, rois_t{rois, "rois", 2};
+
+  at::CheckedFrom c = "ROIAlign_forward_cpu";
+  at::checkAllSameType(c, {input_t, rois_t});
+
+  auto num_rois = rois.size(0);
+  auto channels = input.size(1);
+  auto height = input.size(2);
+  auto width = input.size(3);
+
+  at::Tensor output = at::zeros(
+      {num_rois, channels, pooled_height, pooled_width}, input.options());
+
+  auto output_size = num_rois * pooled_height * pooled_width * channels;
+
+  if (output.numel() == 0)
+    return output;
+
+  auto input_ = input.contiguous(), rois_ = rois.contiguous();
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      input.scalar_type(), "ROIAlign_forward", [&] {
+        ROIAlignForward<scalar_t>(
+            output_size,
+            input_.data_ptr<scalar_t>(),
+            spatial_scale,
+            channels,
+            height,
+            width,
+            pooled_height,
+            pooled_width,
+            sampling_ratio,
+            rois_.data_ptr<scalar_t>(),
+            output.data_ptr<scalar_t>(),
+            aligned);
+      });
+  return output;
+}
+
+at::Tensor ROIAlign_backward_cpu(
+    const at::Tensor& grad,
+    const at::Tensor& rois,
+    const float spatial_scale,
+    const int pooled_height,
+    const int pooled_width,
+    const int batch_size,
+    const int channels,
+    const int height,
+    const int width,
+    const int sampling_ratio,
+    bool aligned) {
+  AT_ASSERTM(grad.device().is_cpu(), "grad must be a CPU tensor");
+  AT_ASSERTM(rois.device().is_cpu(), "rois must be a CPU tensor");
+
+  at::TensorArg grad_t{grad, "grad", 1}, rois_t{rois, "rois", 2};
+
+  at::CheckedFrom c = "ROIAlign_backward_cpu";
+  at::checkAllSameType(c, {grad_t, rois_t});
+
+  at::Tensor grad_input =
+      at::zeros({batch_size, channels, height, width}, grad.options());
+
+  // handle possibly empty gradients
+  if (grad.numel() == 0) {
+    return grad_input;
+  }
+
+  // get stride values to ensure indexing into gradients is correct.
+  int n_stride = grad.stride(0);
+  int c_stride = grad.stride(1);
+  int h_stride = grad.stride(2);
+  int w_stride = grad.stride(3);
+
+  auto rois_ = rois.contiguous();
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      grad.scalar_type(), "ROIAlign_forward", [&] {
+        ROIAlignBackward<scalar_t>(
+            grad.numel(),
+            grad.data_ptr<scalar_t>(),
+            spatial_scale,
+            channels,
+            height,
+            width,
+            pooled_height,
+            pooled_width,
+            sampling_ratio,
+            grad_input.data_ptr<scalar_t>(),
+            rois_.data_ptr<scalar_t>(),
+            n_stride,
+            c_stride,
+            h_stride,
+            w_stride,
+            aligned);
+      });
+  return grad_input;
+}
+
+} // namespace detectron2
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/detectron2/layers/csrc/ROIAlign/ROIAlign_cuda.cu b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/layers/csrc/ROIAlign/ROIAlign_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..2e05953b03089203d29bc304726afbca7ee5d464
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/layers/csrc/ROIAlign/ROIAlign_cuda.cu
@@ -0,0 +1,430 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+#include <ATen/ATen.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAGuard.h>
+#include <ATen/cuda/CUDAApplyUtils.cuh>
+
+// TODO make it in a common file
+#define CUDA_1D_KERNEL_LOOP(i, n)                            \
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; \
+       i += blockDim.x * gridDim.x)
+
+template <typename T>
+__device__ T bilinear_interpolate(
+    const T* bottom_data,
+    const int height,
+    const int width,
+    T y,
+    T x,
+    const int index /* index for debug only*/) {
+  // deal with cases that inverse elements are out of feature map boundary
+  if (y < -1.0 || y > height || x < -1.0 || x > width) {
+    // empty
+    return 0;
+  }
+
+  if (y <= 0)
+    y = 0;
+  if (x <= 0)
+    x = 0;
+
+  int y_low = (int)y;
+  int x_low = (int)x;
+  int y_high;
+  int x_high;
+
+  if (y_low >= height - 1) {
+    y_high = y_low = height - 1;
+    y = (T)y_low;
+  } else {
+    y_high = y_low + 1;
+  }
+
+  if (x_low >= width - 1) {
+    x_high = x_low = width - 1;
+    x = (T)x_low;
+  } else {
+    x_high = x_low + 1;
+  }
+
+  T ly = y - y_low;
+  T lx = x - x_low;
+  T hy = 1. - ly, hx = 1. - lx;
+  // do bilinear interpolation
+  T v1 = bottom_data[y_low * width + x_low];
+  T v2 = bottom_data[y_low * width + x_high];
+  T v3 = bottom_data[y_high * width + x_low];
+  T v4 = bottom_data[y_high * width + x_high];
+  T w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
+
+  T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+
+  return val;
+}
+
+template <typename T>
+__global__ void RoIAlignForward(
+    const int nthreads,
+    const T* bottom_data,
+    const T spatial_scale,
+    const int channels,
+    const int height,
+    const int width,
+    const int pooled_height,
+    const int pooled_width,
+    const int sampling_ratio,
+    const T* bottom_rois,
+    T* top_data,
+    bool aligned) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    // (n, c, ph, pw) is an element in the pooled output
+    int pw = index % pooled_width;
+    int ph = (index / pooled_width) % pooled_height;
+    int c = (index / pooled_width / pooled_height) % channels;
+    int n = index / pooled_width / pooled_height / channels;
+
+    const T* offset_bottom_rois = bottom_rois + n * 5;
+    int roi_batch_ind = offset_bottom_rois[0];
+
+    // Do not use rounding; this implementation detail is critical
+    T offset = aligned ? (T)0.5 : (T)0.0;
+    T roi_start_w = offset_bottom_rois[1] * spatial_scale - offset;
+    T roi_start_h = offset_bottom_rois[2] * spatial_scale - offset;
+    T roi_end_w = offset_bottom_rois[3] * spatial_scale - offset;
+    T roi_end_h = offset_bottom_rois[4] * spatial_scale - offset;
+
+    T roi_width = roi_end_w - roi_start_w;
+    T roi_height = roi_end_h - roi_start_h;
+    if (!aligned) { // for backward-compatibility only
+      roi_width = max(roi_width, (T)1.);
+      roi_height = max(roi_height, (T)1.);
+    }
+    T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
+    T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
+
+    const T* offset_bottom_data =
+        bottom_data + (roi_batch_ind * channels + c) * height * width;
+
+    // We use roi_bin_grid to sample the grid and mimic integral
+    int roi_bin_grid_h = (sampling_ratio > 0)
+        ? sampling_ratio
+        : ceil(roi_height / pooled_height); // e.g., = 2
+    int roi_bin_grid_w =
+        (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);
+
+    // We do average (integral) pooling inside a bin
+    // When the grid is empty, output zeros == 0/1, instead of NaN.
+    const T count = max(roi_bin_grid_h * roi_bin_grid_w, 1); // e.g. = 4
+
+    T output_val = 0.;
+    for (int iy = 0; iy < roi_bin_grid_h; iy++) // e.g., iy = 0, 1
+    {
+      const T y = roi_start_h + ph * bin_size_h +
+          static_cast<T>(iy + .5f) * bin_size_h /
+              static_cast<T>(roi_bin_grid_h); // e.g., 0.5, 1.5
+      for (int ix = 0; ix < roi_bin_grid_w; ix++) {
+        const T x = roi_start_w + pw * bin_size_w +
+            static_cast<T>(ix + .5f) * bin_size_w /
+                static_cast<T>(roi_bin_grid_w);
+
+        T val = bilinear_interpolate(
+            offset_bottom_data, height, width, y, x, index);
+        output_val += val;
+      }
+    }
+    output_val /= count;
+
+    top_data[index] = output_val;
+  }
+}
+
+template <typename T>
+__device__ void bilinear_interpolate_gradient(
+    const int height,
+    const int width,
+    T y,
+    T x,
+    T& w1,
+    T& w2,
+    T& w3,
+    T& w4,
+    int& x_low,
+    int& x_high,
+    int& y_low,
+    int& y_high,
+    const int index /* index for debug only*/) {
+  // deal with cases that inverse elements are out of feature map boundary
+  if (y < -1.0 || y > height || x < -1.0 || x > width) {
+    // empty
+    w1 = w2 = w3 = w4 = 0.;
+    x_low = x_high = y_low = y_high = -1;
+    return;
+  }
+
+  if (y <= 0)
+    y = 0;
+  if (x <= 0)
+    x = 0;
+
+  y_low = (int)y;
+  x_low = (int)x;
+
+  if (y_low >= height - 1) {
+    y_high = y_low = height - 1;
+    y = (T)y_low;
+  } else {
+    y_high = y_low + 1;
+  }
+
+  if (x_low >= width - 1) {
+    x_high = x_low = width - 1;
+    x = (T)x_low;
+  } else {
+    x_high = x_low + 1;
+  }
+
+  T ly = y - y_low;
+  T lx = x - x_low;
+  T hy = 1. - ly, hx = 1. - lx;
+
+  // reference in forward
+  // T v1 = bottom_data[y_low * width + x_low];
+  // T v2 = bottom_data[y_low * width + x_high];
+  // T v3 = bottom_data[y_high * width + x_low];
+  // T v4 = bottom_data[y_high * width + x_high];
+  // T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+
+  w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
+
+  return;
+}
+
+template <typename T>
+__global__ void RoIAlignBackwardFeature(
+    const int nthreads,
+    const T* top_diff,
+    const int num_rois,
+    const T spatial_scale,
+    const int channels,
+    const int height,
+    const int width,
+    const int pooled_height,
+    const int pooled_width,
+    const int sampling_ratio,
+    T* bottom_diff,
+    const T* bottom_rois,
+    bool aligned) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    // (n, c, ph, pw) is an element in the pooled output
+    int pw = index % pooled_width;
+    int ph = (index / pooled_width) % pooled_height;
+    int c = (index / pooled_width / pooled_height) % channels;
+    int n = index / pooled_width / pooled_height / channels;
+
+    const T* offset_bottom_rois = bottom_rois + n * 5;
+    int roi_batch_ind = offset_bottom_rois[0];
+
+    // Do not use rounding; this implementation detail is critical
+    T offset = aligned ? (T)0.5 : (T)0.0;
+    T roi_start_w = offset_bottom_rois[1] * spatial_scale - offset;
+    T roi_start_h = offset_bottom_rois[2] * spatial_scale - offset;
+    T roi_end_w = offset_bottom_rois[3] * spatial_scale - offset;
+    T roi_end_h = offset_bottom_rois[4] * spatial_scale - offset;
+
+    T roi_width = roi_end_w - roi_start_w;
+    T roi_height = roi_end_h - roi_start_h;
+    if (!aligned) { // for backward-compatibility only
+      roi_width = max(roi_width, (T)1.);
+      roi_height = max(roi_height, (T)1.);
+    }
+    T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
+    T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
+
+    T* offset_bottom_diff =
+        bottom_diff + (roi_batch_ind * channels + c) * height * width;
+
+    int top_offset = (n * channels + c) * pooled_height * pooled_width;
+    const T* offset_top_diff = top_diff + top_offset;
+    const T top_diff_this_bin = offset_top_diff[ph * pooled_width + pw];
+
+    // We use roi_bin_grid to sample the grid and mimic integral
+    int roi_bin_grid_h = (sampling_ratio > 0)
+        ? sampling_ratio
+        : ceil(roi_height / pooled_height); // e.g., = 2
+    int roi_bin_grid_w =
+        (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);
+
+    // We do average (integral) pooling inside a bin
+    const T count = roi_bin_grid_h * roi_bin_grid_w; // e.g. = 4
+
+    for (int iy = 0; iy < roi_bin_grid_h; iy++) // e.g., iy = 0, 1
+    {
+      const T y = roi_start_h + ph * bin_size_h +
+          static_cast<T>(iy + .5f) * bin_size_h /
+              static_cast<T>(roi_bin_grid_h); // e.g., 0.5, 1.5
+      for (int ix = 0; ix < roi_bin_grid_w; ix++) {
+        const T x = roi_start_w + pw * bin_size_w +
+            static_cast<T>(ix + .5f) * bin_size_w /
+                static_cast<T>(roi_bin_grid_w);
+
+        T w1, w2, w3, w4;
+        int x_low, x_high, y_low, y_high;
+
+        bilinear_interpolate_gradient(
+            height,
+            width,
+            y,
+            x,
+            w1,
+            w2,
+            w3,
+            w4,
+            x_low,
+            x_high,
+            y_low,
+            y_high,
+            index);
+
+        T g1 = top_diff_this_bin * w1 / count;
+        T g2 = top_diff_this_bin * w2 / count;
+        T g3 = top_diff_this_bin * w3 / count;
+        T g4 = top_diff_this_bin * w4 / count;
+
+        if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) {
+          atomicAdd(
+              offset_bottom_diff + y_low * width + x_low, static_cast<T>(g1));
+          atomicAdd(
+              offset_bottom_diff + y_low * width + x_high, static_cast<T>(g2));
+          atomicAdd(
+              offset_bottom_diff + y_high * width + x_low, static_cast<T>(g3));
+          atomicAdd(
+              offset_bottom_diff + y_high * width + x_high, static_cast<T>(g4));
+        } // if
+      } // ix
+    } // iy
+  } // CUDA_1D_KERNEL_LOOP
+} // RoIAlignBackward
+
+namespace detectron2 {
+
+at::Tensor ROIAlign_forward_cuda(
+    const at::Tensor& input,
+    const at::Tensor& rois,
+    const float spatial_scale,
+    const int pooled_height,
+    const int pooled_width,
+    const int sampling_ratio,
+    bool aligned) {
+  AT_ASSERTM(input.device().is_cuda(), "input must be a CUDA tensor");
+  AT_ASSERTM(rois.device().is_cuda(), "rois must be a CUDA tensor");
+  at::TensorArg input_t{input, "input", 1}, rois_t{rois, "rois", 2};
+
+  at::CheckedFrom c = "ROIAlign_forward_cuda";
+  at::checkAllSameGPU(c, {input_t, rois_t});
+  at::checkAllSameType(c, {input_t, rois_t});
+  at::cuda::CUDAGuard device_guard(input.device());
+
+  auto num_rois = rois.size(0);
+  auto channels = input.size(1);
+  auto height = input.size(2);
+  auto width = input.size(3);
+
+  auto output = at::empty(
+      {num_rois, channels, pooled_height, pooled_width}, input.options());
+  auto output_size = num_rois * pooled_height * pooled_width * channels;
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  dim3 grid(std::min(
+      at::cuda::ATenCeilDiv(
+          static_cast<int64_t>(output_size), static_cast<int64_t>(512)),
+      static_cast<int64_t>(4096)));
+  dim3 block(512);
+
+  if (output.numel() == 0) {
+    AT_CUDA_CHECK(cudaGetLastError());
+    return output;
+  }
+
+  auto input_ = input.contiguous(), rois_ = rois.contiguous();
+  AT_DISPATCH_FLOATING_TYPES(input.scalar_type(), "ROIAlign_forward", [&] {
+    RoIAlignForward<scalar_t><<<grid, block, 0, stream>>>(
+        output_size,
+        input_.data_ptr<scalar_t>(),
+        spatial_scale,
+        channels,
+        height,
+        width,
+        pooled_height,
+        pooled_width,
+        sampling_ratio,
+        rois_.data_ptr<scalar_t>(),
+        output.data_ptr<scalar_t>(),
+        aligned);
+  });
+  cudaDeviceSynchronize();
+  AT_CUDA_CHECK(cudaGetLastError());
+  return output;
+}
+
+// TODO remove the dependency on input and use instead its sizes -> save memory
+at::Tensor ROIAlign_backward_cuda(
+    const at::Tensor& grad,
+    const at::Tensor& rois,
+    const float spatial_scale,
+    const int pooled_height,
+    const int pooled_width,
+    const int batch_size,
+    const int channels,
+    const int height,
+    const int width,
+    const int sampling_ratio,
+    bool aligned) {
+  AT_ASSERTM(grad.device().is_cuda(), "grad must be a CUDA tensor");
+  AT_ASSERTM(rois.device().is_cuda(), "rois must be a CUDA tensor");
+
+  at::TensorArg grad_t{grad, "grad", 1}, rois_t{rois, "rois", 2};
+  at::CheckedFrom c = "ROIAlign_backward_cuda";
+  at::checkAllSameGPU(c, {grad_t, rois_t});
+  at::checkAllSameType(c, {grad_t, rois_t});
+  at::cuda::CUDAGuard device_guard(grad.device());
+
+  auto num_rois = rois.size(0);
+  auto grad_input =
+      at::zeros({batch_size, channels, height, width}, grad.options());
+
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  dim3 grid(std::min(
+      at::cuda::ATenCeilDiv(
+          static_cast<int64_t>(grad.numel()), static_cast<int64_t>(512)),
+      static_cast<int64_t>(4096)));
+  dim3 block(512);
+
+  // handle possibly empty gradients
+  if (grad.numel() == 0) {
+    AT_CUDA_CHECK(cudaGetLastError());
+    return grad_input;
+  }
+
+  auto grad_ = grad.contiguous(), rois_ = rois.contiguous();
+  AT_DISPATCH_FLOATING_TYPES(grad.scalar_type(), "ROIAlign_backward", [&] {
+    RoIAlignBackwardFeature<scalar_t><<<grid, block, 0, stream>>>(
+        grad.numel(),
+        grad_.data_ptr<scalar_t>(),
+        num_rois,
+        spatial_scale,
+        channels,
+        height,
+        width,
+        pooled_height,
+        pooled_width,
+        sampling_ratio,
+        grad_input.data_ptr<scalar_t>(),
+        rois_.data_ptr<scalar_t>(),
+        aligned);
+  });
+  AT_CUDA_CHECK(cudaGetLastError());
+  return grad_input;
+}
+
+} // namespace detectron2
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/detectron2/layers/csrc/ROIAlignRotated/ROIAlignRotated.h b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/layers/csrc/ROIAlignRotated/ROIAlignRotated.h
new file mode 100644
index 0000000000000000000000000000000000000000..a99c8ebddaa4936e26437b42d62e2b8355c655aa
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/layers/csrc/ROIAlignRotated/ROIAlignRotated.h
@@ -0,0 +1,115 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+#pragma once
+#include <torch/types.h>
+
+namespace detectron2 {
+
+at::Tensor ROIAlignRotated_forward_cpu(
+    const at::Tensor& input,
+    const at::Tensor& rois,
+    const float spatial_scale,
+    const int pooled_height,
+    const int pooled_width,
+    const int sampling_ratio);
+
+at::Tensor ROIAlignRotated_backward_cpu(
+    const at::Tensor& grad,
+    const at::Tensor& rois,
+    const float spatial_scale,
+    const int pooled_height,
+    const int pooled_width,
+    const int batch_size,
+    const int channels,
+    const int height,
+    const int width,
+    const int sampling_ratio);
+
+#ifdef WITH_CUDA
+at::Tensor ROIAlignRotated_forward_cuda(
+    const at::Tensor& input,
+    const at::Tensor& rois,
+    const float spatial_scale,
+    const int pooled_height,
+    const int pooled_width,
+    const int sampling_ratio);
+
+at::Tensor ROIAlignRotated_backward_cuda(
+    const at::Tensor& grad,
+    const at::Tensor& rois,
+    const float spatial_scale,
+    const int pooled_height,
+    const int pooled_width,
+    const int batch_size,
+    const int channels,
+    const int height,
+    const int width,
+    const int sampling_ratio);
+#endif
+
+// Interface for Python
+inline at::Tensor ROIAlignRotated_forward(
+    const at::Tensor& input,
+    const at::Tensor& rois,
+    const float spatial_scale,
+    const int pooled_height,
+    const int pooled_width,
+    const int sampling_ratio) {
+  if (input.is_cuda()) {
+#ifdef WITH_CUDA
+    return ROIAlignRotated_forward_cuda(
+        input,
+        rois,
+        spatial_scale,
+        pooled_height,
+        pooled_width,
+        sampling_ratio);
+#else
+    AT_ERROR("Not compiled with GPU support");
+#endif
+  }
+  return ROIAlignRotated_forward_cpu(
+      input, rois, spatial_scale, pooled_height, pooled_width, sampling_ratio);
+}
+
+inline at::Tensor ROIAlignRotated_backward(
+    const at::Tensor& grad,
+    const at::Tensor& rois,
+    const float spatial_scale,
+    const int pooled_height,
+    const int pooled_width,
+    const int batch_size,
+    const int channels,
+    const int height,
+    const int width,
+    const int sampling_ratio) {
+  if (grad.is_cuda()) {
+#ifdef WITH_CUDA
+    return ROIAlignRotated_backward_cuda(
+        grad,
+        rois,
+        spatial_scale,
+        pooled_height,
+        pooled_width,
+        batch_size,
+        channels,
+        height,
+        width,
+        sampling_ratio);
+#else
+    AT_ERROR("Not compiled with GPU support");
+#endif
+  }
+  return ROIAlignRotated_backward_cpu(
+      grad,
+      rois,
+      spatial_scale,
+      pooled_height,
+      pooled_width,
+      batch_size,
+      channels,
+      height,
+      width,
+      sampling_ratio);
+}
+
+} // namespace detectron2
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/detectron2/layers/csrc/ROIAlignRotated/ROIAlignRotated_cpu.cpp b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/layers/csrc/ROIAlignRotated/ROIAlignRotated_cpu.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..7e5e1ffdccd0e2ced15fa34b4906388d371bffe2
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/layers/csrc/ROIAlignRotated/ROIAlignRotated_cpu.cpp
@@ -0,0 +1,522 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+#include <ATen/TensorUtils.h>
+#include "ROIAlignRotated.h"
+
+// Note: this implementation originates from the Caffe2 ROIAlignRotated Op
+// and PyTorch ROIAlign (non-rotated) Op implementations.
+// The key difference between this implementation and those ones is
+// we don't do "legacy offset" in this version, as there aren't many previous
+// works, if any, using the "legacy" ROIAlignRotated Op.
+// This would make the interface a bit cleaner.
+
+namespace detectron2 {
+
+namespace {
+template <typename T>
+struct PreCalc {
+  int pos1;
+  int pos2;
+  int pos3;
+  int pos4;
+  T w1;
+  T w2;
+  T w3;
+  T w4;
+};
+
+template <typename T>
+void pre_calc_for_bilinear_interpolate(
+    const int height,
+    const int width,
+    const int pooled_height,
+    const int pooled_width,
+    const int iy_upper,
+    const int ix_upper,
+    T roi_start_h,
+    T roi_start_w,
+    T bin_size_h,
+    T bin_size_w,
+    int roi_bin_grid_h,
+    int roi_bin_grid_w,
+    T roi_center_h,
+    T roi_center_w,
+    T cos_theta,
+    T sin_theta,
+    std::vector<PreCalc<T>>& pre_calc) {
+  int pre_calc_index = 0;
+  for (int ph = 0; ph < pooled_height; ph++) {
+    for (int pw = 0; pw < pooled_width; pw++) {
+      for (int iy = 0; iy < iy_upper; iy++) {
+        const T yy = roi_start_h + ph * bin_size_h +
+            static_cast<T>(iy + .5f) * bin_size_h /
+                static_cast<T>(roi_bin_grid_h); // e.g., 0.5, 1.5
+        for (int ix = 0; ix < ix_upper; ix++) {
+          const T xx = roi_start_w + pw * bin_size_w +
+              static_cast<T>(ix + .5f) * bin_size_w /
+                  static_cast<T>(roi_bin_grid_w);
+
+          // Rotate by theta around the center and translate
+          // In image space, (y, x) is the order for Right Handed System,
+          // and this is essentially multiplying the point by a rotation matrix
+          // to rotate it counterclockwise through angle theta.
+          T y = yy * cos_theta - xx * sin_theta + roi_center_h;
+          T x = yy * sin_theta + xx * cos_theta + roi_center_w;
+          // deal with: inverse elements are out of feature map boundary
+          if (y < -1.0 || y > height || x < -1.0 || x > width) {
+            // empty
+            PreCalc<T> pc;
+            pc.pos1 = 0;
+            pc.pos2 = 0;
+            pc.pos3 = 0;
+            pc.pos4 = 0;
+            pc.w1 = 0;
+            pc.w2 = 0;
+            pc.w3 = 0;
+            pc.w4 = 0;
+            pre_calc[pre_calc_index] = pc;
+            pre_calc_index += 1;
+            continue;
+          }
+
+          if (y < 0) {
+            y = 0;
+          }
+          if (x < 0) {
+            x = 0;
+          }
+
+          int y_low = (int)y;
+          int x_low = (int)x;
+          int y_high;
+          int x_high;
+
+          if (y_low >= height - 1) {
+            y_high = y_low = height - 1;
+            y = (T)y_low;
+          } else {
+            y_high = y_low + 1;
+          }
+
+          if (x_low >= width - 1) {
+            x_high = x_low = width - 1;
+            x = (T)x_low;
+          } else {
+            x_high = x_low + 1;
+          }
+
+          T ly = y - y_low;
+          T lx = x - x_low;
+          T hy = 1. - ly, hx = 1. - lx;
+          T w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
+
+          // save weights and indices
+          PreCalc<T> pc;
+          pc.pos1 = y_low * width + x_low;
+          pc.pos2 = y_low * width + x_high;
+          pc.pos3 = y_high * width + x_low;
+          pc.pos4 = y_high * width + x_high;
+          pc.w1 = w1;
+          pc.w2 = w2;
+          pc.w3 = w3;
+          pc.w4 = w4;
+          pre_calc[pre_calc_index] = pc;
+
+          pre_calc_index += 1;
+        }
+      }
+    }
+  }
+}
+
+template <typename T>
+void bilinear_interpolate_gradient(
+    const int height,
+    const int width,
+    T y,
+    T x,
+    T& w1,
+    T& w2,
+    T& w3,
+    T& w4,
+    int& x_low,
+    int& x_high,
+    int& y_low,
+    int& y_high) {
+  // deal with cases that inverse elements are out of feature map boundary
+  if (y < -1.0 || y > height || x < -1.0 || x > width) {
+    // empty
+    w1 = w2 = w3 = w4 = 0.;
+    x_low = x_high = y_low = y_high = -1;
+    return;
+  }
+
+  if (y < 0) {
+    y = 0;
+  }
+
+  if (x < 0) {
+    x = 0;
+  }
+
+  y_low = (int)y;
+  x_low = (int)x;
+
+  if (y_low >= height - 1) {
+    y_high = y_low = height - 1;
+    y = (T)y_low;
+  } else {
+    y_high = y_low + 1;
+  }
+
+  if (x_low >= width - 1) {
+    x_high = x_low = width - 1;
+    x = (T)x_low;
+  } else {
+    x_high = x_low + 1;
+  }
+
+  T ly = y - y_low;
+  T lx = x - x_low;
+  T hy = 1. - ly, hx = 1. - lx;
+
+  // reference in forward
+  // T v1 = input[y_low * width + x_low];
+  // T v2 = input[y_low * width + x_high];
+  // T v3 = input[y_high * width + x_low];
+  // T v4 = input[y_high * width + x_high];
+  // T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+
+  w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
+
+  return;
+}
+
+template <class T>
+inline void add(T* address, const T& val) {
+  *address += val;
+}
+
+} // namespace
+
+template <typename T>
+void ROIAlignRotatedForward(
+    const int nthreads,
+    const T* input,
+    const T& spatial_scale,
+    const int channels,
+    const int height,
+    const int width,
+    const int pooled_height,
+    const int pooled_width,
+    const int sampling_ratio,
+    const T* rois,
+    T* output) {
+  int n_rois = nthreads / channels / pooled_width / pooled_height;
+  // (n, c, ph, pw) is an element in the pooled output
+  // can be parallelized using omp
+  // #pragma omp parallel for num_threads(32)
+  for (int n = 0; n < n_rois; n++) {
+    int index_n = n * channels * pooled_width * pooled_height;
+
+    const T* current_roi = rois + n * 6;
+    int roi_batch_ind = current_roi[0];
+
+    // Do not use rounding; this implementation detail is critical
+    // ROIAlignRotated supports align == true, i.e., continuous coordinate
+    // by default, thus the 0.5 offset
+    T offset = (T)0.5;
+    T roi_center_w = current_roi[1] * spatial_scale - offset;
+    T roi_center_h = current_roi[2] * spatial_scale - offset;
+    T roi_width = current_roi[3] * spatial_scale;
+    T roi_height = current_roi[4] * spatial_scale;
+    T theta = current_roi[5] * M_PI / 180.0;
+    T cos_theta = cos(theta);
+    T sin_theta = sin(theta);
+
+    AT_ASSERTM(
+        roi_width >= 0 && roi_height >= 0,
+        "ROIs in ROIAlignRotated do not have non-negative size!");
+
+    T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
+    T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
+
+    // We use roi_bin_grid to sample the grid and mimic integral
+    int roi_bin_grid_h = (sampling_ratio > 0)
+        ? sampling_ratio
+        : ceil(roi_height / pooled_height); // e.g., = 2
+    int roi_bin_grid_w =
+        (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);
+
+    // We do average (integral) pooling inside a bin
+    const T count = std::max(roi_bin_grid_h * roi_bin_grid_w, 1); // e.g. = 4
+
+    // we want to precalculate indices and weights shared by all channels,
+    // this is the key point of optimization
+    std::vector<PreCalc<T>> pre_calc(
+        roi_bin_grid_h * roi_bin_grid_w * pooled_width * pooled_height);
+
+    // roi_start_h and roi_start_w are computed wrt the center of RoI (x, y).
+    // Appropriate translation needs to be applied after.
+    T roi_start_h = -roi_height / 2.0;
+    T roi_start_w = -roi_width / 2.0;
+
+    pre_calc_for_bilinear_interpolate(
+        height,
+        width,
+        pooled_height,
+        pooled_width,
+        roi_bin_grid_h,
+        roi_bin_grid_w,
+        roi_start_h,
+        roi_start_w,
+        bin_size_h,
+        bin_size_w,
+        roi_bin_grid_h,
+        roi_bin_grid_w,
+        roi_center_h,
+        roi_center_w,
+        cos_theta,
+        sin_theta,
+        pre_calc);
+
+    for (int c = 0; c < channels; c++) {
+      int index_n_c = index_n + c * pooled_width * pooled_height;
+      const T* offset_input =
+          input + (roi_batch_ind * channels + c) * height * width;
+      int pre_calc_index = 0;
+
+      for (int ph = 0; ph < pooled_height; ph++) {
+        for (int pw = 0; pw < pooled_width; pw++) {
+          int index = index_n_c + ph * pooled_width + pw;
+
+          T output_val = 0.;
+          for (int iy = 0; iy < roi_bin_grid_h; iy++) {
+            for (int ix = 0; ix < roi_bin_grid_w; ix++) {
+              PreCalc<T> pc = pre_calc[pre_calc_index];
+              output_val += pc.w1 * offset_input[pc.pos1] +
+                  pc.w2 * offset_input[pc.pos2] +
+                  pc.w3 * offset_input[pc.pos3] + pc.w4 * offset_input[pc.pos4];
+
+              pre_calc_index += 1;
+            }
+          }
+          output_val /= count;
+
+          output[index] = output_val;
+        } // for pw
+      } // for ph
+    } // for c
+  } // for n
+}
+
+template <typename T>
+void ROIAlignRotatedBackward(
+    const int nthreads,
+    // may not be contiguous. should index using n_stride, etc
+    const T* grad_output,
+    const T& spatial_scale,
+    const int channels,
+    const int height,
+    const int width,
+    const int pooled_height,
+    const int pooled_width,
+    const int sampling_ratio,
+    T* grad_input,
+    const T* rois,
+    const int n_stride,
+    const int c_stride,
+    const int h_stride,
+    const int w_stride) {
+  for (int index = 0; index < nthreads; index++) {
+    // (n, c, ph, pw) is an element in the pooled output
+    int pw = index % pooled_width;
+    int ph = (index / pooled_width) % pooled_height;
+    int c = (index / pooled_width / pooled_height) % channels;
+    int n = index / pooled_width / pooled_height / channels;
+
+    const T* current_roi = rois + n * 6;
+    int roi_batch_ind = current_roi[0];
+
+    // Do not use rounding; this implementation detail is critical
+    // ROIAlignRotated supports align == true, i.e., continuous coordinate
+    // by default, thus the 0.5 offset
+    T offset = (T)0.5;
+    T roi_center_w = current_roi[1] * spatial_scale - offset;
+    T roi_center_h = current_roi[2] * spatial_scale - offset;
+    T roi_width = current_roi[3] * spatial_scale;
+    T roi_height = current_roi[4] * spatial_scale;
+    T theta = current_roi[5] * M_PI / 180.0;
+    T cos_theta = cos(theta);
+    T sin_theta = sin(theta);
+
+    AT_ASSERTM(
+        roi_width >= 0 && roi_height >= 0,
+        "ROIs in ROIAlignRotated do not have non-negative size!");
+
+    T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
+    T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
+
+    T* offset_grad_input =
+        grad_input + ((roi_batch_ind * channels + c) * height * width);
+
+    int output_offset = n * n_stride + c * c_stride;
+    const T* offset_grad_output = grad_output + output_offset;
+    const T grad_output_this_bin =
+        offset_grad_output[ph * h_stride + pw * w_stride];
+
+    // We use roi_bin_grid to sample the grid and mimic integral
+    int roi_bin_grid_h = (sampling_ratio > 0)
+        ? sampling_ratio
+        : ceil(roi_height / pooled_height); // e.g., = 2
+    int roi_bin_grid_w =
+        (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);
+
+    // roi_start_h and roi_start_w are computed wrt the center of RoI (x, y).
+    // Appropriate translation needs to be applied after.
+    T roi_start_h = -roi_height / 2.0;
+    T roi_start_w = -roi_width / 2.0;
+
+    // We do average (integral) pooling inside a bin
+    const T count = roi_bin_grid_h * roi_bin_grid_w; // e.g. = 4
+
+    for (int iy = 0; iy < roi_bin_grid_h; iy++) {
+      const T yy = roi_start_h + ph * bin_size_h +
+          static_cast<T>(iy + .5f) * bin_size_h /
+              static_cast<T>(roi_bin_grid_h); // e.g., 0.5, 1.5
+      for (int ix = 0; ix < roi_bin_grid_w; ix++) {
+        const T xx = roi_start_w + pw * bin_size_w +
+            static_cast<T>(ix + .5f) * bin_size_w /
+                static_cast<T>(roi_bin_grid_w);
+
+        // Rotate by theta around the center and translate
+        T y = yy * cos_theta - xx * sin_theta + roi_center_h;
+        T x = yy * sin_theta + xx * cos_theta + roi_center_w;
+
+        T w1, w2, w3, w4;
+        int x_low, x_high, y_low, y_high;
+
+        bilinear_interpolate_gradient(
+            height, width, y, x, w1, w2, w3, w4, x_low, x_high, y_low, y_high);
+
+        T g1 = grad_output_this_bin * w1 / count;
+        T g2 = grad_output_this_bin * w2 / count;
+        T g3 = grad_output_this_bin * w3 / count;
+        T g4 = grad_output_this_bin * w4 / count;
+
+        if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) {
+          // atomic add is not needed for now since it is single threaded
+          add(offset_grad_input + y_low * width + x_low, static_cast<T>(g1));
+          add(offset_grad_input + y_low * width + x_high, static_cast<T>(g2));
+          add(offset_grad_input + y_high * width + x_low, static_cast<T>(g3));
+          add(offset_grad_input + y_high * width + x_high, static_cast<T>(g4));
+        } // if
+      } // ix
+    } // iy
+  } // for
+} // ROIAlignRotatedBackward
+
+at::Tensor ROIAlignRotated_forward_cpu(
+    const at::Tensor& input,
+    const at::Tensor& rois,
+    const float spatial_scale,
+    const int pooled_height,
+    const int pooled_width,
+    const int sampling_ratio) {
+  AT_ASSERTM(input.device().is_cpu(), "input must be a CPU tensor");
+  AT_ASSERTM(rois.device().is_cpu(), "rois must be a CPU tensor");
+
+  at::TensorArg input_t{input, "input", 1}, rois_t{rois, "rois", 2};
+
+  at::CheckedFrom c = "ROIAlign_forward_cpu";
+  at::checkAllSameType(c, {input_t, rois_t});
+
+  auto num_rois = rois.size(0);
+  auto channels = input.size(1);
+  auto height = input.size(2);
+  auto width = input.size(3);
+
+  at::Tensor output = at::zeros(
+      {num_rois, channels, pooled_height, pooled_width}, input.options());
+
+  auto output_size = num_rois * pooled_height * pooled_width * channels;
+
+  if (output.numel() == 0) {
+    return output;
+  }
+
+  auto input_ = input.contiguous(), rois_ = rois.contiguous();
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      input.scalar_type(), "ROIAlignRotated_forward", [&] {
+        ROIAlignRotatedForward<scalar_t>(
+            output_size,
+            input_.data_ptr<scalar_t>(),
+            spatial_scale,
+            channels,
+            height,
+            width,
+            pooled_height,
+            pooled_width,
+            sampling_ratio,
+            rois_.data_ptr<scalar_t>(),
+            output.data_ptr<scalar_t>());
+      });
+  return output;
+}
+
+at::Tensor ROIAlignRotated_backward_cpu(
+    const at::Tensor& grad,
+    const at::Tensor& rois,
+    const float spatial_scale,
+    const int pooled_height,
+    const int pooled_width,
+    const int batch_size,
+    const int channels,
+    const int height,
+    const int width,
+    const int sampling_ratio) {
+  AT_ASSERTM(grad.device().is_cpu(), "grad must be a CPU tensor");
+  AT_ASSERTM(rois.device().is_cpu(), "rois must be a CPU tensor");
+
+  at::TensorArg grad_t{grad, "grad", 1}, rois_t{rois, "rois", 2};
+
+  at::CheckedFrom c = "ROIAlignRotated_backward_cpu";
+  at::checkAllSameType(c, {grad_t, rois_t});
+
+  at::Tensor grad_input =
+      at::zeros({batch_size, channels, height, width}, grad.options());
+
+  // handle possibly empty gradients
+  if (grad.numel() == 0) {
+    return grad_input;
+  }
+
+  // get stride values to ensure indexing into gradients is correct.
+  int n_stride = grad.stride(0);
+  int c_stride = grad.stride(1);
+  int h_stride = grad.stride(2);
+  int w_stride = grad.stride(3);
+
+  auto rois_ = rois.contiguous();
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      grad.scalar_type(), "ROIAlignRotated_forward", [&] {
+        ROIAlignRotatedBackward<scalar_t>(
+            grad.numel(),
+            grad.data_ptr<scalar_t>(),
+            spatial_scale,
+            channels,
+            height,
+            width,
+            pooled_height,
+            pooled_width,
+            sampling_ratio,
+            grad_input.data_ptr<scalar_t>(),
+            rois_.data_ptr<scalar_t>(),
+            n_stride,
+            c_stride,
+            h_stride,
+            w_stride);
+      });
+  return grad_input;
+}
+
+} // namespace detectron2
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/detectron2/layers/csrc/ROIAlignRotated/ROIAlignRotated_cuda.cu b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/layers/csrc/ROIAlignRotated/ROIAlignRotated_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..9c376fc6973b75b34967faf870a9f85a3ee430be
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/layers/csrc/ROIAlignRotated/ROIAlignRotated_cuda.cu
@@ -0,0 +1,443 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+#include <ATen/ATen.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAGuard.h>
+#include <ATen/cuda/CUDAApplyUtils.cuh>
+
+// TODO make it in a common file
+#define CUDA_1D_KERNEL_LOOP(i, n)                            \
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; \
+       i += blockDim.x * gridDim.x)
+
+// Note: this implementation originates from the Caffe2 ROIAlignRotated Op
+// and PyTorch ROIAlign (non-rotated) Op implementations.
+// The key difference between this implementation and those ones is
+// we don't do "legacy offset" in this version, as there aren't many previous
+// works, if any, using the "legacy" ROIAlignRotated Op.
+// This would make the interface a bit cleaner.
+
+namespace detectron2 {
+
+namespace {
+
+template <typename T>
+__device__ T bilinear_interpolate(
+    const T* input,
+    const int height,
+    const int width,
+    T y,
+    T x) {
+  // deal with cases that inverse elements are out of feature map boundary
+  if (y < -1.0 || y > height || x < -1.0 || x > width) {
+    // empty
+    return 0;
+  }
+
+  if (y < 0) {
+    y = 0;
+  }
+
+  if (x < 0) {
+    x = 0;
+  }
+
+  int y_low = (int)y;
+  int x_low = (int)x;
+  int y_high;
+  int x_high;
+
+  if (y_low >= height - 1) {
+    y_high = y_low = height - 1;
+    y = (T)y_low;
+  } else {
+    y_high = y_low + 1;
+  }
+
+  if (x_low >= width - 1) {
+    x_high = x_low = width - 1;
+    x = (T)x_low;
+  } else {
+    x_high = x_low + 1;
+  }
+
+  T ly = y - y_low;
+  T lx = x - x_low;
+  T hy = 1. - ly, hx = 1. - lx;
+  // do bilinear interpolation
+  T v1 = input[y_low * width + x_low];
+  T v2 = input[y_low * width + x_high];
+  T v3 = input[y_high * width + x_low];
+  T v4 = input[y_high * width + x_high];
+  T w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
+
+  T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+
+  return val;
+}
+
+template <typename T>
+__device__ void bilinear_interpolate_gradient(
+    const int height,
+    const int width,
+    T y,
+    T x,
+    T& w1,
+    T& w2,
+    T& w3,
+    T& w4,
+    int& x_low,
+    int& x_high,
+    int& y_low,
+    int& y_high) {
+  // deal with cases that inverse elements are out of feature map boundary
+  if (y < -1.0 || y > height || x < -1.0 || x > width) {
+    // empty
+    w1 = w2 = w3 = w4 = 0.;
+    x_low = x_high = y_low = y_high = -1;
+    return;
+  }
+
+  if (y < 0) {
+    y = 0;
+  }
+
+  if (x < 0) {
+    x = 0;
+  }
+
+  y_low = (int)y;
+  x_low = (int)x;
+
+  if (y_low >= height - 1) {
+    y_high = y_low = height - 1;
+    y = (T)y_low;
+  } else {
+    y_high = y_low + 1;
+  }
+
+  if (x_low >= width - 1) {
+    x_high = x_low = width - 1;
+    x = (T)x_low;
+  } else {
+    x_high = x_low + 1;
+  }
+
+  T ly = y - y_low;
+  T lx = x - x_low;
+  T hy = 1. - ly, hx = 1. - lx;
+
+  // reference in forward
+  // T v1 = input[y_low * width + x_low];
+  // T v2 = input[y_low * width + x_high];
+  // T v3 = input[y_high * width + x_low];
+  // T v4 = input[y_high * width + x_high];
+  // T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+
+  w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
+
+  return;
+}
+
+} // namespace
+
+template <typename T>
+__global__ void RoIAlignRotatedForward(
+    const int nthreads,
+    const T* input,
+    const T spatial_scale,
+    const int channels,
+    const int height,
+    const int width,
+    const int pooled_height,
+    const int pooled_width,
+    const int sampling_ratio,
+    const T* rois,
+    T* top_data) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    // (n, c, ph, pw) is an element in the pooled output
+    int pw = index % pooled_width;
+    int ph = (index / pooled_width) % pooled_height;
+    int c = (index / pooled_width / pooled_height) % channels;
+    int n = index / pooled_width / pooled_height / channels;
+
+    const T* current_roi = rois + n * 6;
+    int roi_batch_ind = current_roi[0];
+
+    // Do not use rounding; this implementation detail is critical
+    // ROIAlignRotated supports align == true, i.e., continuous coordinate
+    // by default, thus the 0.5 offset
+    T offset = (T)0.5;
+    T roi_center_w = current_roi[1] * spatial_scale - offset;
+    T roi_center_h = current_roi[2] * spatial_scale - offset;
+    T roi_width = current_roi[3] * spatial_scale;
+    T roi_height = current_roi[4] * spatial_scale;
+    T theta = current_roi[5] * M_PI / 180.0;
+    T cos_theta = cos(theta);
+    T sin_theta = sin(theta);
+
+    T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
+    T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
+
+    const T* offset_input =
+        input + (roi_batch_ind * channels + c) * height * width;
+
+    // We use roi_bin_grid to sample the grid and mimic integral
+    int roi_bin_grid_h = (sampling_ratio > 0)
+        ? sampling_ratio
+        : ceil(roi_height / pooled_height); // e.g., = 2
+    int roi_bin_grid_w =
+        (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);
+
+    // roi_start_h and roi_start_w are computed wrt the center of RoI (x, y).
+    // Appropriate translation needs to be applied after.
+    T roi_start_h = -roi_height / 2.0;
+    T roi_start_w = -roi_width / 2.0;
+
+    // We do average (inte  gral) pooling inside a bin
+    const T count = max(roi_bin_grid_h * roi_bin_grid_w, 1); // e.g. = 4
+
+    T output_val = 0.;
+    for (int iy = 0; iy < roi_bin_grid_h; iy++) // e.g., iy = 0, 1
+    {
+      const T yy = roi_start_h + ph * bin_size_h +
+          static_cast<T>(iy + .5f) * bin_size_h /
+              static_cast<T>(roi_bin_grid_h); // e.g., 0.5, 1.5
+      for (int ix = 0; ix < roi_bin_grid_w; ix++) {
+        const T xx = roi_start_w + pw * bin_size_w +
+            static_cast<T>(ix + .5f) * bin_size_w /
+                static_cast<T>(roi_bin_grid_w);
+
+        // Rotate by theta around the center and translate
+        T y = yy * cos_theta - xx * sin_theta + roi_center_h;
+        T x = yy * sin_theta + xx * cos_theta + roi_center_w;
+
+        T val = bilinear_interpolate(offset_input, height, width, y, x);
+        output_val += val;
+      }
+    }
+    output_val /= count;
+
+    top_data[index] = output_val;
+  }
+}
+
+template <typename T>
+__global__ void RoIAlignRotatedBackwardFeature(
+    const int nthreads,
+    const T* top_diff,
+    const int num_rois,
+    const T spatial_scale,
+    const int channels,
+    const int height,
+    const int width,
+    const int pooled_height,
+    const int pooled_width,
+    const int sampling_ratio,
+    T* bottom_diff,
+    const T* rois) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    // (n, c, ph, pw) is an element in the pooled output
+    int pw = index % pooled_width;
+    int ph = (index / pooled_width) % pooled_height;
+    int c = (index / pooled_width / pooled_height) % channels;
+    int n = index / pooled_width / pooled_height / channels;
+
+    const T* current_roi = rois + n * 6;
+    int roi_batch_ind = current_roi[0];
+
+    // Do not use rounding; this implementation detail is critical
+    // ROIAlignRotated supports align == true, i.e., continuous coordinate
+    // by default, thus the 0.5 offset
+    T offset = (T)0.5;
+    T roi_center_w = current_roi[1] * spatial_scale - offset;
+    T roi_center_h = current_roi[2] * spatial_scale - offset;
+    T roi_width = current_roi[3] * spatial_scale;
+    T roi_height = current_roi[4] * spatial_scale;
+    T theta = current_roi[5] * M_PI / 180.0;
+    T cos_theta = cos(theta);
+    T sin_theta = sin(theta);
+
+    T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
+    T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
+
+    T* offset_bottom_diff =
+        bottom_diff + (roi_batch_ind * channels + c) * height * width;
+
+    int top_offset = (n * channels + c) * pooled_height * pooled_width;
+    const T* offset_top_diff = top_diff + top_offset;
+    const T top_diff_this_bin = offset_top_diff[ph * pooled_width + pw];
+
+    // We use roi_bin_grid to sample the grid and mimic integral
+    int roi_bin_grid_h = (sampling_ratio > 0)
+        ? sampling_ratio
+        : ceil(roi_height / pooled_height); // e.g., = 2
+    int roi_bin_grid_w =
+        (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);
+
+    // roi_start_h and roi_start_w are computed wrt the center of RoI (x, y).
+    // Appropriate translation needs to be applied after.
+    T roi_start_h = -roi_height / 2.0;
+    T roi_start_w = -roi_width / 2.0;
+
+    // We do average (integral) pooling inside a bin
+    const T count = roi_bin_grid_h * roi_bin_grid_w; // e.g. = 4
+
+    for (int iy = 0; iy < roi_bin_grid_h; iy++) // e.g., iy = 0, 1
+    {
+      const T yy = roi_start_h + ph * bin_size_h +
+          static_cast<T>(iy + .5f) * bin_size_h /
+              static_cast<T>(roi_bin_grid_h); // e.g., 0.5, 1.5
+      for (int ix = 0; ix < roi_bin_grid_w; ix++) {
+        const T xx = roi_start_w + pw * bin_size_w +
+            static_cast<T>(ix + .5f) * bin_size_w /
+                static_cast<T>(roi_bin_grid_w);
+
+        // Rotate by theta around the center and translate
+        T y = yy * cos_theta - xx * sin_theta + roi_center_h;
+        T x = yy * sin_theta + xx * cos_theta + roi_center_w;
+
+        T w1, w2, w3, w4;
+        int x_low, x_high, y_low, y_high;
+
+        bilinear_interpolate_gradient(
+            height, width, y, x, w1, w2, w3, w4, x_low, x_high, y_low, y_high);
+
+        T g1 = top_diff_this_bin * w1 / count;
+        T g2 = top_diff_this_bin * w2 / count;
+        T g3 = top_diff_this_bin * w3 / count;
+        T g4 = top_diff_this_bin * w4 / count;
+
+        if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) {
+          atomicAdd(
+              offset_bottom_diff + y_low * width + x_low, static_cast<T>(g1));
+          atomicAdd(
+              offset_bottom_diff + y_low * width + x_high, static_cast<T>(g2));
+          atomicAdd(
+              offset_bottom_diff + y_high * width + x_low, static_cast<T>(g3));
+          atomicAdd(
+              offset_bottom_diff + y_high * width + x_high, static_cast<T>(g4));
+        } // if
+      } // ix
+    } // iy
+  } // CUDA_1D_KERNEL_LOOP
+} // RoIAlignRotatedBackward
+
+at::Tensor ROIAlignRotated_forward_cuda(
+    const at::Tensor& input,
+    const at::Tensor& rois,
+    const float spatial_scale,
+    const int pooled_height,
+    const int pooled_width,
+    const int sampling_ratio) {
+  AT_ASSERTM(input.device().is_cuda(), "input must be a CUDA tensor");
+  AT_ASSERTM(rois.device().is_cuda(), "rois must be a CUDA tensor");
+  at::TensorArg input_t{input, "input", 1}, rois_t{rois, "rois", 2};
+
+  at::CheckedFrom c = "ROIAlignRotated_forward_cuda";
+  at::checkAllSameGPU(c, {input_t, rois_t});
+  at::checkAllSameType(c, {input_t, rois_t});
+  at::cuda::CUDAGuard device_guard(input.device());
+
+  auto num_rois = rois.size(0);
+  auto channels = input.size(1);
+  auto height = input.size(2);
+  auto width = input.size(3);
+
+  auto output = at::empty(
+      {num_rois, channels, pooled_height, pooled_width}, input.options());
+  auto output_size = num_rois * pooled_height * pooled_width * channels;
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  dim3 grid(std::min(
+      at::cuda::ATenCeilDiv(
+          static_cast<int64_t>(output_size), static_cast<int64_t>(512)),
+      static_cast<int64_t>(4096)));
+  dim3 block(512);
+
+  if (output.numel() == 0) {
+    AT_CUDA_CHECK(cudaGetLastError());
+    return output;
+  }
+
+  auto input_ = input.contiguous(), rois_ = rois.contiguous();
+  AT_DISPATCH_FLOATING_TYPES(
+      input.scalar_type(), "ROIAlignRotated_forward", [&] {
+        RoIAlignRotatedForward<scalar_t><<<grid, block, 0, stream>>>(
+            output_size,
+            input_.data_ptr<scalar_t>(),
+            spatial_scale,
+            channels,
+            height,
+            width,
+            pooled_height,
+            pooled_width,
+            sampling_ratio,
+            rois_.data_ptr<scalar_t>(),
+            output.data_ptr<scalar_t>());
+      });
+  cudaDeviceSynchronize();
+  AT_CUDA_CHECK(cudaGetLastError());
+  return output;
+}
+
+// TODO remove the dependency on input and use instead its sizes -> save memory
+at::Tensor ROIAlignRotated_backward_cuda(
+    const at::Tensor& grad,
+    const at::Tensor& rois,
+    const float spatial_scale,
+    const int pooled_height,
+    const int pooled_width,
+    const int batch_size,
+    const int channels,
+    const int height,
+    const int width,
+    const int sampling_ratio) {
+  AT_ASSERTM(grad.device().is_cuda(), "grad must be a CUDA tensor");
+  AT_ASSERTM(rois.device().is_cuda(), "rois must be a CUDA tensor");
+
+  at::TensorArg grad_t{grad, "grad", 1}, rois_t{rois, "rois", 2};
+  at::CheckedFrom c = "ROIAlign_backward_cuda";
+  at::checkAllSameGPU(c, {grad_t, rois_t});
+  at::checkAllSameType(c, {grad_t, rois_t});
+  at::cuda::CUDAGuard device_guard(grad.device());
+
+  auto num_rois = rois.size(0);
+  auto grad_input =
+      at::zeros({batch_size, channels, height, width}, grad.options());
+
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  dim3 grid(std::min(
+      at::cuda::ATenCeilDiv(
+          static_cast<int64_t>(grad.numel()), static_cast<int64_t>(512)),
+      static_cast<int64_t>(4096)));
+  dim3 block(512);
+
+  // handle possibly empty gradients
+  if (grad.numel() == 0) {
+    AT_CUDA_CHECK(cudaGetLastError());
+    return grad_input;
+  }
+
+  auto grad_ = grad.contiguous(), rois_ = rois.contiguous();
+  AT_DISPATCH_FLOATING_TYPES(
+      grad.scalar_type(), "ROIAlignRotated_backward", [&] {
+        RoIAlignRotatedBackwardFeature<scalar_t><<<grid, block, 0, stream>>>(
+            grad.numel(),
+            grad_.data_ptr<scalar_t>(),
+            num_rois,
+            spatial_scale,
+            channels,
+            height,
+            width,
+            pooled_height,
+            pooled_width,
+            sampling_ratio,
+            grad_input.data_ptr<scalar_t>(),
+            rois_.data_ptr<scalar_t>());
+      });
+  AT_CUDA_CHECK(cudaGetLastError());
+  return grad_input;
+}
+
+} // namespace detectron2
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/detectron2/layers/csrc/box_iou_rotated/box_iou_rotated.h b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/layers/csrc/box_iou_rotated/box_iou_rotated.h
new file mode 100644
index 0000000000000000000000000000000000000000..7c389c6cbdbefdfb623296b0918c27c634d621bb
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/layers/csrc/box_iou_rotated/box_iou_rotated.h
@@ -0,0 +1,35 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+#pragma once
+#include <torch/types.h>
+
+namespace detectron2 {
+
+at::Tensor box_iou_rotated_cpu(
+    const at::Tensor& boxes1,
+    const at::Tensor& boxes2);
+
+#ifdef WITH_CUDA
+at::Tensor box_iou_rotated_cuda(
+    const at::Tensor& boxes1,
+    const at::Tensor& boxes2);
+#endif
+
+// Interface for Python
+// inline is needed to prevent multiple function definitions when this header is
+// included by different cpps
+inline at::Tensor box_iou_rotated(
+    const at::Tensor& boxes1,
+    const at::Tensor& boxes2) {
+  assert(boxes1.device().is_cuda() == boxes2.device().is_cuda());
+  if (boxes1.device().is_cuda()) {
+#ifdef WITH_CUDA
+    return box_iou_rotated_cuda(boxes1.contiguous(), boxes2.contiguous());
+#else
+    AT_ERROR("Not compiled with GPU support");
+#endif
+  }
+
+  return box_iou_rotated_cpu(boxes1.contiguous(), boxes2.contiguous());
+}
+
+} // namespace detectron2
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/detectron2/layers/csrc/box_iou_rotated/box_iou_rotated_cpu.cpp b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/layers/csrc/box_iou_rotated/box_iou_rotated_cpu.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..f2b02d171077d96fcaf29b585fa6a678af1f2842
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/layers/csrc/box_iou_rotated/box_iou_rotated_cpu.cpp
@@ -0,0 +1,39 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+#include "box_iou_rotated.h"
+#include "box_iou_rotated_utils.h"
+
+namespace detectron2 {
+
+template <typename T>
+void box_iou_rotated_cpu_kernel(
+    const at::Tensor& boxes1,
+    const at::Tensor& boxes2,
+    at::Tensor& ious) {
+  auto num_boxes1 = boxes1.size(0);
+  auto num_boxes2 = boxes2.size(0);
+
+  for (int i = 0; i < num_boxes1; i++) {
+    for (int j = 0; j < num_boxes2; j++) {
+      ious[i * num_boxes2 + j] = single_box_iou_rotated<T>(
+          boxes1[i].data_ptr<T>(), boxes2[j].data_ptr<T>());
+    }
+  }
+}
+
+at::Tensor box_iou_rotated_cpu(
+    // input must be contiguous:
+    const at::Tensor& boxes1,
+    const at::Tensor& boxes2) {
+  auto num_boxes1 = boxes1.size(0);
+  auto num_boxes2 = boxes2.size(0);
+  at::Tensor ious =
+      at::empty({num_boxes1 * num_boxes2}, boxes1.options().dtype(at::kFloat));
+
+  box_iou_rotated_cpu_kernel<float>(boxes1, boxes2, ious);
+
+  // reshape from 1d array to 2d array
+  auto shape = std::vector<int64_t>{num_boxes1, num_boxes2};
+  return ious.reshape(shape);
+}
+
+} // namespace detectron2
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/detectron2/layers/csrc/box_iou_rotated/box_iou_rotated_cuda.cu b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/layers/csrc/box_iou_rotated/box_iou_rotated_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..e3403c11796cb313771b8b6350c793b9fbdfbcaa
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/layers/csrc/box_iou_rotated/box_iou_rotated_cuda.cu
@@ -0,0 +1,130 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+#include <ATen/ATen.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAGuard.h>
+#include <ATen/cuda/CUDAApplyUtils.cuh>
+#include "box_iou_rotated_utils.h"
+
+namespace detectron2 {
+
+// 2D block with 32 * 16 = 512 threads per block
+const int BLOCK_DIM_X = 32;
+const int BLOCK_DIM_Y = 16;
+
+template <typename T>
+__global__ void box_iou_rotated_cuda_kernel(
+    const int n_boxes1,
+    const int n_boxes2,
+    const T* dev_boxes1,
+    const T* dev_boxes2,
+    T* dev_ious) {
+  const int row_start = blockIdx.x * blockDim.x;
+  const int col_start = blockIdx.y * blockDim.y;
+
+  const int row_size = min(n_boxes1 - row_start, blockDim.x);
+  const int col_size = min(n_boxes2 - col_start, blockDim.y);
+
+  __shared__ float block_boxes1[BLOCK_DIM_X * 5];
+  __shared__ float block_boxes2[BLOCK_DIM_Y * 5];
+
+  // It's safe to copy using threadIdx.x since BLOCK_DIM_X >= BLOCK_DIM_Y
+  if (threadIdx.x < row_size && threadIdx.y == 0) {
+    block_boxes1[threadIdx.x * 5 + 0] =
+        dev_boxes1[(row_start + threadIdx.x) * 5 + 0];
+    block_boxes1[threadIdx.x * 5 + 1] =
+        dev_boxes1[(row_start + threadIdx.x) * 5 + 1];
+    block_boxes1[threadIdx.x * 5 + 2] =
+        dev_boxes1[(row_start + threadIdx.x) * 5 + 2];
+    block_boxes1[threadIdx.x * 5 + 3] =
+        dev_boxes1[(row_start + threadIdx.x) * 5 + 3];
+    block_boxes1[threadIdx.x * 5 + 4] =
+        dev_boxes1[(row_start + threadIdx.x) * 5 + 4];
+  }
+
+  if (threadIdx.x < col_size && threadIdx.y == 0) {
+    block_boxes2[threadIdx.x * 5 + 0] =
+        dev_boxes2[(col_start + threadIdx.x) * 5 + 0];
+    block_boxes2[threadIdx.x * 5 + 1] =
+        dev_boxes2[(col_start + threadIdx.x) * 5 + 1];
+    block_boxes2[threadIdx.x * 5 + 2] =
+        dev_boxes2[(col_start + threadIdx.x) * 5 + 2];
+    block_boxes2[threadIdx.x * 5 + 3] =
+        dev_boxes2[(col_start + threadIdx.x) * 5 + 3];
+    block_boxes2[threadIdx.x * 5 + 4] =
+        dev_boxes2[(col_start + threadIdx.x) * 5 + 4];
+  }
+  __syncthreads();
+
+  if (threadIdx.x < row_size && threadIdx.y < col_size) {
+    int offset = (row_start + threadIdx.x) * n_boxes2 + col_start + threadIdx.y;
+    dev_ious[offset] = single_box_iou_rotated<T>(
+        block_boxes1 + threadIdx.x * 5, block_boxes2 + threadIdx.y * 5);
+  }
+}
+
+at::Tensor box_iou_rotated_cuda(
+    // input must be contiguous
+    const at::Tensor& boxes1,
+    const at::Tensor& boxes2) {
+  using scalar_t = float;
+  AT_ASSERTM(
+      boxes1.scalar_type() == at::kFloat, "boxes1 must be a float tensor");
+  AT_ASSERTM(
+      boxes2.scalar_type() == at::kFloat, "boxes2 must be a float tensor");
+  AT_ASSERTM(boxes1.is_cuda(), "boxes1 must be a CUDA tensor");
+  AT_ASSERTM(boxes2.is_cuda(), "boxes2 must be a CUDA tensor");
+  at::cuda::CUDAGuard device_guard(boxes1.device());
+
+  auto num_boxes1 = boxes1.size(0);
+  auto num_boxes2 = boxes2.size(0);
+
+  at::Tensor ious =
+      at::empty({num_boxes1 * num_boxes2}, boxes1.options().dtype(at::kFloat));
+
+  bool transpose = false;
+  if (num_boxes1 > 0 && num_boxes2 > 0) {
+    scalar_t *data1 = boxes1.data_ptr<scalar_t>(),
+             *data2 = boxes2.data_ptr<scalar_t>();
+
+    if (num_boxes2 > 65535 * BLOCK_DIM_Y) {
+      AT_ASSERTM(
+          num_boxes1 <= 65535 * BLOCK_DIM_Y,
+          "Too many boxes for box_iou_rotated_cuda!");
+      // x dim is allowed to be large, but y dim cannot,
+      // so we transpose the two to avoid "invalid configuration argument"
+      // error. We assume one of them is small. Otherwise the result is hard to
+      // fit in memory anyway.
+      std::swap(num_boxes1, num_boxes2);
+      std::swap(data1, data2);
+      transpose = true;
+    }
+
+    const int blocks_x =
+        at::cuda::ATenCeilDiv(static_cast<int>(num_boxes1), BLOCK_DIM_X);
+    const int blocks_y =
+        at::cuda::ATenCeilDiv(static_cast<int>(num_boxes2), BLOCK_DIM_Y);
+
+    dim3 blocks(blocks_x, blocks_y);
+    dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y);
+    cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+    box_iou_rotated_cuda_kernel<scalar_t><<<blocks, threads, 0, stream>>>(
+        num_boxes1,
+        num_boxes2,
+        data1,
+        data2,
+        (scalar_t*)ious.data_ptr<scalar_t>());
+
+    AT_CUDA_CHECK(cudaGetLastError());
+  }
+
+  // reshape from 1d array to 2d array
+  auto shape = std::vector<int64_t>{num_boxes1, num_boxes2};
+  if (transpose) {
+    return ious.view(shape).t();
+  } else {
+    return ious.view(shape);
+  }
+}
+
+} // namespace detectron2
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/detectron2/layers/csrc/box_iou_rotated/box_iou_rotated_utils.h b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/layers/csrc/box_iou_rotated/box_iou_rotated_utils.h
new file mode 100644
index 0000000000000000000000000000000000000000..d8757ec376e8703e1edc5f76bf5ef214620bd69f
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/layers/csrc/box_iou_rotated/box_iou_rotated_utils.h
@@ -0,0 +1,363 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+#pragma once
+
+#include <cassert>
+#include <cmath>
+
+#ifdef __CUDACC__
+// Designates functions callable from the host (CPU) and the device (GPU)
+#define HOST_DEVICE __host__ __device__
+#define HOST_DEVICE_INLINE HOST_DEVICE __forceinline__
+#else
+#include <algorithm>
+#define HOST_DEVICE
+#define HOST_DEVICE_INLINE HOST_DEVICE inline
+#endif
+
+namespace detectron2 {
+
+namespace {
+
+template <typename T>
+struct RotatedBox {
+  T x_ctr, y_ctr, w, h, a;
+};
+
+template <typename T>
+struct Point {
+  T x, y;
+  HOST_DEVICE_INLINE Point(const T& px = 0, const T& py = 0) : x(px), y(py) {}
+  HOST_DEVICE_INLINE Point operator+(const Point& p) const {
+    return Point(x + p.x, y + p.y);
+  }
+  HOST_DEVICE_INLINE Point& operator+=(const Point& p) {
+    x += p.x;
+    y += p.y;
+    return *this;
+  }
+  HOST_DEVICE_INLINE Point operator-(const Point& p) const {
+    return Point(x - p.x, y - p.y);
+  }
+  HOST_DEVICE_INLINE Point operator*(const T coeff) const {
+    return Point(x * coeff, y * coeff);
+  }
+};
+
+template <typename T>
+HOST_DEVICE_INLINE T dot_2d(const Point<T>& A, const Point<T>& B) {
+  return A.x * B.x + A.y * B.y;
+}
+
+// R: result type. can be different from input type
+template <typename T, typename R = T>
+HOST_DEVICE_INLINE R cross_2d(const Point<T>& A, const Point<T>& B) {
+  return static_cast<R>(A.x) * static_cast<R>(B.y) -
+      static_cast<R>(B.x) * static_cast<R>(A.y);
+}
+
+template <typename T>
+HOST_DEVICE_INLINE void get_rotated_vertices(
+    const RotatedBox<T>& box,
+    Point<T> (&pts)[4]) {
+  // M_PI / 180. == 0.01745329251
+  double theta = box.a * 0.01745329251;
+  T cosTheta2 = (T)cos(theta) * 0.5f;
+  T sinTheta2 = (T)sin(theta) * 0.5f;
+
+  // y: top --> down; x: left --> right
+  pts[0].x = box.x_ctr + sinTheta2 * box.h + cosTheta2 * box.w;
+  pts[0].y = box.y_ctr + cosTheta2 * box.h - sinTheta2 * box.w;
+  pts[1].x = box.x_ctr - sinTheta2 * box.h + cosTheta2 * box.w;
+  pts[1].y = box.y_ctr - cosTheta2 * box.h - sinTheta2 * box.w;
+  pts[2].x = 2 * box.x_ctr - pts[0].x;
+  pts[2].y = 2 * box.y_ctr - pts[0].y;
+  pts[3].x = 2 * box.x_ctr - pts[1].x;
+  pts[3].y = 2 * box.y_ctr - pts[1].y;
+}
+
+template <typename T>
+HOST_DEVICE_INLINE int get_intersection_points(
+    const Point<T> (&pts1)[4],
+    const Point<T> (&pts2)[4],
+    Point<T> (&intersections)[24]) {
+  // Line vector
+  // A line from p1 to p2 is: p1 + (p2-p1)*t, t=[0,1]
+  Point<T> vec1[4], vec2[4];
+  for (int i = 0; i < 4; i++) {
+    vec1[i] = pts1[(i + 1) % 4] - pts1[i];
+    vec2[i] = pts2[(i + 1) % 4] - pts2[i];
+  }
+
+  // Line test - test all line combos for intersection
+  int num = 0; // number of intersections
+  for (int i = 0; i < 4; i++) {
+    for (int j = 0; j < 4; j++) {
+      // Solve for 2x2 Ax=b
+      T det = cross_2d<T>(vec2[j], vec1[i]);
+
+      // This takes care of parallel lines
+      if (fabs(det) <= 1e-14) {
+        continue;
+      }
+
+      auto vec12 = pts2[j] - pts1[i];
+
+      T t1 = cross_2d<T>(vec2[j], vec12) / det;
+      T t2 = cross_2d<T>(vec1[i], vec12) / det;
+
+      if (t1 >= 0.0f && t1 <= 1.0f && t2 >= 0.0f && t2 <= 1.0f) {
+        intersections[num++] = pts1[i] + vec1[i] * t1;
+      }
+    }
+  }
+
+  // Check for vertices of rect1 inside rect2
+  {
+    const auto& AB = vec2[0];
+    const auto& DA = vec2[3];
+    auto ABdotAB = dot_2d<T>(AB, AB);
+    auto ADdotAD = dot_2d<T>(DA, DA);
+    for (int i = 0; i < 4; i++) {
+      // assume ABCD is the rectangle, and P is the point to be judged
+      // P is inside ABCD iff. P's projection on AB lies within AB
+      // and P's projection on AD lies within AD
+
+      auto AP = pts1[i] - pts2[0];
+
+      auto APdotAB = dot_2d<T>(AP, AB);
+      auto APdotAD = -dot_2d<T>(AP, DA);
+
+      if ((APdotAB >= 0) && (APdotAD >= 0) && (APdotAB <= ABdotAB) &&
+          (APdotAD <= ADdotAD)) {
+        intersections[num++] = pts1[i];
+      }
+    }
+  }
+
+  // Reverse the check - check for vertices of rect2 inside rect1
+  {
+    const auto& AB = vec1[0];
+    const auto& DA = vec1[3];
+    auto ABdotAB = dot_2d<T>(AB, AB);
+    auto ADdotAD = dot_2d<T>(DA, DA);
+    for (int i = 0; i < 4; i++) {
+      auto AP = pts2[i] - pts1[0];
+
+      auto APdotAB = dot_2d<T>(AP, AB);
+      auto APdotAD = -dot_2d<T>(AP, DA);
+
+      if ((APdotAB >= 0) && (APdotAD >= 0) && (APdotAB <= ABdotAB) &&
+          (APdotAD <= ADdotAD)) {
+        intersections[num++] = pts2[i];
+      }
+    }
+  }
+
+  return num;
+}
+
+template <typename T>
+HOST_DEVICE_INLINE int convex_hull_graham(
+    const Point<T> (&p)[24],
+    const int& num_in,
+    Point<T> (&q)[24],
+    bool shift_to_zero = false) {
+  assert(num_in >= 2);
+
+  // Step 1:
+  // Find point with minimum y
+  // if more than 1 points have the same minimum y,
+  // pick the one with the minimum x.
+  int t = 0;
+  for (int i = 1; i < num_in; i++) {
+    if (p[i].y < p[t].y || (p[i].y == p[t].y && p[i].x < p[t].x)) {
+      t = i;
+    }
+  }
+  auto& start = p[t]; // starting point
+
+  // Step 2:
+  // Subtract starting point from every points (for sorting in the next step)
+  for (int i = 0; i < num_in; i++) {
+    q[i] = p[i] - start;
+  }
+
+  // Swap the starting point to position 0
+  auto tmp = q[0];
+  q[0] = q[t];
+  q[t] = tmp;
+
+  // Step 3:
+  // Sort point 1 ~ num_in according to their relative cross-product values
+  // (essentially sorting according to angles)
+  // If the angles are the same, sort according to their distance to origin
+  T dist[24];
+#ifdef __CUDACC__
+  // compute distance to origin before sort, and sort them together with the
+  // points
+  for (int i = 0; i < num_in; i++) {
+    dist[i] = dot_2d<T>(q[i], q[i]);
+  }
+
+  // CUDA version
+  // In the future, we can potentially use thrust
+  // for sorting here to improve speed (though not guaranteed)
+  for (int i = 1; i < num_in - 1; i++) {
+    for (int j = i + 1; j < num_in; j++) {
+      T crossProduct = cross_2d<T>(q[i], q[j]);
+      if ((crossProduct < -1e-6) ||
+          (fabs(crossProduct) < 1e-6 && dist[i] > dist[j])) {
+        auto q_tmp = q[i];
+        q[i] = q[j];
+        q[j] = q_tmp;
+        auto dist_tmp = dist[i];
+        dist[i] = dist[j];
+        dist[j] = dist_tmp;
+      }
+    }
+  }
+#else
+  // CPU version
+  std::sort(
+      q + 1, q + num_in, [](const Point<T>& A, const Point<T>& B) -> bool {
+        T temp = cross_2d<T>(A, B);
+        if (fabs(temp) < 1e-6) {
+          return dot_2d<T>(A, A) < dot_2d<T>(B, B);
+        } else {
+          return temp > 0;
+        }
+      });
+  // compute distance to origin after sort, since the points are now different.
+  for (int i = 0; i < num_in; i++) {
+    dist[i] = dot_2d<T>(q[i], q[i]);
+  }
+#endif
+
+  // Step 4:
+  // Make sure there are at least 2 points (that don't overlap with each other)
+  // in the stack
+  int k; // index of the non-overlapped second point
+  for (k = 1; k < num_in; k++) {
+    if (dist[k] > 1e-8) {
+      break;
+    }
+  }
+  if (k == num_in) {
+    // We reach the end, which means the convex hull is just one point
+    q[0] = p[t];
+    return 1;
+  }
+  q[1] = q[k];
+  int m = 2; // 2 points in the stack
+  // Step 5:
+  // Finally we can start the scanning process.
+  // When a non-convex relationship between the 3 points is found
+  // (either concave shape or duplicated points),
+  // we pop the previous point from the stack
+  // until the 3-point relationship is convex again, or
+  // until the stack only contains two points
+  for (int i = k + 1; i < num_in; i++) {
+    while (m > 1) {
+      auto q1 = q[i] - q[m - 2], q2 = q[m - 1] - q[m - 2];
+      // cross_2d() uses FMA and therefore computes round(round(q1.x*q2.y) -
+      // q2.x*q1.y) So it may not return 0 even when q1==q2. Therefore we
+      // compare round(q1.x*q2.y) and round(q2.x*q1.y) directly. (round means
+      // round to nearest floating point).
+      if (q1.x * q2.y >= q2.x * q1.y)
+        m--;
+      else
+        break;
+    }
+    // Using double also helps, but float can solve the issue for now.
+    // while (m > 1 && cross_2d<T, double>(q[i] - q[m - 2], q[m - 1] - q[m - 2])
+    // >= 0) {
+    //     m--;
+    // }
+    q[m++] = q[i];
+  }
+
+  // Step 6 (Optional):
+  // In general sense we need the original coordinates, so we
+  // need to shift the points back (reverting Step 2)
+  // But if we're only interested in getting the area/perimeter of the shape
+  // We can simply return.
+  if (!shift_to_zero) {
+    for (int i = 0; i < m; i++) {
+      q[i] += start;
+    }
+  }
+
+  return m;
+}
+
+template <typename T>
+HOST_DEVICE_INLINE T polygon_area(const Point<T> (&q)[24], const int& m) {
+  if (m <= 2) {
+    return 0;
+  }
+
+  T area = 0;
+  for (int i = 1; i < m - 1; i++) {
+    area += fabs(cross_2d<T>(q[i] - q[0], q[i + 1] - q[0]));
+  }
+
+  return area / 2.0;
+}
+
+template <typename T>
+HOST_DEVICE_INLINE T rotated_boxes_intersection(
+    const RotatedBox<T>& box1,
+    const RotatedBox<T>& box2) {
+  // There are up to 4 x 4 + 4 + 4 = 24 intersections (including dups) returned
+  // from rotated_rect_intersection_pts
+  Point<T> intersectPts[24], orderedPts[24];
+
+  Point<T> pts1[4];
+  Point<T> pts2[4];
+  get_rotated_vertices<T>(box1, pts1);
+  get_rotated_vertices<T>(box2, pts2);
+
+  int num = get_intersection_points<T>(pts1, pts2, intersectPts);
+
+  if (num <= 2) {
+    return 0.0;
+  }
+
+  // Convex Hull to order the intersection points in clockwise order and find
+  // the contour area.
+  int num_convex = convex_hull_graham<T>(intersectPts, num, orderedPts, true);
+  return polygon_area<T>(orderedPts, num_convex);
+}
+
+} // namespace
+
+template <typename T>
+HOST_DEVICE_INLINE T
+single_box_iou_rotated(T const* const box1_raw, T const* const box2_raw) {
+  // shift center to the middle point to achieve higher precision in result
+  RotatedBox<T> box1, box2;
+  auto center_shift_x = (box1_raw[0] + box2_raw[0]) / 2.0;
+  auto center_shift_y = (box1_raw[1] + box2_raw[1]) / 2.0;
+  box1.x_ctr = box1_raw[0] - center_shift_x;
+  box1.y_ctr = box1_raw[1] - center_shift_y;
+  box1.w = box1_raw[2];
+  box1.h = box1_raw[3];
+  box1.a = box1_raw[4];
+  box2.x_ctr = box2_raw[0] - center_shift_x;
+  box2.y_ctr = box2_raw[1] - center_shift_y;
+  box2.w = box2_raw[2];
+  box2.h = box2_raw[3];
+  box2.a = box2_raw[4];
+
+  T area1 = box1.w * box1.h;
+  T area2 = box2.w * box2.h;
+  if (area1 < 1e-14 || area2 < 1e-14) {
+    return 0.f;
+  }
+
+  T intersection = rotated_boxes_intersection<T>(box1, box2);
+  T iou = intersection / (area1 + area2 - intersection);
+  return iou;
+}
+
+} // namespace detectron2
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/detectron2/layers/csrc/cuda_version.cu b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/layers/csrc/cuda_version.cu
new file mode 100644
index 0000000000000000000000000000000000000000..af088e7572f6f27b9d653b4d7178f4e03de6befc
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/layers/csrc/cuda_version.cu
@@ -0,0 +1,9 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+
+#include <cuda_runtime_api.h>
+
+namespace detectron2 {
+int get_cudart_version() {
+  return CUDART_VERSION;
+}
+} // namespace detectron2
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/detectron2/layers/csrc/deformable/deform_conv.h b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/layers/csrc/deformable/deform_conv.h
new file mode 100644
index 0000000000000000000000000000000000000000..49ccd868ace8fd79f6fcbde6fe41f2b95873c414
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/layers/csrc/deformable/deform_conv.h
@@ -0,0 +1,377 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+#pragma once
+#include <torch/types.h>
+
+namespace detectron2 {
+
+#ifdef WITH_CUDA
+int deform_conv_forward_cuda(
+    at::Tensor input,
+    at::Tensor weight,
+    at::Tensor offset,
+    at::Tensor output,
+    at::Tensor columns,
+    at::Tensor ones,
+    int kW,
+    int kH,
+    int dW,
+    int dH,
+    int padW,
+    int padH,
+    int dilationW,
+    int dilationH,
+    int group,
+    int deformable_group,
+    int im2col_step);
+
+int deform_conv_backward_input_cuda(
+    at::Tensor input,
+    at::Tensor offset,
+    at::Tensor gradOutput,
+    at::Tensor gradInput,
+    at::Tensor gradOffset,
+    at::Tensor weight,
+    at::Tensor columns,
+    int kW,
+    int kH,
+    int dW,
+    int dH,
+    int padW,
+    int padH,
+    int dilationW,
+    int dilationH,
+    int group,
+    int deformable_group,
+    int im2col_step);
+
+int deform_conv_backward_parameters_cuda(
+    at::Tensor input,
+    at::Tensor offset,
+    at::Tensor gradOutput,
+    at::Tensor gradWeight, // at::Tensor gradBias,
+    at::Tensor columns,
+    at::Tensor ones,
+    int kW,
+    int kH,
+    int dW,
+    int dH,
+    int padW,
+    int padH,
+    int dilationW,
+    int dilationH,
+    int group,
+    int deformable_group,
+    float scale,
+    int im2col_step);
+
+void modulated_deform_conv_cuda_forward(
+    at::Tensor input,
+    at::Tensor weight,
+    at::Tensor bias,
+    at::Tensor ones,
+    at::Tensor offset,
+    at::Tensor mask,
+    at::Tensor output,
+    at::Tensor columns,
+    int kernel_h,
+    int kernel_w,
+    const int stride_h,
+    const int stride_w,
+    const int pad_h,
+    const int pad_w,
+    const int dilation_h,
+    const int dilation_w,
+    const int group,
+    const int deformable_group,
+    const bool with_bias);
+
+void modulated_deform_conv_cuda_backward(
+    at::Tensor input,
+    at::Tensor weight,
+    at::Tensor bias,
+    at::Tensor ones,
+    at::Tensor offset,
+    at::Tensor mask,
+    at::Tensor columns,
+    at::Tensor grad_input,
+    at::Tensor grad_weight,
+    at::Tensor grad_bias,
+    at::Tensor grad_offset,
+    at::Tensor grad_mask,
+    at::Tensor grad_output,
+    int kernel_h,
+    int kernel_w,
+    int stride_h,
+    int stride_w,
+    int pad_h,
+    int pad_w,
+    int dilation_h,
+    int dilation_w,
+    int group,
+    int deformable_group,
+    const bool with_bias);
+
+#endif
+
+inline int deform_conv_forward(
+    at::Tensor input,
+    at::Tensor weight,
+    at::Tensor offset,
+    at::Tensor output,
+    at::Tensor columns,
+    at::Tensor ones,
+    int kW,
+    int kH,
+    int dW,
+    int dH,
+    int padW,
+    int padH,
+    int dilationW,
+    int dilationH,
+    int group,
+    int deformable_group,
+    int im2col_step) {
+  if (input.is_cuda()) {
+#ifdef WITH_CUDA
+    TORCH_CHECK(weight.is_cuda(), "weight tensor is not on GPU!");
+    TORCH_CHECK(offset.is_cuda(), "offset tensor is not on GPU!");
+    return deform_conv_forward_cuda(
+        input,
+        weight,
+        offset,
+        output,
+        columns,
+        ones,
+        kW,
+        kH,
+        dW,
+        dH,
+        padW,
+        padH,
+        dilationW,
+        dilationH,
+        group,
+        deformable_group,
+        im2col_step);
+#else
+    AT_ERROR("Not compiled with GPU support");
+#endif
+  }
+  AT_ERROR("Not implemented on the CPU");
+}
+
+inline int deform_conv_backward_input(
+    at::Tensor input,
+    at::Tensor offset,
+    at::Tensor gradOutput,
+    at::Tensor gradInput,
+    at::Tensor gradOffset,
+    at::Tensor weight,
+    at::Tensor columns,
+    int kW,
+    int kH,
+    int dW,
+    int dH,
+    int padW,
+    int padH,
+    int dilationW,
+    int dilationH,
+    int group,
+    int deformable_group,
+    int im2col_step) {
+  if (gradOutput.is_cuda()) {
+#ifdef WITH_CUDA
+    TORCH_CHECK(input.is_cuda(), "input tensor is not on GPU!");
+    TORCH_CHECK(weight.is_cuda(), "weight tensor is not on GPU!");
+    TORCH_CHECK(offset.is_cuda(), "offset tensor is not on GPU!");
+    return deform_conv_backward_input_cuda(
+        input,
+        offset,
+        gradOutput,
+        gradInput,
+        gradOffset,
+        weight,
+        columns,
+        kW,
+        kH,
+        dW,
+        dH,
+        padW,
+        padH,
+        dilationW,
+        dilationH,
+        group,
+        deformable_group,
+        im2col_step);
+#else
+    AT_ERROR("Not compiled with GPU support");
+#endif
+  }
+  AT_ERROR("Not implemented on the CPU");
+}
+
+inline int deform_conv_backward_filter(
+    at::Tensor input,
+    at::Tensor offset,
+    at::Tensor gradOutput,
+    at::Tensor gradWeight, // at::Tensor gradBias,
+    at::Tensor columns,
+    at::Tensor ones,
+    int kW,
+    int kH,
+    int dW,
+    int dH,
+    int padW,
+    int padH,
+    int dilationW,
+    int dilationH,
+    int group,
+    int deformable_group,
+    float scale,
+    int im2col_step) {
+  if (gradOutput.is_cuda()) {
+#ifdef WITH_CUDA
+    TORCH_CHECK(input.is_cuda(), "input tensor is not on GPU!");
+    TORCH_CHECK(offset.is_cuda(), "offset tensor is not on GPU!");
+    return deform_conv_backward_parameters_cuda(
+        input,
+        offset,
+        gradOutput,
+        gradWeight,
+        columns,
+        ones,
+        kW,
+        kH,
+        dW,
+        dH,
+        padW,
+        padH,
+        dilationW,
+        dilationH,
+        group,
+        deformable_group,
+        scale,
+        im2col_step);
+#else
+    AT_ERROR("Not compiled with GPU support");
+#endif
+  }
+  AT_ERROR("Not implemented on the CPU");
+}
+
+inline void modulated_deform_conv_forward(
+    at::Tensor input,
+    at::Tensor weight,
+    at::Tensor bias,
+    at::Tensor ones,
+    at::Tensor offset,
+    at::Tensor mask,
+    at::Tensor output,
+    at::Tensor columns,
+    int kernel_h,
+    int kernel_w,
+    const int stride_h,
+    const int stride_w,
+    const int pad_h,
+    const int pad_w,
+    const int dilation_h,
+    const int dilation_w,
+    const int group,
+    const int deformable_group,
+    const bool with_bias) {
+  if (input.is_cuda()) {
+#ifdef WITH_CUDA
+    TORCH_CHECK(weight.is_cuda(), "weight tensor is not on GPU!");
+    TORCH_CHECK(bias.is_cuda(), "bias tensor is not on GPU!");
+    TORCH_CHECK(offset.is_cuda(), "offset tensor is not on GPU!");
+    return modulated_deform_conv_cuda_forward(
+        input,
+        weight,
+        bias,
+        ones,
+        offset,
+        mask,
+        output,
+        columns,
+        kernel_h,
+        kernel_w,
+        stride_h,
+        stride_w,
+        pad_h,
+        pad_w,
+        dilation_h,
+        dilation_w,
+        group,
+        deformable_group,
+        with_bias);
+#else
+    AT_ERROR("Not compiled with GPU support");
+#endif
+  }
+  AT_ERROR("Not implemented on the CPU");
+}
+
+inline void modulated_deform_conv_backward(
+    at::Tensor input,
+    at::Tensor weight,
+    at::Tensor bias,
+    at::Tensor ones,
+    at::Tensor offset,
+    at::Tensor mask,
+    at::Tensor columns,
+    at::Tensor grad_input,
+    at::Tensor grad_weight,
+    at::Tensor grad_bias,
+    at::Tensor grad_offset,
+    at::Tensor grad_mask,
+    at::Tensor grad_output,
+    int kernel_h,
+    int kernel_w,
+    int stride_h,
+    int stride_w,
+    int pad_h,
+    int pad_w,
+    int dilation_h,
+    int dilation_w,
+    int group,
+    int deformable_group,
+    const bool with_bias) {
+  if (grad_output.is_cuda()) {
+#ifdef WITH_CUDA
+    TORCH_CHECK(input.is_cuda(), "input tensor is not on GPU!");
+    TORCH_CHECK(weight.is_cuda(), "weight tensor is not on GPU!");
+    TORCH_CHECK(bias.is_cuda(), "bias tensor is not on GPU!");
+    TORCH_CHECK(offset.is_cuda(), "offset tensor is not on GPU!");
+    return modulated_deform_conv_cuda_backward(
+        input,
+        weight,
+        bias,
+        ones,
+        offset,
+        mask,
+        columns,
+        grad_input,
+        grad_weight,
+        grad_bias,
+        grad_offset,
+        grad_mask,
+        grad_output,
+        kernel_h,
+        kernel_w,
+        stride_h,
+        stride_w,
+        pad_h,
+        pad_w,
+        dilation_h,
+        dilation_w,
+        group,
+        deformable_group,
+        with_bias);
+#else
+    AT_ERROR("Not compiled with GPU support");
+#endif
+  }
+  AT_ERROR("Not implemented on the CPU");
+}
+
+} // namespace detectron2
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/detectron2/layers/csrc/deformable/deform_conv_cuda.cu b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/layers/csrc/deformable/deform_conv_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..5376db0cc4d93e245cfc9fea0f3b5715a1f88db2
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/layers/csrc/deformable/deform_conv_cuda.cu
@@ -0,0 +1,1131 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+
+// modified from
+// https://github.com/open-mmlab/mmdetection/blob/master/mmdet/ops/dcn/src/deform_conv_cuda.cpp
+// Original license: Apache 2.0
+
+// modify from
+// https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/blob/mmdetection/mmdet/ops/dcn/src/deform_conv_cuda.c
+// Original license: Apache 2.0
+
+#include <torch/types.h>
+
+#include "deform_conv.h"
+
+#include <cmath>
+#include <vector>
+
+namespace detectron2 {
+
+void deformable_im2col(
+    const at::Tensor data_im,
+    const at::Tensor data_offset,
+    const int channels,
+    const int height,
+    const int width,
+    const int ksize_h,
+    const int ksize_w,
+    const int pad_h,
+    const int pad_w,
+    const int stride_h,
+    const int stride_w,
+    const int dilation_h,
+    const int dilation_w,
+    const int parallel_imgs,
+    const int deformable_group,
+    at::Tensor data_col);
+
+void deformable_col2im(
+    const at::Tensor data_col,
+    const at::Tensor data_offset,
+    const int channels,
+    const int height,
+    const int width,
+    const int ksize_h,
+    const int ksize_w,
+    const int pad_h,
+    const int pad_w,
+    const int stride_h,
+    const int stride_w,
+    const int dilation_h,
+    const int dilation_w,
+    const int parallel_imgs,
+    const int deformable_group,
+    at::Tensor grad_im);
+
+void deformable_col2im_coord(
+    const at::Tensor data_col,
+    const at::Tensor data_im,
+    const at::Tensor data_offset,
+    const int channels,
+    const int height,
+    const int width,
+    const int ksize_h,
+    const int ksize_w,
+    const int pad_h,
+    const int pad_w,
+    const int stride_h,
+    const int stride_w,
+    const int dilation_h,
+    const int dilation_w,
+    const int parallel_imgs,
+    const int deformable_group,
+    at::Tensor grad_offset);
+
+void modulated_deformable_im2col_cuda(
+    const at::Tensor data_im,
+    const at::Tensor data_offset,
+    const at::Tensor data_mask,
+    const int batch_size,
+    const int channels,
+    const int height_im,
+    const int width_im,
+    const int height_col,
+    const int width_col,
+    const int kernel_h,
+    const int kenerl_w,
+    const int pad_h,
+    const int pad_w,
+    const int stride_h,
+    const int stride_w,
+    const int dilation_h,
+    const int dilation_w,
+    const int deformable_group,
+    at::Tensor data_col);
+
+void modulated_deformable_col2im_cuda(
+    const at::Tensor data_col,
+    const at::Tensor data_offset,
+    const at::Tensor data_mask,
+    const int batch_size,
+    const int channels,
+    const int height_im,
+    const int width_im,
+    const int height_col,
+    const int width_col,
+    const int kernel_h,
+    const int kenerl_w,
+    const int pad_h,
+    const int pad_w,
+    const int stride_h,
+    const int stride_w,
+    const int dilation_h,
+    const int dilation_w,
+    const int deformable_group,
+    at::Tensor grad_im);
+
+void modulated_deformable_col2im_coord_cuda(
+    const at::Tensor data_col,
+    const at::Tensor data_im,
+    const at::Tensor data_offset,
+    const at::Tensor data_mask,
+    const int batch_size,
+    const int channels,
+    const int height_im,
+    const int width_im,
+    const int height_col,
+    const int width_col,
+    const int kernel_h,
+    const int kenerl_w,
+    const int pad_h,
+    const int pad_w,
+    const int stride_h,
+    const int stride_w,
+    const int dilation_h,
+    const int dilation_w,
+    const int deformable_group,
+    at::Tensor grad_offset,
+    at::Tensor grad_mask);
+
+void shape_check(
+    at::Tensor input,
+    at::Tensor offset,
+    at::Tensor* gradOutput,
+    at::Tensor weight,
+    int kH,
+    int kW,
+    int dH,
+    int dW,
+    int padH,
+    int padW,
+    int dilationH,
+    int dilationW,
+    int group,
+    int deformable_group) {
+  TORCH_CHECK(
+      weight.ndimension() == 4,
+      "4D weight tensor (nOutputPlane,nInputPlane,kH,kW) expected, "
+      "but got: %s",
+      weight.ndimension());
+
+  TORCH_CHECK(weight.is_contiguous(), "weight tensor has to be contiguous");
+
+  TORCH_CHECK(
+      kW > 0 && kH > 0,
+      "kernel size should be greater than zero, but got kH: %d kW: %d",
+      kH,
+      kW);
+
+  TORCH_CHECK(
+      (weight.size(2) == kH && weight.size(3) == kW),
+      "kernel size should be consistent with weight, ",
+      "but got kH: %d kW: %d weight.size(2): %d, weight.size(3): %d",
+      kH,
+      kW,
+      weight.size(2),
+      weight.size(3));
+
+  TORCH_CHECK(
+      dW > 0 && dH > 0,
+      "stride should be greater than zero, but got dH: %d dW: %d",
+      dH,
+      dW);
+
+  TORCH_CHECK(
+      dilationW > 0 && dilationH > 0,
+      "dilation should be greater than 0, but got dilationH: %d dilationW: %d",
+      dilationH,
+      dilationW);
+
+  int ndim = input.ndimension();
+  int dimf = 0;
+  int dimh = 1;
+  int dimw = 2;
+
+  if (ndim == 4) {
+    dimf++;
+    dimh++;
+    dimw++;
+  }
+
+  TORCH_CHECK(
+      ndim == 3 || ndim == 4,
+      "3D or 4D input tensor expected but got: %s",
+      ndim);
+
+  long nInputPlane = weight.size(1) * group;
+  long inputHeight = input.size(dimh);
+  long inputWidth = input.size(dimw);
+  long nOutputPlane = weight.size(0);
+  long outputHeight =
+      (inputHeight + 2 * padH - (dilationH * (kH - 1) + 1)) / dH + 1;
+  long outputWidth =
+      (inputWidth + 2 * padW - (dilationW * (kW - 1) + 1)) / dW + 1;
+
+  TORCH_CHECK(
+      nInputPlane % deformable_group == 0,
+      "input channels must divide deformable group size");
+
+  if (outputWidth < 1 || outputHeight < 1)
+    AT_ERROR(
+        "Given input size: (%ld x %ld x %ld). "
+        "Calculated output size: (%ld x %ld x %ld). Output size is too small",
+        nInputPlane,
+        inputHeight,
+        inputWidth,
+        nOutputPlane,
+        outputHeight,
+        outputWidth);
+
+  TORCH_CHECK(
+      input.size(1) == nInputPlane,
+      "invalid number of input planes, expected: %d, but got: %d",
+      nInputPlane,
+      input.size(1));
+
+  TORCH_CHECK(
+      (inputHeight >= kH && inputWidth >= kW),
+      "input image is smaller than kernel");
+
+  TORCH_CHECK(
+      (offset.size(2) == outputHeight && offset.size(3) == outputWidth),
+      "invalid spatial size of offset, expected height: %d width: %d, but "
+      "got height: %d width: %d",
+      outputHeight,
+      outputWidth,
+      offset.size(2),
+      offset.size(3));
+
+  TORCH_CHECK(
+      (offset.size(1) == deformable_group * 2 * kH * kW),
+      "invalid number of channels of offset");
+
+  if (gradOutput != NULL) {
+    TORCH_CHECK(
+        gradOutput->size(dimf) == nOutputPlane,
+        "invalid number of gradOutput planes, expected: %d, but got: %d",
+        nOutputPlane,
+        gradOutput->size(dimf));
+
+    TORCH_CHECK(
+        (gradOutput->size(dimh) == outputHeight &&
+         gradOutput->size(dimw) == outputWidth),
+        "invalid size of gradOutput, expected height: %d width: %d , but "
+        "got height: %d width: %d",
+        outputHeight,
+        outputWidth,
+        gradOutput->size(dimh),
+        gradOutput->size(dimw));
+  }
+}
+
+int deform_conv_forward_cuda(
+    at::Tensor input,
+    at::Tensor weight,
+    at::Tensor offset,
+    at::Tensor output,
+    at::Tensor columns,
+    at::Tensor ones,
+    int kW,
+    int kH,
+    int dW,
+    int dH,
+    int padW,
+    int padH,
+    int dilationW,
+    int dilationH,
+    int group,
+    int deformable_group,
+    int im2col_step) {
+  // todo: resize columns to include im2col: done
+  // todo: add im2col_step as input
+  // todo: add new output buffer and transpose it to output (or directly
+  // transpose output) todo: possibly change data indexing because of
+  // parallel_imgs
+
+  shape_check(
+      input,
+      offset,
+      NULL,
+      weight,
+      kH,
+      kW,
+      dH,
+      dW,
+      padH,
+      padW,
+      dilationH,
+      dilationW,
+      group,
+      deformable_group);
+
+  input = input.contiguous();
+  offset = offset.contiguous();
+  weight = weight.contiguous();
+
+  int batch = 1;
+  if (input.ndimension() == 3) {
+    // Force batch
+    batch = 0;
+    input.unsqueeze_(0);
+    offset.unsqueeze_(0);
+  }
+
+  // todo: assert batchsize dividable by im2col_step
+
+  long batchSize = input.size(0);
+  long nInputPlane = input.size(1);
+  long inputHeight = input.size(2);
+  long inputWidth = input.size(3);
+
+  long nOutputPlane = weight.size(0);
+
+  long outputWidth =
+      (inputWidth + 2 * padW - (dilationW * (kW - 1) + 1)) / dW + 1;
+  long outputHeight =
+      (inputHeight + 2 * padH - (dilationH * (kH - 1) + 1)) / dH + 1;
+
+  TORCH_CHECK((offset.size(0) == batchSize), "invalid batch size of offset");
+
+  output = output.view({batchSize / im2col_step,
+                        im2col_step,
+                        nOutputPlane,
+                        outputHeight,
+                        outputWidth});
+  columns = at::zeros(
+      {nInputPlane * kW * kH, im2col_step * outputHeight * outputWidth},
+      input.options());
+
+  if (ones.ndimension() != 2 ||
+      ones.size(0) * ones.size(1) < outputHeight * outputWidth) {
+    ones = at::ones({outputHeight, outputWidth}, input.options());
+  }
+
+  input = input.view({batchSize / im2col_step,
+                      im2col_step,
+                      nInputPlane,
+                      inputHeight,
+                      inputWidth});
+  offset = offset.view({batchSize / im2col_step,
+                        im2col_step,
+                        deformable_group * 2 * kH * kW,
+                        outputHeight,
+                        outputWidth});
+
+  at::Tensor output_buffer = at::zeros(
+      {batchSize / im2col_step,
+       nOutputPlane,
+       im2col_step * outputHeight,
+       outputWidth},
+      output.options());
+
+  output_buffer = output_buffer.view({output_buffer.size(0),
+                                      group,
+                                      output_buffer.size(1) / group,
+                                      output_buffer.size(2),
+                                      output_buffer.size(3)});
+
+  for (int elt = 0; elt < batchSize / im2col_step; elt++) {
+    deformable_im2col(
+        input[elt],
+        offset[elt],
+        nInputPlane,
+        inputHeight,
+        inputWidth,
+        kH,
+        kW,
+        padH,
+        padW,
+        dH,
+        dW,
+        dilationH,
+        dilationW,
+        im2col_step,
+        deformable_group,
+        columns);
+
+    columns = columns.view({group, columns.size(0) / group, columns.size(1)});
+    weight = weight.view({group,
+                          weight.size(0) / group,
+                          weight.size(1),
+                          weight.size(2),
+                          weight.size(3)});
+
+    for (int g = 0; g < group; g++) {
+      output_buffer[elt][g] = output_buffer[elt][g]
+                                  .flatten(1)
+                                  .addmm_(weight[g].flatten(1), columns[g])
+                                  .view_as(output_buffer[elt][g]);
+    }
+  }
+
+  output_buffer =
+      output_buffer.view({output_buffer.size(0),
+                          output_buffer.size(1) * output_buffer.size(2),
+                          output_buffer.size(3),
+                          output_buffer.size(4)});
+
+  output_buffer = output_buffer.view({batchSize / im2col_step,
+                                      nOutputPlane,
+                                      im2col_step,
+                                      outputHeight,
+                                      outputWidth});
+  output_buffer.transpose_(1, 2);
+  output.copy_(output_buffer);
+  output = output.view({batchSize, nOutputPlane, outputHeight, outputWidth});
+
+  input = input.view({batchSize, nInputPlane, inputHeight, inputWidth});
+  offset = offset.view(
+      {batchSize, deformable_group * 2 * kH * kW, outputHeight, outputWidth});
+
+  if (batch == 0) {
+    output = output.view({nOutputPlane, outputHeight, outputWidth});
+    input = input.view({nInputPlane, inputHeight, inputWidth});
+    offset = offset.view({offset.size(1), offset.size(2), offset.size(3)});
+  }
+
+  return 1;
+}
+
+int deform_conv_backward_input_cuda(
+    at::Tensor input,
+    at::Tensor offset,
+    at::Tensor gradOutput,
+    at::Tensor gradInput,
+    at::Tensor gradOffset,
+    at::Tensor weight,
+    at::Tensor columns,
+    int kW,
+    int kH,
+    int dW,
+    int dH,
+    int padW,
+    int padH,
+    int dilationW,
+    int dilationH,
+    int group,
+    int deformable_group,
+    int im2col_step) {
+  shape_check(
+      input,
+      offset,
+      &gradOutput,
+      weight,
+      kH,
+      kW,
+      dH,
+      dW,
+      padH,
+      padW,
+      dilationH,
+      dilationW,
+      group,
+      deformable_group);
+
+  input = input.contiguous();
+  offset = offset.contiguous();
+  gradOutput = gradOutput.contiguous();
+  weight = weight.contiguous();
+
+  int batch = 1;
+
+  if (input.ndimension() == 3) {
+    // Force batch
+    batch = 0;
+    input = input.view({1, input.size(0), input.size(1), input.size(2)});
+    offset = offset.view({1, offset.size(0), offset.size(1), offset.size(2)});
+    gradOutput = gradOutput.view(
+        {1, gradOutput.size(0), gradOutput.size(1), gradOutput.size(2)});
+  }
+
+  long batchSize = input.size(0);
+  long nInputPlane = input.size(1);
+  long inputHeight = input.size(2);
+  long inputWidth = input.size(3);
+
+  long nOutputPlane = weight.size(0);
+
+  long outputWidth =
+      (inputWidth + 2 * padW - (dilationW * (kW - 1) + 1)) / dW + 1;
+  long outputHeight =
+      (inputHeight + 2 * padH - (dilationH * (kH - 1) + 1)) / dH + 1;
+
+  TORCH_CHECK((offset.size(0) == batchSize), 3, "invalid batch size of offset");
+  gradInput = gradInput.view({batchSize, nInputPlane, inputHeight, inputWidth});
+  columns = at::zeros(
+      {nInputPlane * kW * kH, im2col_step * outputHeight * outputWidth},
+      input.options());
+
+  // change order of grad output
+  gradOutput = gradOutput.view({batchSize / im2col_step,
+                                im2col_step,
+                                nOutputPlane,
+                                outputHeight,
+                                outputWidth});
+  gradOutput.transpose_(1, 2);
+
+  gradInput = gradInput.view({batchSize / im2col_step,
+                              im2col_step,
+                              nInputPlane,
+                              inputHeight,
+                              inputWidth});
+  input = input.view({batchSize / im2col_step,
+                      im2col_step,
+                      nInputPlane,
+                      inputHeight,
+                      inputWidth});
+  gradOffset = gradOffset.view({batchSize / im2col_step,
+                                im2col_step,
+                                deformable_group * 2 * kH * kW,
+                                outputHeight,
+                                outputWidth});
+  offset = offset.view({batchSize / im2col_step,
+                        im2col_step,
+                        deformable_group * 2 * kH * kW,
+                        outputHeight,
+                        outputWidth});
+
+  for (int elt = 0; elt < batchSize / im2col_step; elt++) {
+    // divide into groups
+    columns = columns.view({group, columns.size(0) / group, columns.size(1)});
+    weight = weight.view({group,
+                          weight.size(0) / group,
+                          weight.size(1),
+                          weight.size(2),
+                          weight.size(3)});
+    gradOutput = gradOutput.view({gradOutput.size(0),
+                                  group,
+                                  gradOutput.size(1) / group,
+                                  gradOutput.size(2),
+                                  gradOutput.size(3),
+                                  gradOutput.size(4)});
+
+    for (int g = 0; g < group; g++) {
+      columns[g] = columns[g].addmm_(
+          weight[g].flatten(1).transpose(0, 1),
+          gradOutput[elt][g].flatten(1),
+          0.0f,
+          1.0f);
+    }
+
+    columns =
+        columns.view({columns.size(0) * columns.size(1), columns.size(2)});
+    gradOutput = gradOutput.view({gradOutput.size(0),
+                                  gradOutput.size(1) * gradOutput.size(2),
+                                  gradOutput.size(3),
+                                  gradOutput.size(4),
+                                  gradOutput.size(5)});
+
+    deformable_col2im_coord(
+        columns,
+        input[elt],
+        offset[elt],
+        nInputPlane,
+        inputHeight,
+        inputWidth,
+        kH,
+        kW,
+        padH,
+        padW,
+        dH,
+        dW,
+        dilationH,
+        dilationW,
+        im2col_step,
+        deformable_group,
+        gradOffset[elt]);
+
+    deformable_col2im(
+        columns,
+        offset[elt],
+        nInputPlane,
+        inputHeight,
+        inputWidth,
+        kH,
+        kW,
+        padH,
+        padW,
+        dH,
+        dW,
+        dilationH,
+        dilationW,
+        im2col_step,
+        deformable_group,
+        gradInput[elt]);
+  }
+
+  gradOutput.transpose_(1, 2);
+  gradOutput =
+      gradOutput.view({batchSize, nOutputPlane, outputHeight, outputWidth});
+
+  gradInput = gradInput.view({batchSize, nInputPlane, inputHeight, inputWidth});
+  input = input.view({batchSize, nInputPlane, inputHeight, inputWidth});
+  gradOffset = gradOffset.view(
+      {batchSize, deformable_group * 2 * kH * kW, outputHeight, outputWidth});
+  offset = offset.view(
+      {batchSize, deformable_group * 2 * kH * kW, outputHeight, outputWidth});
+
+  if (batch == 0) {
+    gradOutput = gradOutput.view({nOutputPlane, outputHeight, outputWidth});
+    input = input.view({nInputPlane, inputHeight, inputWidth});
+    gradInput = gradInput.view({nInputPlane, inputHeight, inputWidth});
+    offset = offset.view({offset.size(1), offset.size(2), offset.size(3)});
+    gradOffset =
+        gradOffset.view({offset.size(1), offset.size(2), offset.size(3)});
+  }
+
+  return 1;
+}
+
+int deform_conv_backward_parameters_cuda(
+    at::Tensor input,
+    at::Tensor offset,
+    at::Tensor gradOutput,
+    at::Tensor gradWeight, // at::Tensor gradBias,
+    at::Tensor columns,
+    at::Tensor ones,
+    int kW,
+    int kH,
+    int dW,
+    int dH,
+    int padW,
+    int padH,
+    int dilationW,
+    int dilationH,
+    int group,
+    int deformable_group,
+    float scale,
+    int im2col_step) {
+  // todo: transpose and reshape outGrad
+  // todo: reshape columns
+  // todo: add im2col_step as input
+
+  shape_check(
+      input,
+      offset,
+      &gradOutput,
+      gradWeight,
+      kH,
+      kW,
+      dH,
+      dW,
+      padH,
+      padW,
+      dilationH,
+      dilationW,
+      group,
+      deformable_group);
+
+  input = input.contiguous();
+  offset = offset.contiguous();
+  gradOutput = gradOutput.contiguous();
+
+  int batch = 1;
+
+  if (input.ndimension() == 3) {
+    // Force batch
+    batch = 0;
+    input = input.view(
+        at::IntList({1, input.size(0), input.size(1), input.size(2)}));
+    gradOutput = gradOutput.view(
+        {1, gradOutput.size(0), gradOutput.size(1), gradOutput.size(2)});
+  }
+
+  long batchSize = input.size(0);
+  long nInputPlane = input.size(1);
+  long inputHeight = input.size(2);
+  long inputWidth = input.size(3);
+
+  long nOutputPlane = gradWeight.size(0);
+
+  long outputWidth =
+      (inputWidth + 2 * padW - (dilationW * (kW - 1) + 1)) / dW + 1;
+  long outputHeight =
+      (inputHeight + 2 * padH - (dilationH * (kH - 1) + 1)) / dH + 1;
+
+  TORCH_CHECK((offset.size(0) == batchSize), "invalid batch size of offset");
+
+  columns = at::zeros(
+      {nInputPlane * kW * kH, im2col_step * outputHeight * outputWidth},
+      input.options());
+
+  gradOutput = gradOutput.view({batchSize / im2col_step,
+                                im2col_step,
+                                nOutputPlane,
+                                outputHeight,
+                                outputWidth});
+  gradOutput.transpose_(1, 2);
+
+  at::Tensor gradOutputBuffer = at::zeros_like(gradOutput);
+  gradOutputBuffer = gradOutputBuffer.view({batchSize / im2col_step,
+                                            nOutputPlane,
+                                            im2col_step,
+                                            outputHeight,
+                                            outputWidth});
+  gradOutputBuffer.copy_(gradOutput);
+  // gradOutput is not contiguous, so we do reshape (instead of view) next
+  gradOutputBuffer = gradOutputBuffer.reshape({batchSize / im2col_step,
+                                               nOutputPlane,
+                                               im2col_step * outputHeight,
+                                               outputWidth});
+
+  gradOutput.transpose_(1, 2);
+  gradOutput =
+      gradOutput.view({batchSize, nOutputPlane, outputHeight, outputWidth});
+
+  input = input.view({batchSize / im2col_step,
+                      im2col_step,
+                      nInputPlane,
+                      inputHeight,
+                      inputWidth});
+  offset = offset.view({batchSize / im2col_step,
+                        im2col_step,
+                        deformable_group * 2 * kH * kW,
+                        outputHeight,
+                        outputWidth});
+
+  for (int elt = 0; elt < batchSize / im2col_step; elt++) {
+    deformable_im2col(
+        input[elt],
+        offset[elt],
+        nInputPlane,
+        inputHeight,
+        inputWidth,
+        kH,
+        kW,
+        padH,
+        padW,
+        dH,
+        dW,
+        dilationH,
+        dilationW,
+        im2col_step,
+        deformable_group,
+        columns);
+
+    // divide into group
+    gradOutputBuffer = gradOutputBuffer.view({gradOutputBuffer.size(0),
+                                              group,
+                                              gradOutputBuffer.size(1) / group,
+                                              gradOutputBuffer.size(2),
+                                              gradOutputBuffer.size(3)});
+    columns = columns.view({group, columns.size(0) / group, columns.size(1)});
+    gradWeight = gradWeight.view({group,
+                                  gradWeight.size(0) / group,
+                                  gradWeight.size(1),
+                                  gradWeight.size(2),
+                                  gradWeight.size(3)});
+
+    for (int g = 0; g < group; g++) {
+      gradWeight[g] = gradWeight[g]
+                          .flatten(1)
+                          .addmm_(
+                              gradOutputBuffer[elt][g].flatten(1),
+                              columns[g].transpose(1, 0),
+                              1.0,
+                              scale)
+                          .view_as(gradWeight[g]);
+    }
+    gradOutputBuffer = gradOutputBuffer.view(
+        {gradOutputBuffer.size(0),
+         gradOutputBuffer.size(1) * gradOutputBuffer.size(2),
+         gradOutputBuffer.size(3),
+         gradOutputBuffer.size(4)});
+    columns =
+        columns.view({columns.size(0) * columns.size(1), columns.size(2)});
+    gradWeight = gradWeight.view({gradWeight.size(0) * gradWeight.size(1),
+                                  gradWeight.size(2),
+                                  gradWeight.size(3),
+                                  gradWeight.size(4)});
+  }
+
+  input = input.view({batchSize, nInputPlane, inputHeight, inputWidth});
+  offset = offset.view(
+      {batchSize, deformable_group * 2 * kH * kW, outputHeight, outputWidth});
+
+  if (batch == 0) {
+    gradOutput = gradOutput.view({nOutputPlane, outputHeight, outputWidth});
+    input = input.view({nInputPlane, inputHeight, inputWidth});
+  }
+
+  return 1;
+}
+
+void modulated_deform_conv_cuda_forward(
+    at::Tensor input,
+    at::Tensor weight,
+    at::Tensor bias,
+    at::Tensor ones,
+    at::Tensor offset,
+    at::Tensor mask,
+    at::Tensor output,
+    at::Tensor columns,
+    int kernel_h,
+    int kernel_w,
+    const int stride_h,
+    const int stride_w,
+    const int pad_h,
+    const int pad_w,
+    const int dilation_h,
+    const int dilation_w,
+    const int group,
+    const int deformable_group,
+    const bool with_bias) {
+  TORCH_CHECK(input.is_contiguous(), "input tensor has to be contiguous");
+  TORCH_CHECK(weight.is_contiguous(), "weight tensor has to be contiguous");
+
+  const int batch = input.size(0);
+  const int channels = input.size(1);
+  const int height = input.size(2);
+  const int width = input.size(3);
+
+  const int channels_out = weight.size(0);
+  const int channels_kernel = weight.size(1);
+  const int kernel_h_ = weight.size(2);
+  const int kernel_w_ = weight.size(3);
+
+  if (kernel_h_ != kernel_h || kernel_w_ != kernel_w)
+    AT_ERROR(
+        "Input shape and kernel shape wont match: (%d x %d vs %d x %d).",
+        kernel_h_,
+        kernel_w,
+        kernel_h_,
+        kernel_w_);
+  if (channels != channels_kernel * group)
+    AT_ERROR(
+        "Input shape and kernel channels wont match: (%d vs %d).",
+        channels,
+        channels_kernel * group);
+
+  const int height_out =
+      (height + 2 * pad_h - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1;
+  const int width_out =
+      (width + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1;
+
+  if (ones.ndimension() != 2 ||
+      ones.size(0) * ones.size(1) < height_out * width_out) {
+    // Resize plane and fill with ones...
+    ones = at::ones({height_out, width_out}, input.options());
+  }
+
+  // resize output
+  output = output.view({batch, channels_out, height_out, width_out}).zero_();
+  // resize temporary columns
+  columns = at::zeros(
+      {channels * kernel_h * kernel_w, 1 * height_out * width_out},
+      input.options());
+
+  output = output.view({output.size(0),
+                        group,
+                        output.size(1) / group,
+                        output.size(2),
+                        output.size(3)});
+
+  for (int b = 0; b < batch; b++) {
+    modulated_deformable_im2col_cuda(
+        input[b],
+        offset[b],
+        mask[b],
+        1,
+        channels,
+        height,
+        width,
+        height_out,
+        width_out,
+        kernel_h,
+        kernel_w,
+        pad_h,
+        pad_w,
+        stride_h,
+        stride_w,
+        dilation_h,
+        dilation_w,
+        deformable_group,
+        columns);
+
+    // divide into group
+    weight = weight.view({group,
+                          weight.size(0) / group,
+                          weight.size(1),
+                          weight.size(2),
+                          weight.size(3)});
+    columns = columns.view({group, columns.size(0) / group, columns.size(1)});
+
+    for (int g = 0; g < group; g++) {
+      output[b][g] = output[b][g]
+                         .flatten(1)
+                         .addmm_(weight[g].flatten(1), columns[g])
+                         .view_as(output[b][g]);
+    }
+
+    weight = weight.view({weight.size(0) * weight.size(1),
+                          weight.size(2),
+                          weight.size(3),
+                          weight.size(4)});
+    columns =
+        columns.view({columns.size(0) * columns.size(1), columns.size(2)});
+  }
+
+  output = output.view({output.size(0),
+                        output.size(1) * output.size(2),
+                        output.size(3),
+                        output.size(4)});
+
+  if (with_bias) {
+    output += bias.view({1, bias.size(0), 1, 1});
+  }
+}
+
+void modulated_deform_conv_cuda_backward(
+    at::Tensor input,
+    at::Tensor weight,
+    at::Tensor bias,
+    at::Tensor ones,
+    at::Tensor offset,
+    at::Tensor mask,
+    at::Tensor columns,
+    at::Tensor grad_input,
+    at::Tensor grad_weight,
+    at::Tensor grad_bias,
+    at::Tensor grad_offset,
+    at::Tensor grad_mask,
+    at::Tensor grad_output,
+    int kernel_h,
+    int kernel_w,
+    int stride_h,
+    int stride_w,
+    int pad_h,
+    int pad_w,
+    int dilation_h,
+    int dilation_w,
+    int group,
+    int deformable_group,
+    const bool with_bias) {
+  TORCH_CHECK(input.is_contiguous(), "input tensor has to be contiguous");
+  TORCH_CHECK(weight.is_contiguous(), "weight tensor has to be contiguous");
+
+  const int batch = input.size(0);
+  const int channels = input.size(1);
+  const int height = input.size(2);
+  const int width = input.size(3);
+
+  const int channels_kernel = weight.size(1);
+  const int kernel_h_ = weight.size(2);
+  const int kernel_w_ = weight.size(3);
+  if (kernel_h_ != kernel_h || kernel_w_ != kernel_w)
+    AT_ERROR(
+        "Input shape and kernel shape wont match: (%d x %d vs %d x %d).",
+        kernel_h_,
+        kernel_w,
+        kernel_h_,
+        kernel_w_);
+  if (channels != channels_kernel * group)
+    AT_ERROR(
+        "Input shape and kernel channels wont match: (%d vs %d).",
+        channels,
+        channels_kernel * group);
+
+  const int height_out =
+      (height + 2 * pad_h - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1;
+  const int width_out =
+      (width + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1;
+
+  if (ones.ndimension() != 2 ||
+      ones.size(0) * ones.size(1) < height_out * width_out) {
+    // Resize plane and fill with ones...
+    ones = at::ones({height_out, width_out}, input.options());
+  }
+
+  grad_input = grad_input.view({batch, channels, height, width});
+  columns = at::zeros(
+      {channels * kernel_h * kernel_w, height_out * width_out},
+      input.options());
+
+  grad_output = grad_output.view({grad_output.size(0),
+                                  group,
+                                  grad_output.size(1) / group,
+                                  grad_output.size(2),
+                                  grad_output.size(3)});
+
+  for (int b = 0; b < batch; b++) {
+    // divide int group
+    columns = columns.view({group, columns.size(0) / group, columns.size(1)});
+    weight = weight.view({group,
+                          weight.size(0) / group,
+                          weight.size(1),
+                          weight.size(2),
+                          weight.size(3)});
+
+    for (int g = 0; g < group; g++) {
+      columns[g].addmm_(
+          weight[g].flatten(1).transpose(0, 1),
+          grad_output[b][g].flatten(1),
+          0.0f,
+          1.0f);
+    }
+
+    columns =
+        columns.view({columns.size(0) * columns.size(1), columns.size(2)});
+    weight = weight.view({weight.size(0) * weight.size(1),
+                          weight.size(2),
+                          weight.size(3),
+                          weight.size(4)});
+
+    // gradient w.r.t. input coordinate data
+    modulated_deformable_col2im_coord_cuda(
+        columns,
+        input[b],
+        offset[b],
+        mask[b],
+        1,
+        channels,
+        height,
+        width,
+        height_out,
+        width_out,
+        kernel_h,
+        kernel_w,
+        pad_h,
+        pad_w,
+        stride_h,
+        stride_w,
+        dilation_h,
+        dilation_w,
+        deformable_group,
+        grad_offset[b],
+        grad_mask[b]);
+    // gradient w.r.t. input data
+    modulated_deformable_col2im_cuda(
+        columns,
+        offset[b],
+        mask[b],
+        1,
+        channels,
+        height,
+        width,
+        height_out,
+        width_out,
+        kernel_h,
+        kernel_w,
+        pad_h,
+        pad_w,
+        stride_h,
+        stride_w,
+        dilation_h,
+        dilation_w,
+        deformable_group,
+        grad_input[b]);
+
+    // gradient w.r.t. weight, dWeight should accumulate across the batch and
+    // group
+    modulated_deformable_im2col_cuda(
+        input[b],
+        offset[b],
+        mask[b],
+        1,
+        channels,
+        height,
+        width,
+        height_out,
+        width_out,
+        kernel_h,
+        kernel_w,
+        pad_h,
+        pad_w,
+        stride_h,
+        stride_w,
+        dilation_h,
+        dilation_w,
+        deformable_group,
+        columns);
+
+    columns = columns.view({group, columns.size(0) / group, columns.size(1)});
+    grad_weight = grad_weight.view({group,
+                                    grad_weight.size(0) / group,
+                                    grad_weight.size(1),
+                                    grad_weight.size(2),
+                                    grad_weight.size(3)});
+    if (with_bias)
+      grad_bias = grad_bias.view({group, grad_bias.size(0) / group});
+
+    for (int g = 0; g < group; g++) {
+      grad_weight[g] =
+          grad_weight[g]
+              .flatten(1)
+              .addmm_(grad_output[b][g].flatten(1), columns[g].transpose(0, 1))
+              .view_as(grad_weight[g]);
+      if (with_bias) {
+        grad_bias[g] =
+            grad_bias[g]
+                .view({-1, 1})
+                .addmm_(grad_output[b][g].flatten(1), ones.view({-1, 1}))
+                .view(-1);
+      }
+    }
+
+    columns =
+        columns.view({columns.size(0) * columns.size(1), columns.size(2)});
+    grad_weight = grad_weight.view({grad_weight.size(0) * grad_weight.size(1),
+                                    grad_weight.size(2),
+                                    grad_weight.size(3),
+                                    grad_weight.size(4)});
+    if (with_bias)
+      grad_bias = grad_bias.view({grad_bias.size(0) * grad_bias.size(1)});
+  }
+  grad_output = grad_output.view({grad_output.size(0) * grad_output.size(1),
+                                  grad_output.size(2),
+                                  grad_output.size(3),
+                                  grad_output.size(4)});
+}
+
+} // namespace detectron2
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/detectron2/layers/csrc/deformable/deform_conv_cuda_kernel.cu b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/layers/csrc/deformable/deform_conv_cuda_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..841f3166c902e7f1c17fe58137d42a58e4f66d69
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/layers/csrc/deformable/deform_conv_cuda_kernel.cu
@@ -0,0 +1,1288 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+
+// modified from
+// https://github.com/open-mmlab/mmdetection/blob/master/mmdet/ops/dcn/src/deform_conv_cuda_kernel.cu
+// Original license: Apache 2.0
+// clang-format off
+
+// modify from
+// https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/blob/mmdetection/mmdet/ops/dcn/src/deform_conv_cuda_kernel.cu
+
+/*!
+ ******************* BEGIN Caffe Copyright Notice and Disclaimer *****************
+ *
+ * COPYRIGHT
+ *
+ * All contributions by the University of California:
+ * Copyright (c) 2014-2017 The Regents of the University of California (Regents)
+ * All rights reserved.
+ *
+ * All other contributions:
+ * Copyright (c) 2014-2017, the respective contributors
+ * All rights reserved.
+ *
+ * Caffe uses a shared copyright model: each contributor holds copyright over
+ * their contributions to Caffe. The project versioning records all such
+ * contribution and copyright details. If a contributor wants to further mark
+ * their specific copyright on a particular contribution, they should indicate
+ * their copyright solely in the commit message of the change when it is
+ * committed.
+ *
+ * LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ *AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ *IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+ *FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ *DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ *SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ *OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * CONTRIBUTION AGREEMENT
+ *
+ * By contributing to the BVLC/caffe repository through pull-request, comment,
+ * or otherwise, the contributor releases their content to the
+ * license and copyright terms herein.
+ *
+ ***************** END Caffe Copyright Notice and Disclaimer *********************
+ *
+ * Copyright (c) 2018 Microsoft
+ * Licensed under The MIT License [see LICENSE for details]
+ * \file modulated_deformable_im2col.cuh
+ * \brief Function definitions of converting an image to
+ * column matrix based on kernel, padding, dilation, and offset.
+ * These functions are mainly used in deformable convolution operators.
+ * \ref: https://arxiv.org/abs/1703.06211
+ * \author Yuwen Xiong, Haozhi Qi, Jifeng Dai, Xizhou Zhu, Han Hu, Dazhi Cheng
+ */
+
+#include <ATen/ATen.h>
+#include <c10/cuda/CUDAGuard.h>
+#include <float.h>
+#include <math.h>
+#include <stdio.h>
+#include <THC/THCAtomics.cuh>
+
+using namespace at;
+
+#define CUDA_KERNEL_LOOP(i, n)                                 \
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
+       i += blockDim.x * gridDim.x)
+
+
+namespace {
+
+const int CUDA_NUM_THREADS = 1024;
+const int kMaxGridNum = 65535;
+
+inline int GET_BLOCKS(const int N) {
+  return std::min(kMaxGridNum, (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS);
+}
+
+}
+
+template <typename scalar_t>
+__device__ scalar_t deformable_im2col_bilinear(
+    const scalar_t* bottom_data,
+    const int data_width,
+    const int height,
+    const int width,
+    scalar_t h,
+    scalar_t w) {
+  int h_low = floor(h);
+  int w_low = floor(w);
+  int h_high = h_low + 1;
+  int w_high = w_low + 1;
+
+  scalar_t lh = h - h_low;
+  scalar_t lw = w - w_low;
+  scalar_t hh = 1 - lh, hw = 1 - lw;
+
+  scalar_t v1 = 0;
+  if (h_low >= 0 && w_low >= 0)
+    v1 = bottom_data[h_low * data_width + w_low];
+  scalar_t v2 = 0;
+  if (h_low >= 0 && w_high <= width - 1)
+    v2 = bottom_data[h_low * data_width + w_high];
+  scalar_t v3 = 0;
+  if (h_high <= height - 1 && w_low >= 0)
+    v3 = bottom_data[h_high * data_width + w_low];
+  scalar_t v4 = 0;
+  if (h_high <= height - 1 && w_high <= width - 1)
+    v4 = bottom_data[h_high * data_width + w_high];
+
+  scalar_t w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
+
+  scalar_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+  return val;
+}
+
+template <typename scalar_t>
+__device__ scalar_t get_gradient_weight(
+    scalar_t argmax_h,
+    scalar_t argmax_w,
+    const int h,
+    const int w,
+    const int height,
+    const int width) {
+  if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 ||
+      argmax_w >= width) {
+    // empty
+    return 0;
+  }
+
+  int argmax_h_low = floor(argmax_h);
+  int argmax_w_low = floor(argmax_w);
+  int argmax_h_high = argmax_h_low + 1;
+  int argmax_w_high = argmax_w_low + 1;
+
+  scalar_t weight = 0;
+  if (h == argmax_h_low && w == argmax_w_low)
+    weight = (h + 1 - argmax_h) * (w + 1 - argmax_w);
+  if (h == argmax_h_low && w == argmax_w_high)
+    weight = (h + 1 - argmax_h) * (argmax_w + 1 - w);
+  if (h == argmax_h_high && w == argmax_w_low)
+    weight = (argmax_h + 1 - h) * (w + 1 - argmax_w);
+  if (h == argmax_h_high && w == argmax_w_high)
+    weight = (argmax_h + 1 - h) * (argmax_w + 1 - w);
+  return weight;
+}
+
+template <typename scalar_t>
+__device__ scalar_t get_coordinate_weight(
+    scalar_t argmax_h,
+    scalar_t argmax_w,
+    const int height,
+    const int width,
+    const scalar_t* im_data,
+    const int data_width,
+    const int bp_dir) {
+  if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 ||
+      argmax_w >= width) {
+    // empty
+    return 0;
+  }
+
+  int argmax_h_low = floor(argmax_h);
+  int argmax_w_low = floor(argmax_w);
+  int argmax_h_high = argmax_h_low + 1;
+  int argmax_w_high = argmax_w_low + 1;
+
+  scalar_t weight = 0;
+
+  if (bp_dir == 0) {
+    if (argmax_h_low >= 0 && argmax_w_low >= 0)
+      weight += -1 * (argmax_w_low + 1 - argmax_w) *
+          im_data[argmax_h_low * data_width + argmax_w_low];
+    if (argmax_h_low >= 0 && argmax_w_high <= width - 1)
+      weight += -1 * (argmax_w - argmax_w_low) *
+          im_data[argmax_h_low * data_width + argmax_w_high];
+    if (argmax_h_high <= height - 1 && argmax_w_low >= 0)
+      weight += (argmax_w_low + 1 - argmax_w) *
+          im_data[argmax_h_high * data_width + argmax_w_low];
+    if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1)
+      weight += (argmax_w - argmax_w_low) *
+          im_data[argmax_h_high * data_width + argmax_w_high];
+  } else if (bp_dir == 1) {
+    if (argmax_h_low >= 0 && argmax_w_low >= 0)
+      weight += -1 * (argmax_h_low + 1 - argmax_h) *
+          im_data[argmax_h_low * data_width + argmax_w_low];
+    if (argmax_h_low >= 0 && argmax_w_high <= width - 1)
+      weight += (argmax_h_low + 1 - argmax_h) *
+          im_data[argmax_h_low * data_width + argmax_w_high];
+    if (argmax_h_high <= height - 1 && argmax_w_low >= 0)
+      weight += -1 * (argmax_h - argmax_h_low) *
+          im_data[argmax_h_high * data_width + argmax_w_low];
+    if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1)
+      weight += (argmax_h - argmax_h_low) *
+          im_data[argmax_h_high * data_width + argmax_w_high];
+  }
+
+  return weight;
+}
+
+template <typename scalar_t>
+__global__ void deformable_im2col_gpu_kernel(
+    const int n,
+    const scalar_t* data_im,
+    const scalar_t* data_offset,
+    const int height,
+    const int width,
+    const int kernel_h,
+    const int kernel_w,
+    const int pad_h,
+    const int pad_w,
+    const int stride_h,
+    const int stride_w,
+    const int dilation_h,
+    const int dilation_w,
+    const int channel_per_deformable_group,
+    const int batch_size,
+    const int num_channels,
+    const int deformable_group,
+    const int height_col,
+    const int width_col,
+    scalar_t* data_col) {
+  CUDA_KERNEL_LOOP(index, n) {
+    // index index of output matrix
+    const int w_col = index % width_col;
+    const int h_col = (index / width_col) % height_col;
+    const int b_col = (index / width_col / height_col) % batch_size;
+    const int c_im = (index / width_col / height_col) / batch_size;
+    const int c_col = c_im * kernel_h * kernel_w;
+
+    // compute deformable group index
+    const int deformable_group_index = c_im / channel_per_deformable_group;
+
+    const int h_in = h_col * stride_h - pad_h;
+    const int w_in = w_col * stride_w - pad_w;
+    scalar_t* data_col_ptr = data_col +
+        ((c_col * batch_size + b_col) * height_col + h_col) * width_col + w_col;
+    // const scalar_t* data_im_ptr = data_im + ((b_col * num_channels + c_im) *
+    // height + h_in) * width + w_in;
+    const scalar_t* data_im_ptr =
+        data_im + (b_col * num_channels + c_im) * height * width;
+    const scalar_t* data_offset_ptr = data_offset +
+        (b_col * deformable_group + deformable_group_index) * 2 * kernel_h *
+            kernel_w * height_col * width_col;
+
+    for (int i = 0; i < kernel_h; ++i) {
+      for (int j = 0; j < kernel_w; ++j) {
+        const int data_offset_h_ptr =
+            ((2 * (i * kernel_w + j)) * height_col + h_col) * width_col + w_col;
+        const int data_offset_w_ptr =
+            ((2 * (i * kernel_w + j) + 1) * height_col + h_col) * width_col +
+            w_col;
+        const scalar_t offset_h = data_offset_ptr[data_offset_h_ptr];
+        const scalar_t offset_w = data_offset_ptr[data_offset_w_ptr];
+        scalar_t val = static_cast<scalar_t>(0);
+        const scalar_t h_im = h_in + i * dilation_h + offset_h;
+        const scalar_t w_im = w_in + j * dilation_w + offset_w;
+        if (h_im > -1 && w_im > -1 && h_im < height && w_im < width) {
+          // const scalar_t map_h = i * dilation_h + offset_h;
+          // const scalar_t map_w = j * dilation_w + offset_w;
+          // const int cur_height = height - h_in;
+          // const int cur_width = width - w_in;
+          // val = deformable_im2col_bilinear(data_im_ptr, width, cur_height,
+          // cur_width, map_h, map_w);
+          val = deformable_im2col_bilinear(
+              data_im_ptr, width, height, width, h_im, w_im);
+        }
+        *data_col_ptr = val;
+        data_col_ptr += batch_size * height_col * width_col;
+      }
+    }
+  }
+}
+
+
+template <typename scalar_t>
+__global__ void deformable_col2im_gpu_kernel(
+    const int n,
+    const scalar_t* data_col,
+    const scalar_t* data_offset,
+    const int channels,
+    const int height,
+    const int width,
+    const int kernel_h,
+    const int kernel_w,
+    const int pad_h,
+    const int pad_w,
+    const int stride_h,
+    const int stride_w,
+    const int dilation_h,
+    const int dilation_w,
+    const int channel_per_deformable_group,
+    const int batch_size,
+    const int deformable_group,
+    const int height_col,
+    const int width_col,
+    scalar_t* grad_im) {
+  CUDA_KERNEL_LOOP(index, n) {
+    const int j = (index / width_col / height_col / batch_size) % kernel_w;
+    const int i =
+        (index / width_col / height_col / batch_size / kernel_w) % kernel_h;
+    const int c =
+        index / width_col / height_col / batch_size / kernel_w / kernel_h;
+    // compute the start and end of the output
+
+    const int deformable_group_index = c / channel_per_deformable_group;
+
+    int w_out = index % width_col;
+    int h_out = (index / width_col) % height_col;
+    int b = (index / width_col / height_col) % batch_size;
+    int w_in = w_out * stride_w - pad_w;
+    int h_in = h_out * stride_h - pad_h;
+
+    const scalar_t* data_offset_ptr = data_offset +
+        (b * deformable_group + deformable_group_index) * 2 * kernel_h *
+            kernel_w * height_col * width_col;
+    const int data_offset_h_ptr =
+        ((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out;
+    const int data_offset_w_ptr =
+        ((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col + w_out;
+    const scalar_t offset_h = data_offset_ptr[data_offset_h_ptr];
+    const scalar_t offset_w = data_offset_ptr[data_offset_w_ptr];
+    const scalar_t cur_inv_h_data = h_in + i * dilation_h + offset_h;
+    const scalar_t cur_inv_w_data = w_in + j * dilation_w + offset_w;
+
+    const scalar_t cur_top_grad = data_col[index];
+    const int cur_h = (int)cur_inv_h_data;
+    const int cur_w = (int)cur_inv_w_data;
+    for (int dy = -2; dy <= 2; dy++) {
+      for (int dx = -2; dx <= 2; dx++) {
+        if (cur_h + dy >= 0 && cur_h + dy < height && cur_w + dx >= 0 &&
+            cur_w + dx < width && abs(cur_inv_h_data - (cur_h + dy)) < 1 &&
+            abs(cur_inv_w_data - (cur_w + dx)) < 1) {
+          int cur_bottom_grad_pos =
+              ((b * channels + c) * height + cur_h + dy) * width + cur_w + dx;
+          scalar_t weight = get_gradient_weight(
+              cur_inv_h_data,
+              cur_inv_w_data,
+              cur_h + dy,
+              cur_w + dx,
+              height,
+              width);
+          atomicAdd(grad_im + cur_bottom_grad_pos, weight * cur_top_grad);
+        }
+      }
+    }
+  }
+}
+
+
+template <typename scalar_t>
+__global__ void deformable_col2im_coord_gpu_kernel(
+    const int n,
+    const scalar_t* data_col,
+    const scalar_t* data_im,
+    const scalar_t* data_offset,
+    const int channels,
+    const int height,
+    const int width,
+    const int kernel_h,
+    const int kernel_w,
+    const int pad_h,
+    const int pad_w,
+    const int stride_h,
+    const int stride_w,
+    const int dilation_h,
+    const int dilation_w,
+    const int channel_per_deformable_group,
+    const int batch_size,
+    const int offset_channels,
+    const int deformable_group,
+    const int height_col,
+    const int width_col,
+    scalar_t* grad_offset) {
+  CUDA_KERNEL_LOOP(index, n) {
+    scalar_t val = 0;
+    int w = index % width_col;
+    int h = (index / width_col) % height_col;
+    int c = (index / width_col / height_col) % offset_channels;
+    int b = (index / width_col / height_col) / offset_channels;
+    // compute the start and end of the output
+
+    const int deformable_group_index = c / (2 * kernel_h * kernel_w);
+    const int col_step = kernel_h * kernel_w;
+    int cnt = 0;
+    const scalar_t* data_col_ptr = data_col +
+        deformable_group_index * channel_per_deformable_group * batch_size *
+            width_col * height_col;
+    const scalar_t* data_im_ptr = data_im +
+        (b * deformable_group + deformable_group_index) *
+            channel_per_deformable_group / kernel_h / kernel_w * height * width;
+    const scalar_t* data_offset_ptr = data_offset +
+        (b * deformable_group + deformable_group_index) * 2 * kernel_h *
+            kernel_w * height_col * width_col;
+
+    const int offset_c = c - deformable_group_index * 2 * kernel_h * kernel_w;
+
+    for (int col_c = (offset_c / 2); col_c < channel_per_deformable_group;
+         col_c += col_step) {
+      const int col_pos =
+          (((col_c * batch_size + b) * height_col) + h) * width_col + w;
+      const int bp_dir = offset_c % 2;
+
+      int j = (col_pos / width_col / height_col / batch_size) % kernel_w;
+      int i =
+          (col_pos / width_col / height_col / batch_size / kernel_w) % kernel_h;
+      int w_out = col_pos % width_col;
+      int h_out = (col_pos / width_col) % height_col;
+      int w_in = w_out * stride_w - pad_w;
+      int h_in = h_out * stride_h - pad_h;
+      const int data_offset_h_ptr =
+          (((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out);
+      const int data_offset_w_ptr =
+          (((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col +
+           w_out);
+      const scalar_t offset_h = data_offset_ptr[data_offset_h_ptr];
+      const scalar_t offset_w = data_offset_ptr[data_offset_w_ptr];
+      scalar_t inv_h = h_in + i * dilation_h + offset_h;
+      scalar_t inv_w = w_in + j * dilation_w + offset_w;
+      if (inv_h <= -1 || inv_w <= -1 || inv_h >= height || inv_w >= width) {
+        inv_h = inv_w = -2;
+      }
+      const scalar_t weight = get_coordinate_weight(
+          inv_h,
+          inv_w,
+          height,
+          width,
+          data_im_ptr + cnt * height * width,
+          width,
+          bp_dir);
+      val += weight * data_col_ptr[col_pos];
+      cnt += 1;
+    }
+
+    grad_offset[index] = val;
+  }
+}
+
+
+namespace detectron2 {
+
+void deformable_im2col(
+    const at::Tensor data_im,
+    const at::Tensor data_offset,
+    const int channels,
+    const int height,
+    const int width,
+    const int ksize_h,
+    const int ksize_w,
+    const int pad_h,
+    const int pad_w,
+    const int stride_h,
+    const int stride_w,
+    const int dilation_h,
+    const int dilation_w,
+    const int parallel_imgs,
+    const int deformable_group,
+    at::Tensor data_col) {
+  // num_axes should be smaller than block size
+  // todo: check parallel_imgs is correctly passed in
+  int height_col =
+      (height + 2 * pad_h - (dilation_h * (ksize_h - 1) + 1)) / stride_h + 1;
+  int width_col =
+      (width + 2 * pad_w - (dilation_w * (ksize_w - 1) + 1)) / stride_w + 1;
+  int num_kernels = channels * height_col * width_col * parallel_imgs;
+  int channel_per_deformable_group = channels / deformable_group;
+
+  at::cuda::CUDAGuard device_guard(data_im.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      data_im.scalar_type(), "deformable_im2col_gpu", ([&] {
+        const scalar_t* data_im_ = data_im.data_ptr<scalar_t>();
+        const scalar_t* data_offset_ = data_offset.data_ptr<scalar_t>();
+        scalar_t* data_col_ = data_col.data_ptr<scalar_t>();
+
+        deformable_im2col_gpu_kernel<<<
+            GET_BLOCKS(num_kernels),
+            CUDA_NUM_THREADS,
+            0,
+            stream>>>(
+            num_kernels,
+            data_im_,
+            data_offset_,
+            height,
+            width,
+            ksize_h,
+            ksize_w,
+            pad_h,
+            pad_w,
+            stride_h,
+            stride_w,
+            dilation_h,
+            dilation_w,
+            channel_per_deformable_group,
+            parallel_imgs,
+            channels,
+            deformable_group,
+            height_col,
+            width_col,
+            data_col_);
+      }));
+
+  cudaError_t err = cudaGetLastError();
+  if (err != cudaSuccess) {
+    printf("error in deformable_im2col: %s\n", cudaGetErrorString(err));
+  }
+}
+
+
+void deformable_col2im(
+    const at::Tensor data_col,
+    const at::Tensor data_offset,
+    const int channels,
+    const int height,
+    const int width,
+    const int ksize_h,
+    const int ksize_w,
+    const int pad_h,
+    const int pad_w,
+    const int stride_h,
+    const int stride_w,
+    const int dilation_h,
+    const int dilation_w,
+    const int parallel_imgs,
+    const int deformable_group,
+    at::Tensor grad_im) {
+  // todo: make sure parallel_imgs is passed in correctly
+  int height_col =
+      (height + 2 * pad_h - (dilation_h * (ksize_h - 1) + 1)) / stride_h + 1;
+  int width_col =
+      (width + 2 * pad_w - (dilation_w * (ksize_w - 1) + 1)) / stride_w + 1;
+  int num_kernels =
+      channels * ksize_h * ksize_w * height_col * width_col * parallel_imgs;
+  int channel_per_deformable_group = channels / deformable_group;
+
+  at::cuda::CUDAGuard device_guard(data_col.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      data_col.scalar_type(), "deformable_col2im_gpu", ([&] {
+        const scalar_t* data_col_ = data_col.data_ptr<scalar_t>();
+        const scalar_t* data_offset_ = data_offset.data_ptr<scalar_t>();
+        scalar_t* grad_im_ = grad_im.data_ptr<scalar_t>();
+
+        deformable_col2im_gpu_kernel<<<
+            GET_BLOCKS(num_kernels),
+            CUDA_NUM_THREADS,
+            0,
+            stream>>>(
+            num_kernels,
+            data_col_,
+            data_offset_,
+            channels,
+            height,
+            width,
+            ksize_h,
+            ksize_w,
+            pad_h,
+            pad_w,
+            stride_h,
+            stride_w,
+            dilation_h,
+            dilation_w,
+            channel_per_deformable_group,
+            parallel_imgs,
+            deformable_group,
+            height_col,
+            width_col,
+            grad_im_);
+      }));
+
+  cudaError_t err = cudaGetLastError();
+  if (err != cudaSuccess) {
+    printf("error in deformable_col2im: %s\n", cudaGetErrorString(err));
+  }
+}
+
+
+void deformable_col2im_coord(
+    const at::Tensor data_col,
+    const at::Tensor data_im,
+    const at::Tensor data_offset,
+    const int channels,
+    const int height,
+    const int width,
+    const int ksize_h,
+    const int ksize_w,
+    const int pad_h,
+    const int pad_w,
+    const int stride_h,
+    const int stride_w,
+    const int dilation_h,
+    const int dilation_w,
+    const int parallel_imgs,
+    const int deformable_group,
+    at::Tensor grad_offset) {
+  int height_col =
+      (height + 2 * pad_h - (dilation_h * (ksize_h - 1) + 1)) / stride_h + 1;
+  int width_col =
+      (width + 2 * pad_w - (dilation_w * (ksize_w - 1) + 1)) / stride_w + 1;
+  int num_kernels = height_col * width_col * 2 * ksize_h * ksize_w *
+      deformable_group * parallel_imgs;
+  int channel_per_deformable_group =
+      channels * ksize_h * ksize_w / deformable_group;
+
+  at::cuda::CUDAGuard device_guard(data_col.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      data_col.scalar_type(), "deformable_col2im_coord_gpu", ([&] {
+        const scalar_t* data_col_ = data_col.data_ptr<scalar_t>();
+        const scalar_t* data_im_ = data_im.data_ptr<scalar_t>();
+        const scalar_t* data_offset_ = data_offset.data_ptr<scalar_t>();
+        scalar_t* grad_offset_ = grad_offset.data_ptr<scalar_t>();
+
+        deformable_col2im_coord_gpu_kernel<<<
+            GET_BLOCKS(num_kernels),
+            CUDA_NUM_THREADS,
+            0,
+            stream>>>(
+            num_kernels,
+            data_col_,
+            data_im_,
+            data_offset_,
+            channels,
+            height,
+            width,
+            ksize_h,
+            ksize_w,
+            pad_h,
+            pad_w,
+            stride_h,
+            stride_w,
+            dilation_h,
+            dilation_w,
+            channel_per_deformable_group,
+            parallel_imgs,
+            2 * ksize_h * ksize_w * deformable_group,
+            deformable_group,
+            height_col,
+            width_col,
+            grad_offset_);
+      }));
+}
+
+} // namespace detectron2
+
+
+template <typename scalar_t>
+__device__ scalar_t dmcn_im2col_bilinear(
+    const scalar_t* bottom_data,
+    const int data_width,
+    const int height,
+    const int width,
+    scalar_t h,
+    scalar_t w) {
+  int h_low = floor(h);
+  int w_low = floor(w);
+  int h_high = h_low + 1;
+  int w_high = w_low + 1;
+
+  scalar_t lh = h - h_low;
+  scalar_t lw = w - w_low;
+  scalar_t hh = 1 - lh, hw = 1 - lw;
+
+  scalar_t v1 = 0;
+  if (h_low >= 0 && w_low >= 0)
+    v1 = bottom_data[h_low * data_width + w_low];
+  scalar_t v2 = 0;
+  if (h_low >= 0 && w_high <= width - 1)
+    v2 = bottom_data[h_low * data_width + w_high];
+  scalar_t v3 = 0;
+  if (h_high <= height - 1 && w_low >= 0)
+    v3 = bottom_data[h_high * data_width + w_low];
+  scalar_t v4 = 0;
+  if (h_high <= height - 1 && w_high <= width - 1)
+    v4 = bottom_data[h_high * data_width + w_high];
+
+  scalar_t w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
+
+  scalar_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+  return val;
+}
+
+template <typename scalar_t>
+__device__ scalar_t dmcn_get_gradient_weight(
+    scalar_t argmax_h,
+    scalar_t argmax_w,
+    const int h,
+    const int w,
+    const int height,
+    const int width) {
+  if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 ||
+      argmax_w >= width) {
+    // empty
+    return 0;
+  }
+
+  int argmax_h_low = floor(argmax_h);
+  int argmax_w_low = floor(argmax_w);
+  int argmax_h_high = argmax_h_low + 1;
+  int argmax_w_high = argmax_w_low + 1;
+
+  scalar_t weight = 0;
+  if (h == argmax_h_low && w == argmax_w_low)
+    weight = (h + 1 - argmax_h) * (w + 1 - argmax_w);
+  if (h == argmax_h_low && w == argmax_w_high)
+    weight = (h + 1 - argmax_h) * (argmax_w + 1 - w);
+  if (h == argmax_h_high && w == argmax_w_low)
+    weight = (argmax_h + 1 - h) * (w + 1 - argmax_w);
+  if (h == argmax_h_high && w == argmax_w_high)
+    weight = (argmax_h + 1 - h) * (argmax_w + 1 - w);
+  return weight;
+}
+
+template <typename scalar_t>
+__device__ scalar_t dmcn_get_coordinate_weight(
+    scalar_t argmax_h,
+    scalar_t argmax_w,
+    const int height,
+    const int width,
+    const scalar_t* im_data,
+    const int data_width,
+    const int bp_dir) {
+  if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 ||
+      argmax_w >= width) {
+    // empty
+    return 0;
+  }
+
+  int argmax_h_low = floor(argmax_h);
+  int argmax_w_low = floor(argmax_w);
+  int argmax_h_high = argmax_h_low + 1;
+  int argmax_w_high = argmax_w_low + 1;
+
+  scalar_t weight = 0;
+
+  if (bp_dir == 0) {
+    if (argmax_h_low >= 0 && argmax_w_low >= 0)
+      weight += -1 * (argmax_w_low + 1 - argmax_w) *
+          im_data[argmax_h_low * data_width + argmax_w_low];
+    if (argmax_h_low >= 0 && argmax_w_high <= width - 1)
+      weight += -1 * (argmax_w - argmax_w_low) *
+          im_data[argmax_h_low * data_width + argmax_w_high];
+    if (argmax_h_high <= height - 1 && argmax_w_low >= 0)
+      weight += (argmax_w_low + 1 - argmax_w) *
+          im_data[argmax_h_high * data_width + argmax_w_low];
+    if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1)
+      weight += (argmax_w - argmax_w_low) *
+          im_data[argmax_h_high * data_width + argmax_w_high];
+  } else if (bp_dir == 1) {
+    if (argmax_h_low >= 0 && argmax_w_low >= 0)
+      weight += -1 * (argmax_h_low + 1 - argmax_h) *
+          im_data[argmax_h_low * data_width + argmax_w_low];
+    if (argmax_h_low >= 0 && argmax_w_high <= width - 1)
+      weight += (argmax_h_low + 1 - argmax_h) *
+          im_data[argmax_h_low * data_width + argmax_w_high];
+    if (argmax_h_high <= height - 1 && argmax_w_low >= 0)
+      weight += -1 * (argmax_h - argmax_h_low) *
+          im_data[argmax_h_high * data_width + argmax_w_low];
+    if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1)
+      weight += (argmax_h - argmax_h_low) *
+          im_data[argmax_h_high * data_width + argmax_w_high];
+  }
+
+  return weight;
+}
+
+template <typename scalar_t>
+__global__ void modulated_deformable_im2col_gpu_kernel(
+    const int n,
+    const scalar_t* data_im,
+    const scalar_t* data_offset,
+    const scalar_t* data_mask,
+    const int height,
+    const int width,
+    const int kernel_h,
+    const int kernel_w,
+    const int pad_h,
+    const int pad_w,
+    const int stride_h,
+    const int stride_w,
+    const int dilation_h,
+    const int dilation_w,
+    const int channel_per_deformable_group,
+    const int batch_size,
+    const int num_channels,
+    const int deformable_group,
+    const int height_col,
+    const int width_col,
+    scalar_t* data_col) {
+  CUDA_KERNEL_LOOP(index, n) {
+    // index index of output matrix
+    const int w_col = index % width_col;
+    const int h_col = (index / width_col) % height_col;
+    const int b_col = (index / width_col / height_col) % batch_size;
+    const int c_im = (index / width_col / height_col) / batch_size;
+    const int c_col = c_im * kernel_h * kernel_w;
+
+    // compute deformable group index
+    const int deformable_group_index = c_im / channel_per_deformable_group;
+
+    const int h_in = h_col * stride_h - pad_h;
+    const int w_in = w_col * stride_w - pad_w;
+
+    scalar_t* data_col_ptr = data_col +
+        ((c_col * batch_size + b_col) * height_col + h_col) * width_col + w_col;
+    // const float* data_im_ptr = data_im + ((b_col * num_channels + c_im) *
+    // height + h_in) * width + w_in;
+    const scalar_t* data_im_ptr =
+        data_im + (b_col * num_channels + c_im) * height * width;
+    const scalar_t* data_offset_ptr = data_offset +
+        (b_col * deformable_group + deformable_group_index) * 2 * kernel_h *
+            kernel_w * height_col * width_col;
+
+    const scalar_t* data_mask_ptr = data_mask +
+        (b_col * deformable_group + deformable_group_index) * kernel_h *
+            kernel_w * height_col * width_col;
+
+    for (int i = 0; i < kernel_h; ++i) {
+      for (int j = 0; j < kernel_w; ++j) {
+        const int data_offset_h_ptr =
+            ((2 * (i * kernel_w + j)) * height_col + h_col) * width_col + w_col;
+        const int data_offset_w_ptr =
+            ((2 * (i * kernel_w + j) + 1) * height_col + h_col) * width_col +
+            w_col;
+        const int data_mask_hw_ptr =
+            ((i * kernel_w + j) * height_col + h_col) * width_col + w_col;
+        const scalar_t offset_h = data_offset_ptr[data_offset_h_ptr];
+        const scalar_t offset_w = data_offset_ptr[data_offset_w_ptr];
+        const scalar_t mask = data_mask_ptr[data_mask_hw_ptr];
+        scalar_t val = static_cast<scalar_t>(0);
+        const scalar_t h_im = h_in + i * dilation_h + offset_h;
+        const scalar_t w_im = w_in + j * dilation_w + offset_w;
+        // if (h_im >= 0 && w_im >= 0 && h_im < height && w_im < width) {
+        if (h_im > -1 && w_im > -1 && h_im < height && w_im < width) {
+          // const float map_h = i * dilation_h + offset_h;
+          // const float map_w = j * dilation_w + offset_w;
+          // const int cur_height = height - h_in;
+          // const int cur_width = width - w_in;
+          // val = dmcn_im2col_bilinear(data_im_ptr, width, cur_height,
+          // cur_width, map_h, map_w);
+          val = dmcn_im2col_bilinear(
+              data_im_ptr, width, height, width, h_im, w_im);
+        }
+        *data_col_ptr = val * mask;
+        data_col_ptr += batch_size * height_col * width_col;
+        // data_col_ptr += height_col * width_col;
+      }
+    }
+  }
+}
+
+template <typename scalar_t>
+__global__ void modulated_deformable_col2im_gpu_kernel(
+    const int n,
+    const scalar_t* data_col,
+    const scalar_t* data_offset,
+    const scalar_t* data_mask,
+    const int channels,
+    const int height,
+    const int width,
+    const int kernel_h,
+    const int kernel_w,
+    const int pad_h,
+    const int pad_w,
+    const int stride_h,
+    const int stride_w,
+    const int dilation_h,
+    const int dilation_w,
+    const int channel_per_deformable_group,
+    const int batch_size,
+    const int deformable_group,
+    const int height_col,
+    const int width_col,
+    scalar_t* grad_im) {
+  CUDA_KERNEL_LOOP(index, n) {
+    const int j = (index / width_col / height_col / batch_size) % kernel_w;
+    const int i =
+        (index / width_col / height_col / batch_size / kernel_w) % kernel_h;
+    const int c =
+        index / width_col / height_col / batch_size / kernel_w / kernel_h;
+    // compute the start and end of the output
+
+    const int deformable_group_index = c / channel_per_deformable_group;
+
+    int w_out = index % width_col;
+    int h_out = (index / width_col) % height_col;
+    int b = (index / width_col / height_col) % batch_size;
+    int w_in = w_out * stride_w - pad_w;
+    int h_in = h_out * stride_h - pad_h;
+
+    const scalar_t* data_offset_ptr = data_offset +
+        (b * deformable_group + deformable_group_index) * 2 * kernel_h *
+            kernel_w * height_col * width_col;
+    const scalar_t* data_mask_ptr = data_mask +
+        (b * deformable_group + deformable_group_index) * kernel_h * kernel_w *
+            height_col * width_col;
+    const int data_offset_h_ptr =
+        ((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out;
+    const int data_offset_w_ptr =
+        ((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col + w_out;
+    const int data_mask_hw_ptr =
+        ((i * kernel_w + j) * height_col + h_out) * width_col + w_out;
+    const scalar_t offset_h = data_offset_ptr[data_offset_h_ptr];
+    const scalar_t offset_w = data_offset_ptr[data_offset_w_ptr];
+    const scalar_t mask = data_mask_ptr[data_mask_hw_ptr];
+    const scalar_t cur_inv_h_data = h_in + i * dilation_h + offset_h;
+    const scalar_t cur_inv_w_data = w_in + j * dilation_w + offset_w;
+
+    const scalar_t cur_top_grad = data_col[index] * mask;
+    const int cur_h = (int)cur_inv_h_data;
+    const int cur_w = (int)cur_inv_w_data;
+    for (int dy = -2; dy <= 2; dy++) {
+      for (int dx = -2; dx <= 2; dx++) {
+        if (cur_h + dy >= 0 && cur_h + dy < height && cur_w + dx >= 0 &&
+            cur_w + dx < width && abs(cur_inv_h_data - (cur_h + dy)) < 1 &&
+            abs(cur_inv_w_data - (cur_w + dx)) < 1) {
+          int cur_bottom_grad_pos =
+              ((b * channels + c) * height + cur_h + dy) * width + cur_w + dx;
+          scalar_t weight = dmcn_get_gradient_weight(
+              cur_inv_h_data,
+              cur_inv_w_data,
+              cur_h + dy,
+              cur_w + dx,
+              height,
+              width);
+          atomicAdd(grad_im + cur_bottom_grad_pos, weight * cur_top_grad);
+        }
+      }
+    }
+  }
+}
+
+template <typename scalar_t>
+__global__ void modulated_deformable_col2im_coord_gpu_kernel(
+    const int n,
+    const scalar_t* data_col,
+    const scalar_t* data_im,
+    const scalar_t* data_offset,
+    const scalar_t* data_mask,
+    const int channels,
+    const int height,
+    const int width,
+    const int kernel_h,
+    const int kernel_w,
+    const int pad_h,
+    const int pad_w,
+    const int stride_h,
+    const int stride_w,
+    const int dilation_h,
+    const int dilation_w,
+    const int channel_per_deformable_group,
+    const int batch_size,
+    const int offset_channels,
+    const int deformable_group,
+    const int height_col,
+    const int width_col,
+    scalar_t* grad_offset,
+    scalar_t* grad_mask) {
+  CUDA_KERNEL_LOOP(index, n) {
+    scalar_t val = 0, mval = 0;
+    int w = index % width_col;
+    int h = (index / width_col) % height_col;
+    int c = (index / width_col / height_col) % offset_channels;
+    int b = (index / width_col / height_col) / offset_channels;
+    // compute the start and end of the output
+
+    const int deformable_group_index = c / (2 * kernel_h * kernel_w);
+    const int col_step = kernel_h * kernel_w;
+    int cnt = 0;
+    const scalar_t* data_col_ptr = data_col +
+        deformable_group_index * channel_per_deformable_group * batch_size *
+            width_col * height_col;
+    const scalar_t* data_im_ptr = data_im +
+        (b * deformable_group + deformable_group_index) *
+            channel_per_deformable_group / kernel_h / kernel_w * height * width;
+    const scalar_t* data_offset_ptr = data_offset +
+        (b * deformable_group + deformable_group_index) * 2 * kernel_h *
+            kernel_w * height_col * width_col;
+    const scalar_t* data_mask_ptr = data_mask +
+        (b * deformable_group + deformable_group_index) * kernel_h * kernel_w *
+            height_col * width_col;
+
+    const int offset_c = c - deformable_group_index * 2 * kernel_h * kernel_w;
+
+    for (int col_c = (offset_c / 2); col_c < channel_per_deformable_group;
+         col_c += col_step) {
+      const int col_pos =
+          (((col_c * batch_size + b) * height_col) + h) * width_col + w;
+      const int bp_dir = offset_c % 2;
+
+      int j = (col_pos / width_col / height_col / batch_size) % kernel_w;
+      int i =
+          (col_pos / width_col / height_col / batch_size / kernel_w) % kernel_h;
+      int w_out = col_pos % width_col;
+      int h_out = (col_pos / width_col) % height_col;
+      int w_in = w_out * stride_w - pad_w;
+      int h_in = h_out * stride_h - pad_h;
+      const int data_offset_h_ptr =
+          (((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out);
+      const int data_offset_w_ptr =
+          (((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col +
+           w_out);
+      const int data_mask_hw_ptr =
+          (((i * kernel_w + j) * height_col + h_out) * width_col + w_out);
+      const scalar_t offset_h = data_offset_ptr[data_offset_h_ptr];
+      const scalar_t offset_w = data_offset_ptr[data_offset_w_ptr];
+      const scalar_t mask = data_mask_ptr[data_mask_hw_ptr];
+      scalar_t inv_h = h_in + i * dilation_h + offset_h;
+      scalar_t inv_w = w_in + j * dilation_w + offset_w;
+      if (inv_h <= -1 || inv_w <= -1 || inv_h >= height || inv_w >= width) {
+        inv_h = inv_w = -2;
+      } else {
+        mval += data_col_ptr[col_pos] *
+            dmcn_im2col_bilinear(
+                    data_im_ptr + cnt * height * width,
+                    width,
+                    height,
+                    width,
+                    inv_h,
+                    inv_w);
+      }
+      const scalar_t weight = dmcn_get_coordinate_weight(
+          inv_h,
+          inv_w,
+          height,
+          width,
+          data_im_ptr + cnt * height * width,
+          width,
+          bp_dir);
+      val += weight * data_col_ptr[col_pos] * mask;
+      cnt += 1;
+    }
+    // KERNEL_ASSIGN(grad_offset[index], offset_req, val);
+    grad_offset[index] = val;
+    if (offset_c % 2 == 0)
+      // KERNEL_ASSIGN(grad_mask[(((b * deformable_group +
+      // deformable_group_index) * kernel_h * kernel_w + offset_c / 2) *
+      // height_col + h) * width_col + w], mask_req, mval);
+      grad_mask
+          [(((b * deformable_group + deformable_group_index) * kernel_h *
+                 kernel_w +
+             offset_c / 2) *
+                height_col +
+            h) *
+               width_col +
+           w] = mval;
+  }
+}
+
+
+namespace detectron2 {
+
+void modulated_deformable_im2col_cuda(
+    const at::Tensor data_im,
+    const at::Tensor data_offset,
+    const at::Tensor data_mask,
+    const int batch_size,
+    const int channels,
+    const int height_im,
+    const int width_im,
+    const int height_col,
+    const int width_col,
+    const int kernel_h,
+    const int kenerl_w,
+    const int pad_h,
+    const int pad_w,
+    const int stride_h,
+    const int stride_w,
+    const int dilation_h,
+    const int dilation_w,
+    const int deformable_group,
+    at::Tensor data_col) {
+  // num_axes should be smaller than block size
+  const int channel_per_deformable_group = channels / deformable_group;
+  const int num_kernels = channels * batch_size * height_col * width_col;
+
+  at::cuda::CUDAGuard device_guard(data_im.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      data_im.scalar_type(), "modulated_deformable_im2col_gpu", ([&] {
+        const scalar_t* data_im_ = data_im.data_ptr<scalar_t>();
+        const scalar_t* data_offset_ = data_offset.data_ptr<scalar_t>();
+        const scalar_t* data_mask_ = data_mask.data_ptr<scalar_t>();
+        scalar_t* data_col_ = data_col.data_ptr<scalar_t>();
+
+        modulated_deformable_im2col_gpu_kernel<<<
+            GET_BLOCKS(num_kernels),
+            CUDA_NUM_THREADS,
+            0,
+            stream>>>(
+            num_kernels,
+            data_im_,
+            data_offset_,
+            data_mask_,
+            height_im,
+            width_im,
+            kernel_h,
+            kenerl_w,
+            pad_h,
+            pad_w,
+            stride_h,
+            stride_w,
+            dilation_h,
+            dilation_w,
+            channel_per_deformable_group,
+            batch_size,
+            channels,
+            deformable_group,
+            height_col,
+            width_col,
+            data_col_);
+      }));
+
+  cudaError_t err = cudaGetLastError();
+  if (err != cudaSuccess) {
+    printf(
+        "error in modulated_deformable_im2col_cuda: %s\n",
+        cudaGetErrorString(err));
+  }
+}
+
+void modulated_deformable_col2im_cuda(
+    const at::Tensor data_col,
+    const at::Tensor data_offset,
+    const at::Tensor data_mask,
+    const int batch_size,
+    const int channels,
+    const int height_im,
+    const int width_im,
+    const int height_col,
+    const int width_col,
+    const int kernel_h,
+    const int kernel_w,
+    const int pad_h,
+    const int pad_w,
+    const int stride_h,
+    const int stride_w,
+    const int dilation_h,
+    const int dilation_w,
+    const int deformable_group,
+    at::Tensor grad_im) {
+  const int channel_per_deformable_group = channels / deformable_group;
+  const int num_kernels =
+      channels * kernel_h * kernel_w * batch_size * height_col * width_col;
+
+  at::cuda::CUDAGuard device_guard(data_col.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      data_col.scalar_type(), "modulated_deformable_col2im_gpu", ([&] {
+        const scalar_t* data_col_ = data_col.data_ptr<scalar_t>();
+        const scalar_t* data_offset_ = data_offset.data_ptr<scalar_t>();
+        const scalar_t* data_mask_ = data_mask.data_ptr<scalar_t>();
+        scalar_t* grad_im_ = grad_im.data_ptr<scalar_t>();
+
+        modulated_deformable_col2im_gpu_kernel<<<
+            GET_BLOCKS(num_kernels),
+            CUDA_NUM_THREADS,
+            0,
+            stream>>>(
+            num_kernels,
+            data_col_,
+            data_offset_,
+            data_mask_,
+            channels,
+            height_im,
+            width_im,
+            kernel_h,
+            kernel_w,
+            pad_h,
+            pad_w,
+            stride_h,
+            stride_w,
+            dilation_h,
+            dilation_w,
+            channel_per_deformable_group,
+            batch_size,
+            deformable_group,
+            height_col,
+            width_col,
+            grad_im_);
+      }));
+
+  cudaError_t err = cudaGetLastError();
+  if (err != cudaSuccess) {
+    printf(
+        "error in modulated_deformable_col2im_cuda: %s\n",
+        cudaGetErrorString(err));
+  }
+}
+
+void modulated_deformable_col2im_coord_cuda(
+    const at::Tensor data_col,
+    const at::Tensor data_im,
+    const at::Tensor data_offset,
+    const at::Tensor data_mask,
+    const int batch_size,
+    const int channels,
+    const int height_im,
+    const int width_im,
+    const int height_col,
+    const int width_col,
+    const int kernel_h,
+    const int kernel_w,
+    const int pad_h,
+    const int pad_w,
+    const int stride_h,
+    const int stride_w,
+    const int dilation_h,
+    const int dilation_w,
+    const int deformable_group,
+    at::Tensor grad_offset,
+    at::Tensor grad_mask) {
+  const int num_kernels = batch_size * height_col * width_col * 2 * kernel_h *
+      kernel_w * deformable_group;
+  const int channel_per_deformable_group =
+      channels * kernel_h * kernel_w / deformable_group;
+
+  at::cuda::CUDAGuard device_guard(data_col.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      data_col.scalar_type(), "modulated_deformable_col2im_coord_gpu", ([&] {
+        const scalar_t* data_col_ = data_col.data_ptr<scalar_t>();
+        const scalar_t* data_im_ = data_im.data_ptr<scalar_t>();
+        const scalar_t* data_offset_ = data_offset.data_ptr<scalar_t>();
+        const scalar_t* data_mask_ = data_mask.data_ptr<scalar_t>();
+        scalar_t* grad_offset_ = grad_offset.data_ptr<scalar_t>();
+        scalar_t* grad_mask_ = grad_mask.data_ptr<scalar_t>();
+
+        modulated_deformable_col2im_coord_gpu_kernel<<<
+            GET_BLOCKS(num_kernels),
+            CUDA_NUM_THREADS,
+            0,
+            stream>>>(
+            num_kernels,
+            data_col_,
+            data_im_,
+            data_offset_,
+            data_mask_,
+            channels,
+            height_im,
+            width_im,
+            kernel_h,
+            kernel_w,
+            pad_h,
+            pad_w,
+            stride_h,
+            stride_w,
+            dilation_h,
+            dilation_w,
+            channel_per_deformable_group,
+            batch_size,
+            2 * kernel_h * kernel_w * deformable_group,
+            deformable_group,
+            height_col,
+            width_col,
+            grad_offset_,
+            grad_mask_);
+      }));
+  cudaError_t err = cudaGetLastError();
+  if (err != cudaSuccess) {
+    printf(
+        "error in modulated_deformable_col2im_coord_cuda: %s\n",
+        cudaGetErrorString(err));
+  }
+}
+
+} // namespace detectron2
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/detectron2/layers/csrc/nms_rotated/nms_rotated.h b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/layers/csrc/nms_rotated/nms_rotated.h
new file mode 100644
index 0000000000000000000000000000000000000000..9c86c8d55cd24fb5322657b9d2f676fc3e1373ba
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/layers/csrc/nms_rotated/nms_rotated.h
@@ -0,0 +1,39 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+#pragma once
+#include <torch/types.h>
+
+namespace detectron2 {
+
+at::Tensor nms_rotated_cpu(
+    const at::Tensor& dets,
+    const at::Tensor& scores,
+    const float iou_threshold);
+
+#ifdef WITH_CUDA
+at::Tensor nms_rotated_cuda(
+    const at::Tensor& dets,
+    const at::Tensor& scores,
+    const float iou_threshold);
+#endif
+
+// Interface for Python
+// inline is needed to prevent multiple function definitions when this header is
+// included by different cpps
+inline at::Tensor nms_rotated(
+    const at::Tensor& dets,
+    const at::Tensor& scores,
+    const float iou_threshold) {
+  assert(dets.device().is_cuda() == scores.device().is_cuda());
+  if (dets.device().is_cuda()) {
+#ifdef WITH_CUDA
+    return nms_rotated_cuda(
+        dets.contiguous(), scores.contiguous(), iou_threshold);
+#else
+    AT_ERROR("Not compiled with GPU support");
+#endif
+  }
+
+  return nms_rotated_cpu(dets.contiguous(), scores.contiguous(), iou_threshold);
+}
+
+} // namespace detectron2
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/detectron2/layers/csrc/nms_rotated/nms_rotated_cpu.cpp b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/layers/csrc/nms_rotated/nms_rotated_cpu.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..0658e388df005748c358dcbf3a1ad2a59da6cac8
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/layers/csrc/nms_rotated/nms_rotated_cpu.cpp
@@ -0,0 +1,75 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+#include "../box_iou_rotated/box_iou_rotated_utils.h"
+#include "nms_rotated.h"
+
+namespace detectron2 {
+
+template <typename scalar_t>
+at::Tensor nms_rotated_cpu_kernel(
+    const at::Tensor& dets,
+    const at::Tensor& scores,
+    const float iou_threshold) {
+  // nms_rotated_cpu_kernel is modified from torchvision's nms_cpu_kernel,
+  // however, the code in this function is much shorter because
+  // we delegate the IoU computation for rotated boxes to
+  // the single_box_iou_rotated function in box_iou_rotated_utils.h
+  AT_ASSERTM(dets.device().is_cpu(), "dets must be a CPU tensor");
+  AT_ASSERTM(scores.device().is_cpu(), "scores must be a CPU tensor");
+  AT_ASSERTM(
+      dets.scalar_type() == scores.scalar_type(),
+      "dets should have the same type as scores");
+
+  if (dets.numel() == 0) {
+    return at::empty({0}, dets.options().dtype(at::kLong));
+  }
+
+  auto order_t = std::get<1>(scores.sort(0, /* descending=*/true));
+
+  auto ndets = dets.size(0);
+  at::Tensor suppressed_t = at::zeros({ndets}, dets.options().dtype(at::kByte));
+  at::Tensor keep_t = at::zeros({ndets}, dets.options().dtype(at::kLong));
+
+  auto suppressed = suppressed_t.data_ptr<uint8_t>();
+  auto keep = keep_t.data_ptr<int64_t>();
+  auto order = order_t.data_ptr<int64_t>();
+
+  int64_t num_to_keep = 0;
+
+  for (int64_t _i = 0; _i < ndets; _i++) {
+    auto i = order[_i];
+    if (suppressed[i] == 1) {
+      continue;
+    }
+
+    keep[num_to_keep++] = i;
+
+    for (int64_t _j = _i + 1; _j < ndets; _j++) {
+      auto j = order[_j];
+      if (suppressed[j] == 1) {
+        continue;
+      }
+
+      auto ovr = single_box_iou_rotated<scalar_t>(
+          dets[i].data_ptr<scalar_t>(), dets[j].data_ptr<scalar_t>());
+      if (ovr >= iou_threshold) {
+        suppressed[j] = 1;
+      }
+    }
+  }
+  return keep_t.narrow(/*dim=*/0, /*start=*/0, /*length=*/num_to_keep);
+}
+
+at::Tensor nms_rotated_cpu(
+    // input must be contiguous
+    const at::Tensor& dets,
+    const at::Tensor& scores,
+    const float iou_threshold) {
+  auto result = at::empty({0}, dets.options());
+
+  AT_DISPATCH_FLOATING_TYPES(dets.scalar_type(), "nms_rotated", [&] {
+    result = nms_rotated_cpu_kernel<scalar_t>(dets, scores, iou_threshold);
+  });
+  return result;
+}
+
+} // namespace detectron2
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/detectron2/layers/csrc/nms_rotated/nms_rotated_cuda.cu b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/layers/csrc/nms_rotated/nms_rotated_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..40977a0da1761fe807205fbcf8029d56bf75786c
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/layers/csrc/nms_rotated/nms_rotated_cuda.cu
@@ -0,0 +1,139 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+#include <ATen/ATen.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAGuard.h>
+#include <ATen/cuda/CUDAApplyUtils.cuh>
+#include "../box_iou_rotated/box_iou_rotated_utils.h"
+
+using namespace detectron2;
+
+namespace {
+int const threadsPerBlock = sizeof(unsigned long long) * 8;
+}
+
+template <typename T>
+__global__ void nms_rotated_cuda_kernel(
+    const int n_boxes,
+    const float iou_threshold,
+    const T* dev_boxes,
+    unsigned long long* dev_mask) {
+  // nms_rotated_cuda_kernel is modified from torchvision's nms_cuda_kernel
+
+  const int row_start = blockIdx.y;
+  const int col_start = blockIdx.x;
+
+  // if (row_start > col_start) return;
+
+  const int row_size =
+      min(n_boxes - row_start * threadsPerBlock, threadsPerBlock);
+  const int col_size =
+      min(n_boxes - col_start * threadsPerBlock, threadsPerBlock);
+
+  // Compared to nms_cuda_kernel, where each box is represented with 4 values
+  // (x1, y1, x2, y2), each rotated box is represented with 5 values
+  // (x_center, y_center, width, height, angle_degrees) here.
+  __shared__ T block_boxes[threadsPerBlock * 5];
+  if (threadIdx.x < col_size) {
+    block_boxes[threadIdx.x * 5 + 0] =
+        dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 0];
+    block_boxes[threadIdx.x * 5 + 1] =
+        dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 1];
+    block_boxes[threadIdx.x * 5 + 2] =
+        dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 2];
+    block_boxes[threadIdx.x * 5 + 3] =
+        dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 3];
+    block_boxes[threadIdx.x * 5 + 4] =
+        dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 4];
+  }
+  __syncthreads();
+
+  if (threadIdx.x < row_size) {
+    const int cur_box_idx = threadsPerBlock * row_start + threadIdx.x;
+    const T* cur_box = dev_boxes + cur_box_idx * 5;
+    int i = 0;
+    unsigned long long t = 0;
+    int start = 0;
+    if (row_start == col_start) {
+      start = threadIdx.x + 1;
+    }
+    for (i = start; i < col_size; i++) {
+      // Instead of devIoU used by original horizontal nms, here
+      // we use the single_box_iou_rotated function from box_iou_rotated_utils.h
+      if (single_box_iou_rotated<T>(cur_box, block_boxes + i * 5) >
+          iou_threshold) {
+        t |= 1ULL << i;
+      }
+    }
+    const int col_blocks = at::cuda::ATenCeilDiv(n_boxes, threadsPerBlock);
+    dev_mask[cur_box_idx * col_blocks + col_start] = t;
+  }
+}
+
+namespace detectron2 {
+
+at::Tensor nms_rotated_cuda(
+    // input must be contiguous
+    const at::Tensor& dets,
+    const at::Tensor& scores,
+    float iou_threshold) {
+  // using scalar_t = float;
+  AT_ASSERTM(dets.is_cuda(), "dets must be a CUDA tensor");
+  AT_ASSERTM(scores.is_cuda(), "scores must be a CUDA tensor");
+  at::cuda::CUDAGuard device_guard(dets.device());
+
+  auto order_t = std::get<1>(scores.sort(0, /* descending=*/true));
+  auto dets_sorted = dets.index_select(0, order_t);
+
+  auto dets_num = dets.size(0);
+
+  const int col_blocks =
+      at::cuda::ATenCeilDiv(static_cast<int>(dets_num), threadsPerBlock);
+
+  at::Tensor mask =
+      at::empty({dets_num * col_blocks}, dets.options().dtype(at::kLong));
+
+  dim3 blocks(col_blocks, col_blocks);
+  dim3 threads(threadsPerBlock);
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  AT_DISPATCH_FLOATING_TYPES(
+      dets_sorted.scalar_type(), "nms_rotated_kernel_cuda", [&] {
+        nms_rotated_cuda_kernel<scalar_t><<<blocks, threads, 0, stream>>>(
+            dets_num,
+            iou_threshold,
+            dets_sorted.data_ptr<scalar_t>(),
+            (unsigned long long*)mask.data_ptr<int64_t>());
+      });
+
+  at::Tensor mask_cpu = mask.to(at::kCPU);
+  unsigned long long* mask_host =
+      (unsigned long long*)mask_cpu.data_ptr<int64_t>();
+
+  std::vector<unsigned long long> remv(col_blocks);
+  memset(&remv[0], 0, sizeof(unsigned long long) * col_blocks);
+
+  at::Tensor keep =
+      at::empty({dets_num}, dets.options().dtype(at::kLong).device(at::kCPU));
+  int64_t* keep_out = keep.data_ptr<int64_t>();
+
+  int num_to_keep = 0;
+  for (int i = 0; i < dets_num; i++) {
+    int nblock = i / threadsPerBlock;
+    int inblock = i % threadsPerBlock;
+
+    if (!(remv[nblock] & (1ULL << inblock))) {
+      keep_out[num_to_keep++] = i;
+      unsigned long long* p = mask_host + i * col_blocks;
+      for (int j = nblock; j < col_blocks; j++) {
+        remv[j] |= p[j];
+      }
+    }
+  }
+
+  AT_CUDA_CHECK(cudaGetLastError());
+  return order_t.index(
+      {keep.narrow(/*dim=*/0, /*start=*/0, /*length=*/num_to_keep)
+           .to(order_t.device(), keep.scalar_type())});
+}
+
+} // namespace detectron2
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/detectron2/layers/csrc/vision.cpp b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/layers/csrc/vision.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..fa7942e881af704d33a79e8b2ecd1ac5b6f3a7ef
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/layers/csrc/vision.cpp
@@ -0,0 +1,102 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+
+#include <torch/extension.h>
+#include "ROIAlign/ROIAlign.h"
+#include "ROIAlignRotated/ROIAlignRotated.h"
+#include "box_iou_rotated/box_iou_rotated.h"
+#include "deformable/deform_conv.h"
+#include "nms_rotated/nms_rotated.h"
+
+namespace detectron2 {
+
+#ifdef WITH_CUDA
+extern int get_cudart_version();
+#endif
+
+std::string get_cuda_version() {
+#ifdef WITH_CUDA
+  std::ostringstream oss;
+
+  // copied from
+  // https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/cuda/detail/CUDAHooks.cpp#L231
+  auto printCudaStyleVersion = [&](int v) {
+    oss << (v / 1000) << "." << (v / 10 % 100);
+    if (v % 10 != 0) {
+      oss << "." << (v % 10);
+    }
+  };
+  printCudaStyleVersion(get_cudart_version());
+  return oss.str();
+#else
+  return std::string("not available");
+#endif
+}
+
+// similar to
+// https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/Version.cpp
+std::string get_compiler_version() {
+  std::ostringstream ss;
+#if defined(__GNUC__)
+#ifndef __clang__
+
+#if ((__GNUC__ <= 4) && (__GNUC_MINOR__ <= 8))
+#error "GCC >= 4.9 is required!"
+#endif
+
+  { ss << "GCC " << __GNUC__ << "." << __GNUC_MINOR__; }
+#endif
+#endif
+
+#if defined(__clang_major__)
+  {
+    ss << "clang " << __clang_major__ << "." << __clang_minor__ << "."
+       << __clang_patchlevel__;
+  }
+#endif
+
+#if defined(_MSC_VER)
+  { ss << "MSVC " << _MSC_FULL_VER; }
+#endif
+  return ss.str();
+}
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+  m.def("get_compiler_version", &get_compiler_version, "get_compiler_version");
+  m.def("get_cuda_version", &get_cuda_version, "get_cuda_version");
+
+  m.def("box_iou_rotated", &box_iou_rotated, "IoU for rotated boxes");
+
+  m.def("deform_conv_forward", &deform_conv_forward, "deform_conv_forward");
+  m.def(
+      "deform_conv_backward_input",
+      &deform_conv_backward_input,
+      "deform_conv_backward_input");
+  m.def(
+      "deform_conv_backward_filter",
+      &deform_conv_backward_filter,
+      "deform_conv_backward_filter");
+  m.def(
+      "modulated_deform_conv_forward",
+      &modulated_deform_conv_forward,
+      "modulated_deform_conv_forward");
+  m.def(
+      "modulated_deform_conv_backward",
+      &modulated_deform_conv_backward,
+      "modulated_deform_conv_backward");
+
+  m.def("nms_rotated", &nms_rotated, "NMS for rotated boxes");
+
+  m.def("roi_align_forward", &ROIAlign_forward, "ROIAlign_forward");
+  m.def("roi_align_backward", &ROIAlign_backward, "ROIAlign_backward");
+
+  m.def(
+      "roi_align_rotated_forward",
+      &ROIAlignRotated_forward,
+      "Forward pass for Rotated ROI-Align Operator");
+  m.def(
+      "roi_align_rotated_backward",
+      &ROIAlignRotated_backward,
+      "Backward pass for Rotated ROI-Align Operator");
+}
+
+} // namespace detectron2
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/detectron2/layers/deform_conv.py b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/layers/deform_conv.py
new file mode 100644
index 0000000000000000000000000000000000000000..ba8c6498ffdfffa281e1f02037d40cbbb6e66164
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/layers/deform_conv.py
@@ -0,0 +1,494 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+import math
+from functools import lru_cache
+import torch
+from torch import nn
+from torch.autograd import Function
+from torch.autograd.function import once_differentiable
+from torch.nn.modules.utils import _pair
+
+from detectron2 import _C
+
+from .wrappers import _NewEmptyTensorOp
+
+
+class _DeformConv(Function):
+    @staticmethod
+    def forward(
+        ctx,
+        input,
+        offset,
+        weight,
+        stride=1,
+        padding=0,
+        dilation=1,
+        groups=1,
+        deformable_groups=1,
+        im2col_step=64,
+    ):
+        if input is not None and input.dim() != 4:
+            raise ValueError(
+                "Expected 4D tensor as input, got {}D tensor instead.".format(input.dim())
+            )
+        ctx.stride = _pair(stride)
+        ctx.padding = _pair(padding)
+        ctx.dilation = _pair(dilation)
+        ctx.groups = groups
+        ctx.deformable_groups = deformable_groups
+        ctx.im2col_step = im2col_step
+
+        ctx.save_for_backward(input, offset, weight)
+
+        output = input.new_empty(
+            _DeformConv._output_size(input, weight, ctx.padding, ctx.dilation, ctx.stride)
+        )
+
+        ctx.bufs_ = [input.new_empty(0), input.new_empty(0)]  # columns, ones
+
+        if not input.is_cuda:
+            raise NotImplementedError
+        else:
+            cur_im2col_step = _DeformConv._cal_im2col_step(input.shape[0], ctx.im2col_step)
+            assert (input.shape[0] % cur_im2col_step) == 0, "im2col step must divide batchsize"
+
+            _C.deform_conv_forward(
+                input,
+                weight,
+                offset,
+                output,
+                ctx.bufs_[0],
+                ctx.bufs_[1],
+                weight.size(3),
+                weight.size(2),
+                ctx.stride[1],
+                ctx.stride[0],
+                ctx.padding[1],
+                ctx.padding[0],
+                ctx.dilation[1],
+                ctx.dilation[0],
+                ctx.groups,
+                ctx.deformable_groups,
+                cur_im2col_step,
+            )
+        return output
+
+    @staticmethod
+    @once_differentiable
+    def backward(ctx, grad_output):
+        input, offset, weight = ctx.saved_tensors
+
+        grad_input = grad_offset = grad_weight = None
+
+        if not grad_output.is_cuda:
+            raise NotImplementedError
+        else:
+            cur_im2col_step = _DeformConv._cal_im2col_step(input.shape[0], ctx.im2col_step)
+            assert (input.shape[0] % cur_im2col_step) == 0, "im2col step must divide batchsize"
+
+            if ctx.needs_input_grad[0] or ctx.needs_input_grad[1]:
+                grad_input = torch.zeros_like(input)
+                grad_offset = torch.zeros_like(offset)
+                _C.deform_conv_backward_input(
+                    input,
+                    offset,
+                    grad_output,
+                    grad_input,
+                    grad_offset,
+                    weight,
+                    ctx.bufs_[0],
+                    weight.size(3),
+                    weight.size(2),
+                    ctx.stride[1],
+                    ctx.stride[0],
+                    ctx.padding[1],
+                    ctx.padding[0],
+                    ctx.dilation[1],
+                    ctx.dilation[0],
+                    ctx.groups,
+                    ctx.deformable_groups,
+                    cur_im2col_step,
+                )
+
+            if ctx.needs_input_grad[2]:
+                grad_weight = torch.zeros_like(weight)
+                _C.deform_conv_backward_filter(
+                    input,
+                    offset,
+                    grad_output,
+                    grad_weight,
+                    ctx.bufs_[0],
+                    ctx.bufs_[1],
+                    weight.size(3),
+                    weight.size(2),
+                    ctx.stride[1],
+                    ctx.stride[0],
+                    ctx.padding[1],
+                    ctx.padding[0],
+                    ctx.dilation[1],
+                    ctx.dilation[0],
+                    ctx.groups,
+                    ctx.deformable_groups,
+                    1,
+                    cur_im2col_step,
+                )
+
+        return grad_input, grad_offset, grad_weight, None, None, None, None, None, None
+
+    @staticmethod
+    def _output_size(input, weight, padding, dilation, stride):
+        channels = weight.size(0)
+        output_size = (input.size(0), channels)
+        for d in range(input.dim() - 2):
+            in_size = input.size(d + 2)
+            pad = padding[d]
+            kernel = dilation[d] * (weight.size(d + 2) - 1) + 1
+            stride_ = stride[d]
+            output_size += ((in_size + (2 * pad) - kernel) // stride_ + 1,)
+        if not all(map(lambda s: s > 0, output_size)):
+            raise ValueError(
+                "convolution input is too small (output would be {})".format(
+                    "x".join(map(str, output_size))
+                )
+            )
+        return output_size
+
+    @staticmethod
+    @lru_cache(maxsize=128)
+    def _cal_im2col_step(input_size, default_size):
+        """
+        Calculate proper im2col step size, which should be divisible by input_size and not larger
+        than prefer_size. Meanwhile the step size should be as large as possible to be more
+        efficient. So we choose the largest one among all divisors of input_size which are smaller
+        than prefer_size.
+        :param input_size: input batch size .
+        :param default_size: default preferred im2col step size.
+        :return: the largest proper step size.
+        """
+        if input_size <= default_size:
+            return input_size
+        best_step = 1
+        for step in range(2, min(int(math.sqrt(input_size)) + 1, default_size)):
+            if input_size % step == 0:
+                if input_size // step <= default_size:
+                    return input_size // step
+                best_step = step
+
+        return best_step
+
+
+class _ModulatedDeformConv(Function):
+    @staticmethod
+    def forward(
+        ctx,
+        input,
+        offset,
+        mask,
+        weight,
+        bias=None,
+        stride=1,
+        padding=0,
+        dilation=1,
+        groups=1,
+        deformable_groups=1,
+    ):
+        ctx.stride = stride
+        ctx.padding = padding
+        ctx.dilation = dilation
+        ctx.groups = groups
+        ctx.deformable_groups = deformable_groups
+        ctx.with_bias = bias is not None
+        if not ctx.with_bias:
+            bias = input.new_empty(1)  # fake tensor
+        if not input.is_cuda:
+            raise NotImplementedError
+        if (
+            weight.requires_grad
+            or mask.requires_grad
+            or offset.requires_grad
+            or input.requires_grad
+        ):
+            ctx.save_for_backward(input, offset, mask, weight, bias)
+        output = input.new_empty(_ModulatedDeformConv._infer_shape(ctx, input, weight))
+        ctx._bufs = [input.new_empty(0), input.new_empty(0)]
+        _C.modulated_deform_conv_forward(
+            input,
+            weight,
+            bias,
+            ctx._bufs[0],
+            offset,
+            mask,
+            output,
+            ctx._bufs[1],
+            weight.shape[2],
+            weight.shape[3],
+            ctx.stride,
+            ctx.stride,
+            ctx.padding,
+            ctx.padding,
+            ctx.dilation,
+            ctx.dilation,
+            ctx.groups,
+            ctx.deformable_groups,
+            ctx.with_bias,
+        )
+        return output
+
+    @staticmethod
+    @once_differentiable
+    def backward(ctx, grad_output):
+        if not grad_output.is_cuda:
+            raise NotImplementedError
+        input, offset, mask, weight, bias = ctx.saved_tensors
+        grad_input = torch.zeros_like(input)
+        grad_offset = torch.zeros_like(offset)
+        grad_mask = torch.zeros_like(mask)
+        grad_weight = torch.zeros_like(weight)
+        grad_bias = torch.zeros_like(bias)
+        _C.modulated_deform_conv_backward(
+            input,
+            weight,
+            bias,
+            ctx._bufs[0],
+            offset,
+            mask,
+            ctx._bufs[1],
+            grad_input,
+            grad_weight,
+            grad_bias,
+            grad_offset,
+            grad_mask,
+            grad_output,
+            weight.shape[2],
+            weight.shape[3],
+            ctx.stride,
+            ctx.stride,
+            ctx.padding,
+            ctx.padding,
+            ctx.dilation,
+            ctx.dilation,
+            ctx.groups,
+            ctx.deformable_groups,
+            ctx.with_bias,
+        )
+        if not ctx.with_bias:
+            grad_bias = None
+
+        return (
+            grad_input,
+            grad_offset,
+            grad_mask,
+            grad_weight,
+            grad_bias,
+            None,
+            None,
+            None,
+            None,
+            None,
+        )
+
+    @staticmethod
+    def _infer_shape(ctx, input, weight):
+        n = input.size(0)
+        channels_out = weight.size(0)
+        height, width = input.shape[2:4]
+        kernel_h, kernel_w = weight.shape[2:4]
+        height_out = (
+            height + 2 * ctx.padding - (ctx.dilation * (kernel_h - 1) + 1)
+        ) // ctx.stride + 1
+        width_out = (
+            width + 2 * ctx.padding - (ctx.dilation * (kernel_w - 1) + 1)
+        ) // ctx.stride + 1
+        return n, channels_out, height_out, width_out
+
+
+deform_conv = _DeformConv.apply
+modulated_deform_conv = _ModulatedDeformConv.apply
+
+
+class DeformConv(nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        kernel_size,
+        stride=1,
+        padding=0,
+        dilation=1,
+        groups=1,
+        deformable_groups=1,
+        bias=False,
+        norm=None,
+        activation=None,
+    ):
+        """
+        Deformable convolution from :paper:`deformconv`.
+
+        Arguments are similar to :class:`Conv2D`. Extra arguments:
+
+        Args:
+            deformable_groups (int): number of groups used in deformable convolution.
+            norm (nn.Module, optional): a normalization layer
+            activation (callable(Tensor) -> Tensor): a callable activation function
+        """
+        super(DeformConv, self).__init__()
+
+        assert not bias
+        assert in_channels % groups == 0, "in_channels {} cannot be divisible by groups {}".format(
+            in_channels, groups
+        )
+        assert (
+            out_channels % groups == 0
+        ), "out_channels {} cannot be divisible by groups {}".format(out_channels, groups)
+
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.kernel_size = _pair(kernel_size)
+        self.stride = _pair(stride)
+        self.padding = _pair(padding)
+        self.dilation = _pair(dilation)
+        self.groups = groups
+        self.deformable_groups = deformable_groups
+        self.norm = norm
+        self.activation = activation
+
+        self.weight = nn.Parameter(
+            torch.Tensor(out_channels, in_channels // self.groups, *self.kernel_size)
+        )
+        self.bias = None
+
+        nn.init.kaiming_uniform_(self.weight, nonlinearity="relu")
+
+    def forward(self, x, offset):
+        if x.numel() == 0:
+            # When input is empty, we want to return a empty tensor with "correct" shape,
+            # So that the following operations will not panic
+            # if they check for the shape of the tensor.
+            # This computes the height and width of the output tensor
+            output_shape = [
+                (i + 2 * p - (di * (k - 1) + 1)) // s + 1
+                for i, p, di, k, s in zip(
+                    x.shape[-2:], self.padding, self.dilation, self.kernel_size, self.stride
+                )
+            ]
+            output_shape = [x.shape[0], self.weight.shape[0]] + output_shape
+            return _NewEmptyTensorOp.apply(x, output_shape)
+
+        x = deform_conv(
+            x,
+            offset,
+            self.weight,
+            self.stride,
+            self.padding,
+            self.dilation,
+            self.groups,
+            self.deformable_groups,
+        )
+        if self.norm is not None:
+            x = self.norm(x)
+        if self.activation is not None:
+            x = self.activation(x)
+        return x
+
+    def extra_repr(self):
+        tmpstr = "in_channels=" + str(self.in_channels)
+        tmpstr += ", out_channels=" + str(self.out_channels)
+        tmpstr += ", kernel_size=" + str(self.kernel_size)
+        tmpstr += ", stride=" + str(self.stride)
+        tmpstr += ", padding=" + str(self.padding)
+        tmpstr += ", dilation=" + str(self.dilation)
+        tmpstr += ", groups=" + str(self.groups)
+        tmpstr += ", deformable_groups=" + str(self.deformable_groups)
+        tmpstr += ", bias=False"
+        return tmpstr
+
+
+class ModulatedDeformConv(nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        kernel_size,
+        stride=1,
+        padding=0,
+        dilation=1,
+        groups=1,
+        deformable_groups=1,
+        bias=True,
+        norm=None,
+        activation=None,
+    ):
+        """
+        Modulated deformable convolution from :paper:`deformconv2`.
+
+        Arguments are similar to :class:`Conv2D`. Extra arguments:
+
+        Args:
+            deformable_groups (int): number of groups used in deformable convolution.
+            norm (nn.Module, optional): a normalization layer
+            activation (callable(Tensor) -> Tensor): a callable activation function
+        """
+        super(ModulatedDeformConv, self).__init__()
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.kernel_size = _pair(kernel_size)
+        self.stride = stride
+        self.padding = padding
+        self.dilation = dilation
+        self.groups = groups
+        self.deformable_groups = deformable_groups
+        self.with_bias = bias
+        self.norm = norm
+        self.activation = activation
+
+        self.weight = nn.Parameter(
+            torch.Tensor(out_channels, in_channels // groups, *self.kernel_size)
+        )
+        if bias:
+            self.bias = nn.Parameter(torch.Tensor(out_channels))
+        else:
+            self.bias = None
+
+        nn.init.kaiming_uniform_(self.weight, nonlinearity="relu")
+        if self.bias is not None:
+            nn.init.constant_(self.bias, 0)
+
+    def forward(self, x, offset, mask):
+        if x.numel() == 0:
+            output_shape = [
+                (i + 2 * p - (di * (k - 1) + 1)) // s + 1
+                for i, p, di, k, s in zip(
+                    x.shape[-2:], self.padding, self.dilation, self.kernel_size, self.stride
+                )
+            ]
+            output_shape = [x.shape[0], self.weight.shape[0]] + output_shape
+            return _NewEmptyTensorOp.apply(x, output_shape)
+
+        x = modulated_deform_conv(
+            x,
+            offset,
+            mask,
+            self.weight,
+            self.bias,
+            self.stride,
+            self.padding,
+            self.dilation,
+            self.groups,
+            self.deformable_groups,
+        )
+        if self.norm is not None:
+            x = self.norm(x)
+        if self.activation is not None:
+            x = self.activation(x)
+        return x
+
+    def extra_repr(self):
+        tmpstr = "in_channels=" + str(self.in_channels)
+        tmpstr += ", out_channels=" + str(self.out_channels)
+        tmpstr += ", kernel_size=" + str(self.kernel_size)
+        tmpstr += ", stride=" + str(self.stride)
+        tmpstr += ", padding=" + str(self.padding)
+        tmpstr += ", dilation=" + str(self.dilation)
+        tmpstr += ", groups=" + str(self.groups)
+        tmpstr += ", deformable_groups=" + str(self.deformable_groups)
+        tmpstr += ", bias=" + str(self.with_bias)
+        return tmpstr
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/detectron2/layers/mask_ops.py b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/layers/mask_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..0fe115dbbe15c354575c67d7d10f055eab0bdf91
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/layers/mask_ops.py
@@ -0,0 +1,248 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+import numpy as np
+import torch
+from PIL import Image
+from torch.nn import functional as F
+
+__all__ = ["paste_masks_in_image"]
+
+
+BYTES_PER_FLOAT = 4
+# TODO: This memory limit may be too much or too little. It would be better to
+# determine it based on available resources.
+GPU_MEM_LIMIT = 1024 ** 3  # 1 GB memory limit
+
+
+def _do_paste_mask(masks, boxes, img_h, img_w, skip_empty=True):
+    """
+    Args:
+        masks: N, 1, H, W
+        boxes: N, 4
+        img_h, img_w (int):
+        skip_empty (bool): only paste masks within the region that
+            tightly bound all boxes, and returns the results this region only.
+            An important optimization for CPU.
+
+    Returns:
+        if skip_empty == False, a mask of shape (N, img_h, img_w)
+        if skip_empty == True, a mask of shape (N, h', w'), and the slice
+            object for the corresponding region.
+    """
+    # On GPU, paste all masks together (up to chunk size)
+    # by using the entire image to sample the masks
+    # Compared to pasting them one by one,
+    # this has more operations but is faster on COCO-scale dataset.
+    device = masks.device
+    if skip_empty:
+        x0_int, y0_int = torch.clamp(boxes.min(dim=0).values.floor()[:2] - 1, min=0).to(
+            dtype=torch.int32
+        )
+        x1_int = torch.clamp(boxes[:, 2].max().ceil() + 1, max=img_w).to(dtype=torch.int32)
+        y1_int = torch.clamp(boxes[:, 3].max().ceil() + 1, max=img_h).to(dtype=torch.int32)
+    else:
+        x0_int, y0_int = 0, 0
+        x1_int, y1_int = img_w, img_h
+    x0, y0, x1, y1 = torch.split(boxes, 1, dim=1)  # each is Nx1
+
+    N = masks.shape[0]
+
+    img_y = torch.arange(y0_int, y1_int, device=device, dtype=torch.float32) + 0.5
+    img_x = torch.arange(x0_int, x1_int, device=device, dtype=torch.float32) + 0.5
+    img_y = (img_y - y0) / (y1 - y0) * 2 - 1
+    img_x = (img_x - x0) / (x1 - x0) * 2 - 1
+    # img_x, img_y have shapes (N, w), (N, h)
+
+    gx = img_x[:, None, :].expand(N, img_y.size(1), img_x.size(1))
+    gy = img_y[:, :, None].expand(N, img_y.size(1), img_x.size(1))
+    grid = torch.stack([gx, gy], dim=3)
+
+    img_masks = F.grid_sample(masks.to(dtype=torch.float32), grid, align_corners=False)
+
+    if skip_empty:
+        return img_masks[:, 0], (slice(y0_int, y1_int), slice(x0_int, x1_int))
+    else:
+        return img_masks[:, 0], ()
+
+
+def paste_masks_in_image(masks, boxes, image_shape, threshold=0.5):
+    """
+    Paste a set of masks that are of a fixed resolution (e.g., 28 x 28) into an image.
+    The location, height, and width for pasting each mask is determined by their
+    corresponding bounding boxes in boxes.
+
+    Note:
+        This is a complicated but more accurate implementation. In actual deployment, it is
+        often enough to use a faster but less accurate implementation.
+        See :func:`paste_mask_in_image_old` in this file for an alternative implementation.
+
+    Args:
+        masks (tensor): Tensor of shape (Bimg, Hmask, Wmask), where Bimg is the number of
+            detected object instances in the image and Hmask, Wmask are the mask width and mask
+            height of the predicted mask (e.g., Hmask = Wmask = 28). Values are in [0, 1].
+        boxes (Boxes or Tensor): A Boxes of length Bimg or Tensor of shape (Bimg, 4).
+            boxes[i] and masks[i] correspond to the same object instance.
+        image_shape (tuple): height, width
+        threshold (float): A threshold in [0, 1] for converting the (soft) masks to
+            binary masks.
+
+    Returns:
+        img_masks (Tensor): A tensor of shape (Bimg, Himage, Wimage), where Bimg is the
+        number of detected object instances and Himage, Wimage are the image width
+        and height. img_masks[i] is a binary mask for object instance i.
+    """
+
+    assert masks.shape[-1] == masks.shape[-2], "Only square mask predictions are supported"
+    N = len(masks)
+    if N == 0:
+        return masks.new_empty((0,) + image_shape, dtype=torch.uint8)
+    if not isinstance(boxes, torch.Tensor):
+        boxes = boxes.tensor
+    device = boxes.device
+    assert len(boxes) == N, boxes.shape
+
+    img_h, img_w = image_shape
+
+    # The actual implementation split the input into chunks,
+    # and paste them chunk by chunk.
+    if device.type == "cpu":
+        # CPU is most efficient when they are pasted one by one with skip_empty=True
+        # so that it performs minimal number of operations.
+        num_chunks = N
+    else:
+        # GPU benefits from parallelism for larger chunks, but may have memory issue
+        # int(img_h) because shape may be tensors in tracing
+        num_chunks = int(np.ceil(N * int(img_h) * int(img_w) * BYTES_PER_FLOAT / GPU_MEM_LIMIT))
+        assert (
+            num_chunks <= N
+        ), "Default GPU_MEM_LIMIT in mask_ops.py is too small; try increasing it"
+    chunks = torch.chunk(torch.arange(N, device=device), num_chunks)
+
+    img_masks = torch.zeros(
+        N, img_h, img_w, device=device, dtype=torch.bool if threshold >= 0 else torch.uint8
+    )
+    for inds in chunks:
+        masks_chunk, spatial_inds = _do_paste_mask(
+            masks[inds, None, :, :], boxes[inds], img_h, img_w, skip_empty=device.type == "cpu"
+        )
+
+        if threshold >= 0:
+            masks_chunk = (masks_chunk >= threshold).to(dtype=torch.bool)
+        else:
+            # for visualization and debugging
+            masks_chunk = (masks_chunk * 255).to(dtype=torch.uint8)
+
+        img_masks[(inds,) + spatial_inds] = masks_chunk
+    return img_masks
+
+
+# The below are the original paste function (from Detectron1) which has
+# larger quantization error.
+# It is faster on CPU, while the aligned one is faster on GPU thanks to grid_sample.
+
+
+def paste_mask_in_image_old(mask, box, img_h, img_w, threshold):
+    """
+    Paste a single mask in an image.
+    This is a per-box implementation of :func:`paste_masks_in_image`.
+    This function has larger quantization error due to incorrect pixel
+    modeling and is not used any more.
+
+    Args:
+        mask (Tensor): A tensor of shape (Hmask, Wmask) storing the mask of a single
+            object instance. Values are in [0, 1].
+        box (Tensor): A tensor of shape (4, ) storing the x0, y0, x1, y1 box corners
+            of the object instance.
+        img_h, img_w (int): Image height and width.
+        threshold (float): Mask binarization threshold in [0, 1].
+
+    Returns:
+        im_mask (Tensor):
+            The resized and binarized object mask pasted into the original
+            image plane (a tensor of shape (img_h, img_w)).
+    """
+    # Conversion from continuous box coordinates to discrete pixel coordinates
+    # via truncation (cast to int32). This determines which pixels to paste the
+    # mask onto.
+    box = box.to(dtype=torch.int32)  # Continuous to discrete coordinate conversion
+    # An example (1D) box with continuous coordinates (x0=0.7, x1=4.3) will map to
+    # a discrete coordinates (x0=0, x1=4). Note that box is mapped to 5 = x1 - x0 + 1
+    # pixels (not x1 - x0 pixels).
+    samples_w = box[2] - box[0] + 1  # Number of pixel samples, *not* geometric width
+    samples_h = box[3] - box[1] + 1  # Number of pixel samples, *not* geometric height
+
+    # Resample the mask from it's original grid to the new samples_w x samples_h grid
+    mask = Image.fromarray(mask.cpu().numpy())
+    mask = mask.resize((samples_w, samples_h), resample=Image.BILINEAR)
+    mask = np.array(mask, copy=False)
+
+    if threshold >= 0:
+        mask = np.array(mask > threshold, dtype=np.uint8)
+        mask = torch.from_numpy(mask)
+    else:
+        # for visualization and debugging, we also
+        # allow it to return an unmodified mask
+        mask = torch.from_numpy(mask * 255).to(torch.uint8)
+
+    im_mask = torch.zeros((img_h, img_w), dtype=torch.uint8)
+    x_0 = max(box[0], 0)
+    x_1 = min(box[2] + 1, img_w)
+    y_0 = max(box[1], 0)
+    y_1 = min(box[3] + 1, img_h)
+
+    im_mask[y_0:y_1, x_0:x_1] = mask[
+        (y_0 - box[1]) : (y_1 - box[1]), (x_0 - box[0]) : (x_1 - box[0])
+    ]
+    return im_mask
+
+
+# Our pixel modeling requires extrapolation for any continuous
+# coordinate < 0.5 or > length - 0.5. When sampling pixels on the masks,
+# we would like this extrapolation to be an interpolation between boundary values and zero,
+# instead of using absolute zero or boundary values.
+# Therefore `paste_mask_in_image_old` is often used with zero padding around the masks like this:
+# masks, scale = pad_masks(masks[:, 0, :, :], 1)
+# boxes = scale_boxes(boxes.tensor, scale)
+
+
+def pad_masks(masks, padding):
+    """
+    Args:
+        masks (tensor): A tensor of shape (B, M, M) representing B masks.
+        padding (int): Number of cells to pad on all sides.
+
+    Returns:
+        The padded masks and the scale factor of the padding size / original size.
+    """
+    B = masks.shape[0]
+    M = masks.shape[-1]
+    pad2 = 2 * padding
+    scale = float(M + pad2) / M
+    padded_masks = masks.new_zeros((B, M + pad2, M + pad2))
+    padded_masks[:, padding:-padding, padding:-padding] = masks
+    return padded_masks, scale
+
+
+def scale_boxes(boxes, scale):
+    """
+    Args:
+        boxes (tensor): A tensor of shape (B, 4) representing B boxes with 4
+            coords representing the corners x0, y0, x1, y1,
+        scale (float): The box scaling factor.
+
+    Returns:
+        Scaled boxes.
+    """
+    w_half = (boxes[:, 2] - boxes[:, 0]) * 0.5
+    h_half = (boxes[:, 3] - boxes[:, 1]) * 0.5
+    x_c = (boxes[:, 2] + boxes[:, 0]) * 0.5
+    y_c = (boxes[:, 3] + boxes[:, 1]) * 0.5
+
+    w_half *= scale
+    h_half *= scale
+
+    scaled_boxes = torch.zeros_like(boxes)
+    scaled_boxes[:, 0] = x_c - w_half
+    scaled_boxes[:, 2] = x_c + w_half
+    scaled_boxes[:, 1] = y_c - h_half
+    scaled_boxes[:, 3] = y_c + h_half
+    return scaled_boxes
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/detectron2/layers/nms.py b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/layers/nms.py
new file mode 100644
index 0000000000000000000000000000000000000000..aafe29b3aa551caeeda769dd17b8834b08c7f11c
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/layers/nms.py
@@ -0,0 +1,146 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+
+import torch
+from torchvision.ops import boxes as box_ops
+from torchvision.ops import nms  # BC-compat
+
+
+def batched_nms(boxes, scores, idxs, iou_threshold):
+    """
+    Same as torchvision.ops.boxes.batched_nms, but safer.
+    """
+    assert boxes.shape[-1] == 4
+    # TODO may need better strategy.
+    # Investigate after having a fully-cuda NMS op.
+    if len(boxes) < 40000:
+        return box_ops.batched_nms(boxes, scores, idxs, iou_threshold)
+
+    result_mask = scores.new_zeros(scores.size(), dtype=torch.bool)
+    for id in torch.unique(idxs).cpu().tolist():
+        mask = (idxs == id).nonzero().view(-1)
+        keep = nms(boxes[mask], scores[mask], iou_threshold)
+        result_mask[mask[keep]] = True
+    keep = result_mask.nonzero().view(-1)
+    keep = keep[scores[keep].argsort(descending=True)]
+    return keep
+
+
+# Note: this function (nms_rotated) might be moved into
+# torchvision/ops/boxes.py in the future
+def nms_rotated(boxes, scores, iou_threshold):
+    """
+    Performs non-maximum suppression (NMS) on the rotated boxes according
+    to their intersection-over-union (IoU).
+
+    Rotated NMS iteratively removes lower scoring rotated boxes which have an
+    IoU greater than iou_threshold with another (higher scoring) rotated box.
+
+    Note that RotatedBox (5, 3, 4, 2, -90) covers exactly the same region as
+    RotatedBox (5, 3, 4, 2, 90) does, and their IoU will be 1. However, they
+    can be representing completely different objects in certain tasks, e.g., OCR.
+
+    As for the question of whether rotated-NMS should treat them as faraway boxes
+    even though their IOU is 1, it depends on the application and/or ground truth annotation.
+
+    As an extreme example, consider a single character v and the square box around it.
+
+    If the angle is 0 degree, the object (text) would be read as 'v';
+
+    If the angle is 90 degrees, the object (text) would become '>';
+
+    If the angle is 180 degrees, the object (text) would become '^';
+
+    If the angle is 270/-90 degrees, the object (text) would become '<'
+
+    All of these cases have IoU of 1 to each other, and rotated NMS that only
+    uses IoU as criterion would only keep one of them with the highest score -
+    which, practically, still makes sense in most cases because typically
+    only one of theses orientations is the correct one. Also, it does not matter
+    as much if the box is only used to classify the object (instead of transcribing
+    them with a sequential OCR recognition model) later.
+
+    On the other hand, when we use IoU to filter proposals that are close to the
+    ground truth during training, we should definitely take the angle into account if
+    we know the ground truth is labeled with the strictly correct orientation (as in,
+    upside-down words are annotated with -180 degrees even though they can be covered
+    with a 0/90/-90 degree box, etc.)
+
+    The way the original dataset is annotated also matters. For example, if the dataset
+    is a 4-point polygon dataset that does not enforce ordering of vertices/orientation,
+    we can estimate a minimum rotated bounding box to this polygon, but there's no way
+    we can tell the correct angle with 100% confidence (as shown above, there could be 4 different
+    rotated boxes, with angles differed by 90 degrees to each other, covering the exactly
+    same region). In that case we have to just use IoU to determine the box
+    proximity (as many detection benchmarks (even for text) do) unless there're other
+    assumptions we can make (like width is always larger than height, or the object is not
+    rotated by more than 90 degrees CCW/CW, etc.)
+
+    In summary, not considering angles in rotated NMS seems to be a good option for now,
+    but we should be aware of its implications.
+
+    Args:
+        boxes (Tensor[N, 5]): Rotated boxes to perform NMS on. They are expected to be in
+           (x_center, y_center, width, height, angle_degrees) format.
+        scores (Tensor[N]): Scores for each one of the rotated boxes
+        iou_threshold (float): Discards all overlapping rotated boxes with IoU < iou_threshold
+
+    Returns:
+        keep (Tensor): int64 tensor with the indices of the elements that have been kept
+        by Rotated NMS, sorted in decreasing order of scores
+    """
+    from detectron2 import _C
+
+    return _C.nms_rotated(boxes, scores, iou_threshold)
+
+
+# Note: this function (batched_nms_rotated) might be moved into
+# torchvision/ops/boxes.py in the future
+def batched_nms_rotated(boxes, scores, idxs, iou_threshold):
+    """
+    Performs non-maximum suppression in a batched fashion.
+
+    Each index value correspond to a category, and NMS
+    will not be applied between elements of different categories.
+
+    Args:
+        boxes (Tensor[N, 5]):
+           boxes where NMS will be performed. They
+           are expected to be in (x_ctr, y_ctr, width, height, angle_degrees) format
+        scores (Tensor[N]):
+           scores for each one of the boxes
+        idxs (Tensor[N]):
+           indices of the categories for each one of the boxes.
+        iou_threshold (float):
+           discards all overlapping boxes
+           with IoU < iou_threshold
+
+    Returns:
+        Tensor:
+            int64 tensor with the indices of the elements that have been kept
+            by NMS, sorted in decreasing order of scores
+    """
+    assert boxes.shape[-1] == 5
+
+    if boxes.numel() == 0:
+        return torch.empty((0,), dtype=torch.int64, device=boxes.device)
+    # Strategy: in order to perform NMS independently per class,
+    # we add an offset to all the boxes. The offset is dependent
+    # only on the class idx, and is large enough so that boxes
+    # from different classes do not overlap
+
+    # Note that batched_nms in torchvision/ops/boxes.py only uses max_coordinate,
+    # which won't handle negative coordinates correctly.
+    # Here by using min_coordinate we can make sure the negative coordinates are
+    # correctly handled.
+    max_coordinate = (
+        torch.max(boxes[:, 0], boxes[:, 1]) + torch.max(boxes[:, 2], boxes[:, 3]) / 2
+    ).max()
+    min_coordinate = (
+        torch.min(boxes[:, 0], boxes[:, 1]) - torch.max(boxes[:, 2], boxes[:, 3]) / 2
+    ).min()
+    offsets = idxs.to(boxes) * (max_coordinate - min_coordinate + 1)
+    boxes_for_nms = boxes.clone()  # avoid modifying the original values in boxes
+    boxes_for_nms[:, :2] += offsets[:, None]
+    keep = nms_rotated(boxes_for_nms, scores, iou_threshold)
+    return keep
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/detectron2/layers/roi_align.py b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/layers/roi_align.py
new file mode 100644
index 0000000000000000000000000000000000000000..f8c4ce1d747ec77329fab34436f5efa0e958ef32
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/layers/roi_align.py
@@ -0,0 +1,105 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+from torch import nn
+from torch.autograd import Function
+from torch.autograd.function import once_differentiable
+from torch.nn.modules.utils import _pair
+
+from detectron2 import _C
+
+
+class _ROIAlign(Function):
+    @staticmethod
+    def forward(ctx, input, roi, output_size, spatial_scale, sampling_ratio, aligned):
+        ctx.save_for_backward(roi)
+        ctx.output_size = _pair(output_size)
+        ctx.spatial_scale = spatial_scale
+        ctx.sampling_ratio = sampling_ratio
+        ctx.input_shape = input.size()
+        ctx.aligned = aligned
+        output = _C.roi_align_forward(
+            input, roi, spatial_scale, output_size[0], output_size[1], sampling_ratio, aligned
+        )
+        return output
+
+    @staticmethod
+    @once_differentiable
+    def backward(ctx, grad_output):
+        (rois,) = ctx.saved_tensors
+        output_size = ctx.output_size
+        spatial_scale = ctx.spatial_scale
+        sampling_ratio = ctx.sampling_ratio
+        bs, ch, h, w = ctx.input_shape
+        grad_input = _C.roi_align_backward(
+            grad_output,
+            rois,
+            spatial_scale,
+            output_size[0],
+            output_size[1],
+            bs,
+            ch,
+            h,
+            w,
+            sampling_ratio,
+            ctx.aligned,
+        )
+        return grad_input, None, None, None, None, None
+
+
+roi_align = _ROIAlign.apply
+
+
+class ROIAlign(nn.Module):
+    def __init__(self, output_size, spatial_scale, sampling_ratio, aligned=True):
+        """
+        Args:
+            output_size (tuple): h, w
+            spatial_scale (float): scale the input boxes by this number
+            sampling_ratio (int): number of inputs samples to take for each output
+                sample. 0 to take samples densely.
+            aligned (bool): if False, use the legacy implementation in
+                Detectron. If True, align the results more perfectly.
+
+        Note:
+            The meaning of aligned=True:
+
+            Given a continuous coordinate c, its two neighboring pixel indices (in our
+            pixel model) are computed by floor(c - 0.5) and ceil(c - 0.5). For example,
+            c=1.3 has pixel neighbors with discrete indices [0] and [1] (which are sampled
+            from the underlying signal at continuous coordinates 0.5 and 1.5). But the original
+            roi_align (aligned=False) does not subtract the 0.5 when computing neighboring
+            pixel indices and therefore it uses pixels with a slightly incorrect alignment
+            (relative to our pixel model) when performing bilinear interpolation.
+
+            With `aligned=True`,
+            we first appropriately scale the ROI and then shift it by -0.5
+            prior to calling roi_align. This produces the correct neighbors; see
+            detectron2/tests/test_roi_align.py for verification.
+
+            The difference does not make a difference to the model's performance if
+            ROIAlign is used together with conv layers.
+        """
+        super(ROIAlign, self).__init__()
+        self.output_size = output_size
+        self.spatial_scale = spatial_scale
+        self.sampling_ratio = sampling_ratio
+        self.aligned = aligned
+
+    def forward(self, input, rois):
+        """
+        Args:
+            input: NCHW images
+            rois: Bx5 boxes. First column is the index into N. The other 4 columns are xyxy.
+        """
+        assert rois.dim() == 2 and rois.size(1) == 5
+        return roi_align(
+            input, rois, self.output_size, self.spatial_scale, self.sampling_ratio, self.aligned
+        )
+
+    def __repr__(self):
+        tmpstr = self.__class__.__name__ + "("
+        tmpstr += "output_size=" + str(self.output_size)
+        tmpstr += ", spatial_scale=" + str(self.spatial_scale)
+        tmpstr += ", sampling_ratio=" + str(self.sampling_ratio)
+        tmpstr += ", aligned=" + str(self.aligned)
+        tmpstr += ")"
+        return tmpstr
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/detectron2/layers/roi_align_rotated.py b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/layers/roi_align_rotated.py
new file mode 100644
index 0000000000000000000000000000000000000000..6ed87e69d5e738f8dbaa7c73c5c8de65343de0fd
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/layers/roi_align_rotated.py
@@ -0,0 +1,88 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+from torch import nn
+from torch.autograd import Function
+from torch.autograd.function import once_differentiable
+from torch.nn.modules.utils import _pair
+
+from detectron2 import _C
+
+
+class _ROIAlignRotated(Function):
+    @staticmethod
+    def forward(ctx, input, roi, output_size, spatial_scale, sampling_ratio):
+        ctx.save_for_backward(roi)
+        ctx.output_size = _pair(output_size)
+        ctx.spatial_scale = spatial_scale
+        ctx.sampling_ratio = sampling_ratio
+        ctx.input_shape = input.size()
+        output = _C.roi_align_rotated_forward(
+            input, roi, spatial_scale, output_size[0], output_size[1], sampling_ratio
+        )
+        return output
+
+    @staticmethod
+    @once_differentiable
+    def backward(ctx, grad_output):
+        (rois,) = ctx.saved_tensors
+        output_size = ctx.output_size
+        spatial_scale = ctx.spatial_scale
+        sampling_ratio = ctx.sampling_ratio
+        bs, ch, h, w = ctx.input_shape
+        grad_input = _C.roi_align_rotated_backward(
+            grad_output,
+            rois,
+            spatial_scale,
+            output_size[0],
+            output_size[1],
+            bs,
+            ch,
+            h,
+            w,
+            sampling_ratio,
+        )
+        return grad_input, None, None, None, None, None
+
+
+roi_align_rotated = _ROIAlignRotated.apply
+
+
+class ROIAlignRotated(nn.Module):
+    def __init__(self, output_size, spatial_scale, sampling_ratio):
+        """
+        Args:
+            output_size (tuple): h, w
+            spatial_scale (float): scale the input boxes by this number
+            sampling_ratio (int): number of inputs samples to take for each output
+                sample. 0 to take samples densely.
+
+        Note:
+            ROIAlignRotated supports continuous coordinate by default:
+            Given a continuous coordinate c, its two neighboring pixel indices (in our
+            pixel model) are computed by floor(c - 0.5) and ceil(c - 0.5). For example,
+            c=1.3 has pixel neighbors with discrete indices [0] and [1] (which are sampled
+            from the underlying signal at continuous coordinates 0.5 and 1.5).
+        """
+        super(ROIAlignRotated, self).__init__()
+        self.output_size = output_size
+        self.spatial_scale = spatial_scale
+        self.sampling_ratio = sampling_ratio
+
+    def forward(self, input, rois):
+        """
+        Args:
+            input: NCHW images
+            rois: Bx6 boxes. First column is the index into N.
+                The other 5 columns are (x_ctr, y_ctr, width, height, angle_degrees).
+        """
+        assert rois.dim() == 2 and rois.size(1) == 6
+        return roi_align_rotated(
+            input, rois, self.output_size, self.spatial_scale, self.sampling_ratio
+        )
+
+    def __repr__(self):
+        tmpstr = self.__class__.__name__ + "("
+        tmpstr += "output_size=" + str(self.output_size)
+        tmpstr += ", spatial_scale=" + str(self.spatial_scale)
+        tmpstr += ", sampling_ratio=" + str(self.sampling_ratio)
+        tmpstr += ")"
+        return tmpstr
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/detectron2/layers/rotated_boxes.py b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/layers/rotated_boxes.py
new file mode 100644
index 0000000000000000000000000000000000000000..ea9b08583da79aae871b500bcffc19f8a352da6e
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/layers/rotated_boxes.py
@@ -0,0 +1,22 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+from detectron2 import _C
+
+
+def pairwise_iou_rotated(boxes1, boxes2):
+    """
+    Return intersection-over-union (Jaccard index) of boxes.
+
+    Both sets of boxes are expected to be in
+    (x_center, y_center, width, height, angle) format.
+
+    Arguments:
+        boxes1 (Tensor[N, 5])
+        boxes2 (Tensor[M, 5])
+
+    Returns:
+        iou (Tensor[N, M]): the NxM matrix containing the pairwise
+            IoU values for every element in boxes1 and boxes2
+    """
+    return _C.box_iou_rotated(boxes1, boxes2)
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/detectron2/layers/shape_spec.py b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/layers/shape_spec.py
new file mode 100644
index 0000000000000000000000000000000000000000..ed7f0d08268a2342cfb8246cc032686f2343ef8f
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/layers/shape_spec.py
@@ -0,0 +1,20 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+from collections import namedtuple
+
+
+class ShapeSpec(namedtuple("_ShapeSpec", ["channels", "height", "width", "stride"])):
+    """
+    A simple structure that contains basic shape specification about a tensor.
+    It is often used as the auxiliary inputs/outputs of models,
+    to obtain the shape inference ability among pytorch modules.
+
+    Attributes:
+        channels:
+        height:
+        width:
+        stride:
+    """
+
+    def __new__(cls, *, channels=None, height=None, width=None, stride=None):
+        return super().__new__(cls, channels, height, width, stride)
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/detectron2/layers/wrappers.py b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/layers/wrappers.py
new file mode 100644
index 0000000000000000000000000000000000000000..7e3935e90c61f02e000568af79ed458dd491fed7
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/layers/wrappers.py
@@ -0,0 +1,215 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+"""
+Wrappers around on some nn functions, mainly to support empty tensors.
+
+Ideally, add support directly in PyTorch to empty tensors in those functions.
+
+These can be removed once https://github.com/pytorch/pytorch/issues/12013
+is implemented
+"""
+
+import math
+import torch
+from torch.nn.modules.utils import _ntuple
+
+TORCH_VERSION = tuple(int(x) for x in torch.__version__.split(".")[:2])
+
+
+def cat(tensors, dim=0):
+    """
+    Efficient version of torch.cat that avoids a copy if there is only a single element in a list
+    """
+    assert isinstance(tensors, (list, tuple))
+    if len(tensors) == 1:
+        return tensors[0]
+    return torch.cat(tensors, dim)
+
+
+class _NewEmptyTensorOp(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, x, new_shape):
+        ctx.shape = x.shape
+        return x.new_empty(new_shape)
+
+    @staticmethod
+    def backward(ctx, grad):
+        shape = ctx.shape
+        return _NewEmptyTensorOp.apply(grad, shape), None
+
+
+class Conv2d(torch.nn.Conv2d):
+    """
+    A wrapper around :class:`torch.nn.Conv2d` to support empty inputs and more features.
+    """
+
+    def __init__(self, *args, **kwargs):
+        """
+        Extra keyword arguments supported in addition to those in `torch.nn.Conv2d`:
+
+        Args:
+            norm (nn.Module, optional): a normalization layer
+            activation (callable(Tensor) -> Tensor): a callable activation function
+
+        It assumes that norm layer is used before activation.
+        """
+        norm = kwargs.pop("norm", None)
+        activation = kwargs.pop("activation", None)
+        super().__init__(*args, **kwargs)
+
+        self.norm = norm
+        self.activation = activation
+
+    def forward(self, x):
+        if x.numel() == 0 and self.training:
+            # https://github.com/pytorch/pytorch/issues/12013
+            assert not isinstance(
+                self.norm, torch.nn.SyncBatchNorm
+            ), "SyncBatchNorm does not support empty inputs!"
+
+        if x.numel() == 0 and TORCH_VERSION <= (1, 4):
+            assert not isinstance(
+                self.norm, torch.nn.GroupNorm
+            ), "GroupNorm does not support empty inputs in PyTorch <=1.4!"
+            # When input is empty, we want to return a empty tensor with "correct" shape,
+            # So that the following operations will not panic
+            # if they check for the shape of the tensor.
+            # This computes the height and width of the output tensor
+            output_shape = [
+                (i + 2 * p - (di * (k - 1) + 1)) // s + 1
+                for i, p, di, k, s in zip(
+                    x.shape[-2:], self.padding, self.dilation, self.kernel_size, self.stride
+                )
+            ]
+            output_shape = [x.shape[0], self.weight.shape[0]] + output_shape
+            empty = _NewEmptyTensorOp.apply(x, output_shape)
+            if self.training:
+                # This is to make DDP happy.
+                # DDP expects all workers to have gradient w.r.t the same set of parameters.
+                _dummy = sum(x.view(-1)[0] for x in self.parameters()) * 0.0
+                return empty + _dummy
+            else:
+                return empty
+
+        x = super().forward(x)
+        if self.norm is not None:
+            x = self.norm(x)
+        if self.activation is not None:
+            x = self.activation(x)
+        return x
+
+
+if TORCH_VERSION > (1, 4):
+    ConvTranspose2d = torch.nn.ConvTranspose2d
+else:
+
+    class ConvTranspose2d(torch.nn.ConvTranspose2d):
+        """
+        A wrapper around :class:`torch.nn.ConvTranspose2d` to support zero-size tensor.
+        """
+
+        def forward(self, x):
+            if x.numel() > 0:
+                return super(ConvTranspose2d, self).forward(x)
+            # get output shape
+
+            # When input is empty, we want to return a empty tensor with "correct" shape,
+            # So that the following operations will not panic
+            # if they check for the shape of the tensor.
+            # This computes the height and width of the output tensor
+            output_shape = [
+                (i - 1) * d - 2 * p + (di * (k - 1) + 1) + op
+                for i, p, di, k, d, op in zip(
+                    x.shape[-2:],
+                    self.padding,
+                    self.dilation,
+                    self.kernel_size,
+                    self.stride,
+                    self.output_padding,
+                )
+            ]
+            output_shape = [x.shape[0], self.out_channels] + output_shape
+            # This is to make DDP happy.
+            # DDP expects all workers to have gradient w.r.t the same set of parameters.
+            _dummy = sum(x.view(-1)[0] for x in self.parameters()) * 0.0
+            return _NewEmptyTensorOp.apply(x, output_shape) + _dummy
+
+
+if TORCH_VERSION > (1, 4):
+    BatchNorm2d = torch.nn.BatchNorm2d
+else:
+
+    class BatchNorm2d(torch.nn.BatchNorm2d):
+        """
+        A wrapper around :class:`torch.nn.BatchNorm2d` to support zero-size tensor.
+        """
+
+        def forward(self, x):
+            if x.numel() > 0:
+                return super(BatchNorm2d, self).forward(x)
+            # get output shape
+            output_shape = x.shape
+            return _NewEmptyTensorOp.apply(x, output_shape)
+
+
+if TORCH_VERSION > (1, 5):
+    Linear = torch.nn.Linear
+else:
+
+    class Linear(torch.nn.Linear):
+        """
+        A wrapper around :class:`torch.nn.Linear` to support empty inputs and more features.
+        Because of https://github.com/pytorch/pytorch/issues/34202
+        """
+
+        def forward(self, x):
+            if x.numel() == 0:
+                output_shape = [x.shape[0], self.weight.shape[0]]
+
+                empty = _NewEmptyTensorOp.apply(x, output_shape)
+                if self.training:
+                    # This is to make DDP happy.
+                    # DDP expects all workers to have gradient w.r.t the same set of parameters.
+                    _dummy = sum(x.view(-1)[0] for x in self.parameters()) * 0.0
+                    return empty + _dummy
+                else:
+                    return empty
+
+            x = super().forward(x)
+            return x
+
+
+def interpolate(input, size=None, scale_factor=None, mode="nearest", align_corners=None):
+    """
+    A wrapper around :func:`torch.nn.functional.interpolate` to support zero-size tensor.
+    """
+    if TORCH_VERSION > (1, 4) or input.numel() > 0:
+        return torch.nn.functional.interpolate(
+            input, size, scale_factor, mode, align_corners=align_corners
+        )
+
+    def _check_size_scale_factor(dim):
+        if size is None and scale_factor is None:
+            raise ValueError("either size or scale_factor should be defined")
+        if size is not None and scale_factor is not None:
+            raise ValueError("only one of size or scale_factor should be defined")
+        if (
+            scale_factor is not None
+            and isinstance(scale_factor, tuple)
+            and len(scale_factor) != dim
+        ):
+            raise ValueError(
+                "scale_factor shape must match input shape. "
+                "Input is {}D, scale_factor size is {}".format(dim, len(scale_factor))
+            )
+
+    def _output_size(dim):
+        _check_size_scale_factor(dim)
+        if size is not None:
+            return size
+        scale_factors = _ntuple(dim)(scale_factor)
+        # math.floor might return float in py2.7
+        return [int(math.floor(input.size(i + 2) * scale_factors[i])) for i in range(dim)]
+
+    output_shape = tuple(_output_size(2))
+    output_shape = input.shape[:-2] + output_shape
+    return _NewEmptyTensorOp.apply(input, output_shape)
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/detectron2/model_zoo/__init__.py b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/model_zoo/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..886616f8e11ef31ea85d7a7ba9a75308befceedf
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/model_zoo/__init__.py
@@ -0,0 +1,9 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+"""
+Model Zoo API for Detectron2: a collection of functions to create common model architectures and
+optionally load pre-trained weights as released in
+`MODEL_ZOO.md <https://github.com/facebookresearch/detectron2/blob/master/MODEL_ZOO.md>`_.
+"""
+from .model_zoo import get, get_config_file, get_checkpoint_url
+
+__all__ = ["get_checkpoint_url", "get", "get_config_file"]
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/detectron2/model_zoo/model_zoo.py b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/model_zoo/model_zoo.py
new file mode 100644
index 0000000000000000000000000000000000000000..68d0ce5dc442864474bb1086bf04d6e40708c190
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/model_zoo/model_zoo.py
@@ -0,0 +1,150 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+import os
+import pkg_resources
+import torch
+
+from detectron2.checkpoint import DetectionCheckpointer
+from detectron2.config import get_cfg
+from detectron2.modeling import build_model
+
+
+class _ModelZooUrls(object):
+    """
+    Mapping from names to officially released Detectron2 pre-trained models.
+    """
+
+    S3_PREFIX = "https://dl.fbaipublicfiles.com/detectron2/"
+
+    # format: {config_path.yaml} -> model_id/model_final_{commit}.pkl
+    CONFIG_PATH_TO_URL_SUFFIX = {
+        # COCO Detection with Faster R-CNN
+        "COCO-Detection/faster_rcnn_R_50_C4_1x.yaml": "137257644/model_final_721ade.pkl",
+        "COCO-Detection/faster_rcnn_R_50_DC5_1x.yaml": "137847829/model_final_51d356.pkl",
+        "COCO-Detection/faster_rcnn_R_50_FPN_1x.yaml": "137257794/model_final_b275ba.pkl",
+        "COCO-Detection/faster_rcnn_R_50_C4_3x.yaml": "137849393/model_final_f97cb7.pkl",
+        "COCO-Detection/faster_rcnn_R_50_DC5_3x.yaml": "137849425/model_final_68d202.pkl",
+        "COCO-Detection/faster_rcnn_R_50_FPN_3x.yaml": "137849458/model_final_280758.pkl",
+        "COCO-Detection/faster_rcnn_R_101_C4_3x.yaml": "138204752/model_final_298dad.pkl",
+        "COCO-Detection/faster_rcnn_R_101_DC5_3x.yaml": "138204841/model_final_3e0943.pkl",
+        "COCO-Detection/faster_rcnn_R_101_FPN_3x.yaml": "137851257/model_final_f6e8b1.pkl",
+        "COCO-Detection/faster_rcnn_X_101_32x8d_FPN_3x.yaml": "139173657/model_final_68b088.pkl",
+        # COCO Detection with RetinaNet
+        "COCO-Detection/retinanet_R_50_FPN_1x.yaml": "137593951/model_final_b796dc.pkl",
+        "COCO-Detection/retinanet_R_50_FPN_3x.yaml": "137849486/model_final_4cafe0.pkl",
+        "COCO-Detection/retinanet_R_101_FPN_3x.yaml": "138363263/model_final_59f53c.pkl",
+        # COCO Detection with RPN and Fast R-CNN
+        "COCO-Detection/rpn_R_50_C4_1x.yaml": "137258005/model_final_450694.pkl",
+        "COCO-Detection/rpn_R_50_FPN_1x.yaml": "137258492/model_final_02ce48.pkl",
+        "COCO-Detection/fast_rcnn_R_50_FPN_1x.yaml": "137635226/model_final_e5f7ce.pkl",
+        # COCO Instance Segmentation Baselines with Mask R-CNN
+        "COCO-InstanceSegmentation/mask_rcnn_R_50_C4_1x.yaml": "137259246/model_final_9243eb.pkl",
+        "COCO-InstanceSegmentation/mask_rcnn_R_50_DC5_1x.yaml": "137260150/model_final_4f86c3.pkl",
+        "COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_1x.yaml": "137260431/model_final_a54504.pkl",
+        "COCO-InstanceSegmentation/mask_rcnn_R_50_C4_3x.yaml": "137849525/model_final_4ce675.pkl",
+        "COCO-InstanceSegmentation/mask_rcnn_R_50_DC5_3x.yaml": "137849551/model_final_84107b.pkl",
+        "COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_3x.yaml": "137849600/model_final_f10217.pkl",
+        "COCO-InstanceSegmentation/mask_rcnn_R_101_C4_3x.yaml": "138363239/model_final_a2914c.pkl",
+        "COCO-InstanceSegmentation/mask_rcnn_R_101_DC5_3x.yaml": "138363294/model_final_0464b7.pkl",
+        "COCO-InstanceSegmentation/mask_rcnn_R_101_FPN_3x.yaml": "138205316/model_final_a3ec72.pkl",
+        "COCO-InstanceSegmentation/mask_rcnn_X_101_32x8d_FPN_3x.yaml": "139653917/model_final_2d9806.pkl",  # noqa
+        # COCO Person Keypoint Detection Baselines with Keypoint R-CNN
+        "COCO-Keypoints/keypoint_rcnn_R_50_FPN_1x.yaml": "137261548/model_final_04e291.pkl",
+        "COCO-Keypoints/keypoint_rcnn_R_50_FPN_3x.yaml": "137849621/model_final_a6e10b.pkl",
+        "COCO-Keypoints/keypoint_rcnn_R_101_FPN_3x.yaml": "138363331/model_final_997cc7.pkl",
+        "COCO-Keypoints/keypoint_rcnn_X_101_32x8d_FPN_3x.yaml": "139686956/model_final_5ad38f.pkl",
+        # COCO Panoptic Segmentation Baselines with Panoptic FPN
+        "COCO-PanopticSegmentation/panoptic_fpn_R_50_1x.yaml": "139514544/model_final_dbfeb4.pkl",
+        "COCO-PanopticSegmentation/panoptic_fpn_R_50_3x.yaml": "139514569/model_final_c10459.pkl",
+        "COCO-PanopticSegmentation/panoptic_fpn_R_101_3x.yaml": "139514519/model_final_cafdb1.pkl",
+        # LVIS Instance Segmentation Baselines with Mask R-CNN
+        "LVIS-InstanceSegmentation/mask_rcnn_R_50_FPN_1x.yaml": "144219072/model_final_571f7c.pkl",
+        "LVIS-InstanceSegmentation/mask_rcnn_R_101_FPN_1x.yaml": "144219035/model_final_824ab5.pkl",
+        "LVIS-InstanceSegmentation/mask_rcnn_X_101_32x8d_FPN_1x.yaml": "144219108/model_final_5e3439.pkl",  # noqa
+        # Cityscapes & Pascal VOC Baselines
+        "Cityscapes/mask_rcnn_R_50_FPN.yaml": "142423278/model_final_af9cf5.pkl",
+        "PascalVOC-Detection/faster_rcnn_R_50_C4.yaml": "142202221/model_final_b1acc2.pkl",
+        # Other Settings
+        "Misc/mask_rcnn_R_50_FPN_1x_dconv_c3-c5.yaml": "138602867/model_final_65c703.pkl",
+        "Misc/mask_rcnn_R_50_FPN_3x_dconv_c3-c5.yaml": "144998336/model_final_821d0b.pkl",
+        "Misc/cascade_mask_rcnn_R_50_FPN_1x.yaml": "138602847/model_final_e9d89b.pkl",
+        "Misc/cascade_mask_rcnn_R_50_FPN_3x.yaml": "144998488/model_final_480dd8.pkl",
+        "Misc/mask_rcnn_R_50_FPN_3x_syncbn.yaml": "169527823/model_final_3b3c51.pkl",
+        "Misc/mask_rcnn_R_50_FPN_3x_gn.yaml": "138602888/model_final_dc5d9e.pkl",
+        "Misc/scratch_mask_rcnn_R_50_FPN_3x_gn.yaml": "138602908/model_final_01ca85.pkl",
+        "Misc/panoptic_fpn_R_101_dconv_cascade_gn_3x.yaml": "139797668/model_final_be35db.pkl",
+        "Misc/cascade_mask_rcnn_X_152_32x8d_FPN_IN5k_gn_dconv.yaml": "18131413/model_0039999_e76410.pkl",  # noqa
+        # D1 Comparisons
+        "Detectron1-Comparisons/faster_rcnn_R_50_FPN_noaug_1x.yaml": "137781054/model_final_7ab50c.pkl",  # noqa
+        "Detectron1-Comparisons/mask_rcnn_R_50_FPN_noaug_1x.yaml": "137781281/model_final_62ca52.pkl",  # noqa
+        "Detectron1-Comparisons/keypoint_rcnn_R_50_FPN_1x.yaml": "137781195/model_final_cce136.pkl",
+    }
+
+
+def get_checkpoint_url(config_path):
+    """
+    Returns the URL to the model trained using the given config
+
+    Args:
+        config_path (str): config file name relative to detectron2's "configs/"
+            directory, e.g., "COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_1x.yaml"
+
+    Returns:
+        str: a URL to the model
+    """
+    name = config_path.replace(".yaml", "")
+    if config_path in _ModelZooUrls.CONFIG_PATH_TO_URL_SUFFIX:
+        suffix = _ModelZooUrls.CONFIG_PATH_TO_URL_SUFFIX[config_path]
+        return _ModelZooUrls.S3_PREFIX + name + "/" + suffix
+    raise RuntimeError("{} not available in Model Zoo!".format(name))
+
+
+def get_config_file(config_path):
+    """
+    Returns path to a builtin config file.
+
+    Args:
+        config_path (str): config file name relative to detectron2's "configs/"
+            directory, e.g., "COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_1x.yaml"
+
+    Returns:
+        str: the real path to the config file.
+    """
+    cfg_file = pkg_resources.resource_filename(
+        "detectron2.model_zoo", os.path.join("configs", config_path)
+    )
+    if not os.path.exists(cfg_file):
+        raise RuntimeError("{} not available in Model Zoo!".format(config_path))
+    return cfg_file
+
+
+def get(config_path, trained: bool = False):
+    """
+    Get a model specified by relative path under Detectron2's official ``configs/`` directory.
+
+    Args:
+        config_path (str): config file name relative to detectron2's "configs/"
+            directory, e.g., "COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_1x.yaml"
+        trained (bool): If True, will initialize the model with the trained model zoo weights.
+            If False, the checkpoint specified in the config file's ``MODEL.WEIGHTS`` is used
+            instead; this will typically (though not always) initialize a subset of weights using
+            an ImageNet pre-trained model, while randomly initializing the other weights.
+
+    Example:
+
+    .. code-block:: python
+
+        from detectron2 import model_zoo
+        model = model_zoo.get("COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_1x.yaml", trained=True)
+    """
+    cfg_file = get_config_file(config_path)
+
+    cfg = get_cfg()
+    cfg.merge_from_file(cfg_file)
+    if trained:
+        cfg.MODEL.WEIGHTS = get_checkpoint_url(config_path)
+    if not torch.cuda.is_available():
+        cfg.MODEL.DEVICE = "cpu"
+
+    model = build_model(cfg)
+    DetectionCheckpointer(model).load(cfg.MODEL.WEIGHTS)
+    return model
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/detectron2/modeling/__init__.py b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/modeling/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..9e23fe4a7037c8ece8f4c553b4cfda1631b79c9c
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/modeling/__init__.py
@@ -0,0 +1,56 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+import torch
+
+from detectron2.layers import ShapeSpec
+
+from .anchor_generator import build_anchor_generator, ANCHOR_GENERATOR_REGISTRY
+from .backbone import (
+    BACKBONE_REGISTRY,
+    FPN,
+    Backbone,
+    ResNet,
+    ResNetBlockBase,
+    build_backbone,
+    build_resnet_backbone,
+    make_stage,
+)
+from .meta_arch import (
+    META_ARCH_REGISTRY,
+    SEM_SEG_HEADS_REGISTRY,
+    GeneralizedRCNN,
+    PanopticFPN,
+    ProposalNetwork,
+    RetinaNet,
+    SemanticSegmentor,
+    build_model,
+    build_sem_seg_head,
+)
+from .postprocessing import detector_postprocess
+from .proposal_generator import (
+    PROPOSAL_GENERATOR_REGISTRY,
+    build_proposal_generator,
+    RPN_HEAD_REGISTRY,
+    build_rpn_head,
+)
+from .roi_heads import (
+    ROI_BOX_HEAD_REGISTRY,
+    ROI_HEADS_REGISTRY,
+    ROI_KEYPOINT_HEAD_REGISTRY,
+    ROI_MASK_HEAD_REGISTRY,
+    ROIHeads,
+    StandardROIHeads,
+    BaseMaskRCNNHead,
+    BaseKeypointRCNNHead,
+    build_box_head,
+    build_keypoint_head,
+    build_mask_head,
+    build_roi_heads,
+)
+from .test_time_augmentation import DatasetMapperTTA, GeneralizedRCNNWithTTA
+
+_EXCLUDE = {"torch", "ShapeSpec"}
+__all__ = [k for k in globals().keys() if k not in _EXCLUDE and not k.startswith("_")]
+
+assert (
+    torch.Tensor([1]) == torch.Tensor([2])
+).dtype == torch.bool, "Your Pytorch is too old. Please update to contain https://github.com/pytorch/pytorch/pull/21113"
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/detectron2/modeling/anchor_generator.py b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/modeling/anchor_generator.py
new file mode 100644
index 0000000000000000000000000000000000000000..93927bc1c16106710bc1ca1da4d186f7710e1606
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/modeling/anchor_generator.py
@@ -0,0 +1,382 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+import math
+from typing import List
+import torch
+from torch import nn
+
+from detectron2.config import configurable
+from detectron2.layers import ShapeSpec
+from detectron2.structures import Boxes, RotatedBoxes
+from detectron2.utils.registry import Registry
+
+ANCHOR_GENERATOR_REGISTRY = Registry("ANCHOR_GENERATOR")
+ANCHOR_GENERATOR_REGISTRY.__doc__ = """
+Registry for modules that creates object detection anchors for feature maps.
+
+The registered object will be called with `obj(cfg, input_shape)`.
+"""
+
+
+class BufferList(nn.Module):
+    """
+    Similar to nn.ParameterList, but for buffers
+    """
+
+    def __init__(self, buffers=None):
+        super(BufferList, self).__init__()
+        if buffers is not None:
+            self.extend(buffers)
+
+    def extend(self, buffers):
+        offset = len(self)
+        for i, buffer in enumerate(buffers):
+            self.register_buffer(str(offset + i), buffer)
+        return self
+
+    def __len__(self):
+        return len(self._buffers)
+
+    def __iter__(self):
+        return iter(self._buffers.values())
+
+
+def _create_grid_offsets(size: List[int], stride: int, offset: float, device: torch.device):
+    grid_height, grid_width = size
+    shifts_x = torch.arange(
+        offset * stride, grid_width * stride, step=stride, dtype=torch.float32, device=device
+    )
+    shifts_y = torch.arange(
+        offset * stride, grid_height * stride, step=stride, dtype=torch.float32, device=device
+    )
+
+    shift_y, shift_x = torch.meshgrid(shifts_y, shifts_x)
+    shift_x = shift_x.reshape(-1)
+    shift_y = shift_y.reshape(-1)
+    return shift_x, shift_y
+
+
+def _broadcast_params(params, num_features, name):
+    """
+    If one size (or aspect ratio) is specified and there are multiple feature
+    maps, we "broadcast" anchors of that single size (or aspect ratio)
+    over all feature maps.
+
+    If params is list[float], or list[list[float]] with len(params) == 1, repeat
+    it num_features time.
+
+    Returns:
+        list[list[float]]: param for each feature
+    """
+    assert isinstance(
+        params, (list, tuple)
+    ), f"{name} in anchor generator has to be a list! Got {params}."
+    assert len(params), f"{name} in anchor generator cannot be empty!"
+    if not isinstance(params[0], (list, tuple)):  # list[float]
+        return [params] * num_features
+    if len(params) == 1:
+        return list(params) * num_features
+    assert len(params) == num_features, (
+        f"Got {name} of length {len(params)} in anchor generator, "
+        f"but the number of input features is {num_features}!"
+    )
+    return params
+
+
+@ANCHOR_GENERATOR_REGISTRY.register()
+class DefaultAnchorGenerator(nn.Module):
+    """
+    Compute anchors in the standard ways described in
+    "Faster R-CNN: Towards Real-Time Object Detection with Region Proposal Networks".
+    """
+
+    box_dim: int = 4
+    """
+    the dimension of each anchor box.
+    """
+
+    @configurable
+    def __init__(self, *, sizes, aspect_ratios, strides, offset=0.5):
+        """
+        This interface is experimental.
+
+        Args:
+            sizes (list[list[float]] or list[float]):
+                If sizes is list[list[float]], sizes[i] is the list of anchor sizes
+                (i.e. sqrt of anchor area) to use for the i-th feature map.
+                If sizes is list[float], the sizes are used for all feature maps.
+                Anchor sizes are given in absolute lengths in units of
+                the input image; they do not dynamically scale if the input image size changes.
+            aspect_ratios (list[list[float]] or list[float]): list of aspect ratios
+                (i.e. height / width) to use for anchors. Same "broadcast" rule for `sizes` applies.
+            strides (list[int]): stride of each input feature.
+            offset (float): Relative offset between the center of the first anchor and the top-left
+                corner of the image. Value has to be in [0, 1).
+                Recommend to use 0.5, which means half stride.
+        """
+        super().__init__()
+
+        self.strides = strides
+        self.num_features = len(self.strides)
+        sizes = _broadcast_params(sizes, self.num_features, "sizes")
+        aspect_ratios = _broadcast_params(aspect_ratios, self.num_features, "aspect_ratios")
+        self.cell_anchors = self._calculate_anchors(sizes, aspect_ratios)
+
+        self.offset = offset
+        assert 0.0 <= self.offset < 1.0, self.offset
+
+    @classmethod
+    def from_config(cls, cfg, input_shape: List[ShapeSpec]):
+        return {
+            "sizes": cfg.MODEL.ANCHOR_GENERATOR.SIZES,
+            "aspect_ratios": cfg.MODEL.ANCHOR_GENERATOR.ASPECT_RATIOS,
+            "strides": [x.stride for x in input_shape],
+            "offset": cfg.MODEL.ANCHOR_GENERATOR.OFFSET,
+        }
+
+    def _calculate_anchors(self, sizes, aspect_ratios):
+        cell_anchors = [
+            self.generate_cell_anchors(s, a).float() for s, a in zip(sizes, aspect_ratios)
+        ]
+        return BufferList(cell_anchors)
+
+    @property
+    def num_cell_anchors(self):
+        """
+        Alias of `num_anchors`.
+        """
+        return self.num_anchors
+
+    @property
+    def num_anchors(self):
+        """
+        Returns:
+            list[int]: Each int is the number of anchors at every pixel
+                location, on that feature map.
+                For example, if at every pixel we use anchors of 3 aspect
+                ratios and 5 sizes, the number of anchors is 15.
+                (See also ANCHOR_GENERATOR.SIZES and ANCHOR_GENERATOR.ASPECT_RATIOS in config)
+
+                In standard RPN models, `num_anchors` on every feature map is the same.
+        """
+        return [len(cell_anchors) for cell_anchors in self.cell_anchors]
+
+    def _grid_anchors(self, grid_sizes: List[List[int]]):
+        """
+        Returns:
+            list[Tensor]: #featuremap tensors, each is (#locations x #cell_anchors) x 4
+        """
+        anchors = []
+        for size, stride, base_anchors in zip(grid_sizes, self.strides, self.cell_anchors):
+            shift_x, shift_y = _create_grid_offsets(size, stride, self.offset, base_anchors.device)
+            shifts = torch.stack((shift_x, shift_y, shift_x, shift_y), dim=1)
+
+            anchors.append((shifts.view(-1, 1, 4) + base_anchors.view(1, -1, 4)).reshape(-1, 4))
+
+        return anchors
+
+    def generate_cell_anchors(self, sizes=(32, 64, 128, 256, 512), aspect_ratios=(0.5, 1, 2)):
+        """
+        Generate a tensor storing canonical anchor boxes, which are all anchor
+        boxes of different sizes and aspect_ratios centered at (0, 0).
+        We can later build the set of anchors for a full feature map by
+        shifting and tiling these tensors (see `meth:_grid_anchors`).
+
+        Args:
+            sizes (tuple[float]):
+            aspect_ratios (tuple[float]]):
+
+        Returns:
+            Tensor of shape (len(sizes) * len(aspect_ratios), 4) storing anchor boxes
+                in XYXY format.
+        """
+
+        # This is different from the anchor generator defined in the original Faster R-CNN
+        # code or Detectron. They yield the same AP, however the old version defines cell
+        # anchors in a less natural way with a shift relative to the feature grid and
+        # quantization that results in slightly different sizes for different aspect ratios.
+        # See also https://github.com/facebookresearch/Detectron/issues/227
+
+        anchors = []
+        for size in sizes:
+            area = size ** 2.0
+            for aspect_ratio in aspect_ratios:
+                # s * s = w * h
+                # a = h / w
+                # ... some algebra ...
+                # w = sqrt(s * s / a)
+                # h = a * w
+                w = math.sqrt(area / aspect_ratio)
+                h = aspect_ratio * w
+                x0, y0, x1, y1 = -w / 2.0, -h / 2.0, w / 2.0, h / 2.0
+                anchors.append([x0, y0, x1, y1])
+        return torch.tensor(anchors)
+
+    def forward(self, features):
+        """
+        Args:
+            features (list[Tensor]): list of backbone feature maps on which to generate anchors.
+
+        Returns:
+            list[Boxes]: a list of Boxes containing all the anchors for each feature map
+                (i.e. the cell anchors repeated over all locations in the feature map).
+                The number of anchors of each feature map is Hi x Wi x num_cell_anchors,
+                where Hi, Wi are resolution of the feature map divided by anchor stride.
+        """
+        grid_sizes = [feature_map.shape[-2:] for feature_map in features]
+        anchors_over_all_feature_maps = self._grid_anchors(grid_sizes)
+        return [Boxes(x) for x in anchors_over_all_feature_maps]
+
+
+@ANCHOR_GENERATOR_REGISTRY.register()
+class RotatedAnchorGenerator(nn.Module):
+    """
+    Compute rotated anchors used by Rotated RPN (RRPN), described in
+    "Arbitrary-Oriented Scene Text Detection via Rotation Proposals".
+    """
+
+    box_dim: int = 5
+    """
+    the dimension of each anchor box.
+    """
+
+    @configurable
+    def __init__(self, *, sizes, aspect_ratios, strides, angles, offset=0.5):
+        """
+        This interface is experimental.
+
+        Args:
+            sizes (list[list[float]] or list[float]):
+                If sizes is list[list[float]], sizes[i] is the list of anchor sizes
+                (i.e. sqrt of anchor area) to use for the i-th feature map.
+                If sizes is list[float], the sizes are used for all feature maps.
+                Anchor sizes are given in absolute lengths in units of
+                the input image; they do not dynamically scale if the input image size changes.
+            aspect_ratios (list[list[float]] or list[float]): list of aspect ratios
+                (i.e. height / width) to use for anchors. Same "broadcast" rule for `sizes` applies.
+            strides (list[int]): stride of each input feature.
+            angles (list[list[float]] or list[float]): list of angles (in degrees CCW)
+                to use for anchors. Same "broadcast" rule for `sizes` applies.
+            offset (float): Relative offset between the center of the first anchor and the top-left
+                corner of the image. Value has to be in [0, 1).
+                Recommend to use 0.5, which means half stride.
+        """
+        super().__init__()
+
+        self.strides = strides
+        self.num_features = len(self.strides)
+        sizes = _broadcast_params(sizes, self.num_features, "sizes")
+        aspect_ratios = _broadcast_params(aspect_ratios, self.num_features, "aspect_ratios")
+        angles = _broadcast_params(angles, self.num_features, "angles")
+        self.cell_anchors = self._calculate_anchors(sizes, aspect_ratios, angles)
+
+        self.offset = offset
+        assert 0.0 <= self.offset < 1.0, self.offset
+
+    @classmethod
+    def from_config(cls, cfg, input_shape: List[ShapeSpec]):
+        return {
+            "sizes": cfg.MODEL.ANCHOR_GENERATOR.SIZES,
+            "aspect_ratios": cfg.MODEL.ANCHOR_GENERATOR.ASPECT_RATIOS,
+            "strides": [x.stride for x in input_shape],
+            "offset": cfg.MODEL.ANCHOR_GENERATOR.OFFSET,
+            "angles": cfg.MODEL.ANCHOR_GENERATOR.ANGLES,
+        }
+
+    def _calculate_anchors(self, sizes, aspect_ratios, angles):
+        cell_anchors = [
+            self.generate_cell_anchors(size, aspect_ratio, angle).float()
+            for size, aspect_ratio, angle in zip(sizes, aspect_ratios, angles)
+        ]
+        return BufferList(cell_anchors)
+
+    @property
+    def num_cell_anchors(self):
+        """
+        Alias of `num_anchors`.
+        """
+        return self.num_anchors
+
+    @property
+    def num_anchors(self):
+        """
+        Returns:
+            list[int]: Each int is the number of anchors at every pixel
+                location, on that feature map.
+                For example, if at every pixel we use anchors of 3 aspect
+                ratios, 2 sizes and 5 angles, the number of anchors is 30.
+                (See also ANCHOR_GENERATOR.SIZES, ANCHOR_GENERATOR.ASPECT_RATIOS
+                and ANCHOR_GENERATOR.ANGLES in config)
+
+                In standard RRPN models, `num_anchors` on every feature map is the same.
+        """
+        return [len(cell_anchors) for cell_anchors in self.cell_anchors]
+
+    def _grid_anchors(self, grid_sizes):
+        anchors = []
+        for size, stride, base_anchors in zip(grid_sizes, self.strides, self.cell_anchors):
+            shift_x, shift_y = _create_grid_offsets(size, stride, self.offset, base_anchors.device)
+            zeros = torch.zeros_like(shift_x)
+            shifts = torch.stack((shift_x, shift_y, zeros, zeros, zeros), dim=1)
+
+            anchors.append((shifts.view(-1, 1, 5) + base_anchors.view(1, -1, 5)).reshape(-1, 5))
+
+        return anchors
+
+    def generate_cell_anchors(
+        self,
+        sizes=(32, 64, 128, 256, 512),
+        aspect_ratios=(0.5, 1, 2),
+        angles=(-90, -60, -30, 0, 30, 60, 90),
+    ):
+        """
+        Generate a tensor storing canonical anchor boxes, which are all anchor
+        boxes of different sizes, aspect_ratios, angles centered at (0, 0).
+        We can later build the set of anchors for a full feature map by
+        shifting and tiling these tensors (see `meth:_grid_anchors`).
+
+        Args:
+            sizes (tuple[float]):
+            aspect_ratios (tuple[float]]):
+            angles (tuple[float]]):
+
+        Returns:
+            Tensor of shape (len(sizes) * len(aspect_ratios) * len(angles), 5)
+                storing anchor boxes in (x_ctr, y_ctr, w, h, angle) format.
+        """
+        anchors = []
+        for size in sizes:
+            area = size ** 2.0
+            for aspect_ratio in aspect_ratios:
+                # s * s = w * h
+                # a = h / w
+                # ... some algebra ...
+                # w = sqrt(s * s / a)
+                # h = a * w
+                w = math.sqrt(area / aspect_ratio)
+                h = aspect_ratio * w
+                anchors.extend([0, 0, w, h, a] for a in angles)
+
+        return torch.tensor(anchors)
+
+    def forward(self, features):
+        """
+        Args:
+            features (list[Tensor]): list of backbone feature maps on which to generate anchors.
+
+        Returns:
+            list[RotatedBoxes]: a list of Boxes containing all the anchors for each feature map
+                (i.e. the cell anchors repeated over all locations in the feature map).
+                The number of anchors of each feature map is Hi x Wi x num_cell_anchors,
+                where Hi, Wi are resolution of the feature map divided by anchor stride.
+        """
+        grid_sizes = [feature_map.shape[-2:] for feature_map in features]
+        anchors_over_all_feature_maps = self._grid_anchors(grid_sizes)
+        return [RotatedBoxes(x) for x in anchors_over_all_feature_maps]
+
+
+def build_anchor_generator(cfg, input_shape):
+    """
+    Built an anchor generator from `cfg.MODEL.ANCHOR_GENERATOR.NAME`.
+    """
+    anchor_generator = cfg.MODEL.ANCHOR_GENERATOR.NAME
+    return ANCHOR_GENERATOR_REGISTRY.get(anchor_generator)(cfg, input_shape)
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/detectron2/modeling/backbone/__init__.py b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/modeling/backbone/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..d477fb1e596f77b4c24f2b2c66b528bf2f83b00e
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/modeling/backbone/__init__.py
@@ -0,0 +1,9 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+from .build import build_backbone, BACKBONE_REGISTRY  # noqa F401 isort:skip
+
+from .backbone import Backbone
+from .fpn import FPN
+from .resnet import ResNet, ResNetBlockBase, build_resnet_backbone, make_stage
+
+__all__ = [k for k in globals().keys() if not k.startswith("_")]
+# TODO can expose more resnet blocks after careful consideration
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/detectron2/modeling/backbone/backbone.py b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/modeling/backbone/backbone.py
new file mode 100644
index 0000000000000000000000000000000000000000..66dee4a6565e6c45ed17d0880fcc37eac8f75c3a
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/modeling/backbone/backbone.py
@@ -0,0 +1,53 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+from abc import ABCMeta, abstractmethod
+import torch.nn as nn
+
+from detectron2.layers import ShapeSpec
+
+__all__ = ["Backbone"]
+
+
+class Backbone(nn.Module, metaclass=ABCMeta):
+    """
+    Abstract base class for network backbones.
+    """
+
+    def __init__(self):
+        """
+        The `__init__` method of any subclass can specify its own set of arguments.
+        """
+        super().__init__()
+
+    @abstractmethod
+    def forward(self):
+        """
+        Subclasses must override this method, but adhere to the same return type.
+
+        Returns:
+            dict[str->Tensor]: mapping from feature name (e.g., "res2") to tensor
+        """
+        pass
+
+    @property
+    def size_divisibility(self):
+        """
+        Some backbones require the input height and width to be divisible by a
+        specific integer. This is typically true for encoder / decoder type networks
+        with lateral connection (e.g., FPN) for which feature maps need to match
+        dimension in the "bottom up" and "top down" paths. Set to 0 if no specific
+        input size divisibility is required.
+        """
+        return 0
+
+    def output_shape(self):
+        """
+        Returns:
+            dict[str->ShapeSpec]
+        """
+        # this is a backward-compatible default
+        return {
+            name: ShapeSpec(
+                channels=self._out_feature_channels[name], stride=self._out_feature_strides[name]
+            )
+            for name in self._out_features
+        }
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/detectron2/modeling/backbone/build.py b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/modeling/backbone/build.py
new file mode 100644
index 0000000000000000000000000000000000000000..3d2ecae783257418708b572e298a23e167dabb26
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/modeling/backbone/build.py
@@ -0,0 +1,33 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+from detectron2.layers import ShapeSpec
+from detectron2.utils.registry import Registry
+
+from .backbone import Backbone
+
+BACKBONE_REGISTRY = Registry("BACKBONE")
+BACKBONE_REGISTRY.__doc__ = """
+Registry for backbones, which extract feature maps from images
+
+The registered object must be a callable that accepts two arguments:
+
+1. A :class:`detectron2.config.CfgNode`
+2. A :class:`detectron2.layers.ShapeSpec`, which contains the input shape specification.
+
+It must returns an instance of :class:`Backbone`.
+"""
+
+
+def build_backbone(cfg, input_shape=None):
+    """
+    Build a backbone from `cfg.MODEL.BACKBONE.NAME`.
+
+    Returns:
+        an instance of :class:`Backbone`
+    """
+    if input_shape is None:
+        input_shape = ShapeSpec(channels=len(cfg.MODEL.PIXEL_MEAN))
+
+    backbone_name = cfg.MODEL.BACKBONE.NAME
+    backbone = BACKBONE_REGISTRY.get(backbone_name)(cfg, input_shape)
+    assert isinstance(backbone, Backbone)
+    return backbone
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/detectron2/modeling/backbone/fpn.py b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/modeling/backbone/fpn.py
new file mode 100644
index 0000000000000000000000000000000000000000..338b5f5286ce233f17aa41f50a5a0a8fb819b8d3
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/modeling/backbone/fpn.py
@@ -0,0 +1,245 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+import math
+import fvcore.nn.weight_init as weight_init
+import torch.nn.functional as F
+from torch import nn
+
+from detectron2.layers import Conv2d, ShapeSpec, get_norm
+
+from .backbone import Backbone
+from .build import BACKBONE_REGISTRY
+from .resnet import build_resnet_backbone
+
+__all__ = ["build_resnet_fpn_backbone", "build_retinanet_resnet_fpn_backbone", "FPN"]
+
+
+class FPN(Backbone):
+    """
+    This module implements :paper:`FPN`.
+    It creates pyramid features built on top of some input feature maps.
+    """
+
+    def __init__(
+        self, bottom_up, in_features, out_channels, norm="", top_block=None, fuse_type="sum"
+    ):
+        """
+        Args:
+            bottom_up (Backbone): module representing the bottom up subnetwork.
+                Must be a subclass of :class:`Backbone`. The multi-scale feature
+                maps generated by the bottom up network, and listed in `in_features`,
+                are used to generate FPN levels.
+            in_features (list[str]): names of the input feature maps coming
+                from the backbone to which FPN is attached. For example, if the
+                backbone produces ["res2", "res3", "res4"], any *contiguous* sublist
+                of these may be used; order must be from high to low resolution.
+            out_channels (int): number of channels in the output feature maps.
+            norm (str): the normalization to use.
+            top_block (nn.Module or None): if provided, an extra operation will
+                be performed on the output of the last (smallest resolution)
+                FPN output, and the result will extend the result list. The top_block
+                further downsamples the feature map. It must have an attribute
+                "num_levels", meaning the number of extra FPN levels added by
+                this block, and "in_feature", which is a string representing
+                its input feature (e.g., p5).
+            fuse_type (str): types for fusing the top down features and the lateral
+                ones. It can be "sum" (default), which sums up element-wise; or "avg",
+                which takes the element-wise mean of the two.
+        """
+        super(FPN, self).__init__()
+        assert isinstance(bottom_up, Backbone)
+
+        # Feature map strides and channels from the bottom up network (e.g. ResNet)
+        input_shapes = bottom_up.output_shape()
+        in_strides = [input_shapes[f].stride for f in in_features]
+        in_channels = [input_shapes[f].channels for f in in_features]
+
+        _assert_strides_are_log2_contiguous(in_strides)
+        lateral_convs = []
+        output_convs = []
+
+        use_bias = norm == ""
+        for idx, in_channels in enumerate(in_channels):
+            lateral_norm = get_norm(norm, out_channels)
+            output_norm = get_norm(norm, out_channels)
+
+            lateral_conv = Conv2d(
+                in_channels, out_channels, kernel_size=1, bias=use_bias, norm=lateral_norm
+            )
+            output_conv = Conv2d(
+                out_channels,
+                out_channels,
+                kernel_size=3,
+                stride=1,
+                padding=1,
+                bias=use_bias,
+                norm=output_norm,
+            )
+            weight_init.c2_xavier_fill(lateral_conv)
+            weight_init.c2_xavier_fill(output_conv)
+            stage = int(math.log2(in_strides[idx]))
+            self.add_module("fpn_lateral{}".format(stage), lateral_conv)
+            self.add_module("fpn_output{}".format(stage), output_conv)
+
+            lateral_convs.append(lateral_conv)
+            output_convs.append(output_conv)
+        # Place convs into top-down order (from low to high resolution)
+        # to make the top-down computation in forward clearer.
+        self.lateral_convs = lateral_convs[::-1]
+        self.output_convs = output_convs[::-1]
+        self.top_block = top_block
+        self.in_features = in_features
+        self.bottom_up = bottom_up
+        # Return feature names are "p<stage>", like ["p2", "p3", ..., "p6"]
+        self._out_feature_strides = {"p{}".format(int(math.log2(s))): s for s in in_strides}
+        # top block output feature maps.
+        if self.top_block is not None:
+            for s in range(stage, stage + self.top_block.num_levels):
+                self._out_feature_strides["p{}".format(s + 1)] = 2 ** (s + 1)
+
+        self._out_features = list(self._out_feature_strides.keys())
+        self._out_feature_channels = {k: out_channels for k in self._out_features}
+        self._size_divisibility = in_strides[-1]
+        assert fuse_type in {"avg", "sum"}
+        self._fuse_type = fuse_type
+
+    @property
+    def size_divisibility(self):
+        return self._size_divisibility
+
+    def forward(self, x):
+        """
+        Args:
+            input (dict[str->Tensor]): mapping feature map name (e.g., "res5") to
+                feature map tensor for each feature level in high to low resolution order.
+
+        Returns:
+            dict[str->Tensor]:
+                mapping from feature map name to FPN feature map tensor
+                in high to low resolution order. Returned feature names follow the FPN
+                paper convention: "p<stage>", where stage has stride = 2 ** stage e.g.,
+                ["p2", "p3", ..., "p6"].
+        """
+        # Reverse feature maps into top-down order (from low to high resolution)
+        bottom_up_features = self.bottom_up(x)
+        x = [bottom_up_features[f] for f in self.in_features[::-1]]
+        results = []
+        prev_features = self.lateral_convs[0](x[0])
+        results.append(self.output_convs[0](prev_features))
+        for features, lateral_conv, output_conv in zip(
+            x[1:], self.lateral_convs[1:], self.output_convs[1:]
+        ):
+            top_down_features = F.interpolate(prev_features, scale_factor=2, mode="nearest")
+            lateral_features = lateral_conv(features)
+            prev_features = lateral_features + top_down_features
+            if self._fuse_type == "avg":
+                prev_features /= 2
+            results.insert(0, output_conv(prev_features))
+
+        if self.top_block is not None:
+            top_block_in_feature = bottom_up_features.get(self.top_block.in_feature, None)
+            if top_block_in_feature is None:
+                top_block_in_feature = results[self._out_features.index(self.top_block.in_feature)]
+            results.extend(self.top_block(top_block_in_feature))
+        assert len(self._out_features) == len(results)
+        return dict(zip(self._out_features, results))
+
+    def output_shape(self):
+        return {
+            name: ShapeSpec(
+                channels=self._out_feature_channels[name], stride=self._out_feature_strides[name]
+            )
+            for name in self._out_features
+        }
+
+
+def _assert_strides_are_log2_contiguous(strides):
+    """
+    Assert that each stride is 2x times its preceding stride, i.e. "contiguous in log2".
+    """
+    for i, stride in enumerate(strides[1:], 1):
+        assert stride == 2 * strides[i - 1], "Strides {} {} are not log2 contiguous".format(
+            stride, strides[i - 1]
+        )
+
+
+class LastLevelMaxPool(nn.Module):
+    """
+    This module is used in the original FPN to generate a downsampled
+    P6 feature from P5.
+    """
+
+    def __init__(self):
+        super().__init__()
+        self.num_levels = 1
+        self.in_feature = "p5"
+
+    def forward(self, x):
+        return [F.max_pool2d(x, kernel_size=1, stride=2, padding=0)]
+
+
+class LastLevelP6P7(nn.Module):
+    """
+    This module is used in RetinaNet to generate extra layers, P6 and P7 from
+    C5 feature.
+    """
+
+    def __init__(self, in_channels, out_channels, in_feature="res5"):
+        super().__init__()
+        self.num_levels = 2
+        self.in_feature = in_feature
+        self.p6 = nn.Conv2d(in_channels, out_channels, 3, 2, 1)
+        self.p7 = nn.Conv2d(out_channels, out_channels, 3, 2, 1)
+        for module in [self.p6, self.p7]:
+            weight_init.c2_xavier_fill(module)
+
+    def forward(self, c5):
+        p6 = self.p6(c5)
+        p7 = self.p7(F.relu(p6))
+        return [p6, p7]
+
+
+@BACKBONE_REGISTRY.register()
+def build_resnet_fpn_backbone(cfg, input_shape: ShapeSpec):
+    """
+    Args:
+        cfg: a detectron2 CfgNode
+
+    Returns:
+        backbone (Backbone): backbone module, must be a subclass of :class:`Backbone`.
+    """
+    bottom_up = build_resnet_backbone(cfg, input_shape)
+    in_features = cfg.MODEL.FPN.IN_FEATURES
+    out_channels = cfg.MODEL.FPN.OUT_CHANNELS
+    backbone = FPN(
+        bottom_up=bottom_up,
+        in_features=in_features,
+        out_channels=out_channels,
+        norm=cfg.MODEL.FPN.NORM,
+        top_block=LastLevelMaxPool(),
+        fuse_type=cfg.MODEL.FPN.FUSE_TYPE,
+    )
+    return backbone
+
+
+@BACKBONE_REGISTRY.register()
+def build_retinanet_resnet_fpn_backbone(cfg, input_shape: ShapeSpec):
+    """
+    Args:
+        cfg: a detectron2 CfgNode
+
+    Returns:
+        backbone (Backbone): backbone module, must be a subclass of :class:`Backbone`.
+    """
+    bottom_up = build_resnet_backbone(cfg, input_shape)
+    in_features = cfg.MODEL.FPN.IN_FEATURES
+    out_channels = cfg.MODEL.FPN.OUT_CHANNELS
+    in_channels_p6p7 = bottom_up.output_shape()["res5"].channels
+    backbone = FPN(
+        bottom_up=bottom_up,
+        in_features=in_features,
+        out_channels=out_channels,
+        norm=cfg.MODEL.FPN.NORM,
+        top_block=LastLevelP6P7(in_channels_p6p7, out_channels),
+        fuse_type=cfg.MODEL.FPN.FUSE_TYPE,
+    )
+    return backbone
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/detectron2/modeling/backbone/resnet.py b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/modeling/backbone/resnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..f1faae012f346166a311902826fb9e4b61e24e54
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/modeling/backbone/resnet.py
@@ -0,0 +1,591 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+import numpy as np
+import fvcore.nn.weight_init as weight_init
+import torch
+import torch.nn.functional as F
+from torch import nn
+
+from detectron2.layers import (
+    CNNBlockBase,
+    Conv2d,
+    DeformConv,
+    ModulatedDeformConv,
+    ShapeSpec,
+    get_norm,
+)
+
+from .backbone import Backbone
+from .build import BACKBONE_REGISTRY
+
+__all__ = [
+    "ResNetBlockBase",
+    "BasicBlock",
+    "BottleneckBlock",
+    "DeformBottleneckBlock",
+    "BasicStem",
+    "ResNet",
+    "make_stage",
+    "build_resnet_backbone",
+]
+
+
+ResNetBlockBase = CNNBlockBase
+"""
+Alias for backward compatibiltiy.
+"""
+
+
+class BasicBlock(CNNBlockBase):
+    """
+    The basic residual block for ResNet-18 and ResNet-34 defined in :paper:`ResNet`,
+    with two 3x3 conv layers and a projection shortcut if needed.
+    """
+
+    def __init__(self, in_channels, out_channels, *, stride=1, norm="BN"):
+        """
+        Args:
+            in_channels (int): Number of input channels.
+            out_channels (int): Number of output channels.
+            stride (int): Stride for the first conv.
+            norm (str or callable): normalization for all conv layers.
+                See :func:`layers.get_norm` for supported format.
+        """
+        super().__init__(in_channels, out_channels, stride)
+
+        if in_channels != out_channels:
+            self.shortcut = Conv2d(
+                in_channels,
+                out_channels,
+                kernel_size=1,
+                stride=stride,
+                bias=False,
+                norm=get_norm(norm, out_channels),
+            )
+        else:
+            self.shortcut = None
+
+        self.conv1 = Conv2d(
+            in_channels,
+            out_channels,
+            kernel_size=3,
+            stride=stride,
+            padding=1,
+            bias=False,
+            norm=get_norm(norm, out_channels),
+        )
+
+        self.conv2 = Conv2d(
+            out_channels,
+            out_channels,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+            bias=False,
+            norm=get_norm(norm, out_channels),
+        )
+
+        for layer in [self.conv1, self.conv2, self.shortcut]:
+            if layer is not None:  # shortcut can be None
+                weight_init.c2_msra_fill(layer)
+
+    def forward(self, x):
+        out = self.conv1(x)
+        out = F.relu_(out)
+        out = self.conv2(out)
+
+        if self.shortcut is not None:
+            shortcut = self.shortcut(x)
+        else:
+            shortcut = x
+
+        out += shortcut
+        out = F.relu_(out)
+        return out
+
+
+class BottleneckBlock(CNNBlockBase):
+    """
+    The standard bottleneck residual block used by ResNet-50, 101 and 152
+    defined in :paper:`ResNet`.  It contains 3 conv layers with kernels
+    1x1, 3x3, 1x1, and a projection shortcut if needed.
+    """
+
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        *,
+        bottleneck_channels,
+        stride=1,
+        num_groups=1,
+        norm="BN",
+        stride_in_1x1=False,
+        dilation=1,
+    ):
+        """
+        Args:
+            bottleneck_channels (int): number of output channels for the 3x3
+                "bottleneck" conv layers.
+            num_groups (int): number of groups for the 3x3 conv layer.
+            norm (str or callable): normalization for all conv layers.
+                See :func:`layers.get_norm` for supported format.
+            stride_in_1x1 (bool): when stride>1, whether to put stride in the
+                first 1x1 convolution or the bottleneck 3x3 convolution.
+            dilation (int): the dilation rate of the 3x3 conv layer.
+        """
+        super().__init__(in_channels, out_channels, stride)
+
+        if in_channels != out_channels:
+            self.shortcut = Conv2d(
+                in_channels,
+                out_channels,
+                kernel_size=1,
+                stride=stride,
+                bias=False,
+                norm=get_norm(norm, out_channels),
+            )
+        else:
+            self.shortcut = None
+
+        # The original MSRA ResNet models have stride in the first 1x1 conv
+        # The subsequent fb.torch.resnet and Caffe2 ResNe[X]t implementations have
+        # stride in the 3x3 conv
+        stride_1x1, stride_3x3 = (stride, 1) if stride_in_1x1 else (1, stride)
+
+        self.conv1 = Conv2d(
+            in_channels,
+            bottleneck_channels,
+            kernel_size=1,
+            stride=stride_1x1,
+            bias=False,
+            norm=get_norm(norm, bottleneck_channels),
+        )
+
+        self.conv2 = Conv2d(
+            bottleneck_channels,
+            bottleneck_channels,
+            kernel_size=3,
+            stride=stride_3x3,
+            padding=1 * dilation,
+            bias=False,
+            groups=num_groups,
+            dilation=dilation,
+            norm=get_norm(norm, bottleneck_channels),
+        )
+
+        self.conv3 = Conv2d(
+            bottleneck_channels,
+            out_channels,
+            kernel_size=1,
+            bias=False,
+            norm=get_norm(norm, out_channels),
+        )
+
+        for layer in [self.conv1, self.conv2, self.conv3, self.shortcut]:
+            if layer is not None:  # shortcut can be None
+                weight_init.c2_msra_fill(layer)
+
+        # Zero-initialize the last normalization in each residual branch,
+        # so that at the beginning, the residual branch starts with zeros,
+        # and each residual block behaves like an identity.
+        # See Sec 5.1 in "Accurate, Large Minibatch SGD: Training ImageNet in 1 Hour":
+        # "For BN layers, the learnable scaling coefficient γ is initialized
+        # to be 1, except for each residual block's last BN
+        # where γ is initialized to be 0."
+
+        # nn.init.constant_(self.conv3.norm.weight, 0)
+        # TODO this somehow hurts performance when training GN models from scratch.
+        # Add it as an option when we need to use this code to train a backbone.
+
+    def forward(self, x):
+        out = self.conv1(x)
+        out = F.relu_(out)
+
+        out = self.conv2(out)
+        out = F.relu_(out)
+
+        out = self.conv3(out)
+
+        if self.shortcut is not None:
+            shortcut = self.shortcut(x)
+        else:
+            shortcut = x
+
+        out += shortcut
+        out = F.relu_(out)
+        return out
+
+
+class DeformBottleneckBlock(ResNetBlockBase):
+    """
+    Similar to :class:`BottleneckBlock`, but with :paper:`deformable conv <deformconv>`
+    in the 3x3 convolution.
+    """
+
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        *,
+        bottleneck_channels,
+        stride=1,
+        num_groups=1,
+        norm="BN",
+        stride_in_1x1=False,
+        dilation=1,
+        deform_modulated=False,
+        deform_num_groups=1,
+    ):
+        super().__init__(in_channels, out_channels, stride)
+        self.deform_modulated = deform_modulated
+
+        if in_channels != out_channels:
+            self.shortcut = Conv2d(
+                in_channels,
+                out_channels,
+                kernel_size=1,
+                stride=stride,
+                bias=False,
+                norm=get_norm(norm, out_channels),
+            )
+        else:
+            self.shortcut = None
+
+        stride_1x1, stride_3x3 = (stride, 1) if stride_in_1x1 else (1, stride)
+
+        self.conv1 = Conv2d(
+            in_channels,
+            bottleneck_channels,
+            kernel_size=1,
+            stride=stride_1x1,
+            bias=False,
+            norm=get_norm(norm, bottleneck_channels),
+        )
+
+        if deform_modulated:
+            deform_conv_op = ModulatedDeformConv
+            # offset channels are 2 or 3 (if with modulated) * kernel_size * kernel_size
+            offset_channels = 27
+        else:
+            deform_conv_op = DeformConv
+            offset_channels = 18
+
+        self.conv2_offset = Conv2d(
+            bottleneck_channels,
+            offset_channels * deform_num_groups,
+            kernel_size=3,
+            stride=stride_3x3,
+            padding=1 * dilation,
+            dilation=dilation,
+        )
+        self.conv2 = deform_conv_op(
+            bottleneck_channels,
+            bottleneck_channels,
+            kernel_size=3,
+            stride=stride_3x3,
+            padding=1 * dilation,
+            bias=False,
+            groups=num_groups,
+            dilation=dilation,
+            deformable_groups=deform_num_groups,
+            norm=get_norm(norm, bottleneck_channels),
+        )
+
+        self.conv3 = Conv2d(
+            bottleneck_channels,
+            out_channels,
+            kernel_size=1,
+            bias=False,
+            norm=get_norm(norm, out_channels),
+        )
+
+        for layer in [self.conv1, self.conv2, self.conv3, self.shortcut]:
+            if layer is not None:  # shortcut can be None
+                weight_init.c2_msra_fill(layer)
+
+        nn.init.constant_(self.conv2_offset.weight, 0)
+        nn.init.constant_(self.conv2_offset.bias, 0)
+
+    def forward(self, x):
+        out = self.conv1(x)
+        out = F.relu_(out)
+
+        if self.deform_modulated:
+            offset_mask = self.conv2_offset(out)
+            offset_x, offset_y, mask = torch.chunk(offset_mask, 3, dim=1)
+            offset = torch.cat((offset_x, offset_y), dim=1)
+            mask = mask.sigmoid()
+            out = self.conv2(out, offset, mask)
+        else:
+            offset = self.conv2_offset(out)
+            out = self.conv2(out, offset)
+        out = F.relu_(out)
+
+        out = self.conv3(out)
+
+        if self.shortcut is not None:
+            shortcut = self.shortcut(x)
+        else:
+            shortcut = x
+
+        out += shortcut
+        out = F.relu_(out)
+        return out
+
+
+def make_stage(block_class, num_blocks, first_stride, *, in_channels, out_channels, **kwargs):
+    """
+    Create a list of blocks just like those in a ResNet stage.
+
+    Args:
+        block_class (type): a subclass of ResNetBlockBase
+        num_blocks (int):
+        first_stride (int): the stride of the first block. The other blocks will have stride=1.
+        in_channels (int): input channels of the entire stage.
+        out_channels (int): output channels of **every block** in the stage.
+        kwargs: other arguments passed to the constructor of every block.
+
+    Returns:
+        list[nn.Module]: a list of block module.
+    """
+    assert "stride" not in kwargs, "Stride of blocks in make_stage cannot be changed."
+    blocks = []
+    for i in range(num_blocks):
+        blocks.append(
+            block_class(
+                in_channels=in_channels,
+                out_channels=out_channels,
+                stride=first_stride if i == 0 else 1,
+                **kwargs,
+            )
+        )
+        in_channels = out_channels
+    return blocks
+
+
+class BasicStem(CNNBlockBase):
+    """
+    The standard ResNet stem (layers before the first residual block).
+    """
+
+    def __init__(self, in_channels=3, out_channels=64, norm="BN"):
+        """
+        Args:
+            norm (str or callable): norm after the first conv layer.
+                See :func:`layers.get_norm` for supported format.
+        """
+        super().__init__(in_channels, out_channels, 4)
+        self.in_channels = in_channels
+        self.conv1 = Conv2d(
+            in_channels,
+            out_channels,
+            kernel_size=7,
+            stride=2,
+            padding=3,
+            bias=False,
+            norm=get_norm(norm, out_channels),
+        )
+        weight_init.c2_msra_fill(self.conv1)
+
+    def forward(self, x):
+        x = self.conv1(x)
+        x = F.relu_(x)
+        x = F.max_pool2d(x, kernel_size=3, stride=2, padding=1)
+        return x
+
+
+class ResNet(Backbone):
+    """
+    Implement :paper:`ResNet`.
+    """
+
+    def __init__(self, stem, stages, num_classes=None, out_features=None):
+        """
+        Args:
+            stem (nn.Module): a stem module
+            stages (list[list[CNNBlockBase]]): several (typically 4) stages,
+                each contains multiple :class:`CNNBlockBase`.
+            num_classes (None or int): if None, will not perform classification.
+                Otherwise, will create a linear layer.
+            out_features (list[str]): name of the layers whose outputs should
+                be returned in forward. Can be anything in "stem", "linear", or "res2" ...
+                If None, will return the output of the last layer.
+        """
+        super(ResNet, self).__init__()
+        self.stem = stem
+        self.num_classes = num_classes
+
+        current_stride = self.stem.stride
+        self._out_feature_strides = {"stem": current_stride}
+        self._out_feature_channels = {"stem": self.stem.out_channels}
+
+        self.stages_and_names = []
+        for i, blocks in enumerate(stages):
+            assert len(blocks) > 0, len(blocks)
+            for block in blocks:
+                assert isinstance(block, CNNBlockBase), block
+
+            name = "res" + str(i + 2)
+            stage = nn.Sequential(*blocks)
+
+            self.add_module(name, stage)
+            self.stages_and_names.append((stage, name))
+
+            self._out_feature_strides[name] = current_stride = int(
+                current_stride * np.prod([k.stride for k in blocks])
+            )
+            self._out_feature_channels[name] = curr_channels = blocks[-1].out_channels
+
+        if num_classes is not None:
+            self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
+            self.linear = nn.Linear(curr_channels, num_classes)
+
+            # Sec 5.1 in "Accurate, Large Minibatch SGD: Training ImageNet in 1 Hour":
+            # "The 1000-way fully-connected layer is initialized by
+            # drawing weights from a zero-mean Gaussian with standard deviation of 0.01."
+            nn.init.normal_(self.linear.weight, std=0.01)
+            name = "linear"
+
+        if out_features is None:
+            out_features = [name]
+        self._out_features = out_features
+        assert len(self._out_features)
+        children = [x[0] for x in self.named_children()]
+        for out_feature in self._out_features:
+            assert out_feature in children, "Available children: {}".format(", ".join(children))
+
+    def forward(self, x):
+        outputs = {}
+        x = self.stem(x)
+        if "stem" in self._out_features:
+            outputs["stem"] = x
+        for stage, name in self.stages_and_names:
+            x = stage(x)
+            if name in self._out_features:
+                outputs[name] = x
+        if self.num_classes is not None:
+            x = self.avgpool(x)
+            x = torch.flatten(x, 1)
+            x = self.linear(x)
+            if "linear" in self._out_features:
+                outputs["linear"] = x
+        return outputs
+
+    def output_shape(self):
+        return {
+            name: ShapeSpec(
+                channels=self._out_feature_channels[name], stride=self._out_feature_strides[name]
+            )
+            for name in self._out_features
+        }
+
+    def freeze(self, freeze_at=0):
+        """
+        Freeze the first several stages of the ResNet. Commonly used in
+        fine-tuning.
+
+        Layers that produce the same feature map spatial size are defined as one
+        "stage" by :paper:`FPN`.
+
+        Args:
+            freeze_at (int): number of stages to freeze.
+                `1` means freezing the stem. `2` means freezing the stem and
+                one residual stage, etc.
+
+        Returns:
+            nn.Module: this ResNet itself
+        """
+        if freeze_at >= 1:
+            self.stem.freeze()
+        for idx, (stage, _) in enumerate(self.stages_and_names, start=2):
+            if freeze_at >= idx:
+                for block in stage.children():
+                    block.freeze()
+        return self
+
+
+@BACKBONE_REGISTRY.register()
+def build_resnet_backbone(cfg, input_shape):
+    """
+    Create a ResNet instance from config.
+
+    Returns:
+        ResNet: a :class:`ResNet` instance.
+    """
+    # need registration of new blocks/stems?
+    norm = cfg.MODEL.RESNETS.NORM
+    stem = BasicStem(
+        in_channels=input_shape.channels,
+        out_channels=cfg.MODEL.RESNETS.STEM_OUT_CHANNELS,
+        norm=norm,
+    )
+
+    # fmt: off
+    freeze_at           = cfg.MODEL.BACKBONE.FREEZE_AT
+    out_features        = cfg.MODEL.RESNETS.OUT_FEATURES
+    depth               = cfg.MODEL.RESNETS.DEPTH
+    num_groups          = cfg.MODEL.RESNETS.NUM_GROUPS
+    width_per_group     = cfg.MODEL.RESNETS.WIDTH_PER_GROUP
+    bottleneck_channels = num_groups * width_per_group
+    in_channels         = cfg.MODEL.RESNETS.STEM_OUT_CHANNELS
+    out_channels        = cfg.MODEL.RESNETS.RES2_OUT_CHANNELS
+    stride_in_1x1       = cfg.MODEL.RESNETS.STRIDE_IN_1X1
+    res5_dilation       = cfg.MODEL.RESNETS.RES5_DILATION
+    deform_on_per_stage = cfg.MODEL.RESNETS.DEFORM_ON_PER_STAGE
+    deform_modulated    = cfg.MODEL.RESNETS.DEFORM_MODULATED
+    deform_num_groups   = cfg.MODEL.RESNETS.DEFORM_NUM_GROUPS
+    # fmt: on
+    assert res5_dilation in {1, 2}, "res5_dilation cannot be {}.".format(res5_dilation)
+
+    num_blocks_per_stage = {
+        18: [2, 2, 2, 2],
+        34: [3, 4, 6, 3],
+        50: [3, 4, 6, 3],
+        101: [3, 4, 23, 3],
+        152: [3, 8, 36, 3],
+    }[depth]
+
+    if depth in [18, 34]:
+        assert out_channels == 64, "Must set MODEL.RESNETS.RES2_OUT_CHANNELS = 64 for R18/R34"
+        assert not any(
+            deform_on_per_stage
+        ), "MODEL.RESNETS.DEFORM_ON_PER_STAGE unsupported for R18/R34"
+        assert res5_dilation == 1, "Must set MODEL.RESNETS.RES5_DILATION = 1 for R18/R34"
+        assert num_groups == 1, "Must set MODEL.RESNETS.NUM_GROUPS = 1 for R18/R34"
+
+    stages = []
+
+    # Avoid creating variables without gradients
+    # It consumes extra memory and may cause allreduce to fail
+    out_stage_idx = [{"res2": 2, "res3": 3, "res4": 4, "res5": 5}[f] for f in out_features]
+    max_stage_idx = max(out_stage_idx)
+    for idx, stage_idx in enumerate(range(2, max_stage_idx + 1)):
+        dilation = res5_dilation if stage_idx == 5 else 1
+        first_stride = 1 if idx == 0 or (stage_idx == 5 and dilation == 2) else 2
+        stage_kargs = {
+            "num_blocks": num_blocks_per_stage[idx],
+            "first_stride": first_stride,
+            "in_channels": in_channels,
+            "out_channels": out_channels,
+            "norm": norm,
+        }
+        # Use BasicBlock for R18 and R34.
+        if depth in [18, 34]:
+            stage_kargs["block_class"] = BasicBlock
+        else:
+            stage_kargs["bottleneck_channels"] = bottleneck_channels
+            stage_kargs["stride_in_1x1"] = stride_in_1x1
+            stage_kargs["dilation"] = dilation
+            stage_kargs["num_groups"] = num_groups
+            if deform_on_per_stage[idx]:
+                stage_kargs["block_class"] = DeformBottleneckBlock
+                stage_kargs["deform_modulated"] = deform_modulated
+                stage_kargs["deform_num_groups"] = deform_num_groups
+            else:
+                stage_kargs["block_class"] = BottleneckBlock
+        blocks = make_stage(**stage_kargs)
+        in_channels = out_channels
+        out_channels *= 2
+        bottleneck_channels *= 2
+        stages.append(blocks)
+    return ResNet(stem, stages, out_features=out_features).freeze(freeze_at)
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/detectron2/modeling/box_regression.py b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/modeling/box_regression.py
new file mode 100644
index 0000000000000000000000000000000000000000..88426fddf36812f33def8fb434bebce53db3a4b4
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/modeling/box_regression.py
@@ -0,0 +1,247 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+import math
+from typing import Tuple
+import torch
+
+# Value for clamping large dw and dh predictions. The heuristic is that we clamp
+# such that dw and dh are no larger than what would transform a 16px box into a
+# 1000px box (based on a small anchor, 16px, and a typical image size, 1000px).
+_DEFAULT_SCALE_CLAMP = math.log(1000.0 / 16)
+
+
+__all__ = ["Box2BoxTransform", "Box2BoxTransformRotated"]
+
+
+def apply_deltas_broadcast(box2box_transform, deltas, boxes):
+    """
+    Apply transform deltas to boxes. Similar to `box2box_transform.apply_deltas`,
+    but allow broadcasting boxes when the second dimension of deltas is a multiple
+    of box dimension.
+
+    Args:
+        box2box_transform (Box2BoxTransform or Box2BoxTransformRotated): the transform to apply
+        deltas (Tensor): tensor of shape (N,B) or (N,KxB)
+        boxes (Tensor): tensor of shape (N,B)
+
+    Returns:
+        Tensor: same shape as deltas.
+    """
+    assert deltas.dim() == boxes.dim() == 2, f"{deltas.shape}, {boxes.shape}"
+    N, B = boxes.shape
+    assert (
+        deltas.shape[1] % B == 0
+    ), f"Second dim of deltas should be a multiple of {B}. Got {deltas.shape}"
+    K = deltas.shape[1] // B
+    ret = box2box_transform.apply_deltas(
+        deltas.view(N * K, B), boxes.unsqueeze(1).expand(N, K, B).reshape(N * K, B)
+    )
+    return ret.view(N, K * B)
+
+
+@torch.jit.script
+class Box2BoxTransform(object):
+    """
+    The box-to-box transform defined in R-CNN. The transformation is parameterized
+    by 4 deltas: (dx, dy, dw, dh). The transformation scales the box's width and height
+    by exp(dw), exp(dh) and shifts a box's center by the offset (dx * width, dy * height).
+    """
+
+    def __init__(
+        self, weights: Tuple[float, float, float, float], scale_clamp: float = _DEFAULT_SCALE_CLAMP
+    ):
+        """
+        Args:
+            weights (4-element tuple): Scaling factors that are applied to the
+                (dx, dy, dw, dh) deltas. In Fast R-CNN, these were originally set
+                such that the deltas have unit variance; now they are treated as
+                hyperparameters of the system.
+            scale_clamp (float): When predicting deltas, the predicted box scaling
+                factors (dw and dh) are clamped such that they are <= scale_clamp.
+        """
+        self.weights = weights
+        self.scale_clamp = scale_clamp
+
+    def get_deltas(self, src_boxes, target_boxes):
+        """
+        Get box regression transformation deltas (dx, dy, dw, dh) that can be used
+        to transform the `src_boxes` into the `target_boxes`. That is, the relation
+        ``target_boxes == self.apply_deltas(deltas, src_boxes)`` is true (unless
+        any delta is too large and is clamped).
+
+        Args:
+            src_boxes (Tensor): source boxes, e.g., object proposals
+            target_boxes (Tensor): target of the transformation, e.g., ground-truth
+                boxes.
+        """
+        assert isinstance(src_boxes, torch.Tensor), type(src_boxes)
+        assert isinstance(target_boxes, torch.Tensor), type(target_boxes)
+
+        src_widths = src_boxes[:, 2] - src_boxes[:, 0]
+        src_heights = src_boxes[:, 3] - src_boxes[:, 1]
+        src_ctr_x = src_boxes[:, 0] + 0.5 * src_widths
+        src_ctr_y = src_boxes[:, 1] + 0.5 * src_heights
+
+        target_widths = target_boxes[:, 2] - target_boxes[:, 0]
+        target_heights = target_boxes[:, 3] - target_boxes[:, 1]
+        target_ctr_x = target_boxes[:, 0] + 0.5 * target_widths
+        target_ctr_y = target_boxes[:, 1] + 0.5 * target_heights
+
+        wx, wy, ww, wh = self.weights
+        dx = wx * (target_ctr_x - src_ctr_x) / src_widths
+        dy = wy * (target_ctr_y - src_ctr_y) / src_heights
+        dw = ww * torch.log(target_widths / src_widths)
+        dh = wh * torch.log(target_heights / src_heights)
+
+        deltas = torch.stack((dx, dy, dw, dh), dim=1)
+        assert (src_widths > 0).all().item(), "Input boxes to Box2BoxTransform are not valid!"
+        return deltas
+
+    def apply_deltas(self, deltas, boxes):
+        """
+        Apply transformation `deltas` (dx, dy, dw, dh) to `boxes`.
+
+        Args:
+            deltas (Tensor): transformation deltas of shape (N, k*4), where k >= 1.
+                deltas[i] represents k potentially different class-specific
+                box transformations for the single box boxes[i].
+            boxes (Tensor): boxes to transform, of shape (N, 4)
+        """
+        boxes = boxes.to(deltas.dtype)
+
+        widths = boxes[:, 2] - boxes[:, 0]
+        heights = boxes[:, 3] - boxes[:, 1]
+        ctr_x = boxes[:, 0] + 0.5 * widths
+        ctr_y = boxes[:, 1] + 0.5 * heights
+
+        wx, wy, ww, wh = self.weights
+        dx = deltas[:, 0::4] / wx
+        dy = deltas[:, 1::4] / wy
+        dw = deltas[:, 2::4] / ww
+        dh = deltas[:, 3::4] / wh
+
+        # Prevent sending too large values into torch.exp()
+        dw = torch.clamp(dw, max=self.scale_clamp)
+        dh = torch.clamp(dh, max=self.scale_clamp)
+
+        pred_ctr_x = dx * widths[:, None] + ctr_x[:, None]
+        pred_ctr_y = dy * heights[:, None] + ctr_y[:, None]
+        pred_w = torch.exp(dw) * widths[:, None]
+        pred_h = torch.exp(dh) * heights[:, None]
+
+        pred_boxes = torch.zeros_like(deltas)
+        pred_boxes[:, 0::4] = pred_ctr_x - 0.5 * pred_w  # x1
+        pred_boxes[:, 1::4] = pred_ctr_y - 0.5 * pred_h  # y1
+        pred_boxes[:, 2::4] = pred_ctr_x + 0.5 * pred_w  # x2
+        pred_boxes[:, 3::4] = pred_ctr_y + 0.5 * pred_h  # y2
+        return pred_boxes
+
+
+@torch.jit.script
+class Box2BoxTransformRotated(object):
+    """
+    The box-to-box transform defined in Rotated R-CNN. The transformation is parameterized
+    by 5 deltas: (dx, dy, dw, dh, da). The transformation scales the box's width and height
+    by exp(dw), exp(dh), shifts a box's center by the offset (dx * width, dy * height),
+    and rotate a box's angle by da (radians).
+    Note: angles of deltas are in radians while angles of boxes are in degrees.
+    """
+
+    def __init__(
+        self,
+        weights: Tuple[float, float, float, float, float],
+        scale_clamp: float = _DEFAULT_SCALE_CLAMP,
+    ):
+        """
+        Args:
+            weights (5-element tuple): Scaling factors that are applied to the
+                (dx, dy, dw, dh, da) deltas. These are treated as
+                hyperparameters of the system.
+            scale_clamp (float): When predicting deltas, the predicted box scaling
+                factors (dw and dh) are clamped such that they are <= scale_clamp.
+        """
+        self.weights = weights
+        self.scale_clamp = scale_clamp
+
+    def get_deltas(self, src_boxes, target_boxes):
+        """
+        Get box regression transformation deltas (dx, dy, dw, dh, da) that can be used
+        to transform the `src_boxes` into the `target_boxes`. That is, the relation
+        ``target_boxes == self.apply_deltas(deltas, src_boxes)`` is true (unless
+        any delta is too large and is clamped).
+
+        Args:
+            src_boxes (Tensor): Nx5 source boxes, e.g., object proposals
+            target_boxes (Tensor): Nx5 target of the transformation, e.g., ground-truth
+                boxes.
+        """
+        assert isinstance(src_boxes, torch.Tensor), type(src_boxes)
+        assert isinstance(target_boxes, torch.Tensor), type(target_boxes)
+
+        src_ctr_x, src_ctr_y, src_widths, src_heights, src_angles = torch.unbind(src_boxes, dim=1)
+
+        target_ctr_x, target_ctr_y, target_widths, target_heights, target_angles = torch.unbind(
+            target_boxes, dim=1
+        )
+
+        wx, wy, ww, wh, wa = self.weights
+        dx = wx * (target_ctr_x - src_ctr_x) / src_widths
+        dy = wy * (target_ctr_y - src_ctr_y) / src_heights
+        dw = ww * torch.log(target_widths / src_widths)
+        dh = wh * torch.log(target_heights / src_heights)
+        # Angles of deltas are in radians while angles of boxes are in degrees.
+        # the conversion to radians serve as a way to normalize the values
+        da = target_angles - src_angles
+        da = (da + 180.0) % 360.0 - 180.0  # make it in [-180, 180)
+        da *= wa * math.pi / 180.0
+
+        deltas = torch.stack((dx, dy, dw, dh, da), dim=1)
+        assert (
+            (src_widths > 0).all().item()
+        ), "Input boxes to Box2BoxTransformRotated are not valid!"
+        return deltas
+
+    def apply_deltas(self, deltas, boxes):
+        """
+        Apply transformation `deltas` (dx, dy, dw, dh, da) to `boxes`.
+
+        Args:
+            deltas (Tensor): transformation deltas of shape (N, 5).
+                deltas[i] represents box transformation for the single box boxes[i].
+            boxes (Tensor): boxes to transform, of shape (N, 5)
+        """
+        assert deltas.shape[1] == 5 and boxes.shape[1] == 5
+
+        boxes = boxes.to(deltas.dtype)
+
+        ctr_x = boxes[:, 0]
+        ctr_y = boxes[:, 1]
+        widths = boxes[:, 2]
+        heights = boxes[:, 3]
+        angles = boxes[:, 4]
+
+        wx, wy, ww, wh, wa = self.weights
+
+        dx = deltas[:, 0] / wx
+        dy = deltas[:, 1] / wy
+        dw = deltas[:, 2] / ww
+        dh = deltas[:, 3] / wh
+        da = deltas[:, 4] / wa
+
+        # Prevent sending too large values into torch.exp()
+        dw = torch.clamp(dw, max=self.scale_clamp)
+        dh = torch.clamp(dh, max=self.scale_clamp)
+
+        pred_boxes = torch.zeros_like(deltas)
+        pred_boxes[:, 0] = dx * widths + ctr_x  # x_ctr
+        pred_boxes[:, 1] = dy * heights + ctr_y  # y_ctr
+        pred_boxes[:, 2] = torch.exp(dw) * widths  # width
+        pred_boxes[:, 3] = torch.exp(dh) * heights  # height
+
+        # Following original RRPN implementation,
+        # angles of deltas are in radians while angles of boxes are in degrees.
+        pred_angle = da * 180.0 / math.pi + angles
+        pred_angle = (pred_angle + 180.0) % 360.0 - 180.0  # make it in [-180, 180)
+
+        pred_boxes[:, 4] = pred_angle
+
+        return pred_boxes
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/detectron2/modeling/matcher.py b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/modeling/matcher.py
new file mode 100644
index 0000000000000000000000000000000000000000..2911f8c1937749dec4dbe64aa3e8491a631e03f2
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/modeling/matcher.py
@@ -0,0 +1,123 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+from typing import List
+import torch
+
+
+class Matcher(object):
+    """
+    This class assigns to each predicted "element" (e.g., a box) a ground-truth
+    element. Each predicted element will have exactly zero or one matches; each
+    ground-truth element may be matched to zero or more predicted elements.
+
+    The matching is determined by the MxN match_quality_matrix, that characterizes
+    how well each (ground-truth, prediction)-pair match each other. For example,
+    if the elements are boxes, this matrix may contain box intersection-over-union
+    overlap values.
+
+    The matcher returns (a) a vector of length N containing the index of the
+    ground-truth element m in [0, M) that matches to prediction n in [0, N).
+    (b) a vector of length N containing the labels for each prediction.
+    """
+
+    def __init__(
+        self, thresholds: List[float], labels: List[int], allow_low_quality_matches: bool = False
+    ):
+        """
+        Args:
+            thresholds (list): a list of thresholds used to stratify predictions
+                into levels.
+            labels (list): a list of values to label predictions belonging at
+                each level. A label can be one of {-1, 0, 1} signifying
+                {ignore, negative class, positive class}, respectively.
+            allow_low_quality_matches (bool): if True, produce additional matches
+                for predictions with maximum match quality lower than high_threshold.
+                See set_low_quality_matches_ for more details.
+
+            For example,
+                thresholds = [0.3, 0.5]
+                labels = [0, -1, 1]
+                All predictions with iou < 0.3 will be marked with 0 and
+                thus will be considered as false positives while training.
+                All predictions with 0.3 <= iou < 0.5 will be marked with -1 and
+                thus will be ignored.
+                All predictions with 0.5 <= iou will be marked with 1 and
+                thus will be considered as true positives.
+        """
+        # Add -inf and +inf to first and last position in thresholds
+        thresholds = thresholds[:]
+        assert thresholds[0] > 0
+        thresholds.insert(0, -float("inf"))
+        thresholds.append(float("inf"))
+        assert all(low <= high for (low, high) in zip(thresholds[:-1], thresholds[1:]))
+        assert all(l in [-1, 0, 1] for l in labels)
+        assert len(labels) == len(thresholds) - 1
+        self.thresholds = thresholds
+        self.labels = labels
+        self.allow_low_quality_matches = allow_low_quality_matches
+
+    def __call__(self, match_quality_matrix):
+        """
+        Args:
+            match_quality_matrix (Tensor[float]): an MxN tensor, containing the
+                pairwise quality between M ground-truth elements and N predicted
+                elements. All elements must be >= 0 (due to the us of `torch.nonzero`
+                for selecting indices in :meth:`set_low_quality_matches_`).
+
+        Returns:
+            matches (Tensor[int64]): a vector of length N, where matches[i] is a matched
+                ground-truth index in [0, M)
+            match_labels (Tensor[int8]): a vector of length N, where pred_labels[i] indicates
+                whether a prediction is a true or false positive or ignored
+        """
+        assert match_quality_matrix.dim() == 2
+        if match_quality_matrix.numel() == 0:
+            default_matches = match_quality_matrix.new_full(
+                (match_quality_matrix.size(1),), 0, dtype=torch.int64
+            )
+            # When no gt boxes exist, we define IOU = 0 and therefore set labels
+            # to `self.labels[0]`, which usually defaults to background class 0
+            # To choose to ignore instead, can make labels=[-1,0,-1,1] + set appropriate thresholds
+            default_match_labels = match_quality_matrix.new_full(
+                (match_quality_matrix.size(1),), self.labels[0], dtype=torch.int8
+            )
+            return default_matches, default_match_labels
+
+        assert torch.all(match_quality_matrix >= 0)
+
+        # match_quality_matrix is M (gt) x N (predicted)
+        # Max over gt elements (dim 0) to find best gt candidate for each prediction
+        matched_vals, matches = match_quality_matrix.max(dim=0)
+
+        match_labels = matches.new_full(matches.size(), 1, dtype=torch.int8)
+
+        for (l, low, high) in zip(self.labels, self.thresholds[:-1], self.thresholds[1:]):
+            low_high = (matched_vals >= low) & (matched_vals < high)
+            match_labels[low_high] = l
+
+        if self.allow_low_quality_matches:
+            self.set_low_quality_matches_(match_labels, match_quality_matrix)
+
+        return matches, match_labels
+
+    def set_low_quality_matches_(self, match_labels, match_quality_matrix):
+        """
+        Produce additional matches for predictions that have only low-quality matches.
+        Specifically, for each ground-truth G find the set of predictions that have
+        maximum overlap with it (including ties); for each prediction in that set, if
+        it is unmatched, then match it to the ground-truth G.
+
+        This function implements the RPN assignment case (i) in Sec. 3.1.2 of
+        :paper:`Faster R-CNN`.
+        """
+        # For each gt, find the prediction with which it has highest quality
+        highest_quality_foreach_gt, _ = match_quality_matrix.max(dim=1)
+        # Find the highest quality match available, even if it is low, including ties.
+        # Note that the matches qualities must be positive due to the use of
+        # `torch.nonzero`.
+        _, pred_inds_with_highest_quality = torch.nonzero(
+            match_quality_matrix == highest_quality_foreach_gt[:, None], as_tuple=True
+        )
+        # If an anchor was labeled positive only due to a low-quality match
+        # with gt_A, but it has larger overlap with gt_B, it's matched index will still be gt_B.
+        # This follows the implementation in Detectron, and is found to have no significant impact.
+        match_labels[pred_inds_with_highest_quality] = 1
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/detectron2/modeling/meta_arch/__init__.py b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/modeling/meta_arch/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..96ef9b582c2ed38525102ebb589a750cf6b9fa54
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/modeling/meta_arch/__init__.py
@@ -0,0 +1,11 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+
+from .build import META_ARCH_REGISTRY, build_model  # isort:skip
+
+from .panoptic_fpn import PanopticFPN
+
+# import all the meta_arch, so they will be registered
+from .rcnn import GeneralizedRCNN, ProposalNetwork
+from .retinanet import RetinaNet
+from .semantic_seg import SEM_SEG_HEADS_REGISTRY, SemanticSegmentor, build_sem_seg_head
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/detectron2/modeling/meta_arch/build.py b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/modeling/meta_arch/build.py
new file mode 100644
index 0000000000000000000000000000000000000000..630389dfca822f295447abd5e8424186d02e0465
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/modeling/meta_arch/build.py
@@ -0,0 +1,23 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+import torch
+
+from detectron2.utils.registry import Registry
+
+META_ARCH_REGISTRY = Registry("META_ARCH")  # noqa F401 isort:skip
+META_ARCH_REGISTRY.__doc__ = """
+Registry for meta-architectures, i.e. the whole model.
+
+The registered object will be called with `obj(cfg)`
+and expected to return a `nn.Module` object.
+"""
+
+
+def build_model(cfg):
+    """
+    Build the whole model architecture, defined by ``cfg.MODEL.META_ARCHITECTURE``.
+    Note that it does not load any weights from ``cfg``.
+    """
+    meta_arch = cfg.MODEL.META_ARCHITECTURE
+    model = META_ARCH_REGISTRY.get(meta_arch)(cfg)
+    model.to(torch.device(cfg.MODEL.DEVICE))
+    return model
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/detectron2/modeling/meta_arch/panoptic_fpn.py b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/modeling/meta_arch/panoptic_fpn.py
new file mode 100644
index 0000000000000000000000000000000000000000..c5f92f701f2da3aff6602ad2388307874102fc5c
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/modeling/meta_arch/panoptic_fpn.py
@@ -0,0 +1,218 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+
+import torch
+from torch import nn
+
+from detectron2.structures import ImageList
+
+from ..backbone import build_backbone
+from ..postprocessing import detector_postprocess, sem_seg_postprocess
+from ..proposal_generator import build_proposal_generator
+from ..roi_heads import build_roi_heads
+from .build import META_ARCH_REGISTRY
+from .semantic_seg import build_sem_seg_head
+
+__all__ = ["PanopticFPN"]
+
+
+@META_ARCH_REGISTRY.register()
+class PanopticFPN(nn.Module):
+    """
+    Implement the paper :paper:`PanopticFPN`.
+    """
+
+    def __init__(self, cfg):
+        super().__init__()
+
+        self.instance_loss_weight = cfg.MODEL.PANOPTIC_FPN.INSTANCE_LOSS_WEIGHT
+
+        # options when combining instance & semantic outputs
+        self.combine_on = cfg.MODEL.PANOPTIC_FPN.COMBINE.ENABLED
+        self.combine_overlap_threshold = cfg.MODEL.PANOPTIC_FPN.COMBINE.OVERLAP_THRESH
+        self.combine_stuff_area_limit = cfg.MODEL.PANOPTIC_FPN.COMBINE.STUFF_AREA_LIMIT
+        self.combine_instances_confidence_threshold = (
+            cfg.MODEL.PANOPTIC_FPN.COMBINE.INSTANCES_CONFIDENCE_THRESH
+        )
+
+        self.backbone = build_backbone(cfg)
+        self.proposal_generator = build_proposal_generator(cfg, self.backbone.output_shape())
+        self.roi_heads = build_roi_heads(cfg, self.backbone.output_shape())
+        self.sem_seg_head = build_sem_seg_head(cfg, self.backbone.output_shape())
+
+        self.register_buffer("pixel_mean", torch.Tensor(cfg.MODEL.PIXEL_MEAN).view(-1, 1, 1))
+        self.register_buffer("pixel_std", torch.Tensor(cfg.MODEL.PIXEL_STD).view(-1, 1, 1))
+
+    @property
+    def device(self):
+        return self.pixel_mean.device
+
+    def forward(self, batched_inputs):
+        """
+        Args:
+            batched_inputs: a list, batched outputs of :class:`DatasetMapper`.
+                Each item in the list contains the inputs for one image.
+
+                For now, each item in the list is a dict that contains:
+
+                * "image": Tensor, image in (C, H, W) format.
+                * "instances": Instances
+                * "sem_seg": semantic segmentation ground truth.
+                * Other information that's included in the original dicts, such as:
+                  "height", "width" (int): the output resolution of the model, used in inference.
+                  See :meth:`postprocess` for details.
+
+        Returns:
+            list[dict]:
+                each dict is the results for one image. The dict contains the following keys:
+
+                * "instances": see :meth:`GeneralizedRCNN.forward` for its format.
+                * "sem_seg": see :meth:`SemanticSegmentor.forward` for its format.
+                * "panoptic_seg": available when `PANOPTIC_FPN.COMBINE.ENABLED`.
+                  See the return value of
+                  :func:`combine_semantic_and_instance_outputs` for its format.
+        """
+        images = [x["image"].to(self.device) for x in batched_inputs]
+        images = [(x - self.pixel_mean) / self.pixel_std for x in images]
+        images = ImageList.from_tensors(images, self.backbone.size_divisibility)
+        features = self.backbone(images.tensor)
+
+        if "proposals" in batched_inputs[0]:
+            proposals = [x["proposals"].to(self.device) for x in batched_inputs]
+            proposal_losses = {}
+
+        if "sem_seg" in batched_inputs[0]:
+            gt_sem_seg = [x["sem_seg"].to(self.device) for x in batched_inputs]
+            gt_sem_seg = ImageList.from_tensors(
+                gt_sem_seg, self.backbone.size_divisibility, self.sem_seg_head.ignore_value
+            ).tensor
+        else:
+            gt_sem_seg = None
+        sem_seg_results, sem_seg_losses = self.sem_seg_head(features, gt_sem_seg)
+
+        if "instances" in batched_inputs[0]:
+            gt_instances = [x["instances"].to(self.device) for x in batched_inputs]
+        else:
+            gt_instances = None
+        if self.proposal_generator:
+            proposals, proposal_losses = self.proposal_generator(images, features, gt_instances)
+        detector_results, detector_losses = self.roi_heads(
+            images, features, proposals, gt_instances
+        )
+
+        if self.training:
+            losses = {}
+            losses.update(sem_seg_losses)
+            losses.update({k: v * self.instance_loss_weight for k, v in detector_losses.items()})
+            losses.update(proposal_losses)
+            return losses
+
+        processed_results = []
+        for sem_seg_result, detector_result, input_per_image, image_size in zip(
+            sem_seg_results, detector_results, batched_inputs, images.image_sizes
+        ):
+            height = input_per_image.get("height", image_size[0])
+            width = input_per_image.get("width", image_size[1])
+            sem_seg_r = sem_seg_postprocess(sem_seg_result, image_size, height, width)
+            detector_r = detector_postprocess(detector_result, height, width)
+
+            processed_results.append({"sem_seg": sem_seg_r, "instances": detector_r})
+
+            if self.combine_on:
+                panoptic_r = combine_semantic_and_instance_outputs(
+                    detector_r,
+                    sem_seg_r.argmax(dim=0),
+                    self.combine_overlap_threshold,
+                    self.combine_stuff_area_limit,
+                    self.combine_instances_confidence_threshold,
+                )
+                processed_results[-1]["panoptic_seg"] = panoptic_r
+        return processed_results
+
+
+def combine_semantic_and_instance_outputs(
+    instance_results,
+    semantic_results,
+    overlap_threshold,
+    stuff_area_limit,
+    instances_confidence_threshold,
+):
+    """
+    Implement a simple combining logic following
+    "combine_semantic_and_instance_predictions.py" in panopticapi
+    to produce panoptic segmentation outputs.
+
+    Args:
+        instance_results: output of :func:`detector_postprocess`.
+        semantic_results: an (H, W) tensor, each is the contiguous semantic
+            category id
+
+    Returns:
+        panoptic_seg (Tensor): of shape (height, width) where the values are ids for each segment.
+        segments_info (list[dict]): Describe each segment in `panoptic_seg`.
+            Each dict contains keys "id", "category_id", "isthing".
+    """
+    panoptic_seg = torch.zeros_like(semantic_results, dtype=torch.int32)
+
+    # sort instance outputs by scores
+    sorted_inds = torch.argsort(-instance_results.scores)
+
+    current_segment_id = 0
+    segments_info = []
+
+    instance_masks = instance_results.pred_masks.to(dtype=torch.bool, device=panoptic_seg.device)
+
+    # Add instances one-by-one, check for overlaps with existing ones
+    for inst_id in sorted_inds:
+        score = instance_results.scores[inst_id].item()
+        if score < instances_confidence_threshold:
+            break
+        mask = instance_masks[inst_id]  # H,W
+        mask_area = mask.sum().item()
+
+        if mask_area == 0:
+            continue
+
+        intersect = (mask > 0) & (panoptic_seg > 0)
+        intersect_area = intersect.sum().item()
+
+        if intersect_area * 1.0 / mask_area > overlap_threshold:
+            continue
+
+        if intersect_area > 0:
+            mask = mask & (panoptic_seg == 0)
+
+        current_segment_id += 1
+        panoptic_seg[mask] = current_segment_id
+        segments_info.append(
+            {
+                "id": current_segment_id,
+                "isthing": True,
+                "score": score,
+                "category_id": instance_results.pred_classes[inst_id].item(),
+                "instance_id": inst_id.item(),
+            }
+        )
+
+    # Add semantic results to remaining empty areas
+    semantic_labels = torch.unique(semantic_results).cpu().tolist()
+    for semantic_label in semantic_labels:
+        if semantic_label == 0:  # 0 is a special "thing" class
+            continue
+        mask = (semantic_results == semantic_label) & (panoptic_seg == 0)
+        mask_area = mask.sum().item()
+        if mask_area < stuff_area_limit:
+            continue
+
+        current_segment_id += 1
+        panoptic_seg[mask] = current_segment_id
+        segments_info.append(
+            {
+                "id": current_segment_id,
+                "isthing": False,
+                "category_id": semantic_label,
+                "area": mask_area,
+            }
+        )
+
+    return panoptic_seg, segments_info
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/detectron2/modeling/meta_arch/rcnn.py b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/modeling/meta_arch/rcnn.py
new file mode 100644
index 0000000000000000000000000000000000000000..b15ea8a38e5ddfbb4049c89917f055295e396b4f
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/modeling/meta_arch/rcnn.py
@@ -0,0 +1,263 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+import logging
+import numpy as np
+import torch
+from torch import nn
+
+from detectron2.structures import ImageList
+from detectron2.utils.events import get_event_storage
+from detectron2.utils.logger import log_first_n
+
+from ..backbone import build_backbone
+from ..postprocessing import detector_postprocess
+from ..proposal_generator import build_proposal_generator
+from ..roi_heads import build_roi_heads
+from .build import META_ARCH_REGISTRY
+
+__all__ = ["GeneralizedRCNN", "ProposalNetwork"]
+
+
+@META_ARCH_REGISTRY.register()
+class GeneralizedRCNN(nn.Module):
+    """
+    Generalized R-CNN. Any models that contains the following three components:
+    1. Per-image feature extraction (aka backbone)
+    2. Region proposal generation
+    3. Per-region feature extraction and prediction
+    """
+
+    def __init__(self, cfg):
+        super().__init__()
+
+        self.backbone = build_backbone(cfg)
+        self.proposal_generator = build_proposal_generator(cfg, self.backbone.output_shape())
+        self.roi_heads = build_roi_heads(cfg, self.backbone.output_shape())
+        self.vis_period = cfg.VIS_PERIOD
+        self.input_format = cfg.INPUT.FORMAT
+
+        assert len(cfg.MODEL.PIXEL_MEAN) == len(cfg.MODEL.PIXEL_STD)
+        self.register_buffer("pixel_mean", torch.Tensor(cfg.MODEL.PIXEL_MEAN).view(-1, 1, 1))
+        self.register_buffer("pixel_std", torch.Tensor(cfg.MODEL.PIXEL_STD).view(-1, 1, 1))
+
+    @property
+    def device(self):
+        return self.pixel_mean.device
+
+    def visualize_training(self, batched_inputs, proposals):
+        """
+        A function used to visualize images and proposals. It shows ground truth
+        bounding boxes on the original image and up to 20 predicted object
+        proposals on the original image. Users can implement different
+        visualization functions for different models.
+
+        Args:
+            batched_inputs (list): a list that contains input to the model.
+            proposals (list): a list that contains predicted proposals. Both
+                batched_inputs and proposals should have the same length.
+        """
+        from detectron2.utils.visualizer import Visualizer
+
+        storage = get_event_storage()
+        max_vis_prop = 20
+
+        for input, prop in zip(batched_inputs, proposals):
+            img = input["image"].cpu().numpy()
+            assert img.shape[0] == 3, "Images should have 3 channels."
+            if self.input_format == "BGR":
+                img = img[::-1, :, :]
+            img = img.transpose(1, 2, 0)
+            v_gt = Visualizer(img, None)
+            v_gt = v_gt.overlay_instances(boxes=input["instances"].gt_boxes)
+            anno_img = v_gt.get_image()
+            box_size = min(len(prop.proposal_boxes), max_vis_prop)
+            v_pred = Visualizer(img, None)
+            v_pred = v_pred.overlay_instances(
+                boxes=prop.proposal_boxes[0:box_size].tensor.cpu().numpy()
+            )
+            prop_img = v_pred.get_image()
+            vis_img = np.concatenate((anno_img, prop_img), axis=1)
+            vis_img = vis_img.transpose(2, 0, 1)
+            vis_name = "Left: GT bounding boxes;  Right: Predicted proposals"
+            storage.put_image(vis_name, vis_img)
+            break  # only visualize one image in a batch
+
+    def forward(self, batched_inputs):
+        """
+        Args:
+            batched_inputs: a list, batched outputs of :class:`DatasetMapper` .
+                Each item in the list contains the inputs for one image.
+                For now, each item in the list is a dict that contains:
+
+                * image: Tensor, image in (C, H, W) format.
+                * instances (optional): groundtruth :class:`Instances`
+                * proposals (optional): :class:`Instances`, precomputed proposals.
+
+                Other information that's included in the original dicts, such as:
+
+                * "height", "width" (int): the output resolution of the model, used in inference.
+                  See :meth:`postprocess` for details.
+
+        Returns:
+            list[dict]:
+                Each dict is the output for one input image.
+                The dict contains one key "instances" whose value is a :class:`Instances`.
+                The :class:`Instances` object has the following keys:
+                "pred_boxes", "pred_classes", "scores", "pred_masks", "pred_keypoints"
+        """
+        if not self.training:
+            return self.inference(batched_inputs)
+
+        images = self.preprocess_image(batched_inputs)
+        if "instances" in batched_inputs[0]:
+            gt_instances = [x["instances"].to(self.device) for x in batched_inputs]
+        elif "targets" in batched_inputs[0]:
+            log_first_n(
+                logging.WARN, "'targets' in the model inputs is now renamed to 'instances'!", n=10
+            )
+            gt_instances = [x["targets"].to(self.device) for x in batched_inputs]
+        else:
+            gt_instances = None
+
+        features = self.backbone(images.tensor)
+
+        if self.proposal_generator:
+            proposals, proposal_losses = self.proposal_generator(images, features, gt_instances)
+        else:
+            assert "proposals" in batched_inputs[0]
+            proposals = [x["proposals"].to(self.device) for x in batched_inputs]
+            proposal_losses = {}
+
+        _, detector_losses = self.roi_heads(images, features, proposals, gt_instances)
+        if self.vis_period > 0:
+            storage = get_event_storage()
+            if storage.iter % self.vis_period == 0:
+                self.visualize_training(batched_inputs, proposals)
+
+        losses = {}
+        losses.update(detector_losses)
+        losses.update(proposal_losses)
+        return losses
+
+    def inference(self, batched_inputs, detected_instances=None, do_postprocess=True):
+        """
+        Run inference on the given inputs.
+
+        Args:
+            batched_inputs (list[dict]): same as in :meth:`forward`
+            detected_instances (None or list[Instances]): if not None, it
+                contains an `Instances` object per image. The `Instances`
+                object contains "pred_boxes" and "pred_classes" which are
+                known boxes in the image.
+                The inference will then skip the detection of bounding boxes,
+                and only predict other per-ROI outputs.
+            do_postprocess (bool): whether to apply post-processing on the outputs.
+
+        Returns:
+            same as in :meth:`forward`.
+        """
+        assert not self.training
+
+        images = self.preprocess_image(batched_inputs)
+        features = self.backbone(images.tensor)
+
+        if detected_instances is None:
+            if self.proposal_generator:
+                proposals, _ = self.proposal_generator(images, features, None)
+            else:
+                assert "proposals" in batched_inputs[0]
+                proposals = [x["proposals"].to(self.device) for x in batched_inputs]
+
+            results, _ = self.roi_heads(images, features, proposals, None)
+        else:
+            detected_instances = [x.to(self.device) for x in detected_instances]
+            results = self.roi_heads.forward_with_given_boxes(features, detected_instances)
+
+        if do_postprocess:
+            return GeneralizedRCNN._postprocess(results, batched_inputs, images.image_sizes)
+        else:
+            return results
+
+    def preprocess_image(self, batched_inputs):
+        """
+        Normalize, pad and batch the input images.
+        """
+        images = [x["image"].to(self.device) for x in batched_inputs]
+        images = [(x - self.pixel_mean) / self.pixel_std for x in images]
+        images = ImageList.from_tensors(images, self.backbone.size_divisibility)
+        return images
+
+    @staticmethod
+    def _postprocess(instances, batched_inputs, image_sizes):
+        """
+        Rescale the output instances to the target size.
+        """
+        # note: private function; subject to changes
+        processed_results = []
+        for results_per_image, input_per_image, image_size in zip(
+            instances, batched_inputs, image_sizes
+        ):
+            height = input_per_image.get("height", image_size[0])
+            width = input_per_image.get("width", image_size[1])
+            r = detector_postprocess(results_per_image, height, width)
+            processed_results.append({"instances": r})
+        return processed_results
+
+
+@META_ARCH_REGISTRY.register()
+class ProposalNetwork(nn.Module):
+    """
+    A meta architecture that only predicts object proposals.
+    """
+
+    def __init__(self, cfg):
+        super().__init__()
+        self.backbone = build_backbone(cfg)
+        self.proposal_generator = build_proposal_generator(cfg, self.backbone.output_shape())
+
+        self.register_buffer("pixel_mean", torch.Tensor(cfg.MODEL.PIXEL_MEAN).view(-1, 1, 1))
+        self.register_buffer("pixel_std", torch.Tensor(cfg.MODEL.PIXEL_STD).view(-1, 1, 1))
+
+    @property
+    def device(self):
+        return self.pixel_mean.device
+
+    def forward(self, batched_inputs):
+        """
+        Args:
+            Same as in :class:`GeneralizedRCNN.forward`
+
+        Returns:
+            list[dict]:
+                Each dict is the output for one input image.
+                The dict contains one key "proposals" whose value is a
+                :class:`Instances` with keys "proposal_boxes" and "objectness_logits".
+        """
+        images = [x["image"].to(self.device) for x in batched_inputs]
+        images = [(x - self.pixel_mean) / self.pixel_std for x in images]
+        images = ImageList.from_tensors(images, self.backbone.size_divisibility)
+        features = self.backbone(images.tensor)
+
+        if "instances" in batched_inputs[0]:
+            gt_instances = [x["instances"].to(self.device) for x in batched_inputs]
+        elif "targets" in batched_inputs[0]:
+            log_first_n(
+                logging.WARN, "'targets' in the model inputs is now renamed to 'instances'!", n=10
+            )
+            gt_instances = [x["targets"].to(self.device) for x in batched_inputs]
+        else:
+            gt_instances = None
+        proposals, proposal_losses = self.proposal_generator(images, features, gt_instances)
+        # In training, the proposals are not useful at all but we generate them anyway.
+        # This makes RPN-only models about 5% slower.
+        if self.training:
+            return proposal_losses
+
+        processed_results = []
+        for results_per_image, input_per_image, image_size in zip(
+            proposals, batched_inputs, images.image_sizes
+        ):
+            height = input_per_image.get("height", image_size[0])
+            width = input_per_image.get("width", image_size[1])
+            r = detector_postprocess(results_per_image, height, width)
+            processed_results.append({"proposals": r})
+        return processed_results
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/detectron2/modeling/meta_arch/retinanet.py b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/modeling/meta_arch/retinanet.py
new file mode 100644
index 0000000000000000000000000000000000000000..35c42cc25e93bf2841c5e1fcff389f317ed0883a
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/modeling/meta_arch/retinanet.py
@@ -0,0 +1,489 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+import logging
+import math
+import numpy as np
+from typing import List
+import torch
+from fvcore.nn import sigmoid_focal_loss_jit, smooth_l1_loss
+from torch import nn
+
+from detectron2.layers import ShapeSpec, batched_nms, cat
+from detectron2.structures import Boxes, ImageList, Instances, pairwise_iou
+from detectron2.utils.events import get_event_storage
+from detectron2.utils.logger import log_first_n
+
+from ..anchor_generator import build_anchor_generator
+from ..backbone import build_backbone
+from ..box_regression import Box2BoxTransform
+from ..matcher import Matcher
+from ..postprocessing import detector_postprocess
+from .build import META_ARCH_REGISTRY
+
+__all__ = ["RetinaNet"]
+
+
+def permute_to_N_HWA_K(tensor, K):
+    """
+    Transpose/reshape a tensor from (N, (A x K), H, W) to (N, (HxWxA), K)
+    """
+    assert tensor.dim() == 4, tensor.shape
+    N, _, H, W = tensor.shape
+    tensor = tensor.view(N, -1, K, H, W)
+    tensor = tensor.permute(0, 3, 4, 1, 2)
+    tensor = tensor.reshape(N, -1, K)  # Size=(N,HWA,K)
+    return tensor
+
+
+def permute_all_cls_and_box_to_N_HWA_K_and_concat(box_cls, box_delta, num_classes=80):
+    """
+    Rearrange the tensor layout from the network output, i.e.:
+    list[Tensor]: #lvl tensors of shape (N, A x K, Hi, Wi)
+    to per-image predictions, i.e.:
+    Tensor: of shape (N x sum(Hi x Wi x A), K)
+    """
+    # for each feature level, permute the outputs to make them be in the
+    # same format as the labels. Note that the labels are computed for
+    # all feature levels concatenated, so we keep the same representation
+    # for the objectness and the box_delta
+    box_cls_flattened = [permute_to_N_HWA_K(x, num_classes) for x in box_cls]
+    box_delta_flattened = [permute_to_N_HWA_K(x, 4) for x in box_delta]
+    # concatenate on the first dimension (representing the feature levels), to
+    # take into account the way the labels were generated (with all feature maps
+    # being concatenated as well)
+    box_cls = cat(box_cls_flattened, dim=1).view(-1, num_classes)
+    box_delta = cat(box_delta_flattened, dim=1).view(-1, 4)
+    return box_cls, box_delta
+
+
+@META_ARCH_REGISTRY.register()
+class RetinaNet(nn.Module):
+    """
+    Implement RetinaNet in :paper:`RetinaNet`.
+    """
+
+    def __init__(self, cfg):
+        super().__init__()
+
+        # fmt: off
+        self.num_classes              = cfg.MODEL.RETINANET.NUM_CLASSES
+        self.in_features              = cfg.MODEL.RETINANET.IN_FEATURES
+        # Loss parameters:
+        self.focal_loss_alpha         = cfg.MODEL.RETINANET.FOCAL_LOSS_ALPHA
+        self.focal_loss_gamma         = cfg.MODEL.RETINANET.FOCAL_LOSS_GAMMA
+        self.smooth_l1_loss_beta      = cfg.MODEL.RETINANET.SMOOTH_L1_LOSS_BETA
+        # Inference parameters:
+        self.score_threshold          = cfg.MODEL.RETINANET.SCORE_THRESH_TEST
+        self.topk_candidates          = cfg.MODEL.RETINANET.TOPK_CANDIDATES_TEST
+        self.nms_threshold            = cfg.MODEL.RETINANET.NMS_THRESH_TEST
+        self.max_detections_per_image = cfg.TEST.DETECTIONS_PER_IMAGE
+        # Vis parameters
+        self.vis_period               = cfg.VIS_PERIOD
+        self.input_format             = cfg.INPUT.FORMAT
+        # fmt: on
+
+        self.backbone = build_backbone(cfg)
+
+        backbone_shape = self.backbone.output_shape()
+        feature_shapes = [backbone_shape[f] for f in self.in_features]
+        self.head = RetinaNetHead(cfg, feature_shapes)
+        self.anchor_generator = build_anchor_generator(cfg, feature_shapes)
+
+        # Matching and loss
+        self.box2box_transform = Box2BoxTransform(weights=cfg.MODEL.RPN.BBOX_REG_WEIGHTS)
+        self.matcher = Matcher(
+            cfg.MODEL.RETINANET.IOU_THRESHOLDS,
+            cfg.MODEL.RETINANET.IOU_LABELS,
+            allow_low_quality_matches=True,
+        )
+
+        self.register_buffer("pixel_mean", torch.Tensor(cfg.MODEL.PIXEL_MEAN).view(-1, 1, 1))
+        self.register_buffer("pixel_std", torch.Tensor(cfg.MODEL.PIXEL_STD).view(-1, 1, 1))
+
+        """
+        In Detectron1, loss is normalized by number of foreground samples in the batch.
+        When batch size is 1 per GPU, #foreground has a large variance and
+        using it lead to lower performance. Here we maintain an EMA of #foreground to
+        stabilize the normalizer.
+        """
+        self.loss_normalizer = 100  # initialize with any reasonable #fg that's not too small
+        self.loss_normalizer_momentum = 0.9
+
+    @property
+    def device(self):
+        return self.pixel_mean.device
+
+    def visualize_training(self, batched_inputs, results):
+        """
+        A function used to visualize ground truth images and final network predictions.
+        It shows ground truth bounding boxes on the original image and up to 20
+        predicted object bounding boxes on the original image.
+
+        Args:
+            batched_inputs (list): a list that contains input to the model.
+            results (List[Instances]): a list of #images elements.
+        """
+        from detectron2.utils.visualizer import Visualizer
+
+        assert len(batched_inputs) == len(
+            results
+        ), "Cannot visualize inputs and results of different sizes"
+        storage = get_event_storage()
+        max_boxes = 20
+
+        image_index = 0  # only visualize a single image
+        img = batched_inputs[image_index]["image"].cpu().numpy()
+        assert img.shape[0] == 3, "Images should have 3 channels."
+        if self.input_format == "BGR":
+            img = img[::-1, :, :]
+        img = img.transpose(1, 2, 0)
+        v_gt = Visualizer(img, None)
+        v_gt = v_gt.overlay_instances(boxes=batched_inputs[image_index]["instances"].gt_boxes)
+        anno_img = v_gt.get_image()
+        processed_results = detector_postprocess(results[image_index], img.shape[0], img.shape[1])
+        predicted_boxes = processed_results.pred_boxes.tensor.detach().cpu().numpy()
+
+        v_pred = Visualizer(img, None)
+        v_pred = v_pred.overlay_instances(boxes=predicted_boxes[0:max_boxes])
+        prop_img = v_pred.get_image()
+        vis_img = np.vstack((anno_img, prop_img))
+        vis_img = vis_img.transpose(2, 0, 1)
+        vis_name = f"Top: GT bounding boxes; Bottom: {max_boxes} Highest Scoring Results"
+        storage.put_image(vis_name, vis_img)
+
+    def forward(self, batched_inputs):
+        """
+        Args:
+            batched_inputs: a list, batched outputs of :class:`DatasetMapper` .
+                Each item in the list contains the inputs for one image.
+                For now, each item in the list is a dict that contains:
+
+                * image: Tensor, image in (C, H, W) format.
+                * instances: Instances
+
+                Other information that's included in the original dicts, such as:
+
+                * "height", "width" (int): the output resolution of the model, used in inference.
+                  See :meth:`postprocess` for details.
+        Returns:
+            dict[str: Tensor]:
+                mapping from a named loss to a tensor storing the loss. Used during training only.
+        """
+        images = self.preprocess_image(batched_inputs)
+        if "instances" in batched_inputs[0]:
+            gt_instances = [x["instances"].to(self.device) for x in batched_inputs]
+        elif "targets" in batched_inputs[0]:
+            log_first_n(
+                logging.WARN, "'targets' in the model inputs is now renamed to 'instances'!", n=10
+            )
+            gt_instances = [x["targets"].to(self.device) for x in batched_inputs]
+        else:
+            gt_instances = None
+
+        features = self.backbone(images.tensor)
+        features = [features[f] for f in self.in_features]
+        box_cls, box_delta = self.head(features)
+        anchors = self.anchor_generator(features)
+
+        if self.training:
+            gt_classes, gt_anchors_reg_deltas = self.get_ground_truth(anchors, gt_instances)
+            losses = self.losses(gt_classes, gt_anchors_reg_deltas, box_cls, box_delta)
+
+            if self.vis_period > 0:
+                storage = get_event_storage()
+                if storage.iter % self.vis_period == 0:
+                    results = self.inference(box_cls, box_delta, anchors, images.image_sizes)
+                    self.visualize_training(batched_inputs, results)
+
+            return losses
+        else:
+            results = self.inference(box_cls, box_delta, anchors, images.image_sizes)
+            processed_results = []
+            for results_per_image, input_per_image, image_size in zip(
+                results, batched_inputs, images.image_sizes
+            ):
+                height = input_per_image.get("height", image_size[0])
+                width = input_per_image.get("width", image_size[1])
+                r = detector_postprocess(results_per_image, height, width)
+                processed_results.append({"instances": r})
+            return processed_results
+
+    def losses(self, gt_classes, gt_anchors_deltas, pred_class_logits, pred_anchor_deltas):
+        """
+        Args:
+            For `gt_classes` and `gt_anchors_deltas` parameters, see
+                :meth:`RetinaNet.get_ground_truth`.
+            Their shapes are (N, R) and (N, R, 4), respectively, where R is
+            the total number of anchors across levels, i.e. sum(Hi x Wi x A)
+            For `pred_class_logits` and `pred_anchor_deltas`, see
+                :meth:`RetinaNetHead.forward`.
+
+        Returns:
+            dict[str, Tensor]:
+                mapping from a named loss to a scalar tensor
+                storing the loss. Used during training only. The dict keys are:
+                "loss_cls" and "loss_box_reg"
+        """
+        pred_class_logits, pred_anchor_deltas = permute_all_cls_and_box_to_N_HWA_K_and_concat(
+            pred_class_logits, pred_anchor_deltas, self.num_classes
+        )  # Shapes: (N x R, K) and (N x R, 4), respectively.
+
+        gt_classes = gt_classes.flatten()
+        gt_anchors_deltas = gt_anchors_deltas.view(-1, 4)
+
+        valid_idxs = gt_classes >= 0
+        foreground_idxs = (gt_classes >= 0) & (gt_classes != self.num_classes)
+        num_foreground = foreground_idxs.sum().item()
+        get_event_storage().put_scalar("num_foreground", num_foreground)
+        self.loss_normalizer = (
+            self.loss_normalizer_momentum * self.loss_normalizer
+            + (1 - self.loss_normalizer_momentum) * num_foreground
+        )
+
+        gt_classes_target = torch.zeros_like(pred_class_logits)
+        gt_classes_target[foreground_idxs, gt_classes[foreground_idxs]] = 1
+
+        # logits loss
+        loss_cls = sigmoid_focal_loss_jit(
+            pred_class_logits[valid_idxs],
+            gt_classes_target[valid_idxs],
+            alpha=self.focal_loss_alpha,
+            gamma=self.focal_loss_gamma,
+            reduction="sum",
+        ) / max(1, self.loss_normalizer)
+
+        # regression loss
+        loss_box_reg = smooth_l1_loss(
+            pred_anchor_deltas[foreground_idxs],
+            gt_anchors_deltas[foreground_idxs],
+            beta=self.smooth_l1_loss_beta,
+            reduction="sum",
+        ) / max(1, self.loss_normalizer)
+
+        return {"loss_cls": loss_cls, "loss_box_reg": loss_box_reg}
+
+    @torch.no_grad()
+    def get_ground_truth(self, anchors, targets):
+        """
+        Args:
+            anchors (list[Boxes]): A list of #feature level Boxes.
+                The Boxes contains anchors of this image on the specific feature level.
+            targets (list[Instances]): a list of N `Instances`s. The i-th
+                `Instances` contains the ground-truth per-instance annotations
+                for the i-th input image.  Specify `targets` during training only.
+
+        Returns:
+            gt_classes (Tensor):
+                An integer tensor of shape (N, R) storing ground-truth labels for each anchor.
+                R is the total number of anchors, i.e. the sum of Hi x Wi x A for all levels.
+                Anchors with an IoU with some target higher than the foreground threshold
+                are assigned their corresponding label in the [0, K-1] range.
+                Anchors whose IoU are below the background threshold are assigned
+                the label "K". Anchors whose IoU are between the foreground and background
+                thresholds are assigned a label "-1", i.e. ignore.
+            gt_anchors_deltas (Tensor):
+                Shape (N, R, 4).
+                The last dimension represents ground-truth box2box transform
+                targets (dx, dy, dw, dh) that map each anchor to its matched ground-truth box.
+                The values in the tensor are meaningful only when the corresponding
+                anchor is labeled as foreground.
+        """
+        gt_classes = []
+        gt_anchors_deltas = []
+        anchors = Boxes.cat(anchors)  # Rx4
+
+        for targets_per_image in targets:
+            match_quality_matrix = pairwise_iou(targets_per_image.gt_boxes, anchors)
+            gt_matched_idxs, anchor_labels = self.matcher(match_quality_matrix)
+
+            has_gt = len(targets_per_image) > 0
+            if has_gt:
+                # ground truth box regression
+                matched_gt_boxes = targets_per_image.gt_boxes[gt_matched_idxs]
+                gt_anchors_reg_deltas_i = self.box2box_transform.get_deltas(
+                    anchors.tensor, matched_gt_boxes.tensor
+                )
+
+                gt_classes_i = targets_per_image.gt_classes[gt_matched_idxs]
+                # Anchors with label 0 are treated as background.
+                gt_classes_i[anchor_labels == 0] = self.num_classes
+                # Anchors with label -1 are ignored.
+                gt_classes_i[anchor_labels == -1] = -1
+            else:
+                gt_classes_i = torch.zeros_like(gt_matched_idxs) + self.num_classes
+                gt_anchors_reg_deltas_i = torch.zeros_like(anchors.tensor)
+
+            gt_classes.append(gt_classes_i)
+            gt_anchors_deltas.append(gt_anchors_reg_deltas_i)
+
+        return torch.stack(gt_classes), torch.stack(gt_anchors_deltas)
+
+    def inference(self, box_cls, box_delta, anchors, image_sizes):
+        """
+        Arguments:
+            box_cls, box_delta: Same as the output of :meth:`RetinaNetHead.forward`
+            anchors (list[Boxes]): A list of #feature level Boxes.
+                The Boxes contain anchors of this image on the specific feature level.
+            image_sizes (List[torch.Size]): the input image sizes
+
+        Returns:
+            results (List[Instances]): a list of #images elements.
+        """
+        results = []
+
+        box_cls = [permute_to_N_HWA_K(x, self.num_classes) for x in box_cls]
+        box_delta = [permute_to_N_HWA_K(x, 4) for x in box_delta]
+        # list[Tensor], one per level, each has shape (N, Hi x Wi x A, K or 4)
+
+        for img_idx, image_size in enumerate(image_sizes):
+            box_cls_per_image = [box_cls_per_level[img_idx] for box_cls_per_level in box_cls]
+            box_reg_per_image = [box_reg_per_level[img_idx] for box_reg_per_level in box_delta]
+            results_per_image = self.inference_single_image(
+                box_cls_per_image, box_reg_per_image, anchors, tuple(image_size)
+            )
+            results.append(results_per_image)
+        return results
+
+    def inference_single_image(self, box_cls, box_delta, anchors, image_size):
+        """
+        Single-image inference. Return bounding-box detection results by thresholding
+        on scores and applying non-maximum suppression (NMS).
+
+        Arguments:
+            box_cls (list[Tensor]): list of #feature levels. Each entry contains
+                tensor of size (H x W x A, K)
+            box_delta (list[Tensor]): Same shape as 'box_cls' except that K becomes 4.
+            anchors (list[Boxes]): list of #feature levels. Each entry contains
+                a Boxes object, which contains all the anchors for that
+                image in that feature level.
+            image_size (tuple(H, W)): a tuple of the image height and width.
+
+        Returns:
+            Same as `inference`, but for only one image.
+        """
+        boxes_all = []
+        scores_all = []
+        class_idxs_all = []
+
+        # Iterate over every feature level
+        for box_cls_i, box_reg_i, anchors_i in zip(box_cls, box_delta, anchors):
+            # (HxWxAxK,)
+            box_cls_i = box_cls_i.flatten().sigmoid_()
+
+            # Keep top k top scoring indices only.
+            num_topk = min(self.topk_candidates, box_reg_i.size(0))
+            # torch.sort is actually faster than .topk (at least on GPUs)
+            predicted_prob, topk_idxs = box_cls_i.sort(descending=True)
+            predicted_prob = predicted_prob[:num_topk]
+            topk_idxs = topk_idxs[:num_topk]
+
+            # filter out the proposals with low confidence score
+            keep_idxs = predicted_prob > self.score_threshold
+            predicted_prob = predicted_prob[keep_idxs]
+            topk_idxs = topk_idxs[keep_idxs]
+
+            anchor_idxs = topk_idxs // self.num_classes
+            classes_idxs = topk_idxs % self.num_classes
+
+            box_reg_i = box_reg_i[anchor_idxs]
+            anchors_i = anchors_i[anchor_idxs]
+            # predict boxes
+            predicted_boxes = self.box2box_transform.apply_deltas(box_reg_i, anchors_i.tensor)
+
+            boxes_all.append(predicted_boxes)
+            scores_all.append(predicted_prob)
+            class_idxs_all.append(classes_idxs)
+
+        boxes_all, scores_all, class_idxs_all = [
+            cat(x) for x in [boxes_all, scores_all, class_idxs_all]
+        ]
+        keep = batched_nms(boxes_all, scores_all, class_idxs_all, self.nms_threshold)
+        keep = keep[: self.max_detections_per_image]
+
+        result = Instances(image_size)
+        result.pred_boxes = Boxes(boxes_all[keep])
+        result.scores = scores_all[keep]
+        result.pred_classes = class_idxs_all[keep]
+        return result
+
+    def preprocess_image(self, batched_inputs):
+        """
+        Normalize, pad and batch the input images.
+        """
+        images = [x["image"].to(self.device) for x in batched_inputs]
+        images = [(x - self.pixel_mean) / self.pixel_std for x in images]
+        images = ImageList.from_tensors(images, self.backbone.size_divisibility)
+        return images
+
+
+class RetinaNetHead(nn.Module):
+    """
+    The head used in RetinaNet for object classification and box regression.
+    It has two subnets for the two tasks, with a common structure but separate parameters.
+    """
+
+    def __init__(self, cfg, input_shape: List[ShapeSpec]):
+        super().__init__()
+        # fmt: off
+        in_channels      = input_shape[0].channels
+        num_classes      = cfg.MODEL.RETINANET.NUM_CLASSES
+        num_convs        = cfg.MODEL.RETINANET.NUM_CONVS
+        prior_prob       = cfg.MODEL.RETINANET.PRIOR_PROB
+        num_anchors      = build_anchor_generator(cfg, input_shape).num_cell_anchors
+        # fmt: on
+        assert (
+            len(set(num_anchors)) == 1
+        ), "Using different number of anchors between levels is not currently supported!"
+        num_anchors = num_anchors[0]
+
+        cls_subnet = []
+        bbox_subnet = []
+        for _ in range(num_convs):
+            cls_subnet.append(
+                nn.Conv2d(in_channels, in_channels, kernel_size=3, stride=1, padding=1)
+            )
+            cls_subnet.append(nn.ReLU())
+            bbox_subnet.append(
+                nn.Conv2d(in_channels, in_channels, kernel_size=3, stride=1, padding=1)
+            )
+            bbox_subnet.append(nn.ReLU())
+
+        self.cls_subnet = nn.Sequential(*cls_subnet)
+        self.bbox_subnet = nn.Sequential(*bbox_subnet)
+        self.cls_score = nn.Conv2d(
+            in_channels, num_anchors * num_classes, kernel_size=3, stride=1, padding=1
+        )
+        self.bbox_pred = nn.Conv2d(in_channels, num_anchors * 4, kernel_size=3, stride=1, padding=1)
+
+        # Initialization
+        for modules in [self.cls_subnet, self.bbox_subnet, self.cls_score, self.bbox_pred]:
+            for layer in modules.modules():
+                if isinstance(layer, nn.Conv2d):
+                    torch.nn.init.normal_(layer.weight, mean=0, std=0.01)
+                    torch.nn.init.constant_(layer.bias, 0)
+
+        # Use prior in model initialization to improve stability
+        bias_value = -(math.log((1 - prior_prob) / prior_prob))
+        torch.nn.init.constant_(self.cls_score.bias, bias_value)
+
+    def forward(self, features):
+        """
+        Arguments:
+            features (list[Tensor]): FPN feature map tensors in high to low resolution.
+                Each tensor in the list correspond to different feature levels.
+
+        Returns:
+            logits (list[Tensor]): #lvl tensors, each has shape (N, AxK, Hi, Wi).
+                The tensor predicts the classification probability
+                at each spatial position for each of the A anchors and K object
+                classes.
+            bbox_reg (list[Tensor]): #lvl tensors, each has shape (N, Ax4, Hi, Wi).
+                The tensor predicts 4-vector (dx,dy,dw,dh) box
+                regression values for every anchor. These values are the
+                relative offset between the anchor and the ground truth box.
+        """
+        logits = []
+        bbox_reg = []
+        for feature in features:
+            logits.append(self.cls_score(self.cls_subnet(feature)))
+            bbox_reg.append(self.bbox_pred(self.bbox_subnet(feature)))
+        return logits, bbox_reg
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/detectron2/modeling/meta_arch/semantic_seg.py b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/modeling/meta_arch/semantic_seg.py
new file mode 100644
index 0000000000000000000000000000000000000000..2c41a7235cb9c578e2c6de5835854bdff7493616
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/modeling/meta_arch/semantic_seg.py
@@ -0,0 +1,186 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+import numpy as np
+from typing import Dict
+import fvcore.nn.weight_init as weight_init
+import torch
+from torch import nn
+from torch.nn import functional as F
+
+from detectron2.layers import Conv2d, ShapeSpec
+from detectron2.structures import ImageList
+from detectron2.utils.registry import Registry
+
+from ..backbone import build_backbone
+from ..postprocessing import sem_seg_postprocess
+from .build import META_ARCH_REGISTRY
+
+__all__ = ["SemanticSegmentor", "SEM_SEG_HEADS_REGISTRY", "SemSegFPNHead", "build_sem_seg_head"]
+
+
+SEM_SEG_HEADS_REGISTRY = Registry("SEM_SEG_HEADS")
+SEM_SEG_HEADS_REGISTRY.__doc__ = """
+Registry for semantic segmentation heads, which make semantic segmentation predictions
+from feature maps.
+"""
+
+
+@META_ARCH_REGISTRY.register()
+class SemanticSegmentor(nn.Module):
+    """
+    Main class for semantic segmentation architectures.
+    """
+
+    def __init__(self, cfg):
+        super().__init__()
+        self.backbone = build_backbone(cfg)
+        self.sem_seg_head = build_sem_seg_head(cfg, self.backbone.output_shape())
+        self.register_buffer("pixel_mean", torch.Tensor(cfg.MODEL.PIXEL_MEAN).view(-1, 1, 1))
+        self.register_buffer("pixel_std", torch.Tensor(cfg.MODEL.PIXEL_STD).view(-1, 1, 1))
+
+    @property
+    def device(self):
+        return self.pixel_mean.device
+
+    def forward(self, batched_inputs):
+        """
+        Args:
+            batched_inputs: a list, batched outputs of :class:`DatasetMapper`.
+                Each item in the list contains the inputs for one image.
+
+                For now, each item in the list is a dict that contains:
+
+                   * "image": Tensor, image in (C, H, W) format.
+                   * "sem_seg": semantic segmentation ground truth
+                   * Other information that's included in the original dicts, such as:
+                     "height", "width" (int): the output resolution of the model, used in inference.
+                     See :meth:`postprocess` for details.
+
+        Returns:
+            list[dict]:
+              Each dict is the output for one input image.
+              The dict contains one key "sem_seg" whose value is a
+              Tensor that represents the
+              per-pixel segmentation prediced by the head.
+              The prediction has shape KxHxW that represents the logits of
+              each class for each pixel.
+        """
+        images = [x["image"].to(self.device) for x in batched_inputs]
+        images = [(x - self.pixel_mean) / self.pixel_std for x in images]
+        images = ImageList.from_tensors(images, self.backbone.size_divisibility)
+
+        features = self.backbone(images.tensor)
+
+        if "sem_seg" in batched_inputs[0]:
+            targets = [x["sem_seg"].to(self.device) for x in batched_inputs]
+            targets = ImageList.from_tensors(
+                targets, self.backbone.size_divisibility, self.sem_seg_head.ignore_value
+            ).tensor
+        else:
+            targets = None
+        results, losses = self.sem_seg_head(features, targets)
+
+        if self.training:
+            return losses
+
+        processed_results = []
+        for result, input_per_image, image_size in zip(results, batched_inputs, images.image_sizes):
+            height = input_per_image.get("height")
+            width = input_per_image.get("width")
+            r = sem_seg_postprocess(result, image_size, height, width)
+            processed_results.append({"sem_seg": r})
+        return processed_results
+
+
+def build_sem_seg_head(cfg, input_shape):
+    """
+    Build a semantic segmentation head from `cfg.MODEL.SEM_SEG_HEAD.NAME`.
+    """
+    name = cfg.MODEL.SEM_SEG_HEAD.NAME
+    return SEM_SEG_HEADS_REGISTRY.get(name)(cfg, input_shape)
+
+
+@SEM_SEG_HEADS_REGISTRY.register()
+class SemSegFPNHead(nn.Module):
+    """
+    A semantic segmentation head described in :paper:`PanopticFPN`.
+    It takes FPN features as input and merges information from all
+    levels of the FPN into single output.
+    """
+
+    def __init__(self, cfg, input_shape: Dict[str, ShapeSpec]):
+        super().__init__()
+
+        # fmt: off
+        self.in_features      = cfg.MODEL.SEM_SEG_HEAD.IN_FEATURES
+        feature_strides       = {k: v.stride for k, v in input_shape.items()}
+        feature_channels      = {k: v.channels for k, v in input_shape.items()}
+        self.ignore_value     = cfg.MODEL.SEM_SEG_HEAD.IGNORE_VALUE
+        num_classes           = cfg.MODEL.SEM_SEG_HEAD.NUM_CLASSES
+        conv_dims             = cfg.MODEL.SEM_SEG_HEAD.CONVS_DIM
+        self.common_stride    = cfg.MODEL.SEM_SEG_HEAD.COMMON_STRIDE
+        norm                  = cfg.MODEL.SEM_SEG_HEAD.NORM
+        self.loss_weight      = cfg.MODEL.SEM_SEG_HEAD.LOSS_WEIGHT
+        # fmt: on
+
+        self.scale_heads = []
+        for in_feature in self.in_features:
+            head_ops = []
+            head_length = max(
+                1, int(np.log2(feature_strides[in_feature]) - np.log2(self.common_stride))
+            )
+            for k in range(head_length):
+                norm_module = nn.GroupNorm(32, conv_dims) if norm == "GN" else None
+                conv = Conv2d(
+                    feature_channels[in_feature] if k == 0 else conv_dims,
+                    conv_dims,
+                    kernel_size=3,
+                    stride=1,
+                    padding=1,
+                    bias=not norm,
+                    norm=norm_module,
+                    activation=F.relu,
+                )
+                weight_init.c2_msra_fill(conv)
+                head_ops.append(conv)
+                if feature_strides[in_feature] != self.common_stride:
+                    head_ops.append(
+                        nn.Upsample(scale_factor=2, mode="bilinear", align_corners=False)
+                    )
+            self.scale_heads.append(nn.Sequential(*head_ops))
+            self.add_module(in_feature, self.scale_heads[-1])
+        self.predictor = Conv2d(conv_dims, num_classes, kernel_size=1, stride=1, padding=0)
+        weight_init.c2_msra_fill(self.predictor)
+
+    def forward(self, features, targets=None):
+        """
+        Returns:
+            In training, returns (None, dict of losses)
+            In inference, returns (CxHxW logits, {})
+        """
+        x = self.layers(features)
+        if self.training:
+            return None, self.losses(x, targets)
+        else:
+            x = F.interpolate(
+                x, scale_factor=self.common_stride, mode="bilinear", align_corners=False
+            )
+            return x, {}
+
+    def layers(self, features):
+        for i, f in enumerate(self.in_features):
+            if i == 0:
+                x = self.scale_heads[i](features[f])
+            else:
+                x = x + self.scale_heads[i](features[f])
+        x = self.predictor(x)
+        return x
+
+    def losses(self, predictions, targets):
+        predictions = F.interpolate(
+            predictions, scale_factor=self.common_stride, mode="bilinear", align_corners=False
+        )
+        loss = F.cross_entropy(
+            predictions, targets, reduction="mean", ignore_index=self.ignore_value
+        )
+        losses = {"loss_sem_seg": loss * self.loss_weight}
+        return losses
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/detectron2/modeling/poolers.py b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/modeling/poolers.py
new file mode 100644
index 0000000000000000000000000000000000000000..678f5afc5680e6bdc9931f0449e2ab334a3a5369
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/modeling/poolers.py
@@ -0,0 +1,231 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+import math
+import sys
+import torch
+from torch import nn
+from torchvision.ops import RoIPool
+
+from detectron2.layers import ROIAlign, ROIAlignRotated, cat
+
+__all__ = ["ROIPooler"]
+
+
+def assign_boxes_to_levels(box_lists, min_level, max_level, canonical_box_size, canonical_level):
+    """
+    Map each box in `box_lists` to a feature map level index and return the assignment
+    vector.
+
+    Args:
+        box_lists (list[Boxes] | list[RotatedBoxes]): A list of N Boxes or N RotatedBoxes,
+            where N is the number of images in the batch.
+        min_level (int): Smallest feature map level index. The input is considered index 0,
+            the output of stage 1 is index 1, and so.
+        max_level (int): Largest feature map level index.
+        canonical_box_size (int): A canonical box size in pixels (sqrt(box area)).
+        canonical_level (int): The feature map level index on which a canonically-sized box
+            should be placed.
+
+    Returns:
+        A tensor of length M, where M is the total number of boxes aggregated over all
+            N batch images. The memory layout corresponds to the concatenation of boxes
+            from all images. Each element is the feature map index, as an offset from
+            `self.min_level`, for the corresponding box (so value i means the box is at
+            `self.min_level + i`).
+    """
+    eps = sys.float_info.epsilon
+    box_sizes = torch.sqrt(cat([boxes.area() for boxes in box_lists]))
+    # Eqn.(1) in FPN paper
+    level_assignments = torch.floor(
+        canonical_level + torch.log2(box_sizes / canonical_box_size + eps)
+    )
+    # clamp level to (min, max), in case the box size is too large or too small
+    # for the available feature maps
+    level_assignments = torch.clamp(level_assignments, min=min_level, max=max_level)
+    return level_assignments.to(torch.int64) - min_level
+
+
+def convert_boxes_to_pooler_format(box_lists):
+    """
+    Convert all boxes in `box_lists` to the low-level format used by ROI pooling ops
+    (see description under Returns).
+
+    Args:
+        box_lists (list[Boxes] | list[RotatedBoxes]):
+            A list of N Boxes or N RotatedBoxes, where N is the number of images in the batch.
+
+    Returns:
+        When input is list[Boxes]:
+            A tensor of shape (M, 5), where M is the total number of boxes aggregated over all
+            N batch images.
+            The 5 columns are (batch index, x0, y0, x1, y1), where batch index
+            is the index in [0, N) identifying which batch image the box with corners at
+            (x0, y0, x1, y1) comes from.
+        When input is list[RotatedBoxes]:
+            A tensor of shape (M, 6), where M is the total number of boxes aggregated over all
+            N batch images.
+            The 6 columns are (batch index, x_ctr, y_ctr, width, height, angle_degrees),
+            where batch index is the index in [0, N) identifying which batch image the
+            rotated box (x_ctr, y_ctr, width, height, angle_degrees) comes from.
+    """
+
+    def fmt_box_list(box_tensor, batch_index):
+        repeated_index = torch.full(
+            (len(box_tensor), 1), batch_index, dtype=box_tensor.dtype, device=box_tensor.device
+        )
+        return cat((repeated_index, box_tensor), dim=1)
+
+    pooler_fmt_boxes = cat(
+        [fmt_box_list(box_list.tensor, i) for i, box_list in enumerate(box_lists)], dim=0
+    )
+
+    return pooler_fmt_boxes
+
+
+class ROIPooler(nn.Module):
+    """
+    Region of interest feature map pooler that supports pooling from one or more
+    feature maps.
+    """
+
+    def __init__(
+        self,
+        output_size,
+        scales,
+        sampling_ratio,
+        pooler_type,
+        canonical_box_size=224,
+        canonical_level=4,
+    ):
+        """
+        Args:
+            output_size (int, tuple[int] or list[int]): output size of the pooled region,
+                e.g., 14 x 14. If tuple or list is given, the length must be 2.
+            scales (list[float]): The scale for each low-level pooling op relative to
+                the input image. For a feature map with stride s relative to the input
+                image, scale is defined as a 1 / s. The stride must be power of 2.
+                When there are multiple scales, they must form a pyramid, i.e. they must be
+                a monotically decreasing geometric sequence with a factor of 1/2.
+            sampling_ratio (int): The `sampling_ratio` parameter for the ROIAlign op.
+            pooler_type (string): Name of the type of pooling operation that should be applied.
+                For instance, "ROIPool" or "ROIAlignV2".
+            canonical_box_size (int): A canonical box size in pixels (sqrt(box area)). The default
+                is heuristically defined as 224 pixels in the FPN paper (based on ImageNet
+                pre-training).
+            canonical_level (int): The feature map level index from which a canonically-sized box
+                should be placed. The default is defined as level 4 (stride=16) in the FPN paper,
+                i.e., a box of size 224x224 will be placed on the feature with stride=16.
+                The box placement for all boxes will be determined from their sizes w.r.t
+                canonical_box_size. For example, a box whose area is 4x that of a canonical box
+                should be used to pool features from feature level ``canonical_level+1``.
+
+                Note that the actual input feature maps given to this module may not have
+                sufficiently many levels for the input boxes. If the boxes are too large or too
+                small for the input feature maps, the closest level will be used.
+        """
+        super().__init__()
+
+        if isinstance(output_size, int):
+            output_size = (output_size, output_size)
+        assert len(output_size) == 2
+        assert isinstance(output_size[0], int) and isinstance(output_size[1], int)
+        self.output_size = output_size
+
+        if pooler_type == "ROIAlign":
+            self.level_poolers = nn.ModuleList(
+                ROIAlign(
+                    output_size, spatial_scale=scale, sampling_ratio=sampling_ratio, aligned=False
+                )
+                for scale in scales
+            )
+        elif pooler_type == "ROIAlignV2":
+            self.level_poolers = nn.ModuleList(
+                ROIAlign(
+                    output_size, spatial_scale=scale, sampling_ratio=sampling_ratio, aligned=True
+                )
+                for scale in scales
+            )
+        elif pooler_type == "ROIPool":
+            self.level_poolers = nn.ModuleList(
+                RoIPool(output_size, spatial_scale=scale) for scale in scales
+            )
+        elif pooler_type == "ROIAlignRotated":
+            self.level_poolers = nn.ModuleList(
+                ROIAlignRotated(output_size, spatial_scale=scale, sampling_ratio=sampling_ratio)
+                for scale in scales
+            )
+        else:
+            raise ValueError("Unknown pooler type: {}".format(pooler_type))
+
+        # Map scale (defined as 1 / stride) to its feature map level under the
+        # assumption that stride is a power of 2.
+        min_level = -(math.log2(scales[0]))
+        max_level = -(math.log2(scales[-1]))
+        assert math.isclose(min_level, int(min_level)) and math.isclose(
+            max_level, int(max_level)
+        ), "Featuremap stride is not power of 2!"
+        self.min_level = int(min_level)
+        self.max_level = int(max_level)
+        assert (
+            len(scales) == self.max_level - self.min_level + 1
+        ), "[ROIPooler] Sizes of input featuremaps do not form a pyramid!"
+        assert 0 < self.min_level and self.min_level <= self.max_level
+        self.canonical_level = canonical_level
+        assert canonical_box_size > 0
+        self.canonical_box_size = canonical_box_size
+
+    def forward(self, x, box_lists):
+        """
+        Args:
+            x (list[Tensor]): A list of feature maps of NCHW shape, with scales matching those
+                used to construct this module.
+            box_lists (list[Boxes] | list[RotatedBoxes]):
+                A list of N Boxes or N RotatedBoxes, where N is the number of images in the batch.
+                The box coordinates are defined on the original image and
+                will be scaled by the `scales` argument of :class:`ROIPooler`.
+
+        Returns:
+            Tensor:
+                A tensor of shape (M, C, output_size, output_size) where M is the total number of
+                boxes aggregated over all N batch images and C is the number of channels in `x`.
+        """
+        num_level_assignments = len(self.level_poolers)
+
+        assert isinstance(x, list) and isinstance(
+            box_lists, list
+        ), "Arguments to pooler must be lists"
+        assert (
+            len(x) == num_level_assignments
+        ), "unequal value, num_level_assignments={}, but x is list of {} Tensors".format(
+            num_level_assignments, len(x)
+        )
+
+        assert len(box_lists) == x[0].size(
+            0
+        ), "unequal value, x[0] batch dim 0 is {}, but box_list has length {}".format(
+            x[0].size(0), len(box_lists)
+        )
+
+        pooler_fmt_boxes = convert_boxes_to_pooler_format(box_lists)
+
+        if num_level_assignments == 1:
+            return self.level_poolers[0](x[0], pooler_fmt_boxes)
+
+        level_assignments = assign_boxes_to_levels(
+            box_lists, self.min_level, self.max_level, self.canonical_box_size, self.canonical_level
+        )
+
+        num_boxes = len(pooler_fmt_boxes)
+        num_channels = x[0].shape[1]
+        output_size = self.output_size[0]
+
+        dtype, device = x[0].dtype, x[0].device
+        output = torch.zeros(
+            (num_boxes, num_channels, output_size, output_size), dtype=dtype, device=device
+        )
+
+        for level, (x_level, pooler) in enumerate(zip(x, self.level_poolers)):
+            inds = torch.nonzero(level_assignments == level, as_tuple=True)[0]
+            pooler_fmt_boxes_level = pooler_fmt_boxes[inds]
+            output[inds] = pooler(x_level, pooler_fmt_boxes_level)
+
+        return output
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/detectron2/modeling/postprocessing.py b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/modeling/postprocessing.py
new file mode 100644
index 0000000000000000000000000000000000000000..e85541ff2e25568cdb9c73702f6c9e68a23f6e4c
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/modeling/postprocessing.py
@@ -0,0 +1,79 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+from torch.nn import functional as F
+
+from detectron2.layers import paste_masks_in_image
+from detectron2.structures import Instances
+from detectron2.utils.memory import retry_if_cuda_oom
+
+
+def detector_postprocess(results, output_height, output_width, mask_threshold=0.5):
+    """
+    Resize the output instances.
+    The input images are often resized when entering an object detector.
+    As a result, we often need the outputs of the detector in a different
+    resolution from its inputs.
+
+    This function will resize the raw outputs of an R-CNN detector
+    to produce outputs according to the desired output resolution.
+
+    Args:
+        results (Instances): the raw outputs from the detector.
+            `results.image_size` contains the input image resolution the detector sees.
+            This object might be modified in-place.
+        output_height, output_width: the desired output resolution.
+
+    Returns:
+        Instances: the resized output from the model, based on the output resolution
+    """
+    scale_x, scale_y = (output_width / results.image_size[1], output_height / results.image_size[0])
+    results = Instances((output_height, output_width), **results.get_fields())
+
+    if results.has("pred_boxes"):
+        output_boxes = results.pred_boxes
+    elif results.has("proposal_boxes"):
+        output_boxes = results.proposal_boxes
+
+    output_boxes.scale(scale_x, scale_y)
+    output_boxes.clip(results.image_size)
+
+    results = results[output_boxes.nonempty()]
+
+    if results.has("pred_masks"):
+        results.pred_masks = retry_if_cuda_oom(paste_masks_in_image)(
+            results.pred_masks[:, 0, :, :],  # N, 1, M, M
+            results.pred_boxes,
+            results.image_size,
+            threshold=mask_threshold,
+        )
+
+    if results.has("pred_keypoints"):
+        results.pred_keypoints[:, :, 0] *= scale_x
+        results.pred_keypoints[:, :, 1] *= scale_y
+
+    return results
+
+
+def sem_seg_postprocess(result, img_size, output_height, output_width):
+    """
+    Return semantic segmentation predictions in the original resolution.
+
+    The input images are often resized when entering semantic segmentor. Moreover, in same
+    cases, they also padded inside segmentor to be divisible by maximum network stride.
+    As a result, we often need the predictions of the segmentor in a different
+    resolution from its inputs.
+
+    Args:
+        result (Tensor): semantic segmentation prediction logits. A tensor of shape (C, H, W),
+            where C is the number of classes, and H, W are the height and width of the prediction.
+        img_size (tuple): image size that segmentor is taking as input.
+        output_height, output_width: the desired output resolution.
+
+    Returns:
+        semantic segmentation prediction (Tensor): A tensor of the shape
+            (C, output_height, output_width) that contains per-pixel soft predictions.
+    """
+    result = result[:, : img_size[0], : img_size[1]].expand(1, -1, -1, -1)
+    result = F.interpolate(
+        result, size=(output_height, output_width), mode="bilinear", align_corners=False
+    )[0]
+    return result
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/detectron2/modeling/proposal_generator/__init__.py b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/modeling/proposal_generator/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..64fb6d46359c05ed3d7aa1ec91fdd6e15b14c932
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/modeling/proposal_generator/__init__.py
@@ -0,0 +1,3 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+from .build import PROPOSAL_GENERATOR_REGISTRY, build_proposal_generator
+from .rpn import RPN_HEAD_REGISTRY, build_rpn_head, RPN
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/detectron2/modeling/proposal_generator/build.py b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/modeling/proposal_generator/build.py
new file mode 100644
index 0000000000000000000000000000000000000000..7f252bcb982032cd09270c44741772a34ef32277
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/modeling/proposal_generator/build.py
@@ -0,0 +1,24 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+from detectron2.utils.registry import Registry
+
+PROPOSAL_GENERATOR_REGISTRY = Registry("PROPOSAL_GENERATOR")
+PROPOSAL_GENERATOR_REGISTRY.__doc__ = """
+Registry for proposal generator, which produces object proposals from feature maps.
+
+The registered object will be called with `obj(cfg, input_shape)`.
+The call should return a `nn.Module` object.
+"""
+
+from . import rpn, rrpn  # noqa F401 isort:skip
+
+
+def build_proposal_generator(cfg, input_shape):
+    """
+    Build a proposal generator from `cfg.MODEL.PROPOSAL_GENERATOR.NAME`.
+    The name can be "PrecomputedProposals" to use no proposal generator.
+    """
+    name = cfg.MODEL.PROPOSAL_GENERATOR.NAME
+    if name == "PrecomputedProposals":
+        return None
+
+    return PROPOSAL_GENERATOR_REGISTRY.get(name)(cfg, input_shape)
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/detectron2/modeling/proposal_generator/proposal_utils.py b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/modeling/proposal_generator/proposal_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..d4af90525ba07eb8d313460ee2c3f468fe367cff
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/modeling/proposal_generator/proposal_utils.py
@@ -0,0 +1,57 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+import math
+import torch
+
+from detectron2.structures import Instances
+
+
+def add_ground_truth_to_proposals(gt_boxes, proposals):
+    """
+    Call `add_ground_truth_to_proposals_single_image` for all images.
+
+    Args:
+        gt_boxes(list[Boxes]): list of N elements. Element i is a Boxes
+            representing the gound-truth for image i.
+        proposals (list[Instances]): list of N elements. Element i is a Instances
+            representing the proposals for image i.
+
+    Returns:
+        list[Instances]: list of N Instances. Each is the proposals for the image,
+            with field "proposal_boxes" and "objectness_logits".
+    """
+    assert gt_boxes is not None
+
+    assert len(proposals) == len(gt_boxes)
+    if len(proposals) == 0:
+        return proposals
+
+    return [
+        add_ground_truth_to_proposals_single_image(gt_boxes_i, proposals_i)
+        for gt_boxes_i, proposals_i in zip(gt_boxes, proposals)
+    ]
+
+
+def add_ground_truth_to_proposals_single_image(gt_boxes, proposals):
+    """
+    Augment `proposals` with ground-truth boxes from `gt_boxes`.
+
+    Args:
+        Same as `add_ground_truth_to_proposals`, but with gt_boxes and proposals
+        per image.
+
+    Returns:
+        Same as `add_ground_truth_to_proposals`, but for only one image.
+    """
+    device = proposals.objectness_logits.device
+    # Concatenating gt_boxes with proposals requires them to have the same fields
+    # Assign all ground-truth boxes an objectness logit corresponding to P(object) \approx 1.
+    gt_logit_value = math.log((1.0 - 1e-10) / (1 - (1.0 - 1e-10)))
+
+    gt_logits = gt_logit_value * torch.ones(len(gt_boxes), device=device)
+    gt_proposal = Instances(proposals.image_size)
+
+    gt_proposal.proposal_boxes = gt_boxes
+    gt_proposal.objectness_logits = gt_logits
+    new_proposals = Instances.cat([proposals, gt_proposal])
+
+    return new_proposals
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/detectron2/modeling/proposal_generator/rpn.py b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/modeling/proposal_generator/rpn.py
new file mode 100644
index 0000000000000000000000000000000000000000..8eb93b8e6ecf9f14d5b8de5a7e1d2b1560bcacfd
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/modeling/proposal_generator/rpn.py
@@ -0,0 +1,285 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+from typing import Dict, List
+import torch
+import torch.nn.functional as F
+from torch import nn
+
+from detectron2.config import configurable
+from detectron2.layers import ShapeSpec
+from detectron2.structures import Boxes, Instances, pairwise_iou
+from detectron2.utils.memory import retry_if_cuda_oom
+from detectron2.utils.registry import Registry
+
+from ..anchor_generator import build_anchor_generator
+from ..box_regression import Box2BoxTransform
+from ..matcher import Matcher
+from ..sampling import subsample_labels
+from .build import PROPOSAL_GENERATOR_REGISTRY
+from .rpn_outputs import RPNOutputs, find_top_rpn_proposals
+
+RPN_HEAD_REGISTRY = Registry("RPN_HEAD")
+RPN_HEAD_REGISTRY.__doc__ = """
+Registry for RPN heads, which take feature maps and perform
+objectness classification and bounding box regression for anchors.
+
+The registered object will be called with `obj(cfg, input_shape)`.
+The call should return a `nn.Module` object.
+"""
+
+
+def build_rpn_head(cfg, input_shape):
+    """
+    Build an RPN head defined by `cfg.MODEL.RPN.HEAD_NAME`.
+    """
+    name = cfg.MODEL.RPN.HEAD_NAME
+    return RPN_HEAD_REGISTRY.get(name)(cfg, input_shape)
+
+
+@RPN_HEAD_REGISTRY.register()
+class StandardRPNHead(nn.Module):
+    """
+    Standard RPN classification and regression heads described in :paper:`Faster R-CNN`.
+    Uses a 3x3 conv to produce a shared hidden state from which one 1x1 conv predicts
+    objectness logits for each anchor and a second 1x1 conv predicts bounding-box deltas
+    specifying how to deform each anchor into an object proposal.
+    """
+
+    @configurable
+    def __init__(self, *, in_channels: int, num_anchors: int, box_dim: int = 4):
+        """
+        NOTE: this interface is experimental.
+
+        Args:
+            in_channels (int): number of input feature channels. When using multiple
+                input features, they must have the same number of channels.
+            num_anchors (int): number of anchors to predict for *each spatial position*
+                on the feature map. The total number of anchors for each
+                feature map will be `num_anchors * H * W`.
+            box_dim (int): dimension of a box, which is also the number of box regression
+                predictions to make for each anchor. An axis aligned box has
+                box_dim=4, while a rotated box has box_dim=5.
+        """
+        super().__init__()
+        # 3x3 conv for the hidden representation
+        self.conv = nn.Conv2d(in_channels, in_channels, kernel_size=3, stride=1, padding=1)
+        # 1x1 conv for predicting objectness logits
+        self.objectness_logits = nn.Conv2d(in_channels, num_anchors, kernel_size=1, stride=1)
+        # 1x1 conv for predicting box2box transform deltas
+        self.anchor_deltas = nn.Conv2d(in_channels, num_anchors * box_dim, kernel_size=1, stride=1)
+
+        for l in [self.conv, self.objectness_logits, self.anchor_deltas]:
+            nn.init.normal_(l.weight, std=0.01)
+            nn.init.constant_(l.bias, 0)
+
+    @classmethod
+    def from_config(cls, cfg, input_shape):
+        # Standard RPN is shared across levels:
+        in_channels = [s.channels for s in input_shape]
+        assert len(set(in_channels)) == 1, "Each level must have the same channel!"
+        in_channels = in_channels[0]
+
+        # RPNHead should take the same input as anchor generator
+        # NOTE: it assumes that creating an anchor generator does not have unwanted side effect.
+        anchor_generator = build_anchor_generator(cfg, input_shape)
+        num_anchors = anchor_generator.num_anchors
+        box_dim = anchor_generator.box_dim
+        assert (
+            len(set(num_anchors)) == 1
+        ), "Each level must have the same number of anchors per spatial position"
+        return {"in_channels": in_channels, "num_anchors": num_anchors[0], "box_dim": box_dim}
+
+    def forward(self, features):
+        """
+        Args:
+            features (list[Tensor]): list of feature maps
+
+        Returns:
+            list[Tensor]: A list of L elements.
+                Element i is a tensor of shape (N, A, Hi, Wi) representing
+                the predicted objectness logits for all anchors. A is the number of cell anchors.
+            list[Tensor]: A list of L elements. Element i is a tensor of shape
+                (N, A*box_dim, Hi, Wi) representing the predicted "deltas" used to transform anchors
+                to proposals.
+        """
+        pred_objectness_logits = []
+        pred_anchor_deltas = []
+        for x in features:
+            t = F.relu(self.conv(x))
+            pred_objectness_logits.append(self.objectness_logits(t))
+            pred_anchor_deltas.append(self.anchor_deltas(t))
+        return pred_objectness_logits, pred_anchor_deltas
+
+
+@PROPOSAL_GENERATOR_REGISTRY.register()
+class RPN(nn.Module):
+    """
+    Region Proposal Network, introduced by :paper:`Faster R-CNN`.
+    """
+
+    def __init__(self, cfg, input_shape: Dict[str, ShapeSpec]):
+        super().__init__()
+
+        # fmt: off
+        self.min_box_side_len     = cfg.MODEL.PROPOSAL_GENERATOR.MIN_SIZE
+        self.in_features          = cfg.MODEL.RPN.IN_FEATURES
+        self.nms_thresh           = cfg.MODEL.RPN.NMS_THRESH
+        self.batch_size_per_image = cfg.MODEL.RPN.BATCH_SIZE_PER_IMAGE
+        self.positive_fraction    = cfg.MODEL.RPN.POSITIVE_FRACTION
+        self.smooth_l1_beta       = cfg.MODEL.RPN.SMOOTH_L1_BETA
+        self.loss_weight          = cfg.MODEL.RPN.LOSS_WEIGHT
+        # fmt: on
+
+        # Map from self.training state to train/test settings
+        self.pre_nms_topk = {
+            True: cfg.MODEL.RPN.PRE_NMS_TOPK_TRAIN,
+            False: cfg.MODEL.RPN.PRE_NMS_TOPK_TEST,
+        }
+        self.post_nms_topk = {
+            True: cfg.MODEL.RPN.POST_NMS_TOPK_TRAIN,
+            False: cfg.MODEL.RPN.POST_NMS_TOPK_TEST,
+        }
+        self.boundary_threshold = cfg.MODEL.RPN.BOUNDARY_THRESH
+
+        self.anchor_generator = build_anchor_generator(
+            cfg, [input_shape[f] for f in self.in_features]
+        )
+        self.box2box_transform = Box2BoxTransform(weights=cfg.MODEL.RPN.BBOX_REG_WEIGHTS)
+        self.anchor_matcher = Matcher(
+            cfg.MODEL.RPN.IOU_THRESHOLDS, cfg.MODEL.RPN.IOU_LABELS, allow_low_quality_matches=True
+        )
+        self.rpn_head = build_rpn_head(cfg, [input_shape[f] for f in self.in_features])
+
+    def _subsample_labels(self, label):
+        """
+        Randomly sample a subset of positive and negative examples, and overwrite
+        the label vector to the ignore value (-1) for all elements that are not
+        included in the sample.
+
+        Args:
+            labels (Tensor): a vector of -1, 0, 1. Will be modified in-place and returned.
+        """
+        pos_idx, neg_idx = subsample_labels(
+            label, self.batch_size_per_image, self.positive_fraction, 0
+        )
+        # Fill with the ignore label (-1), then set positive and negative labels
+        label.fill_(-1)
+        label.scatter_(0, pos_idx, 1)
+        label.scatter_(0, neg_idx, 0)
+        return label
+
+    @torch.no_grad()
+    def label_and_sample_anchors(self, anchors: List[Boxes], gt_instances: List[Instances]):
+        """
+        Args:
+            anchors (list[Boxes]): anchors for each feature map.
+            gt_instances: the ground-truth instances for each image.
+
+        Returns:
+            list[Tensor]:
+                List of #demo tensors. i-th element is a vector of labels whose length is
+                the total number of anchors across feature maps. Label values are in {-1, 0, 1},
+                with meanings: -1 = ignore; 0 = negative class; 1 = positive class.
+            list[Tensor]:
+                i-th element is a Nx4 tensor, where N is the total number of anchors across
+                feature maps.  The values are the matched gt boxes for each anchor.
+                Values are undefined for those anchors not labeled as 1.
+        """
+        anchors = Boxes.cat(anchors)
+
+        gt_boxes = [x.gt_boxes for x in gt_instances]
+        image_sizes = [x.image_size for x in gt_instances]
+        del gt_instances
+
+        gt_labels = []
+        matched_gt_boxes = []
+        for image_size_i, gt_boxes_i in zip(image_sizes, gt_boxes):
+            """
+            image_size_i: (h, w) for the i-th image
+            gt_boxes_i: ground-truth boxes for i-th image
+            """
+
+            match_quality_matrix = retry_if_cuda_oom(pairwise_iou)(gt_boxes_i, anchors)
+            matched_idxs, gt_labels_i = retry_if_cuda_oom(self.anchor_matcher)(match_quality_matrix)
+            # Matching is memory-expensive and may result in CPU tensors. But the result is small
+            gt_labels_i = gt_labels_i.to(device=gt_boxes_i.device)
+            del match_quality_matrix
+
+            if self.boundary_threshold >= 0:
+                # Discard anchors that go out of the boundaries of the image
+                # NOTE: This is legacy functionality that is turned off by default in Detectron2
+                anchors_inside_image = anchors.inside_box(image_size_i, self.boundary_threshold)
+                gt_labels_i[~anchors_inside_image] = -1
+
+            # A vector of labels (-1, 0, 1) for each anchor
+            gt_labels_i = self._subsample_labels(gt_labels_i)
+
+            if len(gt_boxes_i) == 0:
+                # These values won't be used anyway since the anchor is labeled as background
+                matched_gt_boxes_i = torch.zeros_like(anchors.tensor)
+            else:
+                # TODO wasted indexing computation for ignored boxes
+                matched_gt_boxes_i = gt_boxes_i[matched_idxs].tensor
+
+            gt_labels.append(gt_labels_i)  # N,AHW
+            matched_gt_boxes.append(matched_gt_boxes_i)
+        return gt_labels, matched_gt_boxes
+
+    def forward(self, images, features, gt_instances=None):
+        """
+        Args:
+            images (ImageList): input images of length `N`
+            features (dict[str: Tensor]): input data as a mapping from feature
+                map name to tensor. Axis 0 represents the number of images `N` in
+                the input data; axes 1-3 are channels, height, and width, which may
+                vary between feature maps (e.g., if a feature pyramid is used).
+            gt_instances (list[Instances], optional): a length `N` list of `Instances`s.
+                Each `Instances` stores ground-truth instances for the corresponding image.
+
+        Returns:
+            proposals: list[Instances]: contains fields "proposal_boxes", "objectness_logits"
+            loss: dict[Tensor] or None
+        """
+        features = [features[f] for f in self.in_features]
+        pred_objectness_logits, pred_anchor_deltas = self.rpn_head(features)
+        anchors = self.anchor_generator(features)
+
+        if self.training:
+            gt_labels, gt_boxes = self.label_and_sample_anchors(anchors, gt_instances)
+        else:
+            gt_labels, gt_boxes = None, None
+
+        outputs = RPNOutputs(
+            self.box2box_transform,
+            self.batch_size_per_image,
+            images,
+            pred_objectness_logits,
+            pred_anchor_deltas,
+            anchors,
+            gt_labels,
+            gt_boxes,
+            self.smooth_l1_beta,
+        )
+
+        if self.training:
+            losses = {k: v * self.loss_weight for k, v in outputs.losses().items()}
+        else:
+            losses = {}
+
+        with torch.no_grad():
+            # Find the top proposals by applying NMS and removing boxes that
+            # are too small. The proposals are treated as fixed for approximate
+            # joint training with roi heads. This approach ignores the derivative
+            # w.r.t. the proposal boxes’ coordinates that are also network
+            # responses, so is approximate.
+            proposals = find_top_rpn_proposals(
+                outputs.predict_proposals(),
+                outputs.predict_objectness_logits(),
+                images,
+                self.nms_thresh,
+                self.pre_nms_topk[self.training],
+                self.post_nms_topk[self.training],
+                self.min_box_side_len,
+                self.training,
+            )
+
+        return proposals, losses
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/detectron2/modeling/proposal_generator/rpn_outputs.py b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/modeling/proposal_generator/rpn_outputs.py
new file mode 100644
index 0000000000000000000000000000000000000000..44f846f18b30d846d1d87faf7f2aa3b10c2333b8
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/modeling/proposal_generator/rpn_outputs.py
@@ -0,0 +1,323 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+import itertools
+import logging
+import torch
+import torch.nn.functional as F
+from fvcore.nn import smooth_l1_loss
+
+from detectron2.layers import batched_nms, cat
+from detectron2.structures import Boxes, Instances
+from detectron2.utils.events import get_event_storage
+
+logger = logging.getLogger(__name__)
+
+# TODO: comments for future refactoring of this module
+#
+# From @rbg:
+# This code involves a significant amount of tensor reshaping and permuting. Look for
+# ways to simplify this.
+
+"""
+Shape shorthand in this module:
+
+    N: number of images in the minibatch
+    L: number of feature maps per image on which RPN is run
+    A: number of cell anchors (must be the same for all feature maps)
+    Hi, Wi: height and width of the i-th feature map
+    4: size of the box parameterization
+
+Naming convention:
+
+    objectness: refers to the binary classification of an anchor as object vs. not
+    object.
+
+    deltas: refers to the 4-d (dx, dy, dw, dh) deltas that parameterize the box2box
+    transform (see :class:`box_regression.Box2BoxTransform`).
+
+    pred_objectness_logits: predicted objectness scores in [-inf, +inf]; use
+        sigmoid(pred_objectness_logits) to estimate P(object).
+
+    gt_labels: ground-truth binary classification labels for objectness
+
+    pred_anchor_deltas: predicted box2box transform deltas
+
+    gt_anchor_deltas: ground-truth box2box transform deltas
+"""
+
+
+def find_top_rpn_proposals(
+    proposals,
+    pred_objectness_logits,
+    images,
+    nms_thresh,
+    pre_nms_topk,
+    post_nms_topk,
+    min_box_side_len,
+    training,
+):
+    """
+    For each feature map, select the `pre_nms_topk` highest scoring proposals,
+    apply NMS, clip proposals, and remove small boxes. Return the `post_nms_topk`
+    highest scoring proposals among all the feature maps if `training` is True,
+    otherwise, returns the highest `post_nms_topk` scoring proposals for each
+    feature map.
+
+    Args:
+        proposals (list[Tensor]): A list of L tensors. Tensor i has shape (N, Hi*Wi*A, 4).
+            All proposal predictions on the feature maps.
+        pred_objectness_logits (list[Tensor]): A list of L tensors. Tensor i has shape (N, Hi*Wi*A).
+        images (ImageList): Input images as an :class:`ImageList`.
+        nms_thresh (float): IoU threshold to use for NMS
+        pre_nms_topk (int): number of top k scoring proposals to keep before applying NMS.
+            When RPN is run on multiple feature maps (as in FPN) this number is per
+            feature map.
+        post_nms_topk (int): number of top k scoring proposals to keep after applying NMS.
+            When RPN is run on multiple feature maps (as in FPN) this number is total,
+            over all feature maps.
+        min_box_side_len (float): minimum proposal box side length in pixels (absolute units
+            wrt input images).
+        training (bool): True if proposals are to be used in training, otherwise False.
+            This arg exists only to support a legacy bug; look for the "NB: Legacy bug ..."
+            comment.
+
+    Returns:
+        proposals (list[Instances]): list of N Instances. The i-th Instances
+            stores post_nms_topk object proposals for image i, sorted by their
+            objectness score in descending order.
+    """
+    image_sizes = images.image_sizes  # in (h, w) order
+    num_images = len(image_sizes)
+    device = proposals[0].device
+
+    # 1. Select top-k anchor for every level and every image
+    topk_scores = []  # #lvl Tensor, each of shape N x topk
+    topk_proposals = []
+    level_ids = []  # #lvl Tensor, each of shape (topk,)
+    batch_idx = torch.arange(num_images, device=device)
+    for level_id, proposals_i, logits_i in zip(
+        itertools.count(), proposals, pred_objectness_logits
+    ):
+        Hi_Wi_A = logits_i.shape[1]
+        num_proposals_i = min(pre_nms_topk, Hi_Wi_A)
+
+        # sort is faster than topk (https://github.com/pytorch/pytorch/issues/22812)
+        # topk_scores_i, topk_idx = logits_i.topk(num_proposals_i, dim=1)
+        logits_i, idx = logits_i.sort(descending=True, dim=1)
+        topk_scores_i = logits_i[batch_idx, :num_proposals_i]
+        topk_idx = idx[batch_idx, :num_proposals_i]
+
+        # each is N x topk
+        topk_proposals_i = proposals_i[batch_idx[:, None], topk_idx]  # N x topk x 4
+
+        topk_proposals.append(topk_proposals_i)
+        topk_scores.append(topk_scores_i)
+        level_ids.append(torch.full((num_proposals_i,), level_id, dtype=torch.int64, device=device))
+
+    # 2. Concat all levels together
+    topk_scores = cat(topk_scores, dim=1)
+    topk_proposals = cat(topk_proposals, dim=1)
+    level_ids = cat(level_ids, dim=0)
+
+    # 3. For each image, run a per-level NMS, and choose topk results.
+    results = []
+    for n, image_size in enumerate(image_sizes):
+        boxes = Boxes(topk_proposals[n])
+        scores_per_img = topk_scores[n]
+        lvl = level_ids
+
+        valid_mask = torch.isfinite(boxes.tensor).all(dim=1) & torch.isfinite(scores_per_img)
+        if not valid_mask.all():
+            if training:
+                raise FloatingPointError(
+                    "Predicted boxes or scores contain Inf/NaN. Training has diverged."
+                )
+            boxes = boxes[valid_mask]
+            scores_per_img = scores_per_img[valid_mask]
+            lvl = lvl[valid_mask]
+        boxes.clip(image_size)
+
+        # filter empty boxes
+        keep = boxes.nonempty(threshold=min_box_side_len)
+        if keep.sum().item() != len(boxes):
+            boxes, scores_per_img, lvl = boxes[keep], scores_per_img[keep], lvl[keep]
+
+        keep = batched_nms(boxes.tensor, scores_per_img, lvl, nms_thresh)
+        # In Detectron1, there was different behavior during training vs. testing.
+        # (https://github.com/facebookresearch/Detectron/issues/459)
+        # During training, topk is over the proposals from *all* images in the training batch.
+        # During testing, it is over the proposals for each image separately.
+        # As a result, the training behavior becomes batch-dependent,
+        # and the configuration "POST_NMS_TOPK_TRAIN" end up relying on the batch size.
+        # This bug is addressed in Detectron2 to make the behavior independent of batch size.
+        keep = keep[:post_nms_topk]  # keep is already sorted
+
+        res = Instances(image_size)
+        res.proposal_boxes = boxes[keep]
+        res.objectness_logits = scores_per_img[keep]
+        results.append(res)
+    return results
+
+
+def rpn_losses(
+    gt_labels, gt_anchor_deltas, pred_objectness_logits, pred_anchor_deltas, smooth_l1_beta
+):
+    """
+    Args:
+        gt_labels (Tensor): shape (N,), each element in {-1, 0, 1} representing
+            ground-truth objectness labels with: -1 = ignore; 0 = not object; 1 = object.
+        gt_anchor_deltas (Tensor): shape (N, box_dim), row i represents ground-truth
+            box2box transform targets (dx, dy, dw, dh) or (dx, dy, dw, dh, da) that map anchor i to
+            its matched ground-truth box.
+        pred_objectness_logits (Tensor): shape (N,), each element is a predicted objectness
+            logit.
+        pred_anchor_deltas (Tensor): shape (N, box_dim), each row is a predicted box2box
+            transform (dx, dy, dw, dh) or (dx, dy, dw, dh, da)
+        smooth_l1_beta (float): The transition point between L1 and L2 loss in
+            the smooth L1 loss function. When set to 0, the loss becomes L1. When
+            set to +inf, the loss becomes constant 0.
+
+    Returns:
+        objectness_loss, localization_loss, both unnormalized (summed over samples).
+    """
+    pos_masks = gt_labels == 1
+    localization_loss = smooth_l1_loss(
+        pred_anchor_deltas[pos_masks], gt_anchor_deltas[pos_masks], smooth_l1_beta, reduction="sum"
+    )
+
+    valid_masks = gt_labels >= 0
+    objectness_loss = F.binary_cross_entropy_with_logits(
+        pred_objectness_logits[valid_masks],
+        gt_labels[valid_masks].to(torch.float32),
+        reduction="sum",
+    )
+    return objectness_loss, localization_loss
+
+
+class RPNOutputs(object):
+    def __init__(
+        self,
+        box2box_transform,
+        batch_size_per_image,
+        images,
+        pred_objectness_logits,
+        pred_anchor_deltas,
+        anchors,
+        gt_labels=None,
+        gt_boxes=None,
+        smooth_l1_beta=0.0,
+    ):
+        """
+        Args:
+            box2box_transform (Box2BoxTransform): :class:`Box2BoxTransform` instance for
+                anchor-proposal transformations.
+            images (ImageList): :class:`ImageList` instance representing N input images
+            batch_size_per_image (int): number of proposals to sample when training
+            pred_objectness_logits (list[Tensor]): A list of L elements.
+                Element i is a tensor of shape (N, A, Hi, Wi) representing
+                the predicted objectness logits for anchors.
+            pred_anchor_deltas (list[Tensor]): A list of L elements. Element i is a tensor of shape
+                (N, A*4 or 5, Hi, Wi) representing the predicted "deltas" used to transform anchors
+                to proposals.
+            anchors (list[Boxes or RotatedBoxes]): A list of Boxes/RotatedBoxes storing the all
+                the anchors for each feature map. See :meth:`AnchorGenerator.forward`.
+            gt_labels (list[Tensor]): Available on in training.
+                See :meth:`RPN.label_and_sample_anchors`.
+            gt_boxes (list[Boxes or RotatedBoxes]): Available on in training.
+                See :meth:`RPN.label_and_sample_anchors`.
+            smooth_l1_beta (float): The transition point between L1 and L2 loss in
+                the smooth L1 loss function. When set to 0, the loss becomes L1. When
+                set to +inf, the loss becomes constant 0.
+        """
+        self.box2box_transform = box2box_transform
+        self.batch_size_per_image = batch_size_per_image
+
+        B = anchors[0].tensor.size(1)  # box dimension (4 or 5)
+        self.pred_objectness_logits = [
+            # Reshape: (N, A, Hi, Wi) -> (N, Hi, Wi, A) -> (N, Hi*Wi*A)
+            score.permute(0, 2, 3, 1).flatten(1)
+            for score in pred_objectness_logits
+        ]
+
+        self.pred_anchor_deltas = [
+            # Reshape: (N, A*B, Hi, Wi) -> (N, A, B, Hi, Wi) -> (N, Hi, Wi, A, B)
+            #          -> (N, Hi*Wi*A, B)
+            x.view(x.shape[0], -1, B, x.shape[-2], x.shape[-1])
+            .permute(0, 3, 4, 1, 2)
+            .flatten(1, -2)
+            for x in pred_anchor_deltas
+        ]
+
+        self.anchors = anchors
+
+        self.gt_boxes = gt_boxes
+        self.gt_labels = gt_labels
+
+        self.num_images = len(images)
+        self.smooth_l1_beta = smooth_l1_beta
+
+    def losses(self):
+        """
+        Return the losses from a set of RPN predictions and their associated ground-truth.
+
+        Returns:
+            dict[loss name -> loss value]: A dict mapping from loss name to loss value.
+                Loss names are: `loss_rpn_cls` for objectness classification and
+                `loss_rpn_loc` for proposal localization.
+        """
+        gt_labels = torch.stack(self.gt_labels)
+        anchors = self.anchors[0].cat(self.anchors).tensor  # Ax(4 or 5)
+        gt_anchor_deltas = [self.box2box_transform.get_deltas(anchors, k) for k in self.gt_boxes]
+        gt_anchor_deltas = torch.stack(gt_anchor_deltas)
+
+        # Log the number of positive/negative anchors per-image that's used in training
+        num_pos_anchors = (gt_labels == 1).sum().item()
+        num_neg_anchors = (gt_labels == 0).sum().item()
+        storage = get_event_storage()
+        storage.put_scalar("rpn/num_pos_anchors", num_pos_anchors / self.num_images)
+        storage.put_scalar("rpn/num_neg_anchors", num_neg_anchors / self.num_images)
+
+        objectness_loss, localization_loss = rpn_losses(
+            gt_labels,
+            gt_anchor_deltas,
+            # concat on the Hi*Wi*A dimension
+            cat(self.pred_objectness_logits, dim=1),
+            cat(self.pred_anchor_deltas, dim=1),
+            self.smooth_l1_beta,
+        )
+        normalizer = self.batch_size_per_image * self.num_images
+        return {
+            "loss_rpn_cls": objectness_loss / normalizer,
+            "loss_rpn_loc": localization_loss / normalizer,
+        }
+
+    def predict_proposals(self):
+        """
+        Transform anchors into proposals by applying the predicted anchor deltas.
+
+        Returns:
+            proposals (list[Tensor]): A list of L tensors. Tensor i has shape
+                (N, Hi*Wi*A, B), where B is box dimension (4 or 5).
+        """
+        proposals = []
+        # For each feature map
+        for anchors_i, pred_anchor_deltas_i in zip(self.anchors, self.pred_anchor_deltas):
+            B = anchors_i.tensor.size(1)
+            N = self.num_images
+            pred_anchor_deltas_i = pred_anchor_deltas_i.reshape(-1, B)
+            # Expand anchors to shape (N*Hi*Wi*A, B)
+            anchors_i = anchors_i.tensor.unsqueeze(0).expand(N, -1, -1).reshape(-1, B)
+            proposals_i = self.box2box_transform.apply_deltas(pred_anchor_deltas_i, anchors_i)
+            # Append feature map proposals with shape (N, Hi*Wi*A, B)
+            proposals.append(proposals_i.view(N, -1, B))
+        return proposals
+
+    def predict_objectness_logits(self):
+        """
+        Return objectness logits in the same format as the proposals returned by
+        :meth:`predict_proposals`.
+
+        Returns:
+            pred_objectness_logits (list[Tensor]): A list of L tensors. Tensor i has shape
+                (N, Hi*Wi*A).
+        """
+        return self.pred_objectness_logits
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/detectron2/modeling/proposal_generator/rrpn.py b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/modeling/proposal_generator/rrpn.py
new file mode 100644
index 0000000000000000000000000000000000000000..8c2ac366face34a12af63c9f13e6dbb14f59bf04
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/modeling/proposal_generator/rrpn.py
@@ -0,0 +1,233 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+import itertools
+import logging
+from typing import Dict, List
+import torch
+
+from detectron2.layers import ShapeSpec, batched_nms_rotated, cat
+from detectron2.structures import Instances, RotatedBoxes, pairwise_iou_rotated
+from detectron2.utils.memory import retry_if_cuda_oom
+
+from ..box_regression import Box2BoxTransformRotated
+from .build import PROPOSAL_GENERATOR_REGISTRY
+from .rpn import RPN
+from .rpn_outputs import RPNOutputs
+
+logger = logging.getLogger(__name__)
+
+
+def find_top_rrpn_proposals(
+    proposals,
+    pred_objectness_logits,
+    images,
+    nms_thresh,
+    pre_nms_topk,
+    post_nms_topk,
+    min_box_side_len,
+    training,
+):
+    """
+    For each feature map, select the `pre_nms_topk` highest scoring proposals,
+    apply NMS, clip proposals, and remove small boxes. Return the `post_nms_topk`
+    highest scoring proposals among all the feature maps if `training` is True,
+    otherwise, returns the highest `post_nms_topk` scoring proposals for each
+    feature map.
+
+    Args:
+        proposals (list[Tensor]): A list of L tensors. Tensor i has shape (N, Hi*Wi*A, 5).
+            All proposal predictions on the feature maps.
+        pred_objectness_logits (list[Tensor]): A list of L tensors. Tensor i has shape (N, Hi*Wi*A).
+        images (ImageList): Input images as an :class:`ImageList`.
+        nms_thresh (float): IoU threshold to use for NMS
+        pre_nms_topk (int): number of top k scoring proposals to keep before applying NMS.
+            When RRPN is run on multiple feature maps (as in FPN) this number is per
+            feature map.
+        post_nms_topk (int): number of top k scoring proposals to keep after applying NMS.
+            When RRPN is run on multiple feature maps (as in FPN) this number is total,
+            over all feature maps.
+        min_box_side_len (float): minimum proposal box side length in pixels (absolute units
+            wrt input images).
+        training (bool): True if proposals are to be used in training, otherwise False.
+            This arg exists only to support a legacy bug; look for the "NB: Legacy bug ..."
+            comment.
+
+    Returns:
+        proposals (list[Instances]): list of N Instances. The i-th Instances
+            stores post_nms_topk object proposals for image i.
+    """
+    image_sizes = images.image_sizes  # in (h, w) order
+    num_images = len(image_sizes)
+    device = proposals[0].device
+
+    # 1. Select top-k anchor for every level and every image
+    topk_scores = []  # #lvl Tensor, each of shape N x topk
+    topk_proposals = []
+    level_ids = []  # #lvl Tensor, each of shape (topk,)
+    batch_idx = torch.arange(num_images, device=device)
+    for level_id, proposals_i, logits_i in zip(
+        itertools.count(), proposals, pred_objectness_logits
+    ):
+        Hi_Wi_A = logits_i.shape[1]
+        num_proposals_i = min(pre_nms_topk, Hi_Wi_A)
+
+        # sort is faster than topk (https://github.com/pytorch/pytorch/issues/22812)
+        # topk_scores_i, topk_idx = logits_i.topk(num_proposals_i, dim=1)
+        logits_i, idx = logits_i.sort(descending=True, dim=1)
+        topk_scores_i = logits_i[batch_idx, :num_proposals_i]
+        topk_idx = idx[batch_idx, :num_proposals_i]
+
+        # each is N x topk
+        topk_proposals_i = proposals_i[batch_idx[:, None], topk_idx]  # N x topk x 5
+
+        topk_proposals.append(topk_proposals_i)
+        topk_scores.append(topk_scores_i)
+        level_ids.append(torch.full((num_proposals_i,), level_id, dtype=torch.int64, device=device))
+
+    # 2. Concat all levels together
+    topk_scores = cat(topk_scores, dim=1)
+    topk_proposals = cat(topk_proposals, dim=1)
+    level_ids = cat(level_ids, dim=0)
+
+    # 3. For each image, run a per-level NMS, and choose topk results.
+    results = []
+    for n, image_size in enumerate(image_sizes):
+        boxes = RotatedBoxes(topk_proposals[n])
+        scores_per_img = topk_scores[n]
+        valid_mask = torch.isfinite(boxes.tensor).all(dim=1) & torch.isfinite(scores_per_img)
+        if not valid_mask.all():
+            boxes = boxes[valid_mask]
+            scores_per_img = scores_per_img[valid_mask]
+        boxes.clip(image_size)
+
+        # filter empty boxes
+        keep = boxes.nonempty(threshold=min_box_side_len)
+        lvl = level_ids
+        if keep.sum().item() != len(boxes):
+            boxes, scores_per_img, lvl = (boxes[keep], scores_per_img[keep], level_ids[keep])
+
+        keep = batched_nms_rotated(boxes.tensor, scores_per_img, lvl, nms_thresh)
+        # In Detectron1, there was different behavior during training vs. testing.
+        # (https://github.com/facebookresearch/Detectron/issues/459)
+        # During training, topk is over the proposals from *all* images in the training batch.
+        # During testing, it is over the proposals for each image separately.
+        # As a result, the training behavior becomes batch-dependent,
+        # and the configuration "POST_NMS_TOPK_TRAIN" end up relying on the batch size.
+        # This bug is addressed in Detectron2 to make the behavior independent of batch size.
+        keep = keep[:post_nms_topk]
+
+        res = Instances(image_size)
+        res.proposal_boxes = boxes[keep]
+        res.objectness_logits = scores_per_img[keep]
+        results.append(res)
+    return results
+
+
+@PROPOSAL_GENERATOR_REGISTRY.register()
+class RRPN(RPN):
+    """
+    Rotated Region Proposal Network described in :paper:`RRPN`.
+    """
+
+    def __init__(self, cfg, input_shape: Dict[str, ShapeSpec]):
+        super().__init__(cfg, input_shape)
+        self.box2box_transform = Box2BoxTransformRotated(weights=cfg.MODEL.RPN.BBOX_REG_WEIGHTS)
+        if self.boundary_threshold >= 0:
+            raise NotImplementedError(
+                "boundary_threshold is a legacy option not implemented for RRPN."
+            )
+
+    @torch.no_grad()
+    def label_and_sample_anchors(self, anchors: List[RotatedBoxes], gt_instances: List[Instances]):
+        """
+        Args:
+            anchors (list[RotatedBoxes]): anchors for each feature map.
+            gt_instances: the ground-truth instances for each image.
+
+        Returns:
+            list[Tensor]:
+                List of #demo tensors. i-th element is a vector of labels whose length is
+                the total number of anchors across feature maps. Label values are in {-1, 0, 1},
+                with meanings: -1 = ignore; 0 = negative class; 1 = positive class.
+            list[Tensor]:
+                i-th element is a Nx5 tensor, where N is the total number of anchors across
+                feature maps.  The values are the matched gt boxes for each anchor.
+                Values are undefined for those anchors not labeled as 1.
+        """
+        anchors = RotatedBoxes.cat(anchors)
+
+        gt_boxes = [x.gt_boxes for x in gt_instances]
+        del gt_instances
+
+        gt_labels = []
+        matched_gt_boxes = []
+        for gt_boxes_i in gt_boxes:
+            """
+            gt_boxes_i: ground-truth boxes for i-th image
+            """
+            match_quality_matrix = retry_if_cuda_oom(pairwise_iou_rotated)(gt_boxes_i, anchors)
+            matched_idxs, gt_labels_i = retry_if_cuda_oom(self.anchor_matcher)(match_quality_matrix)
+            # Matching is memory-expensive and may result in CPU tensors. But the result is small
+            gt_labels_i = gt_labels_i.to(device=gt_boxes_i.device)
+
+            # A vector of labels (-1, 0, 1) for each anchor
+            gt_labels_i = self._subsample_labels(gt_labels_i)
+
+            if len(gt_boxes_i) == 0:
+                # These values won't be used anyway since the anchor is labeled as background
+                matched_gt_boxes_i = torch.zeros_like(anchors.tensor)
+            else:
+                # TODO wasted indexing computation for ignored boxes
+                matched_gt_boxes_i = gt_boxes_i[matched_idxs].tensor
+
+            gt_labels.append(gt_labels_i)  # N,AHW
+            matched_gt_boxes.append(matched_gt_boxes_i)
+        return gt_labels, matched_gt_boxes
+
+    def forward(self, images, features, gt_instances=None):
+        # same signature as RPN.forward
+        features = [features[f] for f in self.in_features]
+        pred_objectness_logits, pred_anchor_deltas = self.rpn_head(features)
+        anchors = self.anchor_generator(features)
+
+        if self.training:
+            gt_labels, gt_boxes = self.label_and_sample_anchors(anchors, gt_instances)
+        else:
+            gt_labels, gt_boxes = None, None
+
+        outputs = RPNOutputs(
+            self.box2box_transform,
+            self.batch_size_per_image,
+            images,
+            pred_objectness_logits,
+            pred_anchor_deltas,
+            anchors,
+            gt_labels,
+            gt_boxes,
+            self.smooth_l1_beta,
+        )
+
+        if self.training:
+            losses = {k: v * self.loss_weight for k, v in outputs.losses().items()}
+        else:
+            losses = {}
+
+        with torch.no_grad():
+            # Find the top proposals by applying NMS and removing boxes that
+            # are too small. The proposals are treated as fixed for approximate
+            # joint training with roi heads. This approach ignores the derivative
+            # w.r.t. the proposal boxes’ coordinates that are also network
+            # responses, so is approximate.
+
+            # Note: this line is the only difference v.s. RPN.forward
+            proposals = find_top_rrpn_proposals(
+                outputs.predict_proposals(),
+                outputs.predict_objectness_logits(),
+                images,
+                self.nms_thresh,
+                self.pre_nms_topk[self.training],
+                self.post_nms_topk[self.training],
+                self.min_box_side_len,
+                self.training,
+            )
+
+        return proposals, losses
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/detectron2/modeling/roi_heads/__init__.py b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/modeling/roi_heads/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a49099aa5cfa58b55c66fe8fa85092eb26d15535
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/modeling/roi_heads/__init__.py
@@ -0,0 +1,16 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+from .box_head import ROI_BOX_HEAD_REGISTRY, build_box_head
+from .keypoint_head import ROI_KEYPOINT_HEAD_REGISTRY, build_keypoint_head, BaseKeypointRCNNHead
+from .mask_head import ROI_MASK_HEAD_REGISTRY, build_mask_head, BaseMaskRCNNHead
+from .roi_heads import (
+    ROI_HEADS_REGISTRY,
+    ROIHeads,
+    Res5ROIHeads,
+    StandardROIHeads,
+    build_roi_heads,
+    select_foreground_proposals,
+)
+from .rotated_fast_rcnn import RROIHeads
+from .fast_rcnn import FastRCNNOutputLayers
+
+from . import cascade_rcnn  # isort:skip
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/detectron2/modeling/roi_heads/box_head.py b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/modeling/roi_heads/box_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..de62d47acfd0ac634daf7db228b43f035cc721f3
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/modeling/roi_heads/box_head.py
@@ -0,0 +1,115 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+import numpy as np
+from typing import List
+import fvcore.nn.weight_init as weight_init
+import torch
+from torch import nn
+from torch.nn import functional as F
+
+from detectron2.config import configurable
+from detectron2.layers import Conv2d, Linear, ShapeSpec, get_norm
+from detectron2.utils.registry import Registry
+
+ROI_BOX_HEAD_REGISTRY = Registry("ROI_BOX_HEAD")
+ROI_BOX_HEAD_REGISTRY.__doc__ = """
+Registry for box heads, which make box predictions from per-region features.
+
+The registered object will be called with `obj(cfg, input_shape)`.
+"""
+
+
+@ROI_BOX_HEAD_REGISTRY.register()
+class FastRCNNConvFCHead(nn.Module):
+    """
+    A head with several 3x3 conv layers (each followed by norm & relu) and then
+    several fc layers (each followed by relu).
+    """
+
+    @configurable
+    def __init__(
+        self, input_shape: ShapeSpec, *, conv_dims: List[int], fc_dims: List[int], conv_norm=""
+    ):
+        """
+        NOTE: this interface is experimental.
+
+        Args:
+            input_shape (ShapeSpec): shape of the input feature.
+            conv_dims (list[int]): the output dimensions of the conv layers
+            fc_dims (list[int]): the output dimensions of the fc layers
+            conv_norm (str or callable): normalization for the conv layers.
+                See :func:`detectron2.layers.get_norm` for supported types.
+        """
+        super().__init__()
+        assert len(conv_dims) + len(fc_dims) > 0
+
+        self._output_size = (input_shape.channels, input_shape.height, input_shape.width)
+
+        self.conv_norm_relus = []
+        for k, conv_dim in enumerate(conv_dims):
+            conv = Conv2d(
+                self._output_size[0],
+                conv_dim,
+                kernel_size=3,
+                padding=1,
+                bias=not conv_norm,
+                norm=get_norm(conv_norm, conv_dim),
+                activation=F.relu,
+            )
+            self.add_module("conv{}".format(k + 1), conv)
+            self.conv_norm_relus.append(conv)
+            self._output_size = (conv_dim, self._output_size[1], self._output_size[2])
+
+        self.fcs = []
+        for k, fc_dim in enumerate(fc_dims):
+            fc = Linear(np.prod(self._output_size), fc_dim)
+            self.add_module("fc{}".format(k + 1), fc)
+            self.fcs.append(fc)
+            self._output_size = fc_dim
+
+        for layer in self.conv_norm_relus:
+            weight_init.c2_msra_fill(layer)
+        for layer in self.fcs:
+            weight_init.c2_xavier_fill(layer)
+
+    @classmethod
+    def from_config(cls, cfg, input_shape):
+        num_conv = cfg.MODEL.ROI_BOX_HEAD.NUM_CONV
+        conv_dim = cfg.MODEL.ROI_BOX_HEAD.CONV_DIM
+        num_fc = cfg.MODEL.ROI_BOX_HEAD.NUM_FC
+        fc_dim = cfg.MODEL.ROI_BOX_HEAD.FC_DIM
+        return {
+            "input_shape": input_shape,
+            "conv_dims": [conv_dim] * num_conv,
+            "fc_dims": [fc_dim] * num_fc,
+            "conv_norm": cfg.MODEL.ROI_BOX_HEAD.NORM,
+        }
+
+    def forward(self, x):
+        for layer in self.conv_norm_relus:
+            x = layer(x)
+        if len(self.fcs):
+            if x.dim() > 2:
+                x = torch.flatten(x, start_dim=1)
+            for layer in self.fcs:
+                x = F.relu(layer(x))
+        return x
+
+    @property
+    def output_shape(self):
+        """
+        Returns:
+            ShapeSpec: the output feature shape
+        """
+        o = self._output_size
+        if isinstance(o, int):
+            return ShapeSpec(channels=o)
+        else:
+            return ShapeSpec(channels=o[0], height=o[1], width=o[2])
+
+
+def build_box_head(cfg, input_shape):
+    """
+    Build a box head defined by `cfg.MODEL.ROI_BOX_HEAD.NAME`.
+    """
+    name = cfg.MODEL.ROI_BOX_HEAD.NAME
+    return ROI_BOX_HEAD_REGISTRY.get(name)(cfg, input_shape)
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/detectron2/modeling/roi_heads/cascade_rcnn.py b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/modeling/roi_heads/cascade_rcnn.py
new file mode 100644
index 0000000000000000000000000000000000000000..b3efdcf70c3b71b935676e103be288484c66f4e2
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/modeling/roi_heads/cascade_rcnn.py
@@ -0,0 +1,298 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+from typing import List
+import torch
+from torch import nn
+from torch.autograd.function import Function
+
+from detectron2.config import configurable
+from detectron2.layers import ShapeSpec
+from detectron2.structures import Boxes, Instances, pairwise_iou
+from detectron2.utils.events import get_event_storage
+
+from ..box_regression import Box2BoxTransform
+from ..matcher import Matcher
+from ..poolers import ROIPooler
+from .box_head import build_box_head
+from .fast_rcnn import FastRCNNOutputLayers, fast_rcnn_inference
+from .roi_heads import ROI_HEADS_REGISTRY, StandardROIHeads
+
+
+class _ScaleGradient(Function):
+    @staticmethod
+    def forward(ctx, input, scale):
+        ctx.scale = scale
+        return input
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        return grad_output * ctx.scale, None
+
+
+@ROI_HEADS_REGISTRY.register()
+class CascadeROIHeads(StandardROIHeads):
+    """
+    Implement :paper:`Cascade R-CNN`.
+    """
+
+    @configurable
+    def __init__(
+        self,
+        *,
+        box_in_features: List[str],
+        box_pooler: ROIPooler,
+        box_heads: List[nn.Module],
+        box_predictors: List[nn.Module],
+        proposal_matchers: List[Matcher],
+        **kwargs,
+    ):
+        """
+        NOTE: this interface is experimental.
+
+        Args:
+            box_pooler (ROIPooler): pooler that extracts region features from given boxes
+            box_heads (list[nn.Module]): box head for each cascade stage
+            box_predictors (list[nn.Module]): box predictor for each cascade stage
+            proposal_matchers (list[Matcher]): matcher with different IoU thresholds to
+                match boxes with ground truth for each stage. The first matcher matches
+                RPN proposals with ground truth, the other matchers use boxes predicted
+                by the previous stage as proposals and match them with ground truth.
+        """
+        assert "proposal_matcher" not in kwargs, (
+            "CascadeROIHeads takes 'proposal_matchers=' for each stage instead "
+            "of one 'proposal_matcher='."
+        )
+        # The first matcher matches RPN proposals with ground truth, done in the base class
+        kwargs["proposal_matcher"] = proposal_matchers[0]
+        num_stages = self.num_cascade_stages = len(box_heads)
+        box_heads = nn.ModuleList(box_heads)
+        box_predictors = nn.ModuleList(box_predictors)
+        assert len(box_predictors) == num_stages, f"{len(box_predictors)} != {num_stages}!"
+        assert len(proposal_matchers) == num_stages, f"{len(proposal_matchers)} != {num_stages}!"
+        super().__init__(
+            box_in_features=box_in_features,
+            box_pooler=box_pooler,
+            box_head=box_heads,
+            box_predictor=box_predictors,
+            **kwargs,
+        )
+        self.proposal_matchers = proposal_matchers
+
+    @classmethod
+    def from_config(cls, cfg, input_shape):
+        ret = super().from_config(cfg, input_shape)
+        ret.pop("proposal_matcher")
+        return ret
+
+    @classmethod
+    def _init_box_head(cls, cfg, input_shape):
+        # fmt: off
+        in_features              = cfg.MODEL.ROI_HEADS.IN_FEATURES
+        pooler_resolution        = cfg.MODEL.ROI_BOX_HEAD.POOLER_RESOLUTION
+        pooler_scales            = tuple(1.0 / input_shape[k].stride for k in in_features)
+        sampling_ratio           = cfg.MODEL.ROI_BOX_HEAD.POOLER_SAMPLING_RATIO
+        pooler_type              = cfg.MODEL.ROI_BOX_HEAD.POOLER_TYPE
+        cascade_bbox_reg_weights = cfg.MODEL.ROI_BOX_CASCADE_HEAD.BBOX_REG_WEIGHTS
+        cascade_ious             = cfg.MODEL.ROI_BOX_CASCADE_HEAD.IOUS
+        assert len(cascade_bbox_reg_weights) == len(cascade_ious)
+        assert cfg.MODEL.ROI_BOX_HEAD.CLS_AGNOSTIC_BBOX_REG,  \
+            "CascadeROIHeads only support class-agnostic regression now!"
+        assert cascade_ious[0] == cfg.MODEL.ROI_HEADS.IOU_THRESHOLDS[0]
+        # fmt: on
+
+        in_channels = [input_shape[f].channels for f in in_features]
+        # Check all channel counts are equal
+        assert len(set(in_channels)) == 1, in_channels
+        in_channels = in_channels[0]
+
+        box_pooler = ROIPooler(
+            output_size=pooler_resolution,
+            scales=pooler_scales,
+            sampling_ratio=sampling_ratio,
+            pooler_type=pooler_type,
+        )
+        pooled_shape = ShapeSpec(
+            channels=in_channels, width=pooler_resolution, height=pooler_resolution
+        )
+
+        box_heads, box_predictors, proposal_matchers = [], [], []
+        for match_iou, bbox_reg_weights in zip(cascade_ious, cascade_bbox_reg_weights):
+            box_head = build_box_head(cfg, pooled_shape)
+            box_heads.append(box_head)
+            box_predictors.append(
+                FastRCNNOutputLayers(
+                    cfg,
+                    box_head.output_shape,
+                    box2box_transform=Box2BoxTransform(weights=bbox_reg_weights),
+                )
+            )
+            proposal_matchers.append(Matcher([match_iou], [0, 1], allow_low_quality_matches=False))
+        return {
+            "box_in_features": in_features,
+            "box_pooler": box_pooler,
+            "box_heads": box_heads,
+            "box_predictors": box_predictors,
+            "proposal_matchers": proposal_matchers,
+        }
+
+    def forward(self, images, features, proposals, targets=None):
+        del images
+        if self.training:
+            proposals = self.label_and_sample_proposals(proposals, targets)
+
+        if self.training:
+            # Need targets to box head
+            losses = self._forward_box(features, proposals, targets)
+            losses.update(self._forward_mask(features, proposals))
+            losses.update(self._forward_keypoint(features, proposals))
+            return proposals, losses
+        else:
+            pred_instances = self._forward_box(features, proposals)
+            pred_instances = self.forward_with_given_boxes(features, pred_instances)
+            return pred_instances, {}
+
+    def _forward_box(self, features, proposals, targets=None):
+        """
+        Args:
+            features, targets: the same as in
+                Same as in :meth:`ROIHeads.forward`.
+            proposals (list[Instances]): the per-image object proposals with
+                their matching ground truth.
+                Each has fields "proposal_boxes", and "objectness_logits",
+                "gt_classes", "gt_boxes".
+        """
+        features = [features[f] for f in self.box_in_features]
+        head_outputs = []  # (predictor, predictions, proposals)
+        prev_pred_boxes = None
+        image_sizes = [x.image_size for x in proposals]
+        for k in range(self.num_cascade_stages):
+            if k > 0:
+                # The output boxes of the previous stage are used to create the input
+                # proposals of the next stage.
+                proposals = self._create_proposals_from_boxes(prev_pred_boxes, image_sizes)
+                if self.training:
+                    proposals = self._match_and_label_boxes(proposals, k, targets)
+            predictions = self._run_stage(features, proposals, k)
+            prev_pred_boxes = self.box_predictor[k].predict_boxes(predictions, proposals)
+            head_outputs.append((self.box_predictor[k], predictions, proposals))
+
+        if self.training:
+            losses = {}
+            storage = get_event_storage()
+            for stage, (predictor, predictions, proposals) in enumerate(head_outputs):
+                with storage.name_scope("stage{}".format(stage)):
+                    stage_losses = predictor.losses(predictions, proposals)
+                losses.update({k + "_stage{}".format(stage): v for k, v in stage_losses.items()})
+            return losses
+        else:
+            # Each is a list[Tensor] of length #image. Each tensor is Ri x (K+1)
+            scores_per_stage = [h[0].predict_probs(h[1], h[2]) for h in head_outputs]
+
+            # Average the scores across heads
+            scores = [
+                sum(list(scores_per_image)) * (1.0 / self.num_cascade_stages)
+                for scores_per_image in zip(*scores_per_stage)
+            ]
+            # Use the boxes of the last head
+            predictor, predictions, proposals = head_outputs[-1]
+            boxes = predictor.predict_boxes(predictions, proposals)
+            pred_instances, _ = fast_rcnn_inference(
+                boxes,
+                scores,
+                image_sizes,
+                predictor.test_score_thresh,
+                predictor.test_nms_thresh,
+                predictor.test_topk_per_image,
+            )
+            return pred_instances
+
+    @torch.no_grad()
+    def _match_and_label_boxes(self, proposals, stage, targets):
+        """
+        Match proposals with groundtruth using the matcher at the given stage.
+        Label the proposals as foreground or background based on the match.
+
+        Args:
+            proposals (list[Instances]): One Instances for each image, with
+                the field "proposal_boxes".
+            stage (int): the current stage
+            targets (list[Instances]): the ground truth instances
+
+        Returns:
+            list[Instances]: the same proposals, but with fields "gt_classes" and "gt_boxes"
+        """
+        num_fg_samples, num_bg_samples = [], []
+        for proposals_per_image, targets_per_image in zip(proposals, targets):
+            match_quality_matrix = pairwise_iou(
+                targets_per_image.gt_boxes, proposals_per_image.proposal_boxes
+            )
+            # proposal_labels are 0 or 1
+            matched_idxs, proposal_labels = self.proposal_matchers[stage](match_quality_matrix)
+            if len(targets_per_image) > 0:
+                gt_classes = targets_per_image.gt_classes[matched_idxs]
+                # Label unmatched proposals (0 label from matcher) as background (label=num_classes)
+                gt_classes[proposal_labels == 0] = self.num_classes
+                gt_boxes = targets_per_image.gt_boxes[matched_idxs]
+            else:
+                gt_classes = torch.zeros_like(matched_idxs) + self.num_classes
+                gt_boxes = Boxes(
+                    targets_per_image.gt_boxes.tensor.new_zeros((len(proposals_per_image), 4))
+                )
+            proposals_per_image.gt_classes = gt_classes
+            proposals_per_image.gt_boxes = gt_boxes
+
+            num_fg_samples.append((proposal_labels == 1).sum().item())
+            num_bg_samples.append(proposal_labels.numel() - num_fg_samples[-1])
+
+        # Log the number of fg/bg samples in each stage
+        storage = get_event_storage()
+        storage.put_scalar(
+            "stage{}/roi_head/num_fg_samples".format(stage),
+            sum(num_fg_samples) / len(num_fg_samples),
+        )
+        storage.put_scalar(
+            "stage{}/roi_head/num_bg_samples".format(stage),
+            sum(num_bg_samples) / len(num_bg_samples),
+        )
+        return proposals
+
+    def _run_stage(self, features, proposals, stage):
+        """
+        Args:
+            features (list[Tensor]): #lvl input features to ROIHeads
+            proposals (list[Instances]): #image Instances, with the field "proposal_boxes"
+            stage (int): the current stage
+
+        Returns:
+            Same output as `FastRCNNOutputLayers.forward()`.
+        """
+        box_features = self.box_pooler(features, [x.proposal_boxes for x in proposals])
+        # The original implementation averages the losses among heads,
+        # but scale up the parameter gradients of the heads.
+        # This is equivalent to adding the losses among heads,
+        # but scale down the gradients on features.
+        box_features = _ScaleGradient.apply(box_features, 1.0 / self.num_cascade_stages)
+        box_features = self.box_head[stage](box_features)
+        return self.box_predictor[stage](box_features)
+
+    def _create_proposals_from_boxes(self, boxes, image_sizes):
+        """
+        Args:
+            boxes (list[Tensor]): per-image predicted boxes, each of shape Ri x 4
+            image_sizes (list[tuple]): list of image shapes in (h, w)
+
+        Returns:
+            list[Instances]: per-image proposals with the given boxes.
+        """
+        # Just like RPN, the proposals should not have gradients
+        boxes = [Boxes(b.detach()) for b in boxes]
+        proposals = []
+        for boxes_per_image, image_size in zip(boxes, image_sizes):
+            boxes_per_image.clip(image_size)
+            if self.training:
+                # do not filter empty boxes at inference time,
+                # because the scores from each stage need to be aligned and added later
+                boxes_per_image = boxes_per_image[boxes_per_image.nonempty()]
+            prop = Instances(image_size)
+            prop.proposal_boxes = boxes_per_image
+            proposals.append(prop)
+        return proposals
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/detectron2/modeling/roi_heads/fast_rcnn.py b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/modeling/roi_heads/fast_rcnn.py
new file mode 100644
index 0000000000000000000000000000000000000000..ca796ace55509efb8a898f580203076bada387f2
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/modeling/roi_heads/fast_rcnn.py
@@ -0,0 +1,510 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+import logging
+import torch
+from fvcore.nn import smooth_l1_loss
+from torch import nn
+from torch.nn import functional as F
+
+from detectron2.config import configurable
+from detectron2.layers import Linear, ShapeSpec, batched_nms, cat
+from detectron2.modeling.box_regression import Box2BoxTransform, apply_deltas_broadcast
+from detectron2.structures import Boxes, Instances
+from detectron2.utils.events import get_event_storage
+
+__all__ = ["fast_rcnn_inference", "FastRCNNOutputLayers"]
+
+
+logger = logging.getLogger(__name__)
+
+"""
+Shape shorthand in this module:
+
+    N: number of images in the minibatch
+    R: number of ROIs, combined over all images, in the minibatch
+    Ri: number of ROIs in image i
+    K: number of foreground classes. E.g.,there are 80 foreground classes in COCO.
+
+Naming convention:
+
+    deltas: refers to the 4-d (dx, dy, dw, dh) deltas that parameterize the box2box
+    transform (see :class:`box_regression.Box2BoxTransform`).
+
+    pred_class_logits: predicted class scores in [-inf, +inf]; use
+        softmax(pred_class_logits) to estimate P(class).
+
+    gt_classes: ground-truth classification labels in [0, K], where [0, K) represent
+        foreground object classes and K represents the background class.
+
+    pred_proposal_deltas: predicted box2box transform deltas for transforming proposals
+        to detection box predictions.
+
+    gt_proposal_deltas: ground-truth box2box transform deltas
+"""
+
+
+def fast_rcnn_inference(boxes, scores, image_shapes, score_thresh, nms_thresh, topk_per_image):
+    """
+    Call `fast_rcnn_inference_single_image` for all images.
+
+    Args:
+        boxes (list[Tensor]): A list of Tensors of predicted class-specific or class-agnostic
+            boxes for each image. Element i has shape (Ri, K * 4) if doing
+            class-specific regression, or (Ri, 4) if doing class-agnostic
+            regression, where Ri is the number of predicted objects for image i.
+            This is compatible with the output of :meth:`FastRCNNOutputLayers.predict_boxes`.
+        scores (list[Tensor]): A list of Tensors of predicted class scores for each image.
+            Element i has shape (Ri, K + 1), where Ri is the number of predicted objects
+            for image i. Compatible with the output of :meth:`FastRCNNOutputLayers.predict_probs`.
+        image_shapes (list[tuple]): A list of (width, height) tuples for each image in the batch.
+        score_thresh (float): Only return detections with a confidence score exceeding this
+            threshold.
+        nms_thresh (float):  The threshold to use for box non-maximum suppression. Value in [0, 1].
+        topk_per_image (int): The number of top scoring detections to return. Set < 0 to return
+            all detections.
+
+    Returns:
+        instances: (list[Instances]): A list of N instances, one for each image in the batch,
+            that stores the topk most confidence detections.
+        kept_indices: (list[Tensor]): A list of 1D tensor of length of N, each element indicates
+            the corresponding boxes/scores index in [0, Ri) from the input, for image i.
+    """
+    result_per_image = [
+        fast_rcnn_inference_single_image(
+            boxes_per_image, scores_per_image, image_shape, score_thresh, nms_thresh, topk_per_image
+        )
+        for scores_per_image, boxes_per_image, image_shape in zip(scores, boxes, image_shapes)
+    ]
+    return [x[0] for x in result_per_image], [x[1] for x in result_per_image]
+
+
+def fast_rcnn_inference_single_image(
+    boxes, scores, image_shape, score_thresh, nms_thresh, topk_per_image
+):
+    """
+    Single-image inference. Return bounding-box detection results by thresholding
+    on scores and applying non-maximum suppression (NMS).
+
+    Args:
+        Same as `fast_rcnn_inference`, but with boxes, scores, and image shapes
+        per image.
+
+    Returns:
+        Same as `fast_rcnn_inference`, but for only one image.
+    """
+    valid_mask = torch.isfinite(boxes).all(dim=1) & torch.isfinite(scores).all(dim=1)
+    if not valid_mask.all():
+        boxes = boxes[valid_mask]
+        scores = scores[valid_mask]
+
+    scores = scores[:, :-1]
+    num_bbox_reg_classes = boxes.shape[1] // 4
+    # Convert to Boxes to use the `clip` function ...
+    boxes = Boxes(boxes.reshape(-1, 4))
+    boxes.clip(image_shape)
+    boxes = boxes.tensor.view(-1, num_bbox_reg_classes, 4)  # R x C x 4
+
+    # Filter results based on detection scores
+    filter_mask = scores > score_thresh  # R x K
+    # R' x 2. First column contains indices of the R predictions;
+    # Second column contains indices of classes.
+    filter_inds = filter_mask.nonzero()
+    if num_bbox_reg_classes == 1:
+        boxes = boxes[filter_inds[:, 0], 0]
+    else:
+        boxes = boxes[filter_mask]
+    scores = scores[filter_mask]
+
+    # Apply per-class NMS
+    keep = batched_nms(boxes, scores, filter_inds[:, 1], nms_thresh)
+    if topk_per_image >= 0:
+        keep = keep[:topk_per_image]
+    boxes, scores, filter_inds = boxes[keep], scores[keep], filter_inds[keep]
+
+    result = Instances(image_shape)
+    result.pred_boxes = Boxes(boxes)
+    result.scores = scores
+    result.pred_classes = filter_inds[:, 1]
+    return result, filter_inds[:, 0]
+
+
+class FastRCNNOutputs(object):
+    """
+    A class that stores information about outputs of a Fast R-CNN head.
+    It provides methods that are used to decode the outputs of a Fast R-CNN head.
+    """
+
+    def __init__(
+        self,
+        box2box_transform,
+        pred_class_logits,
+        pred_proposal_deltas,
+        proposals,
+        smooth_l1_beta=0,
+    ):
+        """
+        Args:
+            box2box_transform (Box2BoxTransform/Box2BoxTransformRotated):
+                box2box transform instance for proposal-to-detection transformations.
+            pred_class_logits (Tensor): A tensor of shape (R, K + 1) storing the predicted class
+                logits for all R predicted object instances.
+                Each row corresponds to a predicted object instance.
+            pred_proposal_deltas (Tensor): A tensor of shape (R, K * B) or (R, B) for
+                class-specific or class-agnostic regression. It stores the predicted deltas that
+                transform proposals into final box detections.
+                B is the box dimension (4 or 5).
+                When B is 4, each row is [dx, dy, dw, dh (, ....)].
+                When B is 5, each row is [dx, dy, dw, dh, da (, ....)].
+            proposals (list[Instances]): A list of N Instances, where Instances i stores the
+                proposals for image i, in the field "proposal_boxes".
+                When training, each Instances must have ground-truth labels
+                stored in the field "gt_classes" and "gt_boxes".
+                The total number of all instances must be equal to R.
+            smooth_l1_beta (float): The transition point between L1 and L2 loss in
+                the smooth L1 loss function. When set to 0, the loss becomes L1. When
+                set to +inf, the loss becomes constant 0.
+        """
+        self.box2box_transform = box2box_transform
+        self.num_preds_per_image = [len(p) for p in proposals]
+        self.pred_class_logits = pred_class_logits
+        self.pred_proposal_deltas = pred_proposal_deltas
+        self.smooth_l1_beta = smooth_l1_beta
+        self.image_shapes = [x.image_size for x in proposals]
+
+        if len(proposals):
+            box_type = type(proposals[0].proposal_boxes)
+            # cat(..., dim=0) concatenates over all images in the batch
+            self.proposals = box_type.cat([p.proposal_boxes for p in proposals])
+            assert (
+                not self.proposals.tensor.requires_grad
+            ), "Proposals should not require gradients!"
+
+            # The following fields should exist only when training.
+            if proposals[0].has("gt_boxes"):
+                self.gt_boxes = box_type.cat([p.gt_boxes for p in proposals])
+                assert proposals[0].has("gt_classes")
+                self.gt_classes = cat([p.gt_classes for p in proposals], dim=0)
+        else:
+            self.proposals = Boxes(torch.zeros(0, 4, device=self.pred_proposal_deltas.device))
+        self._no_instances = len(proposals) == 0  # no instances found
+
+    def _log_accuracy(self):
+        """
+        Log the accuracy metrics to EventStorage.
+        """
+        num_instances = self.gt_classes.numel()
+        pred_classes = self.pred_class_logits.argmax(dim=1)
+        bg_class_ind = self.pred_class_logits.shape[1] - 1
+
+        fg_inds = (self.gt_classes >= 0) & (self.gt_classes < bg_class_ind)
+        num_fg = fg_inds.nonzero().numel()
+        fg_gt_classes = self.gt_classes[fg_inds]
+        fg_pred_classes = pred_classes[fg_inds]
+
+        num_false_negative = (fg_pred_classes == bg_class_ind).nonzero().numel()
+        num_accurate = (pred_classes == self.gt_classes).nonzero().numel()
+        fg_num_accurate = (fg_pred_classes == fg_gt_classes).nonzero().numel()
+
+        storage = get_event_storage()
+        if num_instances > 0:
+            storage.put_scalar("fast_rcnn/cls_accuracy", num_accurate / num_instances)
+            if num_fg > 0:
+                storage.put_scalar("fast_rcnn/fg_cls_accuracy", fg_num_accurate / num_fg)
+                storage.put_scalar("fast_rcnn/false_negative", num_false_negative / num_fg)
+
+    def softmax_cross_entropy_loss(self):
+        """
+        Compute the softmax cross entropy loss for box classification.
+
+        Returns:
+            scalar Tensor
+        """
+        if self._no_instances:
+            return 0.0 * self.pred_class_logits.sum()
+        else:
+            self._log_accuracy()
+            return F.cross_entropy(self.pred_class_logits, self.gt_classes, reduction="mean")
+
+    def smooth_l1_loss(self):
+        """
+        Compute the smooth L1 loss for box regression.
+
+        Returns:
+            scalar Tensor
+        """
+        if self._no_instances:
+            return 0.0 * self.pred_proposal_deltas.sum()
+        gt_proposal_deltas = self.box2box_transform.get_deltas(
+            self.proposals.tensor, self.gt_boxes.tensor
+        )
+        box_dim = gt_proposal_deltas.size(1)  # 4 or 5
+        cls_agnostic_bbox_reg = self.pred_proposal_deltas.size(1) == box_dim
+        device = self.pred_proposal_deltas.device
+
+        bg_class_ind = self.pred_class_logits.shape[1] - 1
+
+        # Box delta loss is only computed between the prediction for the gt class k
+        # (if 0 <= k < bg_class_ind) and the target; there is no loss defined on predictions
+        # for non-gt classes and background.
+        # Empty fg_inds produces a valid loss of zero as long as the size_average
+        # arg to smooth_l1_loss is False (otherwise it uses torch.mean internally
+        # and would produce a nan loss).
+        fg_inds = torch.nonzero(
+            (self.gt_classes >= 0) & (self.gt_classes < bg_class_ind), as_tuple=True
+        )[0]
+        if cls_agnostic_bbox_reg:
+            # pred_proposal_deltas only corresponds to foreground class for agnostic
+            gt_class_cols = torch.arange(box_dim, device=device)
+        else:
+            fg_gt_classes = self.gt_classes[fg_inds]
+            # pred_proposal_deltas for class k are located in columns [b * k : b * k + b],
+            # where b is the dimension of box representation (4 or 5)
+            # Note that compared to Detectron1,
+            # we do not perform bounding box regression for background classes.
+            gt_class_cols = box_dim * fg_gt_classes[:, None] + torch.arange(box_dim, device=device)
+
+        loss_box_reg = smooth_l1_loss(
+            self.pred_proposal_deltas[fg_inds[:, None], gt_class_cols],
+            gt_proposal_deltas[fg_inds],
+            self.smooth_l1_beta,
+            reduction="sum",
+        )
+        # The loss is normalized using the total number of regions (R), not the number
+        # of foreground regions even though the box regression loss is only defined on
+        # foreground regions. Why? Because doing so gives equal training influence to
+        # each foreground example. To see how, consider two different minibatches:
+        #  (1) Contains a single foreground region
+        #  (2) Contains 100 foreground regions
+        # If we normalize by the number of foreground regions, the single example in
+        # minibatch (1) will be given 100 times as much influence as each foreground
+        # example in minibatch (2). Normalizing by the total number of regions, R,
+        # means that the single example in minibatch (1) and each of the 100 examples
+        # in minibatch (2) are given equal influence.
+        loss_box_reg = loss_box_reg / self.gt_classes.numel()
+        return loss_box_reg
+
+    def _predict_boxes(self):
+        """
+        Returns:
+            Tensor: A Tensors of predicted class-specific or class-agnostic boxes
+                for all images in a batch. Element i has shape (Ri, K * B) or (Ri, B), where Ri is
+                the number of predicted objects for image i and B is the box dimension (4 or 5)
+        """
+        return apply_deltas_broadcast(
+            self.box2box_transform, self.pred_proposal_deltas, self.proposals.tensor
+        )
+
+    """
+    A subclass is expected to have the following methods because
+    they are used to query information about the head predictions.
+    """
+
+    def losses(self):
+        """
+        Compute the default losses for box head in Fast(er) R-CNN,
+        with softmax cross entropy loss and smooth L1 loss.
+
+        Returns:
+            A dict of losses (scalar tensors) containing keys "loss_cls" and "loss_box_reg".
+        """
+        return {
+            "loss_cls": self.softmax_cross_entropy_loss(),
+            "loss_box_reg": self.smooth_l1_loss(),
+        }
+
+    def predict_boxes(self):
+        """
+        Deprecated
+        """
+        return self._predict_boxes().split(self.num_preds_per_image, dim=0)
+
+    def predict_probs(self):
+        """
+        Deprecated
+        """
+        probs = F.softmax(self.pred_class_logits, dim=-1)
+        return probs.split(self.num_preds_per_image, dim=0)
+
+    def inference(self, score_thresh, nms_thresh, topk_per_image):
+        """
+        Deprecated
+        """
+        boxes = self.predict_boxes()
+        scores = self.predict_probs()
+        image_shapes = self.image_shapes
+        return fast_rcnn_inference(
+            boxes, scores, image_shapes, score_thresh, nms_thresh, topk_per_image
+        )
+
+
+class FastRCNNOutputLayers(nn.Module):
+    """
+    Two linear layers for predicting Fast R-CNN outputs:
+      (1) proposal-to-detection box regression deltas
+      (2) classification scores
+    """
+
+    @configurable
+    def __init__(
+        self,
+        input_shape,
+        *,
+        box2box_transform,
+        num_classes,
+        cls_agnostic_bbox_reg=False,
+        smooth_l1_beta=0.0,
+        test_score_thresh=0.0,
+        test_nms_thresh=0.5,
+        test_topk_per_image=100,
+    ):
+        """
+        NOTE: this interface is experimental.
+
+        Args:
+            input_shape (ShapeSpec): shape of the input feature to this module
+            box2box_transform (Box2BoxTransform or Box2BoxTransformRotated):
+            num_classes (int): number of foreground classes
+            cls_agnostic_bbox_reg (bool): whether to use class agnostic for bbox regression
+            smooth_l1_beta (float): transition point from L1 to L2 loss.
+            test_score_thresh (float): threshold to filter predictions results.
+            test_nms_thresh (float): NMS threshold for prediction results.
+            test_topk_per_image (int): number of top predictions to produce per image.
+        """
+        super().__init__()
+        if isinstance(input_shape, int):  # some backward compatibility
+            input_shape = ShapeSpec(channels=input_shape)
+        input_size = input_shape.channels * (input_shape.width or 1) * (input_shape.height or 1)
+        # The prediction layer for num_classes foreground classes and one background class
+        # (hence + 1)
+        self.cls_score = Linear(input_size, num_classes + 1)
+        num_bbox_reg_classes = 1 if cls_agnostic_bbox_reg else num_classes
+        box_dim = len(box2box_transform.weights)
+        self.bbox_pred = Linear(input_size, num_bbox_reg_classes * box_dim)
+
+        nn.init.normal_(self.cls_score.weight, std=0.01)
+        nn.init.normal_(self.bbox_pred.weight, std=0.001)
+        for l in [self.cls_score, self.bbox_pred]:
+            nn.init.constant_(l.bias, 0)
+
+        self.box2box_transform = box2box_transform
+        self.smooth_l1_beta = smooth_l1_beta
+        self.test_score_thresh = test_score_thresh
+        self.test_nms_thresh = test_nms_thresh
+        self.test_topk_per_image = test_topk_per_image
+
+    @classmethod
+    def from_config(cls, cfg, input_shape):
+        return {
+            "input_shape": input_shape,
+            "box2box_transform": Box2BoxTransform(weights=cfg.MODEL.ROI_BOX_HEAD.BBOX_REG_WEIGHTS),
+            # fmt: off
+            "num_classes"           : cfg.MODEL.ROI_HEADS.NUM_CLASSES,
+            "cls_agnostic_bbox_reg" : cfg.MODEL.ROI_BOX_HEAD.CLS_AGNOSTIC_BBOX_REG,
+            "smooth_l1_beta"        : cfg.MODEL.ROI_BOX_HEAD.SMOOTH_L1_BETA,
+            "test_score_thresh"     : cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST,
+            "test_nms_thresh"       : cfg.MODEL.ROI_HEADS.NMS_THRESH_TEST,
+            "test_topk_per_image"   : cfg.TEST.DETECTIONS_PER_IMAGE
+            # fmt: on
+        }
+
+    def forward(self, x):
+        """
+        Returns:
+            Tensor: Nx(K+1) scores for each box
+            Tensor: Nx4 or Nx(Kx4) bounding box regression deltas.
+        """
+        if x.dim() > 2:
+            x = torch.flatten(x, start_dim=1)
+        scores = self.cls_score(x)
+        proposal_deltas = self.bbox_pred(x)
+        return scores, proposal_deltas
+
+    # TODO: move the implementation to this class.
+    def losses(self, predictions, proposals):
+        """
+        Args:
+            predictions: return values of :meth:`forward()`.
+            proposals (list[Instances]): proposals that match the features
+                that were used to compute predictions.
+        """
+        scores, proposal_deltas = predictions
+        return FastRCNNOutputs(
+            self.box2box_transform, scores, proposal_deltas, proposals, self.smooth_l1_beta
+        ).losses()
+
+    def inference(self, predictions, proposals):
+        """
+        Returns:
+            list[Instances]: same as `fast_rcnn_inference`.
+            list[Tensor]: same as `fast_rcnn_inference`.
+        """
+        boxes = self.predict_boxes(predictions, proposals)
+        scores = self.predict_probs(predictions, proposals)
+        image_shapes = [x.image_size for x in proposals]
+        return fast_rcnn_inference(
+            boxes,
+            scores,
+            image_shapes,
+            self.test_score_thresh,
+            self.test_nms_thresh,
+            self.test_topk_per_image,
+        )
+
+    def predict_boxes_for_gt_classes(self, predictions, proposals):
+        """
+        Returns:
+            list[Tensor]: A list of Tensors of predicted boxes for GT classes in case of
+                class-specific box head. Element i of the list has shape (Ri, B), where Ri is
+                the number of predicted objects for image i and B is the box dimension (4 or 5)
+        """
+        if not len(proposals):
+            return []
+        scores, proposal_deltas = predictions
+        proposal_boxes = [p.proposal_boxes for p in proposals]
+        proposal_boxes = proposal_boxes[0].cat(proposal_boxes).tensor
+        N, B = proposal_boxes.shape
+        predict_boxes = apply_deltas_broadcast(
+            self.box2box_transform, proposal_deltas, proposal_boxes
+        )  # Nx(KxB)
+
+        K = predict_boxes.shape[1] // B
+        if K > 1:
+            gt_classes = torch.cat([p.gt_classes for p in proposals], dim=0)
+            # Some proposals are ignored or have a background class. Their gt_classes
+            # cannot be used as index.
+            gt_classes = gt_classes.clamp_(0, K - 1)
+
+            predict_boxes = predict_boxes.view(N, K, B)[
+                torch.arange(N, dtype=torch.long, device=predict_boxes.device), gt_classes
+            ]
+        num_prop_per_image = [len(p) for p in proposals]
+        return predict_boxes.split(num_prop_per_image)
+
+    def predict_boxes(self, predictions, proposals):
+        """
+        Returns:
+            list[Tensor]: A list of Tensors of predicted class-specific or class-agnostic boxes
+                for each image. Element i has shape (Ri, K * B) or (Ri, B), where Ri is
+                the number of predicted objects for image i and B is the box dimension (4 or 5)
+        """
+        if not len(proposals):
+            return []
+        _, proposal_deltas = predictions
+        num_prop_per_image = [len(p) for p in proposals]
+        proposal_boxes = [p.proposal_boxes for p in proposals]
+        proposal_boxes = proposal_boxes[0].cat(proposal_boxes).tensor
+        predict_boxes = apply_deltas_broadcast(
+            self.box2box_transform, proposal_deltas, proposal_boxes
+        )  # Nx(KxB)
+        return predict_boxes.split(num_prop_per_image)
+
+    def predict_probs(self, predictions, proposals):
+        """
+        Returns:
+            list[Tensor]: A list of Tensors of predicted class probabilities for each image.
+                Element i has shape (Ri, K + 1), where Ri is the number of predicted objects
+                for image i.
+        """
+        scores, _ = predictions
+        num_inst_per_image = [len(p) for p in proposals]
+        probs = F.softmax(scores, dim=-1)
+        return probs.split(num_inst_per_image, dim=0)
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/detectron2/modeling/roi_heads/keypoint_head.py b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/modeling/roi_heads/keypoint_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..c7990c8fd90c70c98d6b2e3f94935f571b957a79
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/modeling/roi_heads/keypoint_head.py
@@ -0,0 +1,253 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+from typing import List
+import torch
+from torch import nn
+from torch.nn import functional as F
+
+from detectron2.config import configurable
+from detectron2.layers import Conv2d, ConvTranspose2d, cat, interpolate
+from detectron2.structures import Instances, heatmaps_to_keypoints
+from detectron2.utils.events import get_event_storage
+from detectron2.utils.registry import Registry
+
+_TOTAL_SKIPPED = 0
+
+ROI_KEYPOINT_HEAD_REGISTRY = Registry("ROI_KEYPOINT_HEAD")
+ROI_KEYPOINT_HEAD_REGISTRY.__doc__ = """
+Registry for keypoint heads, which make keypoint predictions from per-region features.
+
+The registered object will be called with `obj(cfg, input_shape)`.
+"""
+
+
+def build_keypoint_head(cfg, input_shape):
+    """
+    Build a keypoint head from `cfg.MODEL.ROI_KEYPOINT_HEAD.NAME`.
+    """
+    name = cfg.MODEL.ROI_KEYPOINT_HEAD.NAME
+    return ROI_KEYPOINT_HEAD_REGISTRY.get(name)(cfg, input_shape)
+
+
+def keypoint_rcnn_loss(pred_keypoint_logits, instances, normalizer):
+    """
+    Arguments:
+        pred_keypoint_logits (Tensor): A tensor of shape (N, K, S, S) where N is the total number
+            of instances in the batch, K is the number of keypoints, and S is the side length
+            of the keypoint heatmap. The values are spatial logits.
+        instances (list[Instances]): A list of M Instances, where M is the batch size.
+            These instances are predictions from the model
+            that are in 1:1 correspondence with pred_keypoint_logits.
+            Each Instances should contain a `gt_keypoints` field containing a `structures.Keypoint`
+            instance.
+        normalizer (float): Normalize the loss by this amount.
+            If not specified, we normalize by the number of visible keypoints in the minibatch.
+
+    Returns a scalar tensor containing the loss.
+    """
+    heatmaps = []
+    valid = []
+
+    keypoint_side_len = pred_keypoint_logits.shape[2]
+    for instances_per_image in instances:
+        if len(instances_per_image) == 0:
+            continue
+        keypoints = instances_per_image.gt_keypoints
+        heatmaps_per_image, valid_per_image = keypoints.to_heatmap(
+            instances_per_image.proposal_boxes.tensor, keypoint_side_len
+        )
+        heatmaps.append(heatmaps_per_image.view(-1))
+        valid.append(valid_per_image.view(-1))
+
+    if len(heatmaps):
+        keypoint_targets = cat(heatmaps, dim=0)
+        valid = cat(valid, dim=0).to(dtype=torch.uint8)
+        valid = torch.nonzero(valid).squeeze(1)
+
+    # torch.mean (in binary_cross_entropy_with_logits) doesn't
+    # accept empty tensors, so handle it separately
+    if len(heatmaps) == 0 or valid.numel() == 0:
+        global _TOTAL_SKIPPED
+        _TOTAL_SKIPPED += 1
+        storage = get_event_storage()
+        storage.put_scalar("kpts_num_skipped_batches", _TOTAL_SKIPPED, smoothing_hint=False)
+        return pred_keypoint_logits.sum() * 0
+
+    N, K, H, W = pred_keypoint_logits.shape
+    pred_keypoint_logits = pred_keypoint_logits.view(N * K, H * W)
+
+    keypoint_loss = F.cross_entropy(
+        pred_keypoint_logits[valid], keypoint_targets[valid], reduction="sum"
+    )
+
+    # If a normalizer isn't specified, normalize by the number of visible keypoints in the minibatch
+    if normalizer is None:
+        normalizer = valid.numel()
+    keypoint_loss /= normalizer
+
+    return keypoint_loss
+
+
+def keypoint_rcnn_inference(pred_keypoint_logits, pred_instances):
+    """
+    Post process each predicted keypoint heatmap in `pred_keypoint_logits` into (x, y, score)
+        and add it to the `pred_instances` as a `pred_keypoints` field.
+
+    Args:
+        pred_keypoint_logits (Tensor): A tensor of shape (R, K, S, S) where R is the total number
+           of instances in the batch, K is the number of keypoints, and S is the side length of
+           the keypoint heatmap. The values are spatial logits.
+        pred_instances (list[Instances]): A list of N Instances, where N is the number of images.
+
+    Returns:
+        None. Each element in pred_instances will contain an extra "pred_keypoints" field.
+            The field is a tensor of shape (#instance, K, 3) where the last
+            dimension corresponds to (x, y, score).
+            The scores are larger than 0.
+    """
+    # flatten all bboxes from all images together (list[Boxes] -> Rx4 tensor)
+    bboxes_flat = cat([b.pred_boxes.tensor for b in pred_instances], dim=0)
+
+    keypoint_results = heatmaps_to_keypoints(pred_keypoint_logits.detach(), bboxes_flat.detach())
+    num_instances_per_image = [len(i) for i in pred_instances]
+    keypoint_results = keypoint_results[:, :, [0, 1, 3]].split(num_instances_per_image, dim=0)
+
+    for keypoint_results_per_image, instances_per_image in zip(keypoint_results, pred_instances):
+        # keypoint_results_per_image is (num instances)x(num keypoints)x(x, y, score)
+        instances_per_image.pred_keypoints = keypoint_results_per_image
+
+
+class BaseKeypointRCNNHead(nn.Module):
+    """
+    Implement the basic Keypoint R-CNN losses and inference logic described in :paper:`Mask R-CNN`.
+    """
+
+    @configurable
+    def __init__(self, *, num_keypoints, loss_weight=1.0, loss_normalizer=1.0):
+        """
+        NOTE: this interface is experimental.
+
+        Args:
+            num_keypoints (int): number of keypoints to predict
+            loss_weight (float): weight to multiple on the keypoint loss
+            loss_normalizer (float or str):
+                If float, divide the loss by `loss_normalizer * #images`.
+                If 'visible', the loss is normalized by the total number of
+                    visible keypoints across images.
+        """
+        super().__init__()
+        self.num_keypoints = num_keypoints
+        self.loss_weight = loss_weight
+        assert loss_normalizer == "visible" or isinstance(loss_normalizer, float), loss_normalizer
+        self.loss_normalizer = loss_normalizer
+
+    @classmethod
+    def from_config(cls, cfg, input_shape):
+        ret = {
+            "loss_weight": cfg.MODEL.ROI_KEYPOINT_HEAD.LOSS_WEIGHT,
+            "num_keypoints": cfg.MODEL.ROI_KEYPOINT_HEAD.NUM_KEYPOINTS,
+        }
+        normalize_by_visible = (
+            cfg.MODEL.ROI_KEYPOINT_HEAD.NORMALIZE_LOSS_BY_VISIBLE_KEYPOINTS
+        )  # noqa
+        if not normalize_by_visible:
+            batch_size_per_image = cfg.MODEL.ROI_HEADS.BATCH_SIZE_PER_IMAGE
+            positive_sample_fraction = cfg.MODEL.ROI_HEADS.POSITIVE_FRACTION
+            ret["loss_normalizer"] = (
+                ret["num_keypoints"] * batch_size_per_image * positive_sample_fraction
+            )
+        else:
+            ret["loss_normalizer"] = "visible"
+        return ret
+
+    def forward(self, x, instances: List[Instances]):
+        """
+        Args:
+            x: input region feature(s) provided by :class:`ROIHeads`.
+            instances (list[Instances]): contains the boxes & labels corresponding
+                to the input features.
+                Exact format is up to its caller to decide.
+                Typically, this is the foreground instances in training, with
+                "proposal_boxes" field and other gt annotations.
+                In inference, it contains boxes that are already predicted.
+
+        Returns:
+            A dict of losses if in training. The predicted "instances" if in inference.
+        """
+        x = self.layers(x)
+        if self.training:
+            num_images = len(instances)
+            normalizer = (
+                None if self.loss_normalizer == "visible" else num_images * self.loss_normalizer
+            )
+            return {
+                "loss_keypoint": keypoint_rcnn_loss(x, instances, normalizer=normalizer)
+                * self.loss_weight
+            }
+        else:
+            keypoint_rcnn_inference(x, instances)
+            return instances
+
+    def layers(self, x):
+        """
+        Neural network layers that makes predictions from regional input features.
+        """
+        raise NotImplementedError
+
+
+@ROI_KEYPOINT_HEAD_REGISTRY.register()
+class KRCNNConvDeconvUpsampleHead(BaseKeypointRCNNHead):
+    """
+    A standard keypoint head containing a series of 3x3 convs, followed by
+    a transpose convolution and bilinear interpolation for upsampling.
+    """
+
+    @configurable
+    def __init__(self, input_shape, *, num_keypoints, conv_dims, **kwargs):
+        """
+        NOTE: this interface is experimental.
+
+        Args:
+            input_shape (ShapeSpec): shape of the input feature
+            conv_dims: an iterable of output channel counts for each conv in the head
+                         e.g. (512, 512, 512) for three convs outputting 512 channels.
+        """
+        super().__init__(num_keypoints=num_keypoints, **kwargs)
+
+        # default up_scale to 2 (this can be made an option)
+        up_scale = 2
+        in_channels = input_shape.channels
+
+        self.blocks = []
+        for idx, layer_channels in enumerate(conv_dims, 1):
+            module = Conv2d(in_channels, layer_channels, 3, stride=1, padding=1)
+            self.add_module("conv_fcn{}".format(idx), module)
+            self.blocks.append(module)
+            in_channels = layer_channels
+
+        deconv_kernel = 4
+        self.score_lowres = ConvTranspose2d(
+            in_channels, num_keypoints, deconv_kernel, stride=2, padding=deconv_kernel // 2 - 1
+        )
+        self.up_scale = up_scale
+
+        for name, param in self.named_parameters():
+            if "bias" in name:
+                nn.init.constant_(param, 0)
+            elif "weight" in name:
+                # Caffe2 implementation uses MSRAFill, which in fact
+                # corresponds to kaiming_normal_ in PyTorch
+                nn.init.kaiming_normal_(param, mode="fan_out", nonlinearity="relu")
+
+    @classmethod
+    def from_config(cls, cfg, input_shape):
+        ret = super().from_config(cfg, input_shape)
+        ret["input_shape"] = input_shape
+        ret["conv_dims"] = cfg.MODEL.ROI_KEYPOINT_HEAD.CONV_DIMS
+        return ret
+
+    def layers(self, x):
+        for layer in self.blocks:
+            x = F.relu(layer(x))
+        x = self.score_lowres(x)
+        x = interpolate(x, scale_factor=self.up_scale, mode="bilinear", align_corners=False)
+        return x
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/detectron2/modeling/roi_heads/mask_head.py b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/modeling/roi_heads/mask_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..5209722fb96b5e430bb5f30b3fce2b94b91f2b2e
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/modeling/roi_heads/mask_head.py
@@ -0,0 +1,277 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+from typing import List
+import fvcore.nn.weight_init as weight_init
+import torch
+from torch import nn
+from torch.nn import functional as F
+
+from detectron2.config import configurable
+from detectron2.layers import Conv2d, ConvTranspose2d, ShapeSpec, cat, get_norm
+from detectron2.structures import Instances
+from detectron2.utils.events import get_event_storage
+from detectron2.utils.registry import Registry
+
+ROI_MASK_HEAD_REGISTRY = Registry("ROI_MASK_HEAD")
+ROI_MASK_HEAD_REGISTRY.__doc__ = """
+Registry for mask heads, which predicts instance masks given
+per-region features.
+
+The registered object will be called with `obj(cfg, input_shape)`.
+"""
+
+
+def mask_rcnn_loss(pred_mask_logits, instances, vis_period=0):
+    """
+    Compute the mask prediction loss defined in the Mask R-CNN paper.
+
+    Args:
+        pred_mask_logits (Tensor): A tensor of shape (B, C, Hmask, Wmask) or (B, 1, Hmask, Wmask)
+            for class-specific or class-agnostic, where B is the total number of predicted masks
+            in all images, C is the number of foreground classes, and Hmask, Wmask are the height
+            and width of the mask predictions. The values are logits.
+        instances (list[Instances]): A list of N Instances, where N is the number of images
+            in the batch. These instances are in 1:1
+            correspondence with the pred_mask_logits. The ground-truth labels (class, box, mask,
+            ...) associated with each instance are stored in fields.
+        vis_period (int): the period (in steps) to dump visualization.
+
+    Returns:
+        mask_loss (Tensor): A scalar tensor containing the loss.
+    """
+    cls_agnostic_mask = pred_mask_logits.size(1) == 1
+    total_num_masks = pred_mask_logits.size(0)
+    mask_side_len = pred_mask_logits.size(2)
+    assert pred_mask_logits.size(2) == pred_mask_logits.size(3), "Mask prediction must be square!"
+
+    gt_classes = []
+    gt_masks = []
+    for instances_per_image in instances:
+        if len(instances_per_image) == 0:
+            continue
+        if not cls_agnostic_mask:
+            gt_classes_per_image = instances_per_image.gt_classes.to(dtype=torch.int64)
+            gt_classes.append(gt_classes_per_image)
+
+        gt_masks_per_image = instances_per_image.gt_masks.crop_and_resize(
+            instances_per_image.proposal_boxes.tensor, mask_side_len
+        ).to(device=pred_mask_logits.device)
+        # A tensor of shape (N, M, M), N=#instances in the image; M=mask_side_len
+        gt_masks.append(gt_masks_per_image)
+
+    if len(gt_masks) == 0:
+        return pred_mask_logits.sum() * 0
+
+    gt_masks = cat(gt_masks, dim=0)
+
+    if cls_agnostic_mask:
+        pred_mask_logits = pred_mask_logits[:, 0]
+    else:
+        indices = torch.arange(total_num_masks)
+        gt_classes = cat(gt_classes, dim=0)
+        pred_mask_logits = pred_mask_logits[indices, gt_classes]
+
+    if gt_masks.dtype == torch.bool:
+        gt_masks_bool = gt_masks
+    else:
+        # Here we allow gt_masks to be float as well (depend on the implementation of rasterize())
+        gt_masks_bool = gt_masks > 0.5
+    gt_masks = gt_masks.to(dtype=torch.float32)
+
+    # Log the training accuracy (using gt classes and 0.5 threshold)
+    mask_incorrect = (pred_mask_logits > 0.0) != gt_masks_bool
+    mask_accuracy = 1 - (mask_incorrect.sum().item() / max(mask_incorrect.numel(), 1.0))
+    num_positive = gt_masks_bool.sum().item()
+    false_positive = (mask_incorrect & ~gt_masks_bool).sum().item() / max(
+        gt_masks_bool.numel() - num_positive, 1.0
+    )
+    false_negative = (mask_incorrect & gt_masks_bool).sum().item() / max(num_positive, 1.0)
+
+    storage = get_event_storage()
+    storage.put_scalar("mask_rcnn/accuracy", mask_accuracy)
+    storage.put_scalar("mask_rcnn/false_positive", false_positive)
+    storage.put_scalar("mask_rcnn/false_negative", false_negative)
+    if vis_period > 0 and storage.iter % vis_period == 0:
+        pred_masks = pred_mask_logits.sigmoid()
+        vis_masks = torch.cat([pred_masks, gt_masks], axis=2)
+        name = "Left: mask prediction;   Right: mask GT"
+        for idx, vis_mask in enumerate(vis_masks):
+            vis_mask = torch.stack([vis_mask] * 3, axis=0)
+            storage.put_image(name + f" ({idx})", vis_mask)
+
+    mask_loss = F.binary_cross_entropy_with_logits(pred_mask_logits, gt_masks, reduction="mean")
+    return mask_loss
+
+
+def mask_rcnn_inference(pred_mask_logits, pred_instances):
+    """
+    Convert pred_mask_logits to estimated foreground probability masks while also
+    extracting only the masks for the predicted classes in pred_instances. For each
+    predicted box, the mask of the same class is attached to the instance by adding a
+    new "pred_masks" field to pred_instances.
+
+    Args:
+        pred_mask_logits (Tensor): A tensor of shape (B, C, Hmask, Wmask) or (B, 1, Hmask, Wmask)
+            for class-specific or class-agnostic, where B is the total number of predicted masks
+            in all images, C is the number of foreground classes, and Hmask, Wmask are the height
+            and width of the mask predictions. The values are logits.
+        pred_instances (list[Instances]): A list of N Instances, where N is the number of images
+            in the batch. Each Instances must have field "pred_classes".
+
+    Returns:
+        None. pred_instances will contain an extra "pred_masks" field storing a mask of size (Hmask,
+            Wmask) for predicted class. Note that the masks are returned as a soft (non-quantized)
+            masks the resolution predicted by the network; post-processing steps, such as resizing
+            the predicted masks to the original image resolution and/or binarizing them, is left
+            to the caller.
+    """
+    cls_agnostic_mask = pred_mask_logits.size(1) == 1
+
+    if cls_agnostic_mask:
+        mask_probs_pred = pred_mask_logits.sigmoid()
+    else:
+        # Select masks corresponding to the predicted classes
+        num_masks = pred_mask_logits.shape[0]
+        class_pred = cat([i.pred_classes for i in pred_instances])
+        indices = torch.arange(num_masks, device=class_pred.device)
+        mask_probs_pred = pred_mask_logits[indices, class_pred][:, None].sigmoid()
+    # mask_probs_pred.shape: (B, 1, Hmask, Wmask)
+
+    num_boxes_per_image = [len(i) for i in pred_instances]
+    mask_probs_pred = mask_probs_pred.split(num_boxes_per_image, dim=0)
+
+    for prob, instances in zip(mask_probs_pred, pred_instances):
+        instances.pred_masks = prob  # (1, Hmask, Wmask)
+
+
+class BaseMaskRCNNHead(nn.Module):
+    """
+    Implement the basic Mask R-CNN losses and inference logic described in :paper:`Mask R-CNN`
+    """
+
+    @configurable
+    def __init__(self, *, vis_period=0):
+        """
+        NOTE: this interface is experimental.
+
+        Args:
+            vis_period (int): visualization period
+        """
+        super().__init__()
+        self.vis_period = vis_period
+
+    @classmethod
+    def from_config(cls, cfg, input_shape):
+        return {"vis_period": cfg.VIS_PERIOD}
+
+    def forward(self, x, instances: List[Instances]):
+        """
+        Args:
+            x: input region feature(s) provided by :class:`ROIHeads`.
+            instances (list[Instances]): contains the boxes & labels corresponding
+                to the input features.
+                Exact format is up to its caller to decide.
+                Typically, this is the foreground instances in training, with
+                "proposal_boxes" field and other gt annotations.
+                In inference, it contains boxes that are already predicted.
+
+        Returns:
+            A dict of losses in training. The predicted "instances" in inference.
+        """
+        x = self.layers(x)
+        if self.training:
+            return {"loss_mask": mask_rcnn_loss(x, instances, self.vis_period)}
+        else:
+            mask_rcnn_inference(x, instances)
+            return instances
+
+    def layers(self, x):
+        """
+        Neural network layers that makes predictions from input features.
+        """
+        raise NotImplementedError
+
+
+@ROI_MASK_HEAD_REGISTRY.register()
+class MaskRCNNConvUpsampleHead(BaseMaskRCNNHead):
+    """
+    A mask head with several conv layers, plus an upsample layer (with `ConvTranspose2d`).
+    Predictions are made with a final 1x1 conv layer.
+    """
+
+    @configurable
+    def __init__(self, input_shape: ShapeSpec, *, num_classes, conv_dims, conv_norm="", **kwargs):
+        """
+        NOTE: this interface is experimental.
+
+        Args:
+            input_shape (ShapeSpec): shape of the input feature
+            num_classes (int): the number of classes. 1 if using class agnostic prediction.
+            conv_dims (list[int]): a list of N>0 integers representing the output dimensions
+                of N-1 conv layers and the last upsample layer.
+            conv_norm (str or callable): normalization for the conv layers.
+                See :func:`detectron2.layers.get_norm` for supported types.
+        """
+        super().__init__(**kwargs)
+        assert len(conv_dims) >= 1, "conv_dims have to be non-empty!"
+
+        self.conv_norm_relus = []
+
+        cur_channels = input_shape.channels
+        for k, conv_dim in enumerate(conv_dims[:-1]):
+            conv = Conv2d(
+                cur_channels,
+                conv_dim,
+                kernel_size=3,
+                stride=1,
+                padding=1,
+                bias=not conv_norm,
+                norm=get_norm(conv_norm, conv_dim),
+                activation=F.relu,
+            )
+            self.add_module("mask_fcn{}".format(k + 1), conv)
+            self.conv_norm_relus.append(conv)
+            cur_channels = conv_dim
+
+        self.deconv = ConvTranspose2d(
+            cur_channels, conv_dims[-1], kernel_size=2, stride=2, padding=0
+        )
+        cur_channels = conv_dims[-1]
+
+        self.predictor = Conv2d(cur_channels, num_classes, kernel_size=1, stride=1, padding=0)
+
+        for layer in self.conv_norm_relus + [self.deconv]:
+            weight_init.c2_msra_fill(layer)
+        # use normal distribution initialization for mask prediction layer
+        nn.init.normal_(self.predictor.weight, std=0.001)
+        if self.predictor.bias is not None:
+            nn.init.constant_(self.predictor.bias, 0)
+
+    @classmethod
+    def from_config(cls, cfg, input_shape):
+        ret = super().from_config(cfg, input_shape)
+        conv_dim = cfg.MODEL.ROI_MASK_HEAD.CONV_DIM
+        num_conv = cfg.MODEL.ROI_MASK_HEAD.NUM_CONV
+        ret.update(
+            conv_dims=[conv_dim] * (num_conv + 1),  # +1 for ConvTranspose
+            conv_norm=cfg.MODEL.ROI_MASK_HEAD.NORM,
+            input_shape=input_shape,
+        )
+        if cfg.MODEL.ROI_MASK_HEAD.CLS_AGNOSTIC_MASK:
+            ret["num_classes"] = 1
+        else:
+            ret["num_classes"] = cfg.MODEL.ROI_HEADS.NUM_CLASSES
+        return ret
+
+    def layers(self, x):
+        for layer in self.conv_norm_relus:
+            x = layer(x)
+        x = F.relu(self.deconv(x))
+        return self.predictor(x)
+
+
+def build_mask_head(cfg, input_shape):
+    """
+    Build a mask head defined by `cfg.MODEL.ROI_MASK_HEAD.NAME`.
+    """
+    name = cfg.MODEL.ROI_MASK_HEAD.NAME
+    return ROI_MASK_HEAD_REGISTRY.get(name)(cfg, input_shape)
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/detectron2/modeling/roi_heads/roi_heads.py b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/modeling/roi_heads/roi_heads.py
new file mode 100644
index 0000000000000000000000000000000000000000..f35588e474a1c3d938e5a3b2b8a8ae5e88006215
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/modeling/roi_heads/roi_heads.py
@@ -0,0 +1,812 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+import inspect
+import logging
+import numpy as np
+from typing import Dict, List, Optional, Tuple, Union
+import torch
+from torch import nn
+
+from detectron2.config import configurable
+from detectron2.layers import ShapeSpec
+from detectron2.structures import Boxes, ImageList, Instances, pairwise_iou
+from detectron2.utils.events import get_event_storage
+from detectron2.utils.registry import Registry
+
+from ..backbone.resnet import BottleneckBlock, make_stage
+from ..matcher import Matcher
+from ..poolers import ROIPooler
+from ..proposal_generator.proposal_utils import add_ground_truth_to_proposals
+from ..sampling import subsample_labels
+from .box_head import build_box_head
+from .fast_rcnn import FastRCNNOutputLayers
+from .keypoint_head import build_keypoint_head
+from .mask_head import build_mask_head
+
+ROI_HEADS_REGISTRY = Registry("ROI_HEADS")
+ROI_HEADS_REGISTRY.__doc__ = """
+Registry for ROI heads in a generalized R-CNN model.
+ROIHeads take feature maps and region proposals, and
+perform per-region computation.
+
+The registered object will be called with `obj(cfg, input_shape)`.
+The call is expected to return an :class:`ROIHeads`.
+"""
+
+logger = logging.getLogger(__name__)
+
+
+def build_roi_heads(cfg, input_shape):
+    """
+    Build ROIHeads defined by `cfg.MODEL.ROI_HEADS.NAME`.
+    """
+    name = cfg.MODEL.ROI_HEADS.NAME
+    return ROI_HEADS_REGISTRY.get(name)(cfg, input_shape)
+
+
+def select_foreground_proposals(
+    proposals: List[Instances], bg_label: int
+) -> Tuple[List[Instances], List[torch.Tensor]]:
+    """
+    Given a list of N Instances (for N images), each containing a `gt_classes` field,
+    return a list of Instances that contain only instances with `gt_classes != -1 &&
+    gt_classes != bg_label`.
+
+    Args:
+        proposals (list[Instances]): A list of N Instances, where N is the number of
+            images in the batch.
+        bg_label: label index of background class.
+
+    Returns:
+        list[Instances]: N Instances, each contains only the selected foreground instances.
+        list[Tensor]: N boolean vector, correspond to the selection mask of
+            each Instances object. True for selected instances.
+    """
+    assert isinstance(proposals, (list, tuple))
+    assert isinstance(proposals[0], Instances)
+    assert proposals[0].has("gt_classes")
+    fg_proposals = []
+    fg_selection_masks = []
+    for proposals_per_image in proposals:
+        gt_classes = proposals_per_image.gt_classes
+        fg_selection_mask = (gt_classes != -1) & (gt_classes != bg_label)
+        fg_idxs = fg_selection_mask.nonzero().squeeze(1)
+        fg_proposals.append(proposals_per_image[fg_idxs])
+        fg_selection_masks.append(fg_selection_mask)
+    return fg_proposals, fg_selection_masks
+
+
+def select_proposals_with_visible_keypoints(proposals: List[Instances]) -> List[Instances]:
+    """
+    Args:
+        proposals (list[Instances]): a list of N Instances, where N is the
+            number of images.
+
+    Returns:
+        proposals: only contains proposals with at least one visible keypoint.
+
+    Note that this is still slightly different from Detectron.
+    In Detectron, proposals for training keypoint head are re-sampled from
+    all the proposals with IOU>threshold & >=1 visible keypoint.
+
+    Here, the proposals are first sampled from all proposals with
+    IOU>threshold, then proposals with no visible keypoint are filtered out.
+    This strategy seems to make no difference on Detectron and is easier to implement.
+    """
+    ret = []
+    all_num_fg = []
+    for proposals_per_image in proposals:
+        # If empty/unannotated image (hard negatives), skip filtering for train
+        if len(proposals_per_image) == 0:
+            ret.append(proposals_per_image)
+            continue
+        gt_keypoints = proposals_per_image.gt_keypoints.tensor
+        # #fg x K x 3
+        vis_mask = gt_keypoints[:, :, 2] >= 1
+        xs, ys = gt_keypoints[:, :, 0], gt_keypoints[:, :, 1]
+        proposal_boxes = proposals_per_image.proposal_boxes.tensor.unsqueeze(dim=1)  # #fg x 1 x 4
+        kp_in_box = (
+            (xs >= proposal_boxes[:, :, 0])
+            & (xs <= proposal_boxes[:, :, 2])
+            & (ys >= proposal_boxes[:, :, 1])
+            & (ys <= proposal_boxes[:, :, 3])
+        )
+        selection = (kp_in_box & vis_mask).any(dim=1)
+        selection_idxs = torch.nonzero(selection, as_tuple=True)[0]
+        all_num_fg.append(selection_idxs.numel())
+        ret.append(proposals_per_image[selection_idxs])
+
+    storage = get_event_storage()
+    storage.put_scalar("keypoint_head/num_fg_samples", np.mean(all_num_fg))
+    return ret
+
+
+class ROIHeads(torch.nn.Module):
+    """
+    ROIHeads perform all per-region computation in an R-CNN.
+
+    It typically contains logic to
+    1. (in training only) match proposals with ground truth and sample them
+    2. crop the regions and extract per-region features using proposals
+    3. make per-region predictions with different heads
+
+    It can have many variants, implemented as subclasses of this class.
+    This base class contains the logic to match/sample proposals.
+    But it is not necessary to inherit this class if the sampling logic is not needed.
+    """
+
+    @configurable
+    def __init__(
+        self,
+        *,
+        num_classes,
+        batch_size_per_image,
+        positive_sample_fraction,
+        proposal_matcher,
+        proposal_append_gt=True
+    ):
+        """
+        NOTE: this interface is experimental.
+
+        Args:
+            num_classes (int): number of classes. Used to label background proposals.
+            batch_size_per_image (int): number of proposals to use for training
+            positive_sample_fraction (float): fraction of positive (foreground) proposals
+                to use for training.
+            proposal_matcher (Matcher): matcher that matches proposals and ground truth
+            proposal_append_gt (bool): whether to include ground truth as proposals as well
+        """
+        super().__init__()
+        self.batch_size_per_image = batch_size_per_image
+        self.positive_sample_fraction = positive_sample_fraction
+        self.num_classes = num_classes
+        self.proposal_matcher = proposal_matcher
+        self.proposal_append_gt = proposal_append_gt
+
+    @classmethod
+    def from_config(cls, cfg):
+        return {
+            "batch_size_per_image": cfg.MODEL.ROI_HEADS.BATCH_SIZE_PER_IMAGE,
+            "positive_sample_fraction": cfg.MODEL.ROI_HEADS.POSITIVE_FRACTION,
+            "num_classes": cfg.MODEL.ROI_HEADS.NUM_CLASSES,
+            "proposal_append_gt": cfg.MODEL.ROI_HEADS.PROPOSAL_APPEND_GT,
+            # Matcher to assign box proposals to gt boxes
+            "proposal_matcher": Matcher(
+                cfg.MODEL.ROI_HEADS.IOU_THRESHOLDS,
+                cfg.MODEL.ROI_HEADS.IOU_LABELS,
+                allow_low_quality_matches=False,
+            ),
+        }
+
+    def _sample_proposals(
+        self, matched_idxs: torch.Tensor, matched_labels: torch.Tensor, gt_classes: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Based on the matching between N proposals and M groundtruth,
+        sample the proposals and set their classification labels.
+
+        Args:
+            matched_idxs (Tensor): a vector of length N, each is the best-matched
+                gt index in [0, M) for each proposal.
+            matched_labels (Tensor): a vector of length N, the matcher's label
+                (one of cfg.MODEL.ROI_HEADS.IOU_LABELS) for each proposal.
+            gt_classes (Tensor): a vector of length M.
+
+        Returns:
+            Tensor: a vector of indices of sampled proposals. Each is in [0, N).
+            Tensor: a vector of the same length, the classification label for
+                each sampled proposal. Each sample is labeled as either a category in
+                [0, num_classes) or the background (num_classes).
+        """
+        has_gt = gt_classes.numel() > 0
+        # Get the corresponding GT for each proposal
+        if has_gt:
+            gt_classes = gt_classes[matched_idxs]
+            # Label unmatched proposals (0 label from matcher) as background (label=num_classes)
+            gt_classes[matched_labels == 0] = self.num_classes
+            # Label ignore proposals (-1 label)
+            gt_classes[matched_labels == -1] = -1
+        else:
+            gt_classes = torch.zeros_like(matched_idxs) + self.num_classes
+
+        sampled_fg_idxs, sampled_bg_idxs = subsample_labels(
+            gt_classes, self.batch_size_per_image, self.positive_sample_fraction, self.num_classes
+        )
+
+        sampled_idxs = torch.cat([sampled_fg_idxs, sampled_bg_idxs], dim=0)
+        return sampled_idxs, gt_classes[sampled_idxs]
+
+    @torch.no_grad()
+    def label_and_sample_proposals(
+        self, proposals: List[Instances], targets: List[Instances]
+    ) -> List[Instances]:
+        """
+        Prepare some proposals to be used to train the ROI heads.
+        It performs box matching between `proposals` and `targets`, and assigns
+        training labels to the proposals.
+        It returns ``self.batch_size_per_image`` random samples from proposals and groundtruth
+        boxes, with a fraction of positives that is no larger than
+        ``self.positive_sample_fraction``.
+
+        Args:
+            See :meth:`ROIHeads.forward`
+
+        Returns:
+            list[Instances]:
+                length `N` list of `Instances`s containing the proposals
+                sampled for training. Each `Instances` has the following fields:
+
+                - proposal_boxes: the proposal boxes
+                - gt_boxes: the ground-truth box that the proposal is assigned to
+                  (this is only meaningful if the proposal has a label > 0; if label = 0
+                  then the ground-truth box is random)
+
+                Other fields such as "gt_classes", "gt_masks", that's included in `targets`.
+        """
+        gt_boxes = [x.gt_boxes for x in targets]
+        # Augment proposals with ground-truth boxes.
+        # In the case of learned proposals (e.g., RPN), when training starts
+        # the proposals will be low quality due to random initialization.
+        # It's possible that none of these initial
+        # proposals have high enough overlap with the gt objects to be used
+        # as positive examples for the second stage components (box head,
+        # cls head, mask head). Adding the gt boxes to the set of proposals
+        # ensures that the second stage components will have some positive
+        # examples from the start of training. For RPN, this augmentation improves
+        # convergence and empirically improves box AP on COCO by about 0.5
+        # points (under one tested configuration).
+        if self.proposal_append_gt:
+            proposals = add_ground_truth_to_proposals(gt_boxes, proposals)
+
+        proposals_with_gt = []
+
+        num_fg_samples = []
+        num_bg_samples = []
+        for proposals_per_image, targets_per_image in zip(proposals, targets):
+            has_gt = len(targets_per_image) > 0
+            match_quality_matrix = pairwise_iou(
+                targets_per_image.gt_boxes, proposals_per_image.proposal_boxes
+            )
+            matched_idxs, matched_labels = self.proposal_matcher(match_quality_matrix)
+            sampled_idxs, gt_classes = self._sample_proposals(
+                matched_idxs, matched_labels, targets_per_image.gt_classes
+            )
+
+            # Set target attributes of the sampled proposals:
+            proposals_per_image = proposals_per_image[sampled_idxs]
+            proposals_per_image.gt_classes = gt_classes
+
+            # We index all the attributes of targets that start with "gt_"
+            # and have not been added to proposals yet (="gt_classes").
+            if has_gt:
+                sampled_targets = matched_idxs[sampled_idxs]
+                # NOTE: here the indexing waste some compute, because heads
+                # like masks, keypoints, etc, will filter the proposals again,
+                # (by foreground/background, or number of keypoints in the image, etc)
+                # so we essentially index the data twice.
+                for (trg_name, trg_value) in targets_per_image.get_fields().items():
+                    if trg_name.startswith("gt_") and not proposals_per_image.has(trg_name):
+                        proposals_per_image.set(trg_name, trg_value[sampled_targets])
+            else:
+                gt_boxes = Boxes(
+                    targets_per_image.gt_boxes.tensor.new_zeros((len(sampled_idxs), 4))
+                )
+                proposals_per_image.gt_boxes = gt_boxes
+
+            num_bg_samples.append((gt_classes == self.num_classes).sum().item())
+            num_fg_samples.append(gt_classes.numel() - num_bg_samples[-1])
+            proposals_with_gt.append(proposals_per_image)
+
+        # Log the number of fg/bg samples that are selected for training ROI heads
+        storage = get_event_storage()
+        storage.put_scalar("roi_head/num_fg_samples", np.mean(num_fg_samples))
+        storage.put_scalar("roi_head/num_bg_samples", np.mean(num_bg_samples))
+
+        return proposals_with_gt
+
+    def forward(
+        self,
+        images: ImageList,
+        features: Dict[str, torch.Tensor],
+        proposals: List[Instances],
+        targets: Optional[List[Instances]] = None,
+    ) -> Tuple[List[Instances], Dict[str, torch.Tensor]]:
+        """
+        Args:
+            images (ImageList):
+            features (dict[str,Tensor]): input data as a mapping from feature
+                map name to tensor. Axis 0 represents the number of images `N` in
+                the input data; axes 1-3 are channels, height, and width, which may
+                vary between feature maps (e.g., if a feature pyramid is used).
+            proposals (list[Instances]): length `N` list of `Instances`. The i-th
+                `Instances` contains object proposals for the i-th input image,
+                with fields "proposal_boxes" and "objectness_logits".
+            targets (list[Instances], optional): length `N` list of `Instances`. The i-th
+                `Instances` contains the ground-truth per-instance annotations
+                for the i-th input image.  Specify `targets` during training only.
+                It may have the following fields:
+
+                - gt_boxes: the bounding box of each instance.
+                - gt_classes: the label for each instance with a category ranging in [0, #class].
+                - gt_masks: PolygonMasks or BitMasks, the ground-truth masks of each instance.
+                - gt_keypoints: NxKx3, the groud-truth keypoints for each instance.
+
+        Returns:
+            list[Instances]: length `N` list of `Instances` containing the
+            detected instances. Returned during inference only; may be [] during training.
+
+            dict[str->Tensor]:
+            mapping from a named loss to a tensor storing the loss. Used during training only.
+        """
+        raise NotImplementedError()
+
+
+@ROI_HEADS_REGISTRY.register()
+class Res5ROIHeads(ROIHeads):
+    """
+    The ROIHeads in a typical "C4" R-CNN model, where
+    the box and mask head share the cropping and
+    the per-region feature computation by a Res5 block.
+    """
+
+    def __init__(self, cfg, input_shape):
+        super().__init__(cfg)
+
+        # fmt: off
+        self.in_features  = cfg.MODEL.ROI_HEADS.IN_FEATURES
+        pooler_resolution = cfg.MODEL.ROI_BOX_HEAD.POOLER_RESOLUTION
+        pooler_type       = cfg.MODEL.ROI_BOX_HEAD.POOLER_TYPE
+        pooler_scales     = (1.0 / input_shape[self.in_features[0]].stride, )
+        sampling_ratio    = cfg.MODEL.ROI_BOX_HEAD.POOLER_SAMPLING_RATIO
+        self.mask_on      = cfg.MODEL.MASK_ON
+        # fmt: on
+        assert not cfg.MODEL.KEYPOINT_ON
+        assert len(self.in_features) == 1
+
+        self.pooler = ROIPooler(
+            output_size=pooler_resolution,
+            scales=pooler_scales,
+            sampling_ratio=sampling_ratio,
+            pooler_type=pooler_type,
+        )
+
+        self.res5, out_channels = self._build_res5_block(cfg)
+        self.box_predictor = FastRCNNOutputLayers(
+            cfg, ShapeSpec(channels=out_channels, height=1, width=1)
+        )
+
+        if self.mask_on:
+            self.mask_head = build_mask_head(
+                cfg,
+                ShapeSpec(channels=out_channels, width=pooler_resolution, height=pooler_resolution),
+            )
+
+    def _build_res5_block(self, cfg):
+        # fmt: off
+        stage_channel_factor = 2 ** 3  # res5 is 8x res2
+        num_groups           = cfg.MODEL.RESNETS.NUM_GROUPS
+        width_per_group      = cfg.MODEL.RESNETS.WIDTH_PER_GROUP
+        bottleneck_channels  = num_groups * width_per_group * stage_channel_factor
+        out_channels         = cfg.MODEL.RESNETS.RES2_OUT_CHANNELS * stage_channel_factor
+        stride_in_1x1        = cfg.MODEL.RESNETS.STRIDE_IN_1X1
+        norm                 = cfg.MODEL.RESNETS.NORM
+        assert not cfg.MODEL.RESNETS.DEFORM_ON_PER_STAGE[-1], \
+            "Deformable conv is not yet supported in res5 head."
+        # fmt: on
+
+        blocks = make_stage(
+            BottleneckBlock,
+            3,
+            first_stride=2,
+            in_channels=out_channels // 2,
+            bottleneck_channels=bottleneck_channels,
+            out_channels=out_channels,
+            num_groups=num_groups,
+            norm=norm,
+            stride_in_1x1=stride_in_1x1,
+        )
+        return nn.Sequential(*blocks), out_channels
+
+    def _shared_roi_transform(self, features, boxes):
+        x = self.pooler(features, boxes)
+        return self.res5(x)
+
+    def forward(self, images, features, proposals, targets=None):
+        """
+        See :meth:`ROIHeads.forward`.
+        """
+        del images
+
+        if self.training:
+            assert targets
+            proposals = self.label_and_sample_proposals(proposals, targets)
+        del targets
+
+        proposal_boxes = [x.proposal_boxes for x in proposals]
+        box_features = self._shared_roi_transform(
+            [features[f] for f in self.in_features], proposal_boxes
+        )
+        predictions = self.box_predictor(box_features.mean(dim=[2, 3]))
+
+        if self.training:
+            del features
+            losses = self.box_predictor.losses(predictions, proposals)
+            if self.mask_on:
+                proposals, fg_selection_masks = select_foreground_proposals(
+                    proposals, self.num_classes
+                )
+                # Since the ROI feature transform is shared between boxes and masks,
+                # we don't need to recompute features. The mask loss is only defined
+                # on foreground proposals, so we need to select out the foreground
+                # features.
+                mask_features = box_features[torch.cat(fg_selection_masks, dim=0)]
+                del box_features
+                losses.update(self.mask_head(mask_features, proposals))
+            return [], losses
+        else:
+            pred_instances, _ = self.box_predictor.inference(predictions, proposals)
+            pred_instances = self.forward_with_given_boxes(features, pred_instances)
+            return pred_instances, {}
+
+    def forward_with_given_boxes(self, features, instances):
+        """
+        Use the given boxes in `instances` to produce other (non-box) per-ROI outputs.
+
+        Args:
+            features: same as in `forward()`
+            instances (list[Instances]): instances to predict other outputs. Expect the keys
+                "pred_boxes" and "pred_classes" to exist.
+
+        Returns:
+            instances (Instances):
+                the same `Instances` object, with extra
+                fields such as `pred_masks` or `pred_keypoints`.
+        """
+        assert not self.training
+        assert instances[0].has("pred_boxes") and instances[0].has("pred_classes")
+
+        if self.mask_on:
+            features = [features[f] for f in self.in_features]
+            x = self._shared_roi_transform(features, [x.pred_boxes for x in instances])
+            return self.mask_head(x, instances)
+        else:
+            return instances
+
+
+@ROI_HEADS_REGISTRY.register()
+class StandardROIHeads(ROIHeads):
+    """
+    It's "standard" in a sense that there is no ROI transform sharing
+    or feature sharing between tasks.
+    Each head independently processes the input features by each head's
+    own pooler and head.
+
+    This class is used by most models, such as FPN and C5.
+    To implement more models, you can subclass it and implement a different
+    :meth:`forward()` or a head.
+    """
+
+    @configurable
+    def __init__(
+        self,
+        *,
+        box_in_features: List[str],
+        box_pooler: ROIPooler,
+        box_head: nn.Module,
+        box_predictor: nn.Module,
+        mask_in_features: Optional[List[str]] = None,
+        mask_pooler: Optional[ROIPooler] = None,
+        mask_head: Optional[nn.Module] = None,
+        keypoint_in_features: Optional[List[str]] = None,
+        keypoint_pooler: Optional[ROIPooler] = None,
+        keypoint_head: Optional[nn.Module] = None,
+        train_on_pred_boxes: bool = False,
+        **kwargs
+    ):
+        """
+        NOTE: this interface is experimental.
+
+        Args:
+            box_in_features (list[str]): list of feature names to use for the box head.
+            box_pooler (ROIPooler): pooler to extra region features for box head
+            box_head (nn.Module): transform features to make box predictions
+            box_predictor (nn.Module): make box predictions from the feature.
+                Should have the same interface as :class:`FastRCNNOutputLayers`.
+            mask_in_features (list[str]): list of feature names to use for the mask head.
+                None if not using mask head.
+            mask_pooler (ROIPooler): pooler to extra region features for mask head
+            mask_head (nn.Module): transform features to make mask predictions
+            keypoint_in_features, keypoint_pooler, keypoint_head: similar to ``mask*``.
+            train_on_pred_boxes (bool): whether to use proposal boxes or
+                predicted boxes from the box head to train other heads.
+        """
+        super().__init__(**kwargs)
+        # keep self.in_features for backward compatibility
+        self.in_features = self.box_in_features = box_in_features
+        self.box_pooler = box_pooler
+        self.box_head = box_head
+        self.box_predictor = box_predictor
+
+        self.mask_on = mask_in_features is not None
+        if self.mask_on:
+            self.mask_in_features = mask_in_features
+            self.mask_pooler = mask_pooler
+            self.mask_head = mask_head
+        self.keypoint_on = keypoint_in_features is not None
+        if self.keypoint_on:
+            self.keypoint_in_features = keypoint_in_features
+            self.keypoint_pooler = keypoint_pooler
+            self.keypoint_head = keypoint_head
+
+        self.train_on_pred_boxes = train_on_pred_boxes
+
+    @classmethod
+    def from_config(cls, cfg, input_shape):
+        ret = super().from_config(cfg)
+        ret["train_on_pred_boxes"] = cfg.MODEL.ROI_BOX_HEAD.TRAIN_ON_PRED_BOXES
+        # Subclasses that have not been updated to use from_config style construction
+        # may have overridden _init_*_head methods. In this case, those overridden methods
+        # will not be classmethods and we need to avoid trying to call them here.
+        # We test for this with ismethod which only returns True for bound methods of cls.
+        # Such subclasses will need to handle calling their overridden _init_*_head methods.
+        if inspect.ismethod(cls._init_box_head):
+            ret.update(cls._init_box_head(cfg, input_shape))
+        if inspect.ismethod(cls._init_mask_head):
+            ret.update(cls._init_mask_head(cfg, input_shape))
+        if inspect.ismethod(cls._init_keypoint_head):
+            ret.update(cls._init_keypoint_head(cfg, input_shape))
+        return ret
+
+    @classmethod
+    def _init_box_head(cls, cfg, input_shape):
+        # fmt: off
+        in_features       = cfg.MODEL.ROI_HEADS.IN_FEATURES
+        pooler_resolution = cfg.MODEL.ROI_BOX_HEAD.POOLER_RESOLUTION
+        pooler_scales     = tuple(1.0 / input_shape[k].stride for k in in_features)
+        sampling_ratio    = cfg.MODEL.ROI_BOX_HEAD.POOLER_SAMPLING_RATIO
+        pooler_type       = cfg.MODEL.ROI_BOX_HEAD.POOLER_TYPE
+        # fmt: on
+
+        # If StandardROIHeads is applied on multiple feature maps (as in FPN),
+        # then we share the same predictors and therefore the channel counts must be the same
+        in_channels = [input_shape[f].channels for f in in_features]
+        # Check all channel counts are equal
+        assert len(set(in_channels)) == 1, in_channels
+        in_channels = in_channels[0]
+
+        box_pooler = ROIPooler(
+            output_size=pooler_resolution,
+            scales=pooler_scales,
+            sampling_ratio=sampling_ratio,
+            pooler_type=pooler_type,
+        )
+        # Here we split "box head" and "box predictor", which is mainly due to historical reasons.
+        # They are used together so the "box predictor" layers should be part of the "box head".
+        # New subclasses of ROIHeads do not need "box predictor"s.
+        box_head = build_box_head(
+            cfg, ShapeSpec(channels=in_channels, height=pooler_resolution, width=pooler_resolution)
+        )
+        box_predictor = FastRCNNOutputLayers(cfg, box_head.output_shape)
+        return {
+            "box_in_features": in_features,
+            "box_pooler": box_pooler,
+            "box_head": box_head,
+            "box_predictor": box_predictor,
+        }
+
+    @classmethod
+    def _init_mask_head(cls, cfg, input_shape):
+        if not cfg.MODEL.MASK_ON:
+            return {}
+        # fmt: off
+        in_features       = cfg.MODEL.ROI_HEADS.IN_FEATURES
+        pooler_resolution = cfg.MODEL.ROI_MASK_HEAD.POOLER_RESOLUTION
+        pooler_scales     = tuple(1.0 / input_shape[k].stride for k in in_features)
+        sampling_ratio    = cfg.MODEL.ROI_MASK_HEAD.POOLER_SAMPLING_RATIO
+        pooler_type       = cfg.MODEL.ROI_MASK_HEAD.POOLER_TYPE
+        # fmt: on
+
+        in_channels = [input_shape[f].channels for f in in_features][0]
+
+        ret = {"mask_in_features": in_features}
+        ret["mask_pooler"] = ROIPooler(
+            output_size=pooler_resolution,
+            scales=pooler_scales,
+            sampling_ratio=sampling_ratio,
+            pooler_type=pooler_type,
+        )
+        ret["mask_head"] = build_mask_head(
+            cfg, ShapeSpec(channels=in_channels, width=pooler_resolution, height=pooler_resolution)
+        )
+        return ret
+
+    @classmethod
+    def _init_keypoint_head(cls, cfg, input_shape):
+        if not cfg.MODEL.KEYPOINT_ON:
+            return {}
+        # fmt: off
+        in_features       = cfg.MODEL.ROI_HEADS.IN_FEATURES
+        pooler_resolution = cfg.MODEL.ROI_KEYPOINT_HEAD.POOLER_RESOLUTION
+        pooler_scales     = tuple(1.0 / input_shape[k].stride for k in in_features)  # noqa
+        sampling_ratio    = cfg.MODEL.ROI_KEYPOINT_HEAD.POOLER_SAMPLING_RATIO
+        pooler_type       = cfg.MODEL.ROI_KEYPOINT_HEAD.POOLER_TYPE
+        # fmt: on
+
+        in_channels = [input_shape[f].channels for f in in_features][0]
+
+        ret = {"keypoint_in_features": in_features}
+        ret["keypoint_pooler"] = ROIPooler(
+            output_size=pooler_resolution,
+            scales=pooler_scales,
+            sampling_ratio=sampling_ratio,
+            pooler_type=pooler_type,
+        )
+        ret["keypoint_head"] = build_keypoint_head(
+            cfg, ShapeSpec(channels=in_channels, width=pooler_resolution, height=pooler_resolution)
+        )
+        return ret
+
+    def forward(
+        self,
+        images: ImageList,
+        features: Dict[str, torch.Tensor],
+        proposals: List[Instances],
+        targets: Optional[List[Instances]] = None,
+    ) -> Tuple[List[Instances], Dict[str, torch.Tensor]]:
+        """
+        See :class:`ROIHeads.forward`.
+        """
+        del images
+        if self.training:
+            assert targets
+            proposals = self.label_and_sample_proposals(proposals, targets)
+        del targets
+
+        if self.training:
+            losses = self._forward_box(features, proposals)
+            # Usually the original proposals used by the box head are used by the mask, keypoint
+            # heads. But when `self.train_on_pred_boxes is True`, proposals will contain boxes
+            # predicted by the box head.
+            losses.update(self._forward_mask(features, proposals))
+            losses.update(self._forward_keypoint(features, proposals))
+            return proposals, losses
+        else:
+            pred_instances = self._forward_box(features, proposals)
+            # During inference cascaded prediction is used: the mask and keypoints heads are only
+            # applied to the top scoring box detections.
+            pred_instances = self.forward_with_given_boxes(features, pred_instances)
+            return pred_instances, {}
+
+    def forward_with_given_boxes(
+        self, features: Dict[str, torch.Tensor], instances: List[Instances]
+    ) -> List[Instances]:
+        """
+        Use the given boxes in `instances` to produce other (non-box) per-ROI outputs.
+
+        This is useful for downstream tasks where a box is known, but need to obtain
+        other attributes (outputs of other heads).
+        Test-time augmentation also uses this.
+
+        Args:
+            features: same as in `forward()`
+            instances (list[Instances]): instances to predict other outputs. Expect the keys
+                "pred_boxes" and "pred_classes" to exist.
+
+        Returns:
+            instances (list[Instances]):
+                the same `Instances` objects, with extra
+                fields such as `pred_masks` or `pred_keypoints`.
+        """
+        assert not self.training
+        assert instances[0].has("pred_boxes") and instances[0].has("pred_classes")
+
+        instances = self._forward_mask(features, instances)
+        instances = self._forward_keypoint(features, instances)
+        return instances
+
+    def _forward_box(
+        self, features: Dict[str, torch.Tensor], proposals: List[Instances]
+    ) -> Union[Dict[str, torch.Tensor], List[Instances]]:
+        """
+        Forward logic of the box prediction branch. If `self.train_on_pred_boxes is True`,
+            the function puts predicted boxes in the `proposal_boxes` field of `proposals` argument.
+
+        Args:
+            features (dict[str, Tensor]): mapping from feature map names to tensor.
+                Same as in :meth:`ROIHeads.forward`.
+            proposals (list[Instances]): the per-image object proposals with
+                their matching ground truth.
+                Each has fields "proposal_boxes", and "objectness_logits",
+                "gt_classes", "gt_boxes".
+
+        Returns:
+            In training, a dict of losses.
+            In inference, a list of `Instances`, the predicted instances.
+        """
+        features = [features[f] for f in self.box_in_features]
+        box_features = self.box_pooler(features, [x.proposal_boxes for x in proposals])
+        box_features = self.box_head(box_features)
+        predictions = self.box_predictor(box_features)
+        del box_features
+
+        if self.training:
+            losses = self.box_predictor.losses(predictions, proposals)
+            # proposals is modified in-place below, so losses must be computed first.
+            if self.train_on_pred_boxes:
+                with torch.no_grad():
+                    pred_boxes = self.box_predictor.predict_boxes_for_gt_classes(
+                        predictions, proposals
+                    )
+                    for proposals_per_image, pred_boxes_per_image in zip(proposals, pred_boxes):
+                        proposals_per_image.proposal_boxes = Boxes(pred_boxes_per_image)
+            return losses
+        else:
+            pred_instances, _ = self.box_predictor.inference(predictions, proposals)
+            return pred_instances
+
+    def _forward_mask(
+        self, features: Dict[str, torch.Tensor], instances: List[Instances]
+    ) -> Union[Dict[str, torch.Tensor], List[Instances]]:
+        """
+        Forward logic of the mask prediction branch.
+
+        Args:
+            features (dict[str, Tensor]): mapping from feature map names to tensor.
+                Same as in :meth:`ROIHeads.forward`.
+            instances (list[Instances]): the per-image instances to train/predict masks.
+                In training, they can be the proposals.
+                In inference, they can be the predicted boxes.
+
+        Returns:
+            In training, a dict of losses.
+            In inference, update `instances` with new fields "pred_masks" and return it.
+        """
+        if not self.mask_on:
+            return {} if self.training else instances
+
+        features = [features[f] for f in self.mask_in_features]
+
+        if self.training:
+            # The loss is only defined on positive proposals.
+            proposals, _ = select_foreground_proposals(instances, self.num_classes)
+            proposal_boxes = [x.proposal_boxes for x in proposals]
+            mask_features = self.mask_pooler(features, proposal_boxes)
+            return self.mask_head(mask_features, proposals)
+        else:
+            pred_boxes = [x.pred_boxes for x in instances]
+            mask_features = self.mask_pooler(features, pred_boxes)
+            return self.mask_head(mask_features, instances)
+
+    def _forward_keypoint(
+        self, features: Dict[str, torch.Tensor], instances: List[Instances]
+    ) -> Union[Dict[str, torch.Tensor], List[Instances]]:
+        """
+        Forward logic of the keypoint prediction branch.
+
+        Args:
+            features (dict[str, Tensor]): mapping from feature map names to tensor.
+                Same as in :meth:`ROIHeads.forward`.
+            instances (list[Instances]): the per-image instances to train/predict keypoints.
+                In training, they can be the proposals.
+                In inference, they can be the predicted boxes.
+
+        Returns:
+            In training, a dict of losses.
+            In inference, update `instances` with new fields "pred_keypoints" and return it.
+        """
+        if not self.keypoint_on:
+            return {} if self.training else instances
+
+        features = [features[f] for f in self.keypoint_in_features]
+
+        if self.training:
+            # The loss is defined on positive proposals with >=1 visible keypoints.
+            proposals, _ = select_foreground_proposals(instances, self.num_classes)
+            proposals = select_proposals_with_visible_keypoints(proposals)
+            proposal_boxes = [x.proposal_boxes for x in proposals]
+
+            keypoint_features = self.keypoint_pooler(features, proposal_boxes)
+            return self.keypoint_head(keypoint_features, proposals)
+        else:
+            pred_boxes = [x.pred_boxes for x in instances]
+            keypoint_features = self.keypoint_pooler(features, pred_boxes)
+            return self.keypoint_head(keypoint_features, instances)
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/detectron2/modeling/roi_heads/rotated_fast_rcnn.py b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/modeling/roi_heads/rotated_fast_rcnn.py
new file mode 100644
index 0000000000000000000000000000000000000000..3d7362d93f9be8d3838c477406540603e81ee0be
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/modeling/roi_heads/rotated_fast_rcnn.py
@@ -0,0 +1,276 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+import logging
+import numpy as np
+import torch
+
+from detectron2.config import configurable
+from detectron2.layers import ShapeSpec, batched_nms_rotated
+from detectron2.structures import Instances, RotatedBoxes, pairwise_iou_rotated
+from detectron2.utils.events import get_event_storage
+
+from ..box_regression import Box2BoxTransformRotated
+from ..poolers import ROIPooler
+from ..proposal_generator.proposal_utils import add_ground_truth_to_proposals
+from .box_head import build_box_head
+from .fast_rcnn import FastRCNNOutputLayers
+from .roi_heads import ROI_HEADS_REGISTRY, StandardROIHeads
+
+logger = logging.getLogger(__name__)
+
+"""
+Shape shorthand in this module:
+
+    N: number of images in the minibatch
+    R: number of ROIs, combined over all images, in the minibatch
+    Ri: number of ROIs in image i
+    K: number of foreground classes. E.g.,there are 80 foreground classes in COCO.
+
+Naming convention:
+
+    deltas: refers to the 5-d (dx, dy, dw, dh, da) deltas that parameterize the box2box
+    transform (see :class:`box_regression.Box2BoxTransformRotated`).
+
+    pred_class_logits: predicted class scores in [-inf, +inf]; use
+        softmax(pred_class_logits) to estimate P(class).
+
+    gt_classes: ground-truth classification labels in [0, K], where [0, K) represent
+        foreground object classes and K represents the background class.
+
+    pred_proposal_deltas: predicted rotated box2box transform deltas for transforming proposals
+        to detection box predictions.
+
+    gt_proposal_deltas: ground-truth rotated box2box transform deltas
+"""
+
+
+def fast_rcnn_inference_rotated(
+    boxes, scores, image_shapes, score_thresh, nms_thresh, topk_per_image
+):
+    """
+    Call `fast_rcnn_inference_single_image_rotated` for all images.
+
+    Args:
+        boxes (list[Tensor]): A list of Tensors of predicted class-specific or class-agnostic
+            boxes for each image. Element i has shape (Ri, K * 5) if doing
+            class-specific regression, or (Ri, 5) if doing class-agnostic
+            regression, where Ri is the number of predicted objects for image i.
+            This is compatible with the output of :meth:`FastRCNNOutputs.predict_boxes`.
+        scores (list[Tensor]): A list of Tensors of predicted class scores for each image.
+            Element i has shape (Ri, K + 1), where Ri is the number of predicted objects
+            for image i. Compatible with the output of :meth:`FastRCNNOutputs.predict_probs`.
+        image_shapes (list[tuple]): A list of (width, height) tuples for each image in the batch.
+        score_thresh (float): Only return detections with a confidence score exceeding this
+            threshold.
+        nms_thresh (float):  The threshold to use for box non-maximum suppression. Value in [0, 1].
+        topk_per_image (int): The number of top scoring detections to return. Set < 0 to return
+            all detections.
+
+    Returns:
+        instances: (list[Instances]): A list of N instances, one for each image in the batch,
+            that stores the topk most confidence detections.
+        kept_indices: (list[Tensor]): A list of 1D tensor of length of N, each element indicates
+            the corresponding boxes/scores index in [0, Ri) from the input, for image i.
+    """
+    result_per_image = [
+        fast_rcnn_inference_single_image_rotated(
+            boxes_per_image, scores_per_image, image_shape, score_thresh, nms_thresh, topk_per_image
+        )
+        for scores_per_image, boxes_per_image, image_shape in zip(scores, boxes, image_shapes)
+    ]
+    return [x[0] for x in result_per_image], [x[1] for x in result_per_image]
+
+
+def fast_rcnn_inference_single_image_rotated(
+    boxes, scores, image_shape, score_thresh, nms_thresh, topk_per_image
+):
+    """
+    Single-image inference. Return rotated bounding-box detection results by thresholding
+    on scores and applying rotated non-maximum suppression (Rotated NMS).
+
+    Args:
+        Same as `fast_rcnn_inference_rotated`, but with rotated boxes, scores, and image shapes
+        per image.
+
+    Returns:
+        Same as `fast_rcnn_inference_rotated`, but for only one image.
+    """
+    valid_mask = torch.isfinite(boxes).all(dim=1) & torch.isfinite(scores).all(dim=1)
+    if not valid_mask.all():
+        boxes = boxes[valid_mask]
+        scores = scores[valid_mask]
+
+    B = 5  # box dimension
+    scores = scores[:, :-1]
+    num_bbox_reg_classes = boxes.shape[1] // B
+    # Convert to Boxes to use the `clip` function ...
+    boxes = RotatedBoxes(boxes.reshape(-1, B))
+    boxes.clip(image_shape)
+    boxes = boxes.tensor.view(-1, num_bbox_reg_classes, B)  # R x C x B
+    # Filter results based on detection scores
+    filter_mask = scores > score_thresh  # R x K
+    # R' x 2. First column contains indices of the R predictions;
+    # Second column contains indices of classes.
+    filter_inds = filter_mask.nonzero()
+    if num_bbox_reg_classes == 1:
+        boxes = boxes[filter_inds[:, 0], 0]
+    else:
+        boxes = boxes[filter_mask]
+    scores = scores[filter_mask]
+
+    # Apply per-class Rotated NMS
+    keep = batched_nms_rotated(boxes, scores, filter_inds[:, 1], nms_thresh)
+    if topk_per_image >= 0:
+        keep = keep[:topk_per_image]
+    boxes, scores, filter_inds = boxes[keep], scores[keep], filter_inds[keep]
+
+    result = Instances(image_shape)
+    result.pred_boxes = RotatedBoxes(boxes)
+    result.scores = scores
+    result.pred_classes = filter_inds[:, 1]
+
+    return result, filter_inds[:, 0]
+
+
+class RotatedFastRCNNOutputLayers(FastRCNNOutputLayers):
+    """
+    Two linear layers for predicting Rotated Fast R-CNN outputs.
+    """
+
+    @classmethod
+    def from_config(cls, cfg, input_shape):
+        args = super().from_config(cfg, input_shape)
+        args["box2box_transform"] = Box2BoxTransformRotated(
+            weights=cfg.MODEL.ROI_BOX_HEAD.BBOX_REG_WEIGHTS
+        )
+        return args
+
+    def inference(self, predictions, proposals):
+        """
+        Returns:
+            list[Instances]: same as `fast_rcnn_inference_rotated`.
+            list[Tensor]: same as `fast_rcnn_inference_rotated`.
+        """
+        boxes = self.predict_boxes(predictions, proposals)
+        scores = self.predict_probs(predictions, proposals)
+        image_shapes = [x.image_size for x in proposals]
+
+        return fast_rcnn_inference_rotated(
+            boxes,
+            scores,
+            image_shapes,
+            self.test_score_thresh,
+            self.test_nms_thresh,
+            self.test_topk_per_image,
+        )
+
+
+@ROI_HEADS_REGISTRY.register()
+class RROIHeads(StandardROIHeads):
+    """
+    This class is used by Rotated Fast R-CNN to detect rotated boxes.
+    For now, it only supports box predictions but not mask or keypoints.
+    """
+
+    @configurable
+    def __init__(self, **kwargs):
+        """
+        NOTE: this interface is experimental.
+        """
+        super().__init__(**kwargs)
+        assert (
+            not self.mask_on and not self.keypoint_on
+        ), "Mask/Keypoints not supported in Rotated ROIHeads."
+        assert not self.train_on_pred_boxes, "train_on_pred_boxes not implemented for RROIHeads!"
+
+    @classmethod
+    def _init_box_head(cls, cfg, input_shape):
+        # fmt: off
+        in_features       = cfg.MODEL.ROI_HEADS.IN_FEATURES
+        pooler_resolution = cfg.MODEL.ROI_BOX_HEAD.POOLER_RESOLUTION
+        pooler_scales     = tuple(1.0 / input_shape[k].stride for k in in_features)
+        sampling_ratio    = cfg.MODEL.ROI_BOX_HEAD.POOLER_SAMPLING_RATIO
+        pooler_type       = cfg.MODEL.ROI_BOX_HEAD.POOLER_TYPE
+        # fmt: on
+        assert pooler_type in ["ROIAlignRotated"], pooler_type
+        # assume all channel counts are equal
+        in_channels = [input_shape[f].channels for f in in_features][0]
+
+        box_pooler = ROIPooler(
+            output_size=pooler_resolution,
+            scales=pooler_scales,
+            sampling_ratio=sampling_ratio,
+            pooler_type=pooler_type,
+        )
+        box_head = build_box_head(
+            cfg, ShapeSpec(channels=in_channels, height=pooler_resolution, width=pooler_resolution)
+        )
+        # This line is the only difference v.s. StandardROIHeads
+        box_predictor = RotatedFastRCNNOutputLayers(cfg, box_head.output_shape)
+        return {
+            "box_in_features": in_features,
+            "box_pooler": box_pooler,
+            "box_head": box_head,
+            "box_predictor": box_predictor,
+        }
+
+    @torch.no_grad()
+    def label_and_sample_proposals(self, proposals, targets):
+        """
+        Prepare some proposals to be used to train the RROI heads.
+        It performs box matching between `proposals` and `targets`, and assigns
+        training labels to the proposals.
+        It returns `self.batch_size_per_image` random samples from proposals and groundtruth boxes,
+        with a fraction of positives that is no larger than `self.positive_sample_fraction.
+
+        Args:
+            See :meth:`StandardROIHeads.forward`
+
+        Returns:
+            list[Instances]: length `N` list of `Instances`s containing the proposals
+                sampled for training. Each `Instances` has the following fields:
+                - proposal_boxes: the rotated proposal boxes
+                - gt_boxes: the ground-truth rotated boxes that the proposal is assigned to
+                  (this is only meaningful if the proposal has a label > 0; if label = 0
+                   then the ground-truth box is random)
+                - gt_classes: the ground-truth classification lable for each proposal
+        """
+        gt_boxes = [x.gt_boxes for x in targets]
+        if self.proposal_append_gt:
+            proposals = add_ground_truth_to_proposals(gt_boxes, proposals)
+
+        proposals_with_gt = []
+
+        num_fg_samples = []
+        num_bg_samples = []
+        for proposals_per_image, targets_per_image in zip(proposals, targets):
+            has_gt = len(targets_per_image) > 0
+            match_quality_matrix = pairwise_iou_rotated(
+                targets_per_image.gt_boxes, proposals_per_image.proposal_boxes
+            )
+            matched_idxs, matched_labels = self.proposal_matcher(match_quality_matrix)
+            sampled_idxs, gt_classes = self._sample_proposals(
+                matched_idxs, matched_labels, targets_per_image.gt_classes
+            )
+
+            proposals_per_image = proposals_per_image[sampled_idxs]
+            proposals_per_image.gt_classes = gt_classes
+
+            if has_gt:
+                sampled_targets = matched_idxs[sampled_idxs]
+                proposals_per_image.gt_boxes = targets_per_image.gt_boxes[sampled_targets]
+            else:
+                gt_boxes = RotatedBoxes(
+                    targets_per_image.gt_boxes.tensor.new_zeros((len(sampled_idxs), 5))
+                )
+                proposals_per_image.gt_boxes = gt_boxes
+
+            num_bg_samples.append((gt_classes == self.num_classes).sum().item())
+            num_fg_samples.append(gt_classes.numel() - num_bg_samples[-1])
+            proposals_with_gt.append(proposals_per_image)
+
+        # Log the number of fg/bg samples that are selected for training ROI heads
+        storage = get_event_storage()
+        storage.put_scalar("roi_head/num_fg_samples", np.mean(num_fg_samples))
+        storage.put_scalar("roi_head/num_bg_samples", np.mean(num_bg_samples))
+
+        return proposals_with_gt
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/detectron2/modeling/sampling.py b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/modeling/sampling.py
new file mode 100644
index 0000000000000000000000000000000000000000..ecf251a2fa301d9e31eee7d3ba5dc6eaab1732f8
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/modeling/sampling.py
@@ -0,0 +1,50 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+import torch
+
+__all__ = ["subsample_labels"]
+
+
+def subsample_labels(labels, num_samples, positive_fraction, bg_label):
+    """
+    Return `num_samples` (or fewer, if not enough found)
+    random samples from `labels` which is a mixture of positives & negatives.
+    It will try to return as many positives as possible without
+    exceeding `positive_fraction * num_samples`, and then try to
+    fill the remaining slots with negatives.
+
+    Args:
+        labels (Tensor): (N, ) label vector with values:
+            * -1: ignore
+            * bg_label: background ("negative") class
+            * otherwise: one or more foreground ("positive") classes
+        num_samples (int): The total number of labels with value >= 0 to return.
+            Values that are not sampled will be filled with -1 (ignore).
+        positive_fraction (float): The number of subsampled labels with values > 0
+            is `min(num_positives, int(positive_fraction * num_samples))`. The number
+            of negatives sampled is `min(num_negatives, num_samples - num_positives_sampled)`.
+            In order words, if there are not enough positives, the sample is filled with
+            negatives. If there are also not enough negatives, then as many elements are
+            sampled as is possible.
+        bg_label (int): label index of background ("negative") class.
+
+    Returns:
+        pos_idx, neg_idx (Tensor):
+            1D vector of indices. The total length of both is `num_samples` or fewer.
+    """
+    positive = torch.nonzero((labels != -1) & (labels != bg_label), as_tuple=True)[0]
+    negative = torch.nonzero(labels == bg_label, as_tuple=True)[0]
+
+    num_pos = int(num_samples * positive_fraction)
+    # protect against not enough positive examples
+    num_pos = min(positive.numel(), num_pos)
+    num_neg = num_samples - num_pos
+    # protect against not enough negative examples
+    num_neg = min(negative.numel(), num_neg)
+
+    # randomly select positive and negative examples
+    perm1 = torch.randperm(positive.numel(), device=positive.device)[:num_pos]
+    perm2 = torch.randperm(negative.numel(), device=negative.device)[:num_neg]
+
+    pos_idx = positive[perm1]
+    neg_idx = negative[perm2]
+    return pos_idx, neg_idx
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/detectron2/modeling/test_time_augmentation.py b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/modeling/test_time_augmentation.py
new file mode 100644
index 0000000000000000000000000000000000000000..1e5bcf02f655956f76eb78fb7de36d691de6a53c
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/modeling/test_time_augmentation.py
@@ -0,0 +1,285 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+import copy
+import numpy as np
+from contextlib import contextmanager
+from itertools import count
+import torch
+from torch import nn
+from torch.nn.parallel import DistributedDataParallel
+
+from detectron2.data.detection_utils import read_image
+from detectron2.data.transforms import ResizeShortestEdge
+from detectron2.structures import Instances
+
+from .meta_arch import GeneralizedRCNN
+from .postprocessing import detector_postprocess
+from .roi_heads.fast_rcnn import fast_rcnn_inference_single_image
+
+__all__ = ["DatasetMapperTTA", "GeneralizedRCNNWithTTA"]
+
+
+class DatasetMapperTTA:
+    """
+    Implement test-time augmentation for detection data.
+    It is a callable which takes a dataset dict from a detection dataset,
+    and returns a list of dataset dicts where the images
+    are augmented from the input image by the transformations defined in the config.
+    This is used for test-time augmentation.
+    """
+
+    def __init__(self, cfg):
+        self.min_sizes = cfg.TEST.AUG.MIN_SIZES
+        self.max_size = cfg.TEST.AUG.MAX_SIZE
+        self.flip = cfg.TEST.AUG.FLIP
+        self.image_format = cfg.INPUT.FORMAT
+
+    def __call__(self, dataset_dict):
+        """
+        Args:
+            dict: a detection dataset dict
+
+        Returns:
+            list[dict]:
+                a list of dataset dicts, which contain augmented version of the input image.
+                The total number of dicts is ``len(min_sizes) * (2 if flip else 1)``.
+        """
+        ret = []
+        if "image" not in dataset_dict:
+            numpy_image = read_image(dataset_dict["file_name"], self.image_format)
+        else:
+            numpy_image = dataset_dict["image"].permute(1, 2, 0).numpy().astype("uint8")
+        for min_size in self.min_sizes:
+            image = np.copy(numpy_image)
+            tfm = ResizeShortestEdge(min_size, self.max_size).get_transform(image)
+            resized = tfm.apply_image(image)
+            resized = torch.as_tensor(resized.transpose(2, 0, 1).astype("float32"))
+
+            dic = copy.deepcopy(dataset_dict)
+            dic["horiz_flip"] = False
+            dic["image"] = resized
+            ret.append(dic)
+
+            if self.flip:
+                dic = copy.deepcopy(dataset_dict)
+                dic["horiz_flip"] = True
+                dic["image"] = torch.flip(resized, dims=[2])
+                ret.append(dic)
+        return ret
+
+
+class GeneralizedRCNNWithTTA(nn.Module):
+    """
+    A GeneralizedRCNN with test-time augmentation enabled.
+    Its :meth:`__call__` method has the same interface as :meth:`GeneralizedRCNN.forward`.
+    """
+
+    def __init__(self, cfg, model, tta_mapper=None, batch_size=3):
+        """
+        Args:
+            cfg (CfgNode):
+            model (GeneralizedRCNN): a GeneralizedRCNN to apply TTA on.
+            tta_mapper (callable): takes a dataset dict and returns a list of
+                augmented versions of the dataset dict. Defaults to
+                `DatasetMapperTTA(cfg)`.
+            batch_size (int): batch the augmented images into this batch size for inference.
+        """
+        super().__init__()
+        if isinstance(model, DistributedDataParallel):
+            model = model.module
+        assert isinstance(
+            model, GeneralizedRCNN
+        ), "TTA is only supported on GeneralizedRCNN. Got a model of type {}".format(type(model))
+        self.cfg = cfg.clone()
+        assert not self.cfg.MODEL.KEYPOINT_ON, "TTA for keypoint is not supported yet"
+        assert (
+            not self.cfg.MODEL.LOAD_PROPOSALS
+        ), "TTA for pre-computed proposals is not supported yet"
+
+        self.model = model
+
+        if tta_mapper is None:
+            tta_mapper = DatasetMapperTTA(cfg)
+        self.tta_mapper = tta_mapper
+        self.batch_size = batch_size
+
+    @contextmanager
+    def _turn_off_roi_heads(self, attrs):
+        """
+        Open a context where some heads in `model.roi_heads` are temporarily turned off.
+        Args:
+            attr (list[str]): the attribute in `model.roi_heads` which can be used
+                to turn off a specific head, e.g., "mask_on", "keypoint_on".
+        """
+        roi_heads = self.model.roi_heads
+        old = {}
+        for attr in attrs:
+            try:
+                old[attr] = getattr(roi_heads, attr)
+            except AttributeError:
+                # The head may not be implemented in certain ROIHeads
+                pass
+
+        if len(old.keys()) == 0:
+            yield
+        else:
+            for attr in old.keys():
+                setattr(roi_heads, attr, False)
+            yield
+            for attr in old.keys():
+                setattr(roi_heads, attr, old[attr])
+
+    def _batch_inference(self, batched_inputs, detected_instances=None, do_postprocess=True):
+        """
+        Execute inference on a list of inputs,
+        using batch size = self.batch_size, instead of the length of the list.
+
+        Inputs & outputs have the same format as :meth:`GeneralizedRCNN.inference`
+        """
+        if detected_instances is None:
+            detected_instances = [None] * len(batched_inputs)
+
+        outputs = []
+        inputs, instances = [], []
+        for idx, input, instance in zip(count(), batched_inputs, detected_instances):
+            inputs.append(input)
+            instances.append(instance)
+            if len(inputs) == self.batch_size or idx == len(batched_inputs) - 1:
+                outputs.extend(
+                    self.model.inference(
+                        inputs,
+                        instances if instances[0] is not None else None,
+                        do_postprocess=do_postprocess,
+                    )
+                )
+                inputs, instances = [], []
+        return outputs
+
+    def __call__(self, batched_inputs):
+        """
+        Same input/output format as :meth:`GeneralizedRCNN.forward`
+        """
+        return [self._inference_one_image(x) for x in batched_inputs]
+
+    def _detector_postprocess(self, outputs, aug_vars):
+        return detector_postprocess(outputs, aug_vars["height"], aug_vars["width"])
+
+    def _inference_one_image(self, input):
+        """
+        Args:
+            input (dict): one dataset dict
+
+        Returns:
+            dict: one output dict
+        """
+
+        augmented_inputs, aug_vars = self._get_augmented_inputs(input)
+        # Detect boxes from all augmented versions
+        with self._turn_off_roi_heads(["mask_on", "keypoint_on"]):
+            # temporarily disable roi heads
+            all_boxes, all_scores, all_classes = self._get_augmented_boxes(
+                augmented_inputs, aug_vars
+            )
+        merged_instances = self._merge_detections(
+            all_boxes, all_scores, all_classes, (aug_vars["height"], aug_vars["width"])
+        )
+
+        if self.cfg.MODEL.MASK_ON:
+            # Use the detected boxes to obtain new fields
+            augmented_instances = self._rescale_detected_boxes(
+                augmented_inputs, merged_instances, aug_vars
+            )
+            # run forward on the detected boxes
+            outputs = self._batch_inference(
+                augmented_inputs, augmented_instances, do_postprocess=False
+            )
+            # Delete now useless variables to avoid being out of memory
+            del augmented_inputs, augmented_instances, merged_instances
+            # average the predictions
+            outputs[0].pred_masks = self._reduce_pred_masks(outputs, aug_vars)
+            # postprocess
+            output = self._detector_postprocess(outputs[0], aug_vars)
+            return {"instances": output}
+        else:
+            return {"instances": merged_instances}
+
+    def _get_augmented_inputs(self, input):
+        augmented_inputs = self.tta_mapper(input)
+
+        do_hflip = [k.pop("horiz_flip", False) for k in augmented_inputs]
+        heights = [k["height"] for k in augmented_inputs]
+        widths = [k["width"] for k in augmented_inputs]
+        assert (
+            len(set(heights)) == 1 and len(set(widths)) == 1
+        ), "Augmented version of the inputs should have the same original resolution!"
+        height = heights[0]
+        width = widths[0]
+        aug_vars = {"height": height, "width": width, "do_hflip": do_hflip}
+
+        return augmented_inputs, aug_vars
+
+    def _get_augmented_boxes(self, augmented_inputs, aug_vars):
+        # 1: forward with all augmented images
+        outputs = self._batch_inference(augmented_inputs, do_postprocess=False)
+        # 2: union the results
+        all_boxes = []
+        all_scores = []
+        all_classes = []
+        for idx, output in enumerate(outputs):
+            rescaled_output = self._detector_postprocess(output, aug_vars)
+            pred_boxes = rescaled_output.pred_boxes.tensor
+            if aug_vars["do_hflip"][idx]:
+                pred_boxes[:, [0, 2]] = aug_vars["width"] - pred_boxes[:, [2, 0]]
+            all_boxes.append(pred_boxes)
+            all_scores.extend(rescaled_output.scores)
+            all_classes.extend(rescaled_output.pred_classes)
+        all_boxes = torch.cat(all_boxes, dim=0).cpu()
+        return all_boxes, all_scores, all_classes
+
+    def _merge_detections(self, all_boxes, all_scores, all_classes, shape_hw):
+        # select from the union of all results
+        num_boxes = len(all_boxes)
+        num_classes = self.cfg.MODEL.ROI_HEADS.NUM_CLASSES
+        # +1 because fast_rcnn_inference expects background scores as well
+        all_scores_2d = torch.zeros(num_boxes, num_classes + 1, device=all_boxes.device)
+        for idx, cls, score in zip(count(), all_classes, all_scores):
+            all_scores_2d[idx, cls] = score
+
+        merged_instances, _ = fast_rcnn_inference_single_image(
+            all_boxes,
+            all_scores_2d,
+            shape_hw,
+            1e-8,
+            self.cfg.MODEL.ROI_HEADS.NMS_THRESH_TEST,
+            self.cfg.TEST.DETECTIONS_PER_IMAGE,
+        )
+
+        return merged_instances
+
+    def _rescale_detected_boxes(self, augmented_inputs, merged_instances, aug_vars):
+        augmented_instances = []
+        for idx, input in enumerate(augmented_inputs):
+            actual_height, actual_width = input["image"].shape[1:3]
+            scale_x = actual_width * 1.0 / aug_vars["width"]
+            scale_y = actual_height * 1.0 / aug_vars["height"]
+            pred_boxes = merged_instances.pred_boxes.clone()
+            pred_boxes.tensor[:, 0::2] *= scale_x
+            pred_boxes.tensor[:, 1::2] *= scale_y
+            if aug_vars["do_hflip"][idx]:
+                pred_boxes.tensor[:, [0, 2]] = actual_width - pred_boxes.tensor[:, [2, 0]]
+
+            aug_instances = Instances(
+                image_size=(actual_height, actual_width),
+                pred_boxes=pred_boxes,
+                pred_classes=merged_instances.pred_classes,
+                scores=merged_instances.scores,
+            )
+            augmented_instances.append(aug_instances)
+        return augmented_instances
+
+    def _reduce_pred_masks(self, outputs, aug_vars):
+        for idx, output in enumerate(outputs):
+            if aug_vars["do_hflip"][idx]:
+                output.pred_masks = output.pred_masks.flip(dims=[3])
+        all_pred_masks = torch.stack([o.pred_masks for o in outputs], dim=0)
+        avg_pred_masks = torch.mean(all_pred_masks, dim=0)
+        return avg_pred_masks
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/detectron2/solver/__init__.py b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/solver/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..10f84e12d029a07d5c7d3ac29e18b572a92ef03c
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/solver/__init__.py
@@ -0,0 +1,5 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+from .build import build_lr_scheduler, build_optimizer
+from .lr_scheduler import WarmupCosineLR, WarmupMultiStepLR
+
+__all__ = [k for k in globals().keys() if not k.startswith("_")]
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/detectron2/solver/build.py b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/solver/build.py
new file mode 100644
index 0000000000000000000000000000000000000000..6d9d0ee5df1a6135c1a3df0151dfe0e36aa9971a
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/solver/build.py
@@ -0,0 +1,165 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+from enum import Enum
+from typing import Any, Callable, Dict, Iterable, List, Set, Type, Union
+import torch
+
+from detectron2.config import CfgNode
+
+from .lr_scheduler import WarmupCosineLR, WarmupMultiStepLR
+
+_GradientClipperInput = Union[torch.Tensor, Iterable[torch.Tensor]]
+_GradientClipper = Callable[[_GradientClipperInput], None]
+
+
+class GradientClipType(Enum):
+    VALUE = "value"
+    NORM = "norm"
+
+
+def _create_gradient_clipper(cfg: CfgNode) -> _GradientClipper:
+    """
+    Creates gradient clipping closure to clip by value or by norm,
+    according to the provided config.
+    """
+    cfg = cfg.clone()
+
+    def clip_grad_norm(p: _GradientClipperInput):
+        torch.nn.utils.clip_grad_norm_(p, cfg.CLIP_VALUE, cfg.NORM_TYPE)
+
+    def clip_grad_value(p: _GradientClipperInput):
+        torch.nn.utils.clip_grad_value_(p, cfg.CLIP_VALUE)
+
+    _GRADIENT_CLIP_TYPE_TO_CLIPPER = {
+        GradientClipType.VALUE: clip_grad_value,
+        GradientClipType.NORM: clip_grad_norm,
+    }
+    return _GRADIENT_CLIP_TYPE_TO_CLIPPER[GradientClipType(cfg.CLIP_TYPE)]
+
+
+def _generate_optimizer_class_with_gradient_clipping(
+    optimizer_type: Type[torch.optim.Optimizer], gradient_clipper: _GradientClipper
+) -> Type[torch.optim.Optimizer]:
+    """
+    Dynamically creates a new type that inherits the type of a given instance
+    and overrides the `step` method to add gradient clipping
+    """
+
+    def optimizer_wgc_step(self, closure=None):
+        for group in self.param_groups:
+            for p in group["params"]:
+                gradient_clipper(p)
+        super(type(self), self).step(closure)
+
+    OptimizerWithGradientClip = type(
+        optimizer_type.__name__ + "WithGradientClip",
+        (optimizer_type,),
+        {"step": optimizer_wgc_step},
+    )
+    return OptimizerWithGradientClip
+
+
+def maybe_add_gradient_clipping(
+    cfg: CfgNode, optimizer: torch.optim.Optimizer
+) -> torch.optim.Optimizer:
+    """
+    If gradient clipping is enabled through config options, wraps the existing
+    optimizer instance of some type OptimizerType to become an instance
+    of the new dynamically created class OptimizerTypeWithGradientClip
+    that inherits OptimizerType and overrides the `step` method to
+    include gradient clipping.
+
+    Args:
+        cfg: CfgNode
+            configuration options
+        optimizer: torch.optim.Optimizer
+            existing optimizer instance
+
+    Return:
+        optimizer: torch.optim.Optimizer
+            either the unmodified optimizer instance (if gradient clipping is
+            disabled), or the same instance with adjusted __class__ to override
+            the `step` method and include gradient clipping
+    """
+    if not cfg.SOLVER.CLIP_GRADIENTS.ENABLED:
+        return optimizer
+    grad_clipper = _create_gradient_clipper(cfg.SOLVER.CLIP_GRADIENTS)
+    OptimizerWithGradientClip = _generate_optimizer_class_with_gradient_clipping(
+        type(optimizer), grad_clipper
+    )
+    optimizer.__class__ = OptimizerWithGradientClip
+    return optimizer
+
+
+def build_optimizer(cfg: CfgNode, model: torch.nn.Module) -> torch.optim.Optimizer:
+    """
+    Build an optimizer from config.
+    """
+    norm_module_types = (
+        torch.nn.BatchNorm1d,
+        torch.nn.BatchNorm2d,
+        torch.nn.BatchNorm3d,
+        torch.nn.SyncBatchNorm,
+        # NaiveSyncBatchNorm inherits from BatchNorm2d
+        torch.nn.GroupNorm,
+        torch.nn.InstanceNorm1d,
+        torch.nn.InstanceNorm2d,
+        torch.nn.InstanceNorm3d,
+        torch.nn.LayerNorm,
+        torch.nn.LocalResponseNorm,
+    )
+    params: List[Dict[str, Any]] = []
+    memo: Set[torch.nn.parameter.Parameter] = set()
+    for module in model.modules():
+        for key, value in module.named_parameters(recurse=False):
+            if not value.requires_grad:
+                continue
+            # Avoid duplicating parameters
+            if value in memo:
+                continue
+            memo.add(value)
+            lr = cfg.SOLVER.BASE_LR
+            weight_decay = cfg.SOLVER.WEIGHT_DECAY
+            if isinstance(module, norm_module_types):
+                weight_decay = cfg.SOLVER.WEIGHT_DECAY_NORM
+            elif key == "bias":
+                # NOTE: unlike Detectron v1, we now default BIAS_LR_FACTOR to 1.0
+                # and WEIGHT_DECAY_BIAS to WEIGHT_DECAY so that bias optimizer
+                # hyperparameters are by default exactly the same as for regular
+                # weights.
+                lr = cfg.SOLVER.BASE_LR * cfg.SOLVER.BIAS_LR_FACTOR
+                weight_decay = cfg.SOLVER.WEIGHT_DECAY_BIAS
+            params += [{"params": [value], "lr": lr, "weight_decay": weight_decay}]
+
+    optimizer = torch.optim.SGD(
+        params, cfg.SOLVER.BASE_LR, momentum=cfg.SOLVER.MOMENTUM, nesterov=cfg.SOLVER.NESTEROV
+    )
+    optimizer = maybe_add_gradient_clipping(cfg, optimizer)
+    return optimizer
+
+
+def build_lr_scheduler(
+    cfg: CfgNode, optimizer: torch.optim.Optimizer
+) -> torch.optim.lr_scheduler._LRScheduler:
+    """
+    Build a LR scheduler from config.
+    """
+    name = cfg.SOLVER.LR_SCHEDULER_NAME
+    if name == "WarmupMultiStepLR":
+        return WarmupMultiStepLR(
+            optimizer,
+            cfg.SOLVER.STEPS,
+            cfg.SOLVER.GAMMA,
+            warmup_factor=cfg.SOLVER.WARMUP_FACTOR,
+            warmup_iters=cfg.SOLVER.WARMUP_ITERS,
+            warmup_method=cfg.SOLVER.WARMUP_METHOD,
+        )
+    elif name == "WarmupCosineLR":
+        return WarmupCosineLR(
+            optimizer,
+            cfg.SOLVER.MAX_ITER,
+            warmup_factor=cfg.SOLVER.WARMUP_FACTOR,
+            warmup_iters=cfg.SOLVER.WARMUP_ITERS,
+            warmup_method=cfg.SOLVER.WARMUP_METHOD,
+        )
+    else:
+        raise ValueError("Unknown LR scheduler: {}".format(name))
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/detectron2/solver/lr_scheduler.py b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/solver/lr_scheduler.py
new file mode 100644
index 0000000000000000000000000000000000000000..6148d86785dae03ed2611792fb28da387d1103b8
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/solver/lr_scheduler.py
@@ -0,0 +1,116 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+import math
+from bisect import bisect_right
+from typing import List
+import torch
+
+# NOTE: PyTorch's LR scheduler interface uses names that assume the LR changes
+# only on epoch boundaries. We typically use iteration based schedules instead.
+# As a result, "epoch" (e.g., as in self.last_epoch) should be understood to mean
+# "iteration" instead.
+
+# FIXME: ideally this would be achieved with a CombinedLRScheduler, separating
+# MultiStepLR with WarmupLR but the current LRScheduler design doesn't allow it.
+
+
+class WarmupMultiStepLR(torch.optim.lr_scheduler._LRScheduler):
+    def __init__(
+        self,
+        optimizer: torch.optim.Optimizer,
+        milestones: List[int],
+        gamma: float = 0.1,
+        warmup_factor: float = 0.001,
+        warmup_iters: int = 1000,
+        warmup_method: str = "linear",
+        last_epoch: int = -1,
+    ):
+        if not list(milestones) == sorted(milestones):
+            raise ValueError(
+                "Milestones should be a list of" " increasing integers. Got {}", milestones
+            )
+        self.milestones = milestones
+        self.gamma = gamma
+        self.warmup_factor = warmup_factor
+        self.warmup_iters = warmup_iters
+        self.warmup_method = warmup_method
+        super().__init__(optimizer, last_epoch)
+
+    def get_lr(self) -> List[float]:
+        warmup_factor = _get_warmup_factor_at_iter(
+            self.warmup_method, self.last_epoch, self.warmup_iters, self.warmup_factor
+        )
+        return [
+            base_lr * warmup_factor * self.gamma ** bisect_right(self.milestones, self.last_epoch)
+            for base_lr in self.base_lrs
+        ]
+
+    def _compute_values(self) -> List[float]:
+        # The new interface
+        return self.get_lr()
+
+
+class WarmupCosineLR(torch.optim.lr_scheduler._LRScheduler):
+    def __init__(
+        self,
+        optimizer: torch.optim.Optimizer,
+        max_iters: int,
+        warmup_factor: float = 0.001,
+        warmup_iters: int = 1000,
+        warmup_method: str = "linear",
+        last_epoch: int = -1,
+    ):
+        self.max_iters = max_iters
+        self.warmup_factor = warmup_factor
+        self.warmup_iters = warmup_iters
+        self.warmup_method = warmup_method
+        super().__init__(optimizer, last_epoch)
+
+    def get_lr(self) -> List[float]:
+        warmup_factor = _get_warmup_factor_at_iter(
+            self.warmup_method, self.last_epoch, self.warmup_iters, self.warmup_factor
+        )
+        # Different definitions of half-cosine with warmup are possible. For
+        # simplicity we multiply the standard half-cosine schedule by the warmup
+        # factor. An alternative is to start the period of the cosine at warmup_iters
+        # instead of at 0. In the case that warmup_iters << max_iters the two are
+        # very close to each other.
+        return [
+            base_lr
+            * warmup_factor
+            * 0.5
+            * (1.0 + math.cos(math.pi * self.last_epoch / self.max_iters))
+            for base_lr in self.base_lrs
+        ]
+
+    def _compute_values(self) -> List[float]:
+        # The new interface
+        return self.get_lr()
+
+
+def _get_warmup_factor_at_iter(
+    method: str, iter: int, warmup_iters: int, warmup_factor: float
+) -> float:
+    """
+    Return the learning rate warmup factor at a specific iteration.
+    See :paper:`in1k1h` for more details.
+
+    Args:
+        method (str): warmup method; either "constant" or "linear".
+        iter (int): iteration at which to calculate the warmup factor.
+        warmup_iters (int): the number of warmup iterations.
+        warmup_factor (float): the base warmup factor (the meaning changes according
+            to the method used).
+
+    Returns:
+        float: the effective warmup factor at the given iteration.
+    """
+    if iter >= warmup_iters:
+        return 1.0
+
+    if method == "constant":
+        return warmup_factor
+    elif method == "linear":
+        alpha = iter / warmup_iters
+        return warmup_factor * (1 - alpha) + alpha
+    else:
+        raise ValueError("Unknown warmup method: {}".format(method))
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/detectron2/structures/__init__.py b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/structures/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..618f526753b5813b86645023271b67b421ea4cb5
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/structures/__init__.py
@@ -0,0 +1,11 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+from .boxes import Boxes, BoxMode, pairwise_iou
+from .image_list import ImageList
+
+from .instances import Instances
+from .keypoints import Keypoints, heatmaps_to_keypoints
+from .masks import BitMasks, PolygonMasks, rasterize_polygons_within_box, polygons_to_bitmask
+from .rotated_boxes import RotatedBoxes
+from .rotated_boxes import pairwise_iou as pairwise_iou_rotated
+
+__all__ = [k for k in globals().keys() if not k.startswith("_")]
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/detectron2/structures/boxes.py b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/structures/boxes.py
new file mode 100644
index 0000000000000000000000000000000000000000..e625803e23ec6c0f71ada847ba7bef8e15c8fa40
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/structures/boxes.py
@@ -0,0 +1,367 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+import math
+import numpy as np
+from enum import IntEnum, unique
+from typing import Iterator, List, Tuple, Union
+import torch
+
+_RawBoxType = Union[List[float], Tuple[float, ...], torch.Tensor, np.ndarray]
+
+
+@unique
+class BoxMode(IntEnum):
+    """
+    Enum of different ways to represent a box.
+    """
+
+    XYXY_ABS = 0
+    """
+    (x0, y0, x1, y1) in absolute floating points coordinates.
+    The coordinates in range [0, width or height].
+    """
+    XYWH_ABS = 1
+    """
+    (x0, y0, w, h) in absolute floating points coordinates.
+    """
+    XYXY_REL = 2
+    """
+    Not yet supported!
+    (x0, y0, x1, y1) in range [0, 1]. They are relative to the size of the image.
+    """
+    XYWH_REL = 3
+    """
+    Not yet supported!
+    (x0, y0, w, h) in range [0, 1]. They are relative to the size of the image.
+    """
+    XYWHA_ABS = 4
+    """
+    (xc, yc, w, h, a) in absolute floating points coordinates.
+    (xc, yc) is the center of the rotated box, and the angle a is in degrees ccw.
+    """
+
+    @staticmethod
+    def convert(box: _RawBoxType, from_mode: "BoxMode", to_mode: "BoxMode") -> _RawBoxType:
+        """
+        Args:
+            box: can be a k-tuple, k-list or an Nxk array/tensor, where k = 4 or 5
+            from_mode, to_mode (BoxMode)
+
+        Returns:
+            The converted box of the same type.
+        """
+        if from_mode == to_mode:
+            return box
+
+        original_type = type(box)
+        is_numpy = isinstance(box, np.ndarray)
+        single_box = isinstance(box, (list, tuple))
+        if single_box:
+            assert len(box) == 4 or len(box) == 5, (
+                "BoxMode.convert takes either a k-tuple/list or an Nxk array/tensor,"
+                " where k == 4 or 5"
+            )
+            arr = torch.tensor(box)[None, :]
+        else:
+            # avoid modifying the input box
+            if is_numpy:
+                arr = torch.from_numpy(np.asarray(box)).clone()
+            else:
+                arr = box.clone()
+
+        assert to_mode.value not in [
+            BoxMode.XYXY_REL,
+            BoxMode.XYWH_REL,
+        ] and from_mode.value not in [
+            BoxMode.XYXY_REL,
+            BoxMode.XYWH_REL,
+        ], "Relative mode not yet supported!"
+
+        if from_mode == BoxMode.XYWHA_ABS and to_mode == BoxMode.XYXY_ABS:
+            assert (
+                arr.shape[-1] == 5
+            ), "The last dimension of input shape must be 5 for XYWHA format"
+            original_dtype = arr.dtype
+            arr = arr.double()
+
+            w = arr[:, 2]
+            h = arr[:, 3]
+            a = arr[:, 4]
+            c = torch.abs(torch.cos(a * math.pi / 180.0))
+            s = torch.abs(torch.sin(a * math.pi / 180.0))
+            # This basically computes the horizontal bounding rectangle of the rotated box
+            new_w = c * w + s * h
+            new_h = c * h + s * w
+
+            # convert center to top-left corner
+            arr[:, 0] -= new_w / 2.0
+            arr[:, 1] -= new_h / 2.0
+            # bottom-right corner
+            arr[:, 2] = arr[:, 0] + new_w
+            arr[:, 3] = arr[:, 1] + new_h
+
+            arr = arr[:, :4].to(dtype=original_dtype)
+        elif from_mode == BoxMode.XYWH_ABS and to_mode == BoxMode.XYWHA_ABS:
+            original_dtype = arr.dtype
+            arr = arr.double()
+            arr[:, 0] += arr[:, 2] / 2.0
+            arr[:, 1] += arr[:, 3] / 2.0
+            angles = torch.zeros((arr.shape[0], 1), dtype=arr.dtype)
+            arr = torch.cat((arr, angles), axis=1).to(dtype=original_dtype)
+        else:
+            if to_mode == BoxMode.XYXY_ABS and from_mode == BoxMode.XYWH_ABS:
+                arr[:, 2] += arr[:, 0]
+                arr[:, 3] += arr[:, 1]
+            elif from_mode == BoxMode.XYXY_ABS and to_mode == BoxMode.XYWH_ABS:
+                arr[:, 2] -= arr[:, 0]
+                arr[:, 3] -= arr[:, 1]
+            else:
+                raise NotImplementedError(
+                    "Conversion from BoxMode {} to {} is not supported yet".format(
+                        from_mode, to_mode
+                    )
+                )
+
+        if single_box:
+            return original_type(arr.flatten().tolist())
+        if is_numpy:
+            return arr.numpy()
+        else:
+            return arr
+
+
+class Boxes:
+    """
+    This structure stores a list of boxes as a Nx4 torch.Tensor.
+    It supports some common methods about boxes
+    (`area`, `clip`, `nonempty`, etc),
+    and also behaves like a Tensor
+    (support indexing, `to(device)`, `.device`, and iteration over all boxes)
+
+    Attributes:
+        tensor (torch.Tensor): float matrix of Nx4. Each row is (x1, y1, x2, y2).
+    """
+
+    BoxSizeType = Union[List[int], Tuple[int, int]]
+
+    def __init__(self, tensor: torch.Tensor):
+        """
+        Args:
+            tensor (Tensor[float]): a Nx4 matrix.  Each row is (x1, y1, x2, y2).
+        """
+        device = tensor.device if isinstance(tensor, torch.Tensor) else torch.device("cpu")
+        tensor = torch.as_tensor(tensor, dtype=torch.float32, device=device)
+        if tensor.numel() == 0:
+            # Use reshape, so we don't end up creating a new tensor that does not depend on
+            # the inputs (and consequently confuses jit)
+            tensor = tensor.reshape((0, 4)).to(dtype=torch.float32, device=device)
+        assert tensor.dim() == 2 and tensor.size(-1) == 4, tensor.size()
+
+        self.tensor = tensor
+
+    def clone(self) -> "Boxes":
+        """
+        Clone the Boxes.
+
+        Returns:
+            Boxes
+        """
+        return Boxes(self.tensor.clone())
+
+    def to(self, device: str) -> "Boxes":
+        return Boxes(self.tensor.to(device))
+
+    def area(self) -> torch.Tensor:
+        """
+        Computes the area of all the boxes.
+
+        Returns:
+            torch.Tensor: a vector with areas of each box.
+        """
+        box = self.tensor
+        area = (box[:, 2] - box[:, 0]) * (box[:, 3] - box[:, 1])
+        return area
+
+    def clip(self, box_size: BoxSizeType) -> None:
+        """
+        Clip (in place) the boxes by limiting x coordinates to the range [0, width]
+        and y coordinates to the range [0, height].
+
+        Args:
+            box_size (height, width): The clipping box's size.
+        """
+        assert torch.isfinite(self.tensor).all(), "Box tensor contains infinite or NaN!"
+        h, w = box_size
+        self.tensor[:, 0].clamp_(min=0, max=w)
+        self.tensor[:, 1].clamp_(min=0, max=h)
+        self.tensor[:, 2].clamp_(min=0, max=w)
+        self.tensor[:, 3].clamp_(min=0, max=h)
+
+    def nonempty(self, threshold: float = 0.0) -> torch.Tensor:
+        """
+        Find boxes that are non-empty.
+        A box is considered empty, if either of its side is no larger than threshold.
+
+        Returns:
+            Tensor:
+                a binary vector which represents whether each box is empty
+                (False) or non-empty (True).
+        """
+        box = self.tensor
+        widths = box[:, 2] - box[:, 0]
+        heights = box[:, 3] - box[:, 1]
+        keep = (widths > threshold) & (heights > threshold)
+        return keep
+
+    def __getitem__(self, item: Union[int, slice, torch.BoolTensor]) -> "Boxes":
+        """
+        Returns:
+            Boxes: Create a new :class:`Boxes` by indexing.
+
+        The following usage are allowed:
+
+        1. `new_boxes = boxes[3]`: return a `Boxes` which contains only one box.
+        2. `new_boxes = boxes[2:10]`: return a slice of boxes.
+        3. `new_boxes = boxes[vector]`, where vector is a torch.BoolTensor
+           with `length = len(boxes)`. Nonzero elements in the vector will be selected.
+
+        Note that the returned Boxes might share storage with this Boxes,
+        subject to Pytorch's indexing semantics.
+        """
+        if isinstance(item, int):
+            return Boxes(self.tensor[item].view(1, -1))
+        b = self.tensor[item]
+        assert b.dim() == 2, "Indexing on Boxes with {} failed to return a matrix!".format(item)
+        return Boxes(b)
+
+    def __len__(self) -> int:
+        return self.tensor.shape[0]
+
+    def __repr__(self) -> str:
+        return "Boxes(" + str(self.tensor) + ")"
+
+    def inside_box(self, box_size: BoxSizeType, boundary_threshold: int = 0) -> torch.Tensor:
+        """
+        Args:
+            box_size (height, width): Size of the reference box.
+            boundary_threshold (int): Boxes that extend beyond the reference box
+                boundary by more than boundary_threshold are considered "outside".
+
+        Returns:
+            a binary vector, indicating whether each box is inside the reference box.
+        """
+        height, width = box_size
+        inds_inside = (
+            (self.tensor[..., 0] >= -boundary_threshold)
+            & (self.tensor[..., 1] >= -boundary_threshold)
+            & (self.tensor[..., 2] < width + boundary_threshold)
+            & (self.tensor[..., 3] < height + boundary_threshold)
+        )
+        return inds_inside
+
+    def get_centers(self) -> torch.Tensor:
+        """
+        Returns:
+            The box centers in a Nx2 array of (x, y).
+        """
+        return (self.tensor[:, :2] + self.tensor[:, 2:]) / 2
+
+    def scale(self, scale_x: float, scale_y: float) -> None:
+        """
+        Scale the box with horizontal and vertical scaling factors
+        """
+        self.tensor[:, 0::2] *= scale_x
+        self.tensor[:, 1::2] *= scale_y
+
+    @classmethod
+    def cat(cls, boxes_list: List["Boxes"]) -> "Boxes":
+        """
+        Concatenates a list of Boxes into a single Boxes
+
+        Arguments:
+            boxes_list (list[Boxes])
+
+        Returns:
+            Boxes: the concatenated Boxes
+        """
+        assert isinstance(boxes_list, (list, tuple))
+        if len(boxes_list) == 0:
+            return cls(torch.empty(0))
+        assert all(isinstance(box, Boxes) for box in boxes_list)
+
+        # use torch.cat (v.s. layers.cat) so the returned boxes never share storage with input
+        cat_boxes = cls(torch.cat([b.tensor for b in boxes_list], dim=0))
+        return cat_boxes
+
+    @property
+    def device(self) -> torch.device:
+        return self.tensor.device
+
+    def __iter__(self) -> Iterator[torch.Tensor]:
+        """
+        Yield a box as a Tensor of shape (4,) at a time.
+        """
+        yield from self.tensor
+
+
+# implementation from https://github.com/kuangliu/torchcv/blob/master/torchcv/utils/box.py
+# with slight modifications
+def pairwise_iou(boxes1: Boxes, boxes2: Boxes) -> torch.Tensor:
+    """
+    Given two lists of boxes of size N and M,
+    compute the IoU (intersection over union)
+    between __all__ N x M pairs of boxes.
+    The box order must be (xmin, ymin, xmax, ymax).
+
+    Args:
+        boxes1,boxes2 (Boxes): two `Boxes`. Contains N & M boxes, respectively.
+
+    Returns:
+        Tensor: IoU, sized [N,M].
+    """
+    area1 = boxes1.area()
+    area2 = boxes2.area()
+
+    boxes1, boxes2 = boxes1.tensor, boxes2.tensor
+
+    width_height = torch.min(boxes1[:, None, 2:], boxes2[:, 2:]) - torch.max(
+        boxes1[:, None, :2], boxes2[:, :2]
+    )  # [N,M,2]
+
+    width_height.clamp_(min=0)  # [N,M,2]
+    inter = width_height.prod(dim=2)  # [N,M]
+    del width_height
+
+    # handle empty boxes
+    iou = torch.where(
+        inter > 0,
+        inter / (area1[:, None] + area2 - inter),
+        torch.zeros(1, dtype=inter.dtype, device=inter.device),
+    )
+    return iou
+
+
+def matched_boxlist_iou(boxes1: Boxes, boxes2: Boxes) -> torch.Tensor:
+    """
+    Compute pairwise intersection over union (IOU) of two sets of matched
+    boxes. The box order must be (xmin, ymin, xmax, ymax).
+    Similar to boxlist_iou, but computes only diagonal elements of the matrix
+    Arguments:
+        boxes1: (Boxes) bounding boxes, sized [N,4].
+        boxes2: (Boxes) bounding boxes, sized [N,4].
+    Returns:
+        (tensor) iou, sized [N].
+    """
+    assert len(boxes1) == len(
+        boxes2
+    ), "boxlists should have the same" "number of entries, got {}, {}".format(
+        len(boxes1), len(boxes2)
+    )
+    area1 = boxes1.area()  # [N]
+    area2 = boxes2.area()  # [N]
+    box1, box2 = boxes1.tensor, boxes2.tensor
+    lt = torch.max(box1[:, :2], box2[:, :2])  # [N,2]
+    rb = torch.min(box1[:, 2:], box2[:, 2:])  # [N,2]
+    wh = (rb - lt).clamp(min=0)  # [N,2]
+    inter = wh[:, 0] * wh[:, 1]  # [N]
+    iou = inter / (area1 + area2 - inter)  # [N]
+    return iou
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/detectron2/structures/image_list.py b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/structures/image_list.py
new file mode 100644
index 0000000000000000000000000000000000000000..2d89224b64402badf7f0b113188b5f653df912ac
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/structures/image_list.py
@@ -0,0 +1,113 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+from __future__ import division
+from typing import Any, List, Sequence, Tuple, Union
+import torch
+from torch.nn import functional as F
+
+
+class ImageList(object):
+    """
+    Structure that holds a list of images (of possibly
+    varying sizes) as a single tensor.
+    This works by padding the images to the same size,
+    and storing in a field the original sizes of each image
+
+    Attributes:
+        image_sizes (list[tuple[int, int]]): each tuple is (h, w)
+    """
+
+    def __init__(self, tensor: torch.Tensor, image_sizes: List[Tuple[int, int]]):
+        """
+        Arguments:
+            tensor (Tensor): of shape (N, H, W) or (N, C_1, ..., C_K, H, W) where K >= 1
+            image_sizes (list[tuple[int, int]]): Each tuple is (h, w). It can
+                be smaller than (H, W) due to padding.
+        """
+        self.tensor = tensor
+        self.image_sizes = image_sizes
+
+    def __len__(self) -> int:
+        return len(self.image_sizes)
+
+    def __getitem__(self, idx: Union[int, slice]) -> torch.Tensor:
+        """
+        Access the individual image in its original size.
+
+        Returns:
+            Tensor: an image of shape (H, W) or (C_1, ..., C_K, H, W) where K >= 1
+        """
+        size = self.image_sizes[idx]
+        return self.tensor[idx, ..., : size[0], : size[1]]  # type: ignore
+
+    def to(self, *args: Any, **kwargs: Any) -> "ImageList":
+        cast_tensor = self.tensor.to(*args, **kwargs)
+        return ImageList(cast_tensor, self.image_sizes)
+
+    @property
+    def device(self) -> torch.device:
+        return self.tensor.device
+
+    @staticmethod
+    def from_tensors(
+        tensors: Sequence[torch.Tensor], size_divisibility: int = 0, pad_value: float = 0.0
+    ) -> "ImageList":
+        """
+        Args:
+            tensors: a tuple or list of `torch.Tensors`, each of shape (Hi, Wi) or
+                (C_1, ..., C_K, Hi, Wi) where K >= 1. The Tensors will be padded
+                to the same shape with `pad_value`.
+            size_divisibility (int): If `size_divisibility > 0`, add padding to ensure
+                the common height and width is divisible by `size_divisibility`.
+                This depends on the model and many models need a divisibility of 32.
+            pad_value (float): value to pad
+
+        Returns:
+            an `ImageList`.
+        """
+        assert len(tensors) > 0
+        assert isinstance(tensors, (tuple, list))
+        for t in tensors:
+            assert isinstance(t, torch.Tensor), type(t)
+            assert t.shape[1:-2] == tensors[0].shape[1:-2], t.shape
+        # per dimension maximum (H, W) or (C_1, ..., C_K, H, W) where K >= 1 among all tensors
+        max_size = (
+            # In tracing mode, x.shape[i] is Tensor, and should not be converted
+            # to int: this will cause the traced graph to have hard-coded shapes.
+            # Instead we should make max_size a Tensor that depends on these tensors.
+            # Using torch.stack twice seems to be the best way to convert
+            # list[list[ScalarTensor]] to a Tensor
+            torch.stack(
+                [
+                    torch.stack([torch.as_tensor(dim) for dim in size])
+                    for size in [tuple(img.shape) for img in tensors]
+                ]
+            )
+            .max(0)
+            .values
+        )
+
+        if size_divisibility > 0:
+            stride = size_divisibility
+            # the last two dims are H,W, both subject to divisibility requirement
+            max_size = torch.cat([max_size[:-2], (max_size[-2:] + (stride - 1)) // stride * stride])
+
+        image_sizes = [tuple(im.shape[-2:]) for im in tensors]
+
+        if len(tensors) == 1:
+            # This seems slightly (2%) faster.
+            # TODO: check whether it's faster for multiple images as well
+            image_size = image_sizes[0]
+            padding_size = [0, max_size[-1] - image_size[1], 0, max_size[-2] - image_size[0]]
+            if all(x == 0 for x in padding_size):  # https://github.com/pytorch/pytorch/issues/31734
+                batched_imgs = tensors[0].unsqueeze(0)
+            else:
+                padded = F.pad(tensors[0], padding_size, value=pad_value)
+                batched_imgs = padded.unsqueeze_(0)
+        else:
+            # max_size can be a tensor in tracing mode, therefore use tuple()
+            batch_shape = (len(tensors),) + tuple(max_size)
+            batched_imgs = tensors[0].new_full(batch_shape, pad_value)
+            for img, pad_img in zip(tensors, batched_imgs):
+                pad_img[..., : img.shape[-2], : img.shape[-1]].copy_(img)
+
+        return ImageList(batched_imgs.contiguous(), image_sizes)
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/detectron2/structures/instances.py b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/structures/instances.py
new file mode 100644
index 0000000000000000000000000000000000000000..373de08c01517c0f78b14d94da7ff702daaf375d
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/structures/instances.py
@@ -0,0 +1,185 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+import itertools
+from typing import Any, Dict, List, Tuple, Union
+import torch
+
+
+class Instances:
+    """
+    This class represents a list of instances in an image.
+    It stores the attributes of instances (e.g., boxes, masks, labels, scores) as "fields".
+    All fields must have the same ``__len__`` which is the number of instances.
+
+    All other (non-field) attributes of this class are considered private:
+    they must start with '_' and are not modifiable by a user.
+
+    Some basic usage:
+
+    1. Set/Get a field:
+
+       .. code-block:: python
+
+          instances.gt_boxes = Boxes(...)
+          print(instances.pred_masks)  # a tensor of shape (N, H, W)
+          print('gt_masks' in instances)
+
+    2. ``len(instances)`` returns the number of instances
+    3. Indexing: ``instances[indices]`` will apply the indexing on all the fields
+       and returns a new :class:`Instances`.
+       Typically, ``indices`` is a integer vector of indices,
+       or a binary mask of length ``num_instances``,
+    """
+
+    def __init__(self, image_size: Tuple[int, int], **kwargs: Any):
+        """
+        Args:
+            image_size (height, width): the spatial size of the image.
+            kwargs: fields to add to this `Instances`.
+        """
+        self._image_size = image_size
+        self._fields: Dict[str, Any] = {}
+        for k, v in kwargs.items():
+            self.set(k, v)
+
+    @property
+    def image_size(self) -> Tuple[int, int]:
+        """
+        Returns:
+            tuple: height, width
+        """
+        return self._image_size
+
+    def __setattr__(self, name: str, val: Any) -> None:
+        if name.startswith("_"):
+            super().__setattr__(name, val)
+        else:
+            self.set(name, val)
+
+    def __getattr__(self, name: str) -> Any:
+        if name == "_fields" or name not in self._fields:
+            raise AttributeError("Cannot find field '{}' in the given Instances!".format(name))
+        return self._fields[name]
+
+    def set(self, name: str, value: Any) -> None:
+        """
+        Set the field named `name` to `value`.
+        The length of `value` must be the number of instances,
+        and must agree with other existing fields in this object.
+        """
+        data_len = len(value)
+        if len(self._fields):
+            assert (
+                len(self) == data_len
+            ), "Adding a field of length {} to a Instances of length {}".format(data_len, len(self))
+        self._fields[name] = value
+
+    def has(self, name: str) -> bool:
+        """
+        Returns:
+            bool: whether the field called `name` exists.
+        """
+        return name in self._fields
+
+    def remove(self, name: str) -> None:
+        """
+        Remove the field called `name`.
+        """
+        del self._fields[name]
+
+    def get(self, name: str) -> Any:
+        """
+        Returns the field called `name`.
+        """
+        return self._fields[name]
+
+    def get_fields(self) -> Dict[str, Any]:
+        """
+        Returns:
+            dict: a dict which maps names (str) to data of the fields
+
+        Modifying the returned dict will modify this instance.
+        """
+        return self._fields
+
+    # Tensor-like methods
+    def to(self, device: str) -> "Instances":
+        """
+        Returns:
+            Instances: all fields are called with a `to(device)`, if the field has this method.
+        """
+        ret = Instances(self._image_size)
+        for k, v in self._fields.items():
+            if hasattr(v, "to"):
+                v = v.to(device)
+            ret.set(k, v)
+        return ret
+
+    def __getitem__(self, item: Union[int, slice, torch.BoolTensor]) -> "Instances":
+        """
+        Args:
+            item: an index-like object and will be used to index all the fields.
+
+        Returns:
+            If `item` is a string, return the data in the corresponding field.
+            Otherwise, returns an `Instances` where all fields are indexed by `item`.
+        """
+        if type(item) == int:
+            if item >= len(self) or item < -len(self):
+                raise IndexError("Instances index out of range!")
+            else:
+                item = slice(item, None, len(self))
+
+        ret = Instances(self._image_size)
+        for k, v in self._fields.items():
+            ret.set(k, v[item])
+        return ret
+
+    def __len__(self) -> int:
+        for v in self._fields.values():
+            return len(v)
+        raise NotImplementedError("Empty Instances does not support __len__!")
+
+    def __iter__(self):
+        raise NotImplementedError("`Instances` object is not iterable!")
+
+    @staticmethod
+    def cat(instance_lists: List["Instances"]) -> "Instances":
+        """
+        Args:
+            instance_lists (list[Instances])
+
+        Returns:
+            Instances
+        """
+        assert all(isinstance(i, Instances) for i in instance_lists)
+        assert len(instance_lists) > 0
+        if len(instance_lists) == 1:
+            return instance_lists[0]
+
+        image_size = instance_lists[0].image_size
+        for i in instance_lists[1:]:
+            assert i.image_size == image_size
+        ret = Instances(image_size)
+        for k in instance_lists[0]._fields.keys():
+            values = [i.get(k) for i in instance_lists]
+            v0 = values[0]
+            if isinstance(v0, torch.Tensor):
+                values = torch.cat(values, dim=0)
+            elif isinstance(v0, list):
+                values = list(itertools.chain(*values))
+            elif hasattr(type(v0), "cat"):
+                values = type(v0).cat(values)
+            else:
+                raise ValueError("Unsupported type {} for concatenation".format(type(v0)))
+            ret.set(k, values)
+        return ret
+
+    def __str__(self) -> str:
+        s = self.__class__.__name__ + "("
+        s += "num_instances={}, ".format(len(self))
+        s += "image_height={}, ".format(self._image_size[0])
+        s += "image_width={}, ".format(self._image_size[1])
+        s += "fields=[{}])".format(", ".join((f"{k}: {v}" for k, v in self._fields.items())))
+        return s
+
+    __repr__ = __str__
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/detectron2/structures/keypoints.py b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/structures/keypoints.py
new file mode 100644
index 0000000000000000000000000000000000000000..2242815f31dfe88aaabbf4b49f724c999a71912d
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/structures/keypoints.py
@@ -0,0 +1,209 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+import numpy as np
+from typing import Any, List, Tuple, Union
+import torch
+
+from detectron2.layers import interpolate
+
+
+class Keypoints:
+    """
+    Stores keypoint annotation data. GT Instances have a `gt_keypoints` property
+    containing the x,y location and visibility flag of each keypoint. This tensor has shape
+    (N, K, 3) where N is the number of instances and K is the number of keypoints per instance.
+
+    The visibility flag follows the COCO format and must be one of three integers:
+    * v=0: not labeled (in which case x=y=0)
+    * v=1: labeled but not visible
+    * v=2: labeled and visible
+    """
+
+    def __init__(self, keypoints: Union[torch.Tensor, np.ndarray, List[List[float]]]):
+        """
+        Arguments:
+            keypoints: A Tensor, numpy array, or list of the x, y, and visibility of each keypoint.
+                The shape should be (N, K, 3) where N is the number of
+                instances, and K is the number of keypoints per instance.
+        """
+        device = keypoints.device if isinstance(keypoints, torch.Tensor) else torch.device("cpu")
+        keypoints = torch.as_tensor(keypoints, dtype=torch.float32, device=device)
+        assert keypoints.dim() == 3 and keypoints.shape[2] == 3, keypoints.shape
+        self.tensor = keypoints
+
+    def __len__(self) -> int:
+        return self.tensor.size(0)
+
+    def to(self, *args: Any, **kwargs: Any) -> "Keypoints":
+        return type(self)(self.tensor.to(*args, **kwargs))
+
+    @property
+    def device(self) -> torch.device:
+        return self.tensor.device
+
+    def to_heatmap(self, boxes: torch.Tensor, heatmap_size: int) -> torch.Tensor:
+        """
+        Arguments:
+            boxes: Nx4 tensor, the boxes to draw the keypoints to
+
+        Returns:
+            heatmaps:
+                A tensor of shape (N, K) containing an integer spatial label
+                in the range [0, heatmap_size**2 - 1] for each keypoint in the input.
+            valid:
+                A tensor of shape (N, K) containing whether each keypoint is in the roi or not.
+        """
+        return _keypoints_to_heatmap(self.tensor, boxes, heatmap_size)
+
+    def __getitem__(self, item: Union[int, slice, torch.BoolTensor]) -> "Keypoints":
+        """
+        Create a new `Keypoints` by indexing on this `Keypoints`.
+
+        The following usage are allowed:
+
+        1. `new_kpts = kpts[3]`: return a `Keypoints` which contains only one instance.
+        2. `new_kpts = kpts[2:10]`: return a slice of key points.
+        3. `new_kpts = kpts[vector]`, where vector is a torch.ByteTensor
+           with `length = len(kpts)`. Nonzero elements in the vector will be selected.
+
+        Note that the returned Keypoints might share storage with this Keypoints,
+        subject to Pytorch's indexing semantics.
+        """
+        if isinstance(item, int):
+            return Keypoints([self.tensor[item]])
+        return Keypoints(self.tensor[item])
+
+    def __repr__(self) -> str:
+        s = self.__class__.__name__ + "("
+        s += "num_instances={})".format(len(self.tensor))
+        return s
+
+
+# TODO make this nicer, this is a direct translation from C2 (but removing the inner loop)
+def _keypoints_to_heatmap(
+    keypoints: torch.Tensor, rois: torch.Tensor, heatmap_size: int
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    Encode keypoint locations into a target heatmap for use in SoftmaxWithLoss across space.
+
+    Maps keypoints from the half-open interval [x1, x2) on continuous image coordinates to the
+    closed interval [0, heatmap_size - 1] on discrete image coordinates. We use the
+    continuous-discrete conversion from Heckbert 1990 ("What is the coordinate of a pixel?"):
+    d = floor(c) and c = d + 0.5, where d is a discrete coordinate and c is a continuous coordinate.
+
+    Arguments:
+        keypoints: tensor of keypoint locations in of shape (N, K, 3).
+        rois: Nx4 tensor of rois in xyxy format
+        heatmap_size: integer side length of square heatmap.
+
+    Returns:
+        heatmaps: A tensor of shape (N, K) containing an integer spatial label
+            in the range [0, heatmap_size**2 - 1] for each keypoint in the input.
+        valid: A tensor of shape (N, K) containing whether each keypoint is in
+            the roi or not.
+    """
+
+    if rois.numel() == 0:
+        return rois.new().long(), rois.new().long()
+    offset_x = rois[:, 0]
+    offset_y = rois[:, 1]
+    scale_x = heatmap_size / (rois[:, 2] - rois[:, 0])
+    scale_y = heatmap_size / (rois[:, 3] - rois[:, 1])
+
+    offset_x = offset_x[:, None]
+    offset_y = offset_y[:, None]
+    scale_x = scale_x[:, None]
+    scale_y = scale_y[:, None]
+
+    x = keypoints[..., 0]
+    y = keypoints[..., 1]
+
+    x_boundary_inds = x == rois[:, 2][:, None]
+    y_boundary_inds = y == rois[:, 3][:, None]
+
+    x = (x - offset_x) * scale_x
+    x = x.floor().long()
+    y = (y - offset_y) * scale_y
+    y = y.floor().long()
+
+    x[x_boundary_inds] = heatmap_size - 1
+    y[y_boundary_inds] = heatmap_size - 1
+
+    valid_loc = (x >= 0) & (y >= 0) & (x < heatmap_size) & (y < heatmap_size)
+    vis = keypoints[..., 2] > 0
+    valid = (valid_loc & vis).long()
+
+    lin_ind = y * heatmap_size + x
+    heatmaps = lin_ind * valid
+
+    return heatmaps, valid
+
+
+@torch.no_grad()
+def heatmaps_to_keypoints(maps: torch.Tensor, rois: torch.Tensor) -> torch.Tensor:
+    """
+    Extract predicted keypoint locations from heatmaps.
+
+    Args:
+        maps (Tensor): (#ROIs, #keypoints, POOL_H, POOL_W). The predicted heatmap of logits for
+            each ROI and each keypoint.
+        rois (Tensor): (#ROIs, 4). The box of each ROI.
+
+    Returns:
+        Tensor of shape (#ROIs, #keypoints, 4) with the last dimension corresponding to
+        (x, y, logit, score) for each keypoint.
+
+    When converting discrete pixel indices in an NxN image to a continuous keypoint coordinate,
+    we maintain consistency with :meth:`Keypoints.to_heatmap` by using the conversion from
+    Heckbert 1990: c = d + 0.5, where d is a discrete coordinate and c is a continuous coordinate.
+    """
+    offset_x = rois[:, 0]
+    offset_y = rois[:, 1]
+
+    widths = (rois[:, 2] - rois[:, 0]).clamp(min=1)
+    heights = (rois[:, 3] - rois[:, 1]).clamp(min=1)
+    widths_ceil = widths.ceil()
+    heights_ceil = heights.ceil()
+
+    num_rois, num_keypoints = maps.shape[:2]
+    xy_preds = maps.new_zeros(rois.shape[0], num_keypoints, 4)
+
+    width_corrections = widths / widths_ceil
+    height_corrections = heights / heights_ceil
+
+    keypoints_idx = torch.arange(num_keypoints, device=maps.device)
+
+    for i in range(num_rois):
+        outsize = (int(heights_ceil[i]), int(widths_ceil[i]))
+        roi_map = interpolate(maps[[i]], size=outsize, mode="bicubic", align_corners=False).squeeze(
+            0
+        )  # #keypoints x H x W
+
+        # softmax over the spatial region
+        max_score, _ = roi_map.view(num_keypoints, -1).max(1)
+        max_score = max_score.view(num_keypoints, 1, 1)
+        tmp_full_resolution = (roi_map - max_score).exp_()
+        tmp_pool_resolution = (maps[i] - max_score).exp_()
+        # Produce scores over the region H x W, but normalize with POOL_H x POOL_W,
+        # so that the scores of objects of different absolute sizes will be more comparable
+        roi_map_scores = tmp_full_resolution / tmp_pool_resolution.sum((1, 2), keepdim=True)
+
+        w = roi_map.shape[2]
+        pos = roi_map.view(num_keypoints, -1).argmax(1)
+
+        x_int = pos % w
+        y_int = (pos - x_int) // w
+
+        assert (
+            roi_map_scores[keypoints_idx, y_int, x_int]
+            == roi_map_scores.view(num_keypoints, -1).max(1)[0]
+        ).all()
+
+        x = (x_int.float() + 0.5) * width_corrections[i]
+        y = (y_int.float() + 0.5) * height_corrections[i]
+
+        xy_preds[i, :, 0] = x + offset_x[i]
+        xy_preds[i, :, 1] = y + offset_y[i]
+        xy_preds[i, :, 2] = roi_map[keypoints_idx, y_int, x_int]
+        xy_preds[i, :, 3] = roi_map_scores[keypoints_idx, y_int, x_int]
+
+    return xy_preds
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/detectron2/structures/masks.py b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/structures/masks.py
new file mode 100644
index 0000000000000000000000000000000000000000..e363baf3d8cfc4694558fc12bbd2e9d65507b9d9
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/structures/masks.py
@@ -0,0 +1,424 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+import copy
+import itertools
+import numpy as np
+from typing import Any, Iterator, List, Union
+import pycocotools.mask as mask_utils
+import torch
+
+from detectron2.layers.roi_align import ROIAlign
+
+from .boxes import Boxes
+
+
+def polygon_area(x, y):
+    # Using the shoelace formula
+    # https://stackoverflow.com/questions/24467972/calculate-area-of-polygon-given-x-y-coordinates
+    return 0.5 * np.abs(np.dot(x, np.roll(y, 1)) - np.dot(y, np.roll(x, 1)))
+
+
+def polygons_to_bitmask(polygons: List[np.ndarray], height: int, width: int) -> np.ndarray:
+    """
+    Args:
+        polygons (list[ndarray]): each array has shape (Nx2,)
+        height, width (int)
+
+    Returns:
+        ndarray: a bool mask of shape (height, width)
+    """
+    assert len(polygons) > 0, "COCOAPI does not support empty polygons"
+    rles = mask_utils.frPyObjects(polygons, height, width)
+    rle = mask_utils.merge(rles)
+    return mask_utils.decode(rle).astype(np.bool)
+
+
+def rasterize_polygons_within_box(
+    polygons: List[np.ndarray], box: np.ndarray, mask_size: int
+) -> torch.Tensor:
+    """
+    Rasterize the polygons into a mask image and
+    crop the mask content in the given box.
+    The cropped mask is resized to (mask_size, mask_size).
+
+    This function is used when generating training targets for mask head in Mask R-CNN.
+    Given original ground-truth masks for an image, new ground-truth mask
+    training targets in the size of `mask_size x mask_size`
+    must be provided for each predicted box. This function will be called to
+    produce such targets.
+
+    Args:
+        polygons (list[ndarray[float]]): a list of polygons, which represents an instance.
+        box: 4-element numpy array
+        mask_size (int):
+
+    Returns:
+        Tensor: BoolTensor of shape (mask_size, mask_size)
+    """
+    # 1. Shift the polygons w.r.t the boxes
+    w, h = box[2] - box[0], box[3] - box[1]
+
+    polygons = copy.deepcopy(polygons)
+    for p in polygons:
+        p[0::2] = p[0::2] - box[0]
+        p[1::2] = p[1::2] - box[1]
+
+    # 2. Rescale the polygons to the new box size
+    # max() to avoid division by small number
+    ratio_h = mask_size / max(h, 0.1)
+    ratio_w = mask_size / max(w, 0.1)
+
+    if ratio_h == ratio_w:
+        for p in polygons:
+            p *= ratio_h
+    else:
+        for p in polygons:
+            p[0::2] *= ratio_w
+            p[1::2] *= ratio_h
+
+    # 3. Rasterize the polygons with coco api
+    mask = polygons_to_bitmask(polygons, mask_size, mask_size)
+    mask = torch.from_numpy(mask)
+    return mask
+
+
+class BitMasks:
+    """
+    This class stores the segmentation masks for all objects in one image, in
+    the form of bitmaps.
+
+    Attributes:
+        tensor: bool Tensor of N,H,W, representing N instances in the image.
+    """
+
+    def __init__(self, tensor: Union[torch.Tensor, np.ndarray]):
+        """
+        Args:
+            tensor: bool Tensor of N,H,W, representing N instances in the image.
+        """
+        device = tensor.device if isinstance(tensor, torch.Tensor) else torch.device("cpu")
+        tensor = torch.as_tensor(tensor, dtype=torch.bool, device=device)
+        assert tensor.dim() == 3, tensor.size()
+        self.image_size = tensor.shape[1:]
+        self.tensor = tensor
+
+    def to(self, device: str) -> "BitMasks":
+        return BitMasks(self.tensor.to(device))
+
+    @property
+    def device(self) -> torch.device:
+        return self.tensor.device
+
+    def __getitem__(self, item: Union[int, slice, torch.BoolTensor]) -> "BitMasks":
+        """
+        Returns:
+            BitMasks: Create a new :class:`BitMasks` by indexing.
+
+        The following usage are allowed:
+
+        1. `new_masks = masks[3]`: return a `BitMasks` which contains only one mask.
+        2. `new_masks = masks[2:10]`: return a slice of masks.
+        3. `new_masks = masks[vector]`, where vector is a torch.BoolTensor
+           with `length = len(masks)`. Nonzero elements in the vector will be selected.
+
+        Note that the returned object might share storage with this object,
+        subject to Pytorch's indexing semantics.
+        """
+        if isinstance(item, int):
+            return BitMasks(self.tensor[item].view(1, -1))
+        m = self.tensor[item]
+        assert m.dim() == 3, "Indexing on BitMasks with {} returns a tensor with shape {}!".format(
+            item, m.shape
+        )
+        return BitMasks(m)
+
+    def __iter__(self) -> torch.Tensor:
+        yield from self.tensor
+
+    def __repr__(self) -> str:
+        s = self.__class__.__name__ + "("
+        s += "num_instances={})".format(len(self.tensor))
+        return s
+
+    def __len__(self) -> int:
+        return self.tensor.shape[0]
+
+    def nonempty(self) -> torch.Tensor:
+        """
+        Find masks that are non-empty.
+
+        Returns:
+            Tensor: a BoolTensor which represents
+                whether each mask is empty (False) or non-empty (True).
+        """
+        return self.tensor.flatten(1).any(dim=1)
+
+    @staticmethod
+    def from_polygon_masks(
+        polygon_masks: Union["PolygonMasks", List[List[np.ndarray]]], height: int, width: int
+    ) -> "BitMasks":
+        """
+        Args:
+            polygon_masks (list[list[ndarray]] or PolygonMasks)
+            height, width (int)
+        """
+        if isinstance(polygon_masks, PolygonMasks):
+            polygon_masks = polygon_masks.polygons
+        masks = [polygons_to_bitmask(p, height, width) for p in polygon_masks]
+        return BitMasks(torch.stack([torch.from_numpy(x) for x in masks]))
+
+    def crop_and_resize(self, boxes: torch.Tensor, mask_size: int) -> torch.Tensor:
+        """
+        Crop each bitmask by the given box, and resize results to (mask_size, mask_size).
+        This can be used to prepare training targets for Mask R-CNN.
+        It has less reconstruction error compared to rasterization with polygons.
+        However we observe no difference in accuracy,
+        but BitMasks requires more memory to store all the masks.
+
+        Args:
+            boxes (Tensor): Nx4 tensor storing the boxes for each mask
+            mask_size (int): the size of the rasterized mask.
+
+        Returns:
+            Tensor:
+                A bool tensor of shape (N, mask_size, mask_size), where
+                N is the number of predicted boxes for this image.
+        """
+        assert len(boxes) == len(self), "{} != {}".format(len(boxes), len(self))
+        device = self.tensor.device
+
+        batch_inds = torch.arange(len(boxes), device=device).to(dtype=boxes.dtype)[:, None]
+        rois = torch.cat([batch_inds, boxes], dim=1)  # Nx5
+
+        bit_masks = self.tensor.to(dtype=torch.float32)
+        rois = rois.to(device=device)
+        output = (
+            ROIAlign((mask_size, mask_size), 1.0, 0, aligned=True)
+            .forward(bit_masks[:, None, :, :], rois)
+            .squeeze(1)
+        )
+        output = output >= 0.5
+        return output
+
+    def get_bounding_boxes(self) -> None:
+        # not needed now
+        raise NotImplementedError
+
+    @staticmethod
+    def cat(bitmasks_list: List["BitMasks"]) -> "BitMasks":
+        """
+        Concatenates a list of BitMasks into a single BitMasks
+
+        Arguments:
+            bitmasks_list (list[BitMasks])
+
+        Returns:
+            BitMasks: the concatenated BitMasks
+        """
+        assert isinstance(bitmasks_list, (list, tuple))
+        assert len(bitmasks_list) > 0
+        assert all(isinstance(bitmask, BitMasks) for bitmask in bitmasks_list)
+
+        cat_bitmasks = type(bitmasks_list[0])(torch.cat([bm.tensor for bm in bitmasks_list], dim=0))
+        return cat_bitmasks
+
+
+class PolygonMasks:
+    """
+    This class stores the segmentation masks for all objects in one image, in the form of polygons.
+
+    Attributes:
+        polygons: list[list[ndarray]]. Each ndarray is a float64 vector representing a polygon.
+    """
+
+    def __init__(self, polygons: List[List[Union[torch.Tensor, np.ndarray]]]):
+        """
+        Arguments:
+            polygons (list[list[np.ndarray]]): The first
+                level of the list correspond to individual instances,
+                the second level to all the polygons that compose the
+                instance, and the third level to the polygon coordinates.
+                The third level array should have the format of
+                [x0, y0, x1, y1, ..., xn, yn] (n >= 3).
+        """
+        assert isinstance(polygons, list), (
+            "Cannot create PolygonMasks: Expect a list of list of polygons per image. "
+            "Got '{}' instead.".format(type(polygons))
+        )
+
+        def _make_array(t: Union[torch.Tensor, np.ndarray]) -> np.ndarray:
+            # Use float64 for higher precision, because why not?
+            # Always put polygons on CPU (self.to is a no-op) since they
+            # are supposed to be small tensors.
+            # May need to change this assumption if GPU placement becomes useful
+            if isinstance(t, torch.Tensor):
+                t = t.cpu().numpy()
+            return np.asarray(t).astype("float64")
+
+        def process_polygons(
+            polygons_per_instance: List[Union[torch.Tensor, np.ndarray]]
+        ) -> List[np.ndarray]:
+            assert isinstance(polygons_per_instance, list), (
+                "Cannot create polygons: Expect a list of polygons per instance. "
+                "Got '{}' instead.".format(type(polygons_per_instance))
+            )
+            # transform the polygon to a tensor
+            polygons_per_instance = [_make_array(p) for p in polygons_per_instance]
+            for polygon in polygons_per_instance:
+                assert len(polygon) % 2 == 0 and len(polygon) >= 6
+            return polygons_per_instance
+
+        self.polygons: List[List[np.ndarray]] = [
+            process_polygons(polygons_per_instance) for polygons_per_instance in polygons
+        ]
+
+    def to(self, *args: Any, **kwargs: Any) -> "PolygonMasks":
+        return self
+
+    @property
+    def device(self) -> torch.device:
+        return torch.device("cpu")
+
+    def get_bounding_boxes(self) -> Boxes:
+        """
+        Returns:
+            Boxes: tight bounding boxes around polygon masks.
+        """
+        boxes = torch.zeros(len(self.polygons), 4, dtype=torch.float32)
+        for idx, polygons_per_instance in enumerate(self.polygons):
+            minxy = torch.as_tensor([float("inf"), float("inf")], dtype=torch.float32)
+            maxxy = torch.zeros(2, dtype=torch.float32)
+            for polygon in polygons_per_instance:
+                coords = torch.from_numpy(polygon).view(-1, 2).to(dtype=torch.float32)
+                minxy = torch.min(minxy, torch.min(coords, dim=0).values)
+                maxxy = torch.max(maxxy, torch.max(coords, dim=0).values)
+            boxes[idx, :2] = minxy
+            boxes[idx, 2:] = maxxy
+        return Boxes(boxes)
+
+    def nonempty(self) -> torch.Tensor:
+        """
+        Find masks that are non-empty.
+
+        Returns:
+            Tensor:
+                a BoolTensor which represents whether each mask is empty (False) or not (True).
+        """
+        keep = [1 if len(polygon) > 0 else 0 for polygon in self.polygons]
+        return torch.from_numpy(np.asarray(keep, dtype=np.bool))
+
+    def __getitem__(self, item: Union[int, slice, List[int], torch.BoolTensor]) -> "PolygonMasks":
+        """
+        Support indexing over the instances and return a `PolygonMasks` object.
+        `item` can be:
+
+        1. An integer. It will return an object with only one instance.
+        2. A slice. It will return an object with the selected instances.
+        3. A list[int]. It will return an object with the selected instances,
+           correpsonding to the indices in the list.
+        4. A vector mask of type BoolTensor, whose length is num_instances.
+           It will return an object with the instances whose mask is nonzero.
+        """
+        if isinstance(item, int):
+            selected_polygons = [self.polygons[item]]
+        elif isinstance(item, slice):
+            selected_polygons = self.polygons[item]
+        elif isinstance(item, list):
+            selected_polygons = [self.polygons[i] for i in item]
+        elif isinstance(item, torch.Tensor):
+            # Polygons is a list, so we have to move the indices back to CPU.
+            if item.dtype == torch.bool:
+                assert item.dim() == 1, item.shape
+                item = item.nonzero().squeeze(1).cpu().numpy().tolist()
+            elif item.dtype in [torch.int32, torch.int64]:
+                item = item.cpu().numpy().tolist()
+            else:
+                raise ValueError("Unsupported tensor dtype={} for indexing!".format(item.dtype))
+            selected_polygons = [self.polygons[i] for i in item]
+        return PolygonMasks(selected_polygons)
+
+    def __iter__(self) -> Iterator[List[np.ndarray]]:
+        """
+        Yields:
+            list[ndarray]: the polygons for one instance.
+            Each Tensor is a float64 vector representing a polygon.
+        """
+        return iter(self.polygons)
+
+    def __repr__(self) -> str:
+        s = self.__class__.__name__ + "("
+        s += "num_instances={})".format(len(self.polygons))
+        return s
+
+    def __len__(self) -> int:
+        return len(self.polygons)
+
+    def crop_and_resize(self, boxes: torch.Tensor, mask_size: int) -> torch.Tensor:
+        """
+        Crop each mask by the given box, and resize results to (mask_size, mask_size).
+        This can be used to prepare training targets for Mask R-CNN.
+
+        Args:
+            boxes (Tensor): Nx4 tensor storing the boxes for each mask
+            mask_size (int): the size of the rasterized mask.
+
+        Returns:
+            Tensor: A bool tensor of shape (N, mask_size, mask_size), where
+            N is the number of predicted boxes for this image.
+        """
+        assert len(boxes) == len(self), "{} != {}".format(len(boxes), len(self))
+
+        device = boxes.device
+        # Put boxes on the CPU, as the polygon representation is not efficient GPU-wise
+        # (several small tensors for representing a single instance mask)
+        boxes = boxes.to(torch.device("cpu"))
+
+        results = [
+            rasterize_polygons_within_box(poly, box.numpy(), mask_size)
+            for poly, box in zip(self.polygons, boxes)
+        ]
+        """
+        poly: list[list[float]], the polygons for one instance
+        box: a tensor of shape (4,)
+        """
+        if len(results) == 0:
+            return torch.empty(0, mask_size, mask_size, dtype=torch.bool, device=device)
+        return torch.stack(results, dim=0).to(device=device)
+
+    def area(self):
+        """
+        Computes area of the mask.
+        Only works with Polygons, using the shoelace formula:
+        https://stackoverflow.com/questions/24467972/calculate-area-of-polygon-given-x-y-coordinates
+
+        Returns:
+            Tensor: a vector, area for each instance
+        """
+
+        area = []
+        for polygons_per_instance in self.polygons:
+            area_per_instance = 0
+            for p in polygons_per_instance:
+                area_per_instance += polygon_area(p[0::2], p[1::2])
+            area.append(area_per_instance)
+
+        return torch.tensor(area)
+
+    @staticmethod
+    def cat(polymasks_list: List["PolygonMasks"]) -> "PolygonMasks":
+        """
+        Concatenates a list of PolygonMasks into a single PolygonMasks
+
+        Arguments:
+            polymasks_list (list[PolygonMasks])
+
+        Returns:
+            PolygonMasks: the concatenated PolygonMasks
+        """
+        assert isinstance(polymasks_list, (list, tuple))
+        assert len(polymasks_list) > 0
+        assert all(isinstance(polymask, PolygonMasks) for polymask in polymasks_list)
+
+        cat_polymasks = type(polymasks_list[0])(
+            list(itertools.chain.from_iterable(pm.polygons for pm in polymasks_list))
+        )
+        return cat_polymasks
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/detectron2/structures/rotated_boxes.py b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/structures/rotated_boxes.py
new file mode 100644
index 0000000000000000000000000000000000000000..823cfb62a13d0ff060099d1b930bc900a4ca009b
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/structures/rotated_boxes.py
@@ -0,0 +1,481 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+import math
+from typing import Iterator, Union
+import torch
+
+from detectron2.layers.rotated_boxes import pairwise_iou_rotated
+
+from .boxes import Boxes
+
+
+class RotatedBoxes(Boxes):
+    """
+    This structure stores a list of rotated boxes as a Nx5 torch.Tensor.
+    It supports some common methods about boxes
+    (`area`, `clip`, `nonempty`, etc),
+    and also behaves like a Tensor
+    (support indexing, `to(device)`, `.device`, and iteration over all boxes)
+    """
+
+    def __init__(self, tensor: torch.Tensor):
+        """
+        Args:
+            tensor (Tensor[float]): a Nx5 matrix.  Each row is
+                (x_center, y_center, width, height, angle),
+                in which angle is represented in degrees.
+                While there's no strict range restriction for it,
+                the recommended principal range is between [-180, 180) degrees.
+
+        Assume we have a horizontal box B = (x_center, y_center, width, height),
+        where width is along the x-axis and height is along the y-axis.
+        The rotated box B_rot (x_center, y_center, width, height, angle)
+        can be seen as:
+
+        1. When angle == 0:
+           B_rot == B
+        2. When angle > 0:
+           B_rot is obtained by rotating B w.r.t its center by :math:`|angle|` degrees CCW;
+        3. When angle < 0:
+           B_rot is obtained by rotating B w.r.t its center by :math:`|angle|` degrees CW.
+
+        Mathematically, since the right-handed coordinate system for image space
+        is (y, x), where y is top->down and x is left->right, the 4 vertices of the
+        rotated rectangle :math:`(yr_i, xr_i)` (i = 1, 2, 3, 4) can be obtained from
+        the vertices of the horizontal rectangle (y_i, x_i) (i = 1, 2, 3, 4)
+        in the following way (:math:`\\theta = angle*\\pi/180` is the angle in radians,
+        (y_c, x_c) is the center of the rectangle):
+
+        .. math::
+
+            yr_i = \\cos(\\theta) (y_i - y_c) - \\sin(\\theta) (x_i - x_c) + y_c,
+
+            xr_i = \\sin(\\theta) (y_i - y_c) + \\cos(\\theta) (x_i - x_c) + x_c,
+
+        which is the standard rigid-body rotation transformation.
+
+        Intuitively, the angle is
+        (1) the rotation angle from y-axis in image space
+        to the height vector (top->down in the box's local coordinate system)
+        of the box in CCW, and
+        (2) the rotation angle from x-axis in image space
+        to the width vector (left->right in the box's local coordinate system)
+        of the box in CCW.
+
+        More intuitively, consider the following horizontal box ABCD represented
+        in (x1, y1, x2, y2): (3, 2, 7, 4),
+        covering the [3, 7] x [2, 4] region of the continuous coordinate system
+        which looks like this:
+
+        .. code:: none
+
+            O--------> x
+            |
+            |  A---B
+            |  |   |
+            |  D---C
+            |
+            v y
+
+        Note that each capital letter represents one 0-dimensional geometric point
+        instead of a 'square pixel' here.
+
+        In the example above, using (x, y) to represent a point we have:
+
+        .. math::
+
+            O = (0, 0), A = (3, 2), B = (7, 2), C = (7, 4), D = (3, 4)
+
+        We name vector AB = vector DC as the width vector in box's local coordinate system, and
+        vector AD = vector BC as the height vector in box's local coordinate system. Initially,
+        when angle = 0 degree, they're aligned with the positive directions of x-axis and y-axis
+        in the image space, respectively.
+
+        For better illustration, we denote the center of the box as E,
+
+        .. code:: none
+
+            O--------> x
+            |
+            |  A---B
+            |  | E |
+            |  D---C
+            |
+            v y
+
+        where the center E = ((3+7)/2, (2+4)/2) = (5, 3).
+
+        Also,
+
+        .. math::
+
+            width = |AB| = |CD| = 7 - 3 = 4,
+            height = |AD| = |BC| = 4 - 2 = 2.
+
+        Therefore, the corresponding representation for the same shape in rotated box in
+        (x_center, y_center, width, height, angle) format is:
+
+        (5, 3, 4, 2, 0),
+
+        Now, let's consider (5, 3, 4, 2, 90), which is rotated by 90 degrees
+        CCW (counter-clockwise) by definition. It looks like this:
+
+        .. code:: none
+
+            O--------> x
+            |   B-C
+            |   | |
+            |   |E|
+            |   | |
+            |   A-D
+            v y
+
+        The center E is still located at the same point (5, 3), while the vertices
+        ABCD are rotated by 90 degrees CCW with regard to E:
+        A = (4, 5), B = (4, 1), C = (6, 1), D = (6, 5)
+
+        Here, 90 degrees can be seen as the CCW angle to rotate from y-axis to
+        vector AD or vector BC (the top->down height vector in box's local coordinate system),
+        or the CCW angle to rotate from x-axis to vector AB or vector DC (the left->right
+        width vector in box's local coordinate system).
+
+        .. math::
+
+            width = |AB| = |CD| = 5 - 1 = 4,
+            height = |AD| = |BC| = 6 - 4 = 2.
+
+        Next, how about (5, 3, 4, 2, -90), which is rotated by 90 degrees CW (clockwise)
+        by definition? It looks like this:
+
+        .. code:: none
+
+            O--------> x
+            |   D-A
+            |   | |
+            |   |E|
+            |   | |
+            |   C-B
+            v y
+
+        The center E is still located at the same point (5, 3), while the vertices
+        ABCD are rotated by 90 degrees CW with regard to E:
+        A = (6, 1), B = (6, 5), C = (4, 5), D = (4, 1)
+
+        .. math::
+
+            width = |AB| = |CD| = 5 - 1 = 4,
+            height = |AD| = |BC| = 6 - 4 = 2.
+
+        This covers exactly the same region as (5, 3, 4, 2, 90) does, and their IoU
+        will be 1. However, these two will generate different RoI Pooling results and
+        should not be treated as an identical box.
+
+        On the other hand, it's easy to see that (X, Y, W, H, A) is identical to
+        (X, Y, W, H, A+360N), for any integer N. For example (5, 3, 4, 2, 270) would be
+        identical to (5, 3, 4, 2, -90), because rotating the shape 270 degrees CCW is
+        equivalent to rotating the same shape 90 degrees CW.
+
+        We could rotate further to get (5, 3, 4, 2, 180), or (5, 3, 4, 2, -180):
+
+        .. code:: none
+
+            O--------> x
+            |
+            |  C---D
+            |  | E |
+            |  B---A
+            |
+            v y
+
+        .. math::
+
+            A = (7, 4), B = (3, 4), C = (3, 2), D = (7, 2),
+
+            width = |AB| = |CD| = 7 - 3 = 4,
+            height = |AD| = |BC| = 4 - 2 = 2.
+
+        Finally, this is a very inaccurate (heavily quantized) illustration of
+        how (5, 3, 4, 2, 60) looks like in case anyone wonders:
+
+        .. code:: none
+
+            O--------> x
+            |     B\
+            |    /  C
+            |   /E /
+            |  A  /
+            |   `D
+            v y
+
+        It's still a rectangle with center of (5, 3), width of 4 and height of 2,
+        but its angle (and thus orientation) is somewhere between
+        (5, 3, 4, 2, 0) and (5, 3, 4, 2, 90).
+        """
+        device = tensor.device if isinstance(tensor, torch.Tensor) else torch.device("cpu")
+        tensor = torch.as_tensor(tensor, dtype=torch.float32, device=device)
+        if tensor.numel() == 0:
+            # Use reshape, so we don't end up creating a new tensor that does not depend on
+            # the inputs (and consequently confuses jit)
+            tensor = tensor.reshape((0, 5)).to(dtype=torch.float32, device=device)
+        assert tensor.dim() == 2 and tensor.size(-1) == 5, tensor.size()
+
+        self.tensor = tensor
+
+    def clone(self) -> "RotatedBoxes":
+        """
+        Clone the RotatedBoxes.
+
+        Returns:
+            RotatedBoxes
+        """
+        return RotatedBoxes(self.tensor.clone())
+
+    def to(self, device: str) -> "RotatedBoxes":
+        return RotatedBoxes(self.tensor.to(device))
+
+    def area(self) -> torch.Tensor:
+        """
+        Computes the area of all the boxes.
+
+        Returns:
+            torch.Tensor: a vector with areas of each box.
+        """
+        box = self.tensor
+        area = box[:, 2] * box[:, 3]
+        return area
+
+    def normalize_angles(self) -> None:
+        """
+        Restrict angles to the range of [-180, 180) degrees
+        """
+        self.tensor[:, 4] = (self.tensor[:, 4] + 180.0) % 360.0 - 180.0
+
+    def clip(self, box_size: Boxes.BoxSizeType, clip_angle_threshold: float = 1.0) -> None:
+        """
+        Clip (in place) the boxes by limiting x coordinates to the range [0, width]
+        and y coordinates to the range [0, height].
+
+        For RRPN:
+        Only clip boxes that are almost horizontal with a tolerance of
+        clip_angle_threshold to maintain backward compatibility.
+
+        Rotated boxes beyond this threshold are not clipped for two reasons:
+
+        1. There are potentially multiple ways to clip a rotated box to make it
+           fit within the image.
+        2. It's tricky to make the entire rectangular box fit within the image
+           and still be able to not leave out pixels of interest.
+
+        Therefore we rely on ops like RoIAlignRotated to safely handle this.
+
+        Args:
+            box_size (height, width): The clipping box's size.
+            clip_angle_threshold:
+                Iff. abs(normalized(angle)) <= clip_angle_threshold (in degrees),
+                we do the clipping as horizontal boxes.
+        """
+        h, w = box_size
+
+        # normalize angles to be within (-180, 180] degrees
+        self.normalize_angles()
+
+        idx = torch.where(torch.abs(self.tensor[:, 4]) <= clip_angle_threshold)[0]
+
+        # convert to (x1, y1, x2, y2)
+        x1 = self.tensor[idx, 0] - self.tensor[idx, 2] / 2.0
+        y1 = self.tensor[idx, 1] - self.tensor[idx, 3] / 2.0
+        x2 = self.tensor[idx, 0] + self.tensor[idx, 2] / 2.0
+        y2 = self.tensor[idx, 1] + self.tensor[idx, 3] / 2.0
+
+        # clip
+        x1.clamp_(min=0, max=w)
+        y1.clamp_(min=0, max=h)
+        x2.clamp_(min=0, max=w)
+        y2.clamp_(min=0, max=h)
+
+        # convert back to (xc, yc, w, h)
+        self.tensor[idx, 0] = (x1 + x2) / 2.0
+        self.tensor[idx, 1] = (y1 + y2) / 2.0
+        # make sure widths and heights do not increase due to numerical errors
+        self.tensor[idx, 2] = torch.min(self.tensor[idx, 2], x2 - x1)
+        self.tensor[idx, 3] = torch.min(self.tensor[idx, 3], y2 - y1)
+
+    def nonempty(self, threshold: float = 0.0) -> torch.Tensor:
+        """
+        Find boxes that are non-empty.
+        A box is considered empty, if either of its side is no larger than threshold.
+
+        Returns:
+            Tensor: a binary vector which represents
+            whether each box is empty (False) or non-empty (True).
+        """
+        box = self.tensor
+        widths = box[:, 2]
+        heights = box[:, 3]
+        keep = (widths > threshold) & (heights > threshold)
+        return keep
+
+    def __getitem__(self, item: Union[int, slice, torch.BoolTensor]) -> "RotatedBoxes":
+        """
+        Returns:
+            RotatedBoxes: Create a new :class:`RotatedBoxes` by indexing.
+
+        The following usage are allowed:
+
+        1. `new_boxes = boxes[3]`: return a `RotatedBoxes` which contains only one box.
+        2. `new_boxes = boxes[2:10]`: return a slice of boxes.
+        3. `new_boxes = boxes[vector]`, where vector is a torch.ByteTensor
+           with `length = len(boxes)`. Nonzero elements in the vector will be selected.
+
+        Note that the returned RotatedBoxes might share storage with this RotatedBoxes,
+        subject to Pytorch's indexing semantics.
+        """
+        if isinstance(item, int):
+            return RotatedBoxes(self.tensor[item].view(1, -1))
+        b = self.tensor[item]
+        assert b.dim() == 2, "Indexing on RotatedBoxes with {} failed to return a matrix!".format(
+            item
+        )
+        return RotatedBoxes(b)
+
+    def __len__(self) -> int:
+        return self.tensor.shape[0]
+
+    def __repr__(self) -> str:
+        return "RotatedBoxes(" + str(self.tensor) + ")"
+
+    def inside_box(self, box_size: Boxes.BoxSizeType, boundary_threshold: int = 0) -> torch.Tensor:
+        """
+        Args:
+            box_size (height, width): Size of the reference box covering
+                [0, width] x [0, height]
+            boundary_threshold (int): Boxes that extend beyond the reference box
+                boundary by more than boundary_threshold are considered "outside".
+
+        For RRPN, it might not be necessary to call this function since it's common
+        for rotated box to extend to outside of the image boundaries
+        (the clip function only clips the near-horizontal boxes)
+
+        Returns:
+            a binary vector, indicating whether each box is inside the reference box.
+        """
+        height, width = box_size
+
+        cnt_x = self.tensor[..., 0]
+        cnt_y = self.tensor[..., 1]
+        half_w = self.tensor[..., 2] / 2.0
+        half_h = self.tensor[..., 3] / 2.0
+        a = self.tensor[..., 4]
+        c = torch.abs(torch.cos(a * math.pi / 180.0))
+        s = torch.abs(torch.sin(a * math.pi / 180.0))
+        # This basically computes the horizontal bounding rectangle of the rotated box
+        max_rect_dx = c * half_w + s * half_h
+        max_rect_dy = c * half_h + s * half_w
+
+        inds_inside = (
+            (cnt_x - max_rect_dx >= -boundary_threshold)
+            & (cnt_y - max_rect_dy >= -boundary_threshold)
+            & (cnt_x + max_rect_dx < width + boundary_threshold)
+            & (cnt_y + max_rect_dy < height + boundary_threshold)
+        )
+
+        return inds_inside
+
+    def get_centers(self) -> torch.Tensor:
+        """
+        Returns:
+            The box centers in a Nx2 array of (x, y).
+        """
+        return self.tensor[:, :2]
+
+    def scale(self, scale_x: float, scale_y: float) -> None:
+        """
+        Scale the rotated box with horizontal and vertical scaling factors
+        Note: when scale_factor_x != scale_factor_y,
+        the rotated box does not preserve the rectangular shape when the angle
+        is not a multiple of 90 degrees under resize transformation.
+        Instead, the shape is a parallelogram (that has skew)
+        Here we make an approximation by fitting a rotated rectangle to the parallelogram.
+        """
+        self.tensor[:, 0] *= scale_x
+        self.tensor[:, 1] *= scale_y
+        theta = self.tensor[:, 4] * math.pi / 180.0
+        c = torch.cos(theta)
+        s = torch.sin(theta)
+
+        # In image space, y is top->down and x is left->right
+        # Consider the local coordintate system for the rotated box,
+        # where the box center is located at (0, 0), and the four vertices ABCD are
+        # A(-w / 2, -h / 2), B(w / 2, -h / 2), C(w / 2, h / 2), D(-w / 2, h / 2)
+        # the midpoint of the left edge AD of the rotated box E is:
+        # E = (A+D)/2 = (-w / 2, 0)
+        # the midpoint of the top edge AB of the rotated box F is:
+        # F(0, -h / 2)
+        # To get the old coordinates in the global system, apply the rotation transformation
+        # (Note: the right-handed coordinate system for image space is yOx):
+        # (old_x, old_y) = (s * y + c * x, c * y - s * x)
+        # E(old) = (s * 0 + c * (-w/2), c * 0 - s * (-w/2)) = (-c * w / 2, s * w / 2)
+        # F(old) = (s * (-h / 2) + c * 0, c * (-h / 2) - s * 0) = (-s * h / 2, -c * h / 2)
+        # After applying the scaling factor (sfx, sfy):
+        # E(new) = (-sfx * c * w / 2, sfy * s * w / 2)
+        # F(new) = (-sfx * s * h / 2, -sfy * c * h / 2)
+        # The new width after scaling tranformation becomes:
+
+        # w(new) = |E(new) - O| * 2
+        #        = sqrt[(sfx * c * w / 2)^2 + (sfy * s * w / 2)^2] * 2
+        #        = sqrt[(sfx * c)^2 + (sfy * s)^2] * w
+        # i.e., scale_factor_w = sqrt[(sfx * c)^2 + (sfy * s)^2]
+        #
+        # For example,
+        # when angle = 0 or 180, |c| = 1, s = 0, scale_factor_w == scale_factor_x;
+        # when |angle| = 90, c = 0, |s| = 1, scale_factor_w == scale_factor_y
+        self.tensor[:, 2] *= torch.sqrt((scale_x * c) ** 2 + (scale_y * s) ** 2)
+
+        # h(new) = |F(new) - O| * 2
+        #        = sqrt[(sfx * s * h / 2)^2 + (sfy * c * h / 2)^2] * 2
+        #        = sqrt[(sfx * s)^2 + (sfy * c)^2] * h
+        # i.e., scale_factor_h = sqrt[(sfx * s)^2 + (sfy * c)^2]
+        #
+        # For example,
+        # when angle = 0 or 180, |c| = 1, s = 0, scale_factor_h == scale_factor_y;
+        # when |angle| = 90, c = 0, |s| = 1, scale_factor_h == scale_factor_x
+        self.tensor[:, 3] *= torch.sqrt((scale_x * s) ** 2 + (scale_y * c) ** 2)
+
+        # The angle is the rotation angle from y-axis in image space to the height
+        # vector (top->down in the box's local coordinate system) of the box in CCW.
+        #
+        # angle(new) = angle_yOx(O - F(new))
+        #            = angle_yOx( (sfx * s * h / 2, sfy * c * h / 2) )
+        #            = atan2(sfx * s * h / 2, sfy * c * h / 2)
+        #            = atan2(sfx * s, sfy * c)
+        #
+        # For example,
+        # when sfx == sfy, angle(new) == atan2(s, c) == angle(old)
+        self.tensor[:, 4] = torch.atan2(scale_x * s, scale_y * c) * 180 / math.pi
+
+    @property
+    def device(self) -> str:
+        return self.tensor.device
+
+    def __iter__(self) -> Iterator[torch.Tensor]:
+        """
+        Yield a box as a Tensor of shape (5,) at a time.
+        """
+        yield from self.tensor
+
+
+def pairwise_iou(boxes1: RotatedBoxes, boxes2: RotatedBoxes) -> None:
+    """
+    Given two lists of rotated boxes of size N and M,
+    compute the IoU (intersection over union)
+    between __all__ N x M pairs of boxes.
+    The box order must be (x_center, y_center, width, height, angle).
+
+    Args:
+        boxes1, boxes2 (RotatedBoxes):
+            two `RotatedBoxes`. Contains N & M rotated boxes, respectively.
+
+    Returns:
+        Tensor: IoU, sized [N,M].
+    """
+
+    return pairwise_iou_rotated(boxes1.tensor, boxes2.tensor)
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/detectron2/utils/README.md b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/utils/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..9765b24a730b77556104187ac3ef5439ab0859fd
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/utils/README.md
@@ -0,0 +1,5 @@
+# Utility functions
+
+This folder contain utility functions that are not used in the
+core library, but are useful for building models or training
+code using the config system.
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/detectron2/utils/__init__.py b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..168f9979a4623806934b0ff1102ac166704e7dec
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/utils/__init__.py
@@ -0,0 +1 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/detectron2/utils/analysis.py b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/utils/analysis.py
new file mode 100644
index 0000000000000000000000000000000000000000..c48e376c242f57f480280538ae770520d14110f8
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/utils/analysis.py
@@ -0,0 +1,164 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+# -*- coding: utf-8 -*-
+
+import logging
+import typing
+import torch
+from fvcore.nn import activation_count, flop_count, parameter_count, parameter_count_table
+from torch import nn
+
+from detectron2.structures import BitMasks, Boxes, ImageList, Instances
+
+from .logger import log_first_n
+
+__all__ = [
+    "activation_count_operators",
+    "flop_count_operators",
+    "parameter_count_table",
+    "parameter_count",
+]
+
+FLOPS_MODE = "flops"
+ACTIVATIONS_MODE = "activations"
+
+
+# some extra ops to ignore from counting.
+_IGNORED_OPS = [
+    "aten::add",
+    "aten::add_",
+    "aten::batch_norm",
+    "aten::constant_pad_nd",
+    "aten::div",
+    "aten::div_",
+    "aten::exp",
+    "aten::log2",
+    "aten::max_pool2d",
+    "aten::meshgrid",
+    "aten::mul",
+    "aten::mul_",
+    "aten::nonzero_numpy",
+    "aten::relu",
+    "aten::relu_",
+    "aten::rsub",
+    "aten::sigmoid",
+    "aten::sigmoid_",
+    "aten::softmax",
+    "aten::sort",
+    "aten::sqrt",
+    "aten::sub",
+    "aten::upsample_nearest2d",
+    "prim::PythonOp",
+    "torchvision::nms",
+]
+
+
+def flop_count_operators(
+    model: nn.Module, inputs: list, **kwargs
+) -> typing.DefaultDict[str, float]:
+    """
+    Implement operator-level flops counting using jit.
+    This is a wrapper of fvcore.nn.flop_count, that supports standard detection models
+    in detectron2.
+
+    Note:
+        The function runs the input through the model to compute flops.
+        The flops of a detection model is often input-dependent, for example,
+        the flops of box & mask head depends on the number of proposals &
+        the number of detected objects.
+        Therefore, the flops counting using a single input may not accurately
+        reflect the computation cost of a model.
+
+    Args:
+        model: a detectron2 model that takes `list[dict]` as input.
+        inputs (list[dict]): inputs to model, in detectron2's standard format.
+    """
+    return _wrapper_count_operators(model=model, inputs=inputs, mode=FLOPS_MODE, **kwargs)
+
+
+def activation_count_operators(
+    model: nn.Module, inputs: list, **kwargs
+) -> typing.DefaultDict[str, float]:
+    """
+    Implement operator-level activations counting using jit.
+    This is a wrapper of fvcore.nn.activation_count, that supports standard detection models
+    in detectron2.
+
+    Note:
+        The function runs the input through the model to compute activations.
+        The activations of a detection model is often input-dependent, for example,
+        the activations of box & mask head depends on the number of proposals &
+        the number of detected objects.
+
+    Args:
+        model: a detectron2 model that takes `list[dict]` as input.
+        inputs (list[dict]): inputs to model, in detectron2's standard format.
+    """
+    return _wrapper_count_operators(model=model, inputs=inputs, mode=ACTIVATIONS_MODE, **kwargs)
+
+
+def _flatten_to_tuple(outputs):
+    result = []
+    if isinstance(outputs, torch.Tensor):
+        result.append(outputs)
+    elif isinstance(outputs, (list, tuple)):
+        for v in outputs:
+            result.extend(_flatten_to_tuple(v))
+    elif isinstance(outputs, dict):
+        for _, v in outputs.items():
+            result.extend(_flatten_to_tuple(v))
+    elif isinstance(outputs, Instances):
+        result.extend(_flatten_to_tuple(outputs.get_fields()))
+    elif isinstance(outputs, (Boxes, BitMasks, ImageList)):
+        result.append(outputs.tensor)
+    else:
+        log_first_n(
+            logging.WARN,
+            f"Output of type {type(outputs)} not included in flops/activations count.",
+            n=10,
+        )
+    return tuple(result)
+
+
+def _wrapper_count_operators(
+    model: nn.Module, inputs: list, mode: str, **kwargs
+) -> typing.DefaultDict[str, float]:
+
+    # ignore some ops
+    supported_ops = {k: lambda *args, **kwargs: {} for k in _IGNORED_OPS}
+    supported_ops.update(kwargs.pop("supported_ops", {}))
+    kwargs["supported_ops"] = supported_ops
+
+    assert len(inputs) == 1, "Please use batch size=1"
+    tensor_input = inputs[0]["image"]
+
+    class WrapModel(nn.Module):
+        def __init__(self, model):
+            super().__init__()
+            if isinstance(
+                model, (nn.parallel.distributed.DistributedDataParallel, nn.DataParallel)
+            ):
+                self.model = model.module
+            else:
+                self.model = model
+
+        def forward(self, image):
+            # jit requires the input/output to be Tensors
+            inputs = [{"image": image}]
+            outputs = self.model.forward(inputs)
+            # Only the subgraph that computes the returned tuple of tensor will be
+            # counted. So we flatten everything we found to tuple of tensors.
+            return _flatten_to_tuple(outputs)
+
+    old_train = model.training
+    with torch.no_grad():
+        if mode == FLOPS_MODE:
+            ret = flop_count(WrapModel(model).train(False), (tensor_input,), **kwargs)
+        elif mode == ACTIVATIONS_MODE:
+            ret = activation_count(WrapModel(model).train(False), (tensor_input,), **kwargs)
+        else:
+            raise NotImplementedError("Count for mode {} is not supported yet.".format(mode))
+    # compatible with change in fvcore
+    if isinstance(ret, tuple):
+        ret = ret[0]
+    model.train(old_train)
+    return ret
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/detectron2/utils/collect_env.py b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/utils/collect_env.py
new file mode 100644
index 0000000000000000000000000000000000000000..c25b99cb0ab626cc4f4dabca5eb81f710011f2e3
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/utils/collect_env.py
@@ -0,0 +1,160 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+import importlib
+import numpy as np
+import os
+import re
+import subprocess
+import sys
+from collections import defaultdict
+import PIL
+import torch
+import torchvision
+from tabulate import tabulate
+
+__all__ = ["collect_env_info"]
+
+
+def collect_torch_env():
+    try:
+        import torch.__config__
+
+        return torch.__config__.show()
+    except ImportError:
+        # compatible with older versions of pytorch
+        from torch.utils.collect_env import get_pretty_env_info
+
+        return get_pretty_env_info()
+
+
+def get_env_module():
+    var_name = "DETECTRON2_ENV_MODULE"
+    return var_name, os.environ.get(var_name, "<not set>")
+
+
+def detect_compute_compatibility(CUDA_HOME, so_file):
+    try:
+        cuobjdump = os.path.join(CUDA_HOME, "bin", "cuobjdump")
+        if os.path.isfile(cuobjdump):
+            output = subprocess.check_output(
+                "'{}' --list-elf '{}'".format(cuobjdump, so_file), shell=True
+            )
+            output = output.decode("utf-8").strip().split("\n")
+            sm = []
+            for line in output:
+                line = re.findall(r"\.sm_[0-9]*\.", line)[0]
+                sm.append(line.strip("."))
+            sm = sorted(set(sm))
+            return ", ".join(sm)
+        else:
+            return so_file + "; cannot find cuobjdump"
+    except Exception:
+        # unhandled failure
+        return so_file
+
+
+def collect_env_info():
+    has_cuda = torch.cuda.is_available()
+    # NOTE: the use of CUDA_HOME requires the CUDA build deps, though in
+    # theory detectron2 should be made runnable with only the CUDA runtime
+    from torch.utils.cpp_extension import CUDA_HOME
+
+    data = []
+    data.append(("sys.platform", sys.platform))
+    data.append(("Python", sys.version.replace("\n", "")))
+    data.append(("numpy", np.__version__))
+
+    try:
+        import detectron2  # noqa
+
+        data.append(
+            ("detectron2", detectron2.__version__ + " @" + os.path.dirname(detectron2.__file__))
+        )
+    except ImportError:
+        data.append(("detectron2", "failed to import"))
+    else:
+        try:
+            from detectron2 import _C
+        except ImportError:
+            data.append(("detectron2._C", "failed to import"))
+        else:
+            data.append(("detectron2 compiler", _C.get_compiler_version()))
+            data.append(("detectron2 CUDA compiler", _C.get_cuda_version()))
+            if has_cuda:
+                data.append(
+                    ("detectron2 arch flags", detect_compute_compatibility(CUDA_HOME, _C.__file__))
+                )
+
+    data.append(get_env_module())
+    data.append(("PyTorch", torch.__version__ + " @" + os.path.dirname(torch.__file__)))
+    data.append(("PyTorch debug build", torch.version.debug))
+
+    data.append(("CUDA available", has_cuda))
+    if has_cuda:
+        devices = defaultdict(list)
+        for k in range(torch.cuda.device_count()):
+            devices[torch.cuda.get_device_name(k)].append(str(k))
+        for name, devids in devices.items():
+            data.append(("GPU " + ",".join(devids), name))
+
+        from torch.utils.cpp_extension import CUDA_HOME
+
+        data.append(("CUDA_HOME", str(CUDA_HOME)))
+
+        if CUDA_HOME is not None and os.path.isdir(CUDA_HOME):
+            try:
+                nvcc = os.path.join(CUDA_HOME, "bin", "nvcc")
+                nvcc = subprocess.check_output("'{}' -V | tail -n1".format(nvcc), shell=True)
+                nvcc = nvcc.decode("utf-8").strip()
+            except subprocess.SubprocessError:
+                nvcc = "Not Available"
+            data.append(("NVCC", nvcc))
+
+        cuda_arch_list = os.environ.get("TORCH_CUDA_ARCH_LIST", None)
+        if cuda_arch_list:
+            data.append(("TORCH_CUDA_ARCH_LIST", cuda_arch_list))
+    data.append(("Pillow", PIL.__version__))
+
+    try:
+        data.append(
+            (
+                "torchvision",
+                str(torchvision.__version__) + " @" + os.path.dirname(torchvision.__file__),
+            )
+        )
+        if has_cuda:
+            try:
+                torchvision_C = importlib.util.find_spec("torchvision._C").origin
+                msg = detect_compute_compatibility(CUDA_HOME, torchvision_C)
+                data.append(("torchvision arch flags", msg))
+            except ImportError:
+                data.append(("torchvision._C", "failed to find"))
+    except AttributeError:
+        data.append(("torchvision", "unknown"))
+
+    try:
+        import fvcore
+
+        data.append(("fvcore", fvcore.__version__))
+    except ImportError:
+        pass
+
+    try:
+        import cv2
+
+        data.append(("cv2", cv2.__version__))
+    except ImportError:
+        pass
+    env_str = tabulate(data) + "\n"
+    env_str += collect_torch_env()
+    return env_str
+
+
+if __name__ == "__main__":
+    try:
+        import detectron2  # noqa
+    except ImportError:
+        print(collect_env_info())
+    else:
+        from detectron2.utils.collect_env import collect_env_info
+
+        print(collect_env_info())
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/detectron2/utils/colormap.py b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/utils/colormap.py
new file mode 100644
index 0000000000000000000000000000000000000000..1bf1455e4ce9e077961143c8d734a7298d28476d
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/utils/colormap.py
@@ -0,0 +1,140 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+
+"""
+An awesome colormap for really neat visualizations.
+Copied from Detectron, and removed gray colors.
+"""
+
+import numpy as np
+
+__all__ = ["colormap", "random_color"]
+
+# fmt: off
+# RGB:
+_COLORS = np.array(
+    [
+        0.000, 0.447, 0.741,
+        0.850, 0.325, 0.098,
+        0.929, 0.694, 0.125,
+        0.494, 0.184, 0.556,
+        0.466, 0.674, 0.188,
+        0.301, 0.745, 0.933,
+        0.635, 0.078, 0.184,
+        0.300, 0.300, 0.300,
+        0.600, 0.600, 0.600,
+        1.000, 0.000, 0.000,
+        1.000, 0.500, 0.000,
+        0.749, 0.749, 0.000,
+        0.000, 1.000, 0.000,
+        0.000, 0.000, 1.000,
+        0.667, 0.000, 1.000,
+        0.333, 0.333, 0.000,
+        0.333, 0.667, 0.000,
+        0.333, 1.000, 0.000,
+        0.667, 0.333, 0.000,
+        0.667, 0.667, 0.000,
+        0.667, 1.000, 0.000,
+        1.000, 0.333, 0.000,
+        1.000, 0.667, 0.000,
+        1.000, 1.000, 0.000,
+        0.000, 0.333, 0.500,
+        0.000, 0.667, 0.500,
+        0.000, 1.000, 0.500,
+        0.333, 0.000, 0.500,
+        0.333, 0.333, 0.500,
+        0.333, 0.667, 0.500,
+        0.333, 1.000, 0.500,
+        0.667, 0.000, 0.500,
+        0.667, 0.333, 0.500,
+        0.667, 0.667, 0.500,
+        0.667, 1.000, 0.500,
+        1.000, 0.000, 0.500,
+        1.000, 0.333, 0.500,
+        1.000, 0.667, 0.500,
+        1.000, 1.000, 0.500,
+        0.000, 0.333, 1.000,
+        0.000, 0.667, 1.000,
+        0.000, 1.000, 1.000,
+        0.333, 0.000, 1.000,
+        0.333, 0.333, 1.000,
+        0.333, 0.667, 1.000,
+        0.333, 1.000, 1.000,
+        0.667, 0.000, 1.000,
+        0.667, 0.333, 1.000,
+        0.667, 0.667, 1.000,
+        0.667, 1.000, 1.000,
+        1.000, 0.000, 1.000,
+        1.000, 0.333, 1.000,
+        1.000, 0.667, 1.000,
+        0.333, 0.000, 0.000,
+        0.500, 0.000, 0.000,
+        0.667, 0.000, 0.000,
+        0.833, 0.000, 0.000,
+        1.000, 0.000, 0.000,
+        0.000, 0.167, 0.000,
+        0.000, 0.333, 0.000,
+        0.000, 0.500, 0.000,
+        0.000, 0.667, 0.000,
+        0.000, 0.833, 0.000,
+        0.000, 1.000, 0.000,
+        0.000, 0.000, 0.167,
+        0.000, 0.000, 0.333,
+        0.000, 0.000, 0.500,
+        0.000, 0.000, 0.667,
+        0.000, 0.000, 0.833,
+        0.000, 0.000, 1.000,
+        0.000, 0.000, 0.000,
+        0.143, 0.143, 0.143,
+        0.857, 0.857, 0.857,
+        1.000, 1.000, 1.000
+    ]
+).astype(np.float32).reshape(-1, 3)
+# fmt: on
+
+
+def colormap(rgb=False, maximum=255):
+    """
+    Args:
+        rgb (bool): whether to return RGB colors or BGR colors.
+        maximum (int): either 255 or 1
+
+    Returns:
+        ndarray: a float32 array of Nx3 colors, in range [0, 255] or [0, 1]
+    """
+    assert maximum in [255, 1], maximum
+    c = _COLORS * maximum
+    if not rgb:
+        c = c[:, ::-1]
+    return c
+
+
+def random_color(rgb=False, maximum=255):
+    """
+    Args:
+        rgb (bool): whether to return RGB colors or BGR colors.
+        maximum (int): either 255 or 1
+
+    Returns:
+        ndarray: a vector of 3 numbers
+    """
+    idx = np.random.randint(0, len(_COLORS))
+    ret = _COLORS[idx] * maximum
+    if not rgb:
+        ret = ret[::-1]
+    return ret
+
+
+if __name__ == "__main__":
+    import cv2
+
+    size = 100
+    H, W = 10, 10
+    canvas = np.random.rand(H * size, W * size, 3).astype("float32")
+    for h in range(H):
+        for w in range(W):
+            idx = h * W + w
+            if idx >= len(_COLORS):
+                break
+            canvas[h * size : (h + 1) * size, w * size : (w + 1) * size] = _COLORS[idx]
+    cv2.imshow("a", canvas)
+    cv2.waitKey(0)
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/detectron2/utils/comm.py b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/utils/comm.py
new file mode 100644
index 0000000000000000000000000000000000000000..8cc7b3dac5a45db87fa91ac86fce50805ecf1bad
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/utils/comm.py
@@ -0,0 +1,263 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+"""
+This file contains primitives for multi-gpu communication.
+This is useful when doing distributed training.
+"""
+
+import functools
+import logging
+import numpy as np
+import pickle
+import torch
+import torch.distributed as dist
+
+_LOCAL_PROCESS_GROUP = None
+"""
+A torch process group which only includes processes that on the same machine as the current process.
+This variable is set when processes are spawned by `launch()` in "engine/launch.py".
+"""
+
+
+def get_world_size() -> int:
+    if not dist.is_available():
+        return 1
+    if not dist.is_initialized():
+        return 1
+    return dist.get_world_size()
+
+
+def get_rank() -> int:
+    if not dist.is_available():
+        return 0
+    if not dist.is_initialized():
+        return 0
+    return dist.get_rank()
+
+
+def get_local_rank() -> int:
+    """
+    Returns:
+        The rank of the current process within the local (per-machine) process group.
+    """
+    if not dist.is_available():
+        return 0
+    if not dist.is_initialized():
+        return 0
+    assert _LOCAL_PROCESS_GROUP is not None
+    return dist.get_rank(group=_LOCAL_PROCESS_GROUP)
+
+
+def get_local_size() -> int:
+    """
+    Returns:
+        The size of the per-machine process group,
+        i.e. the number of processes per machine.
+    """
+    if not dist.is_available():
+        return 1
+    if not dist.is_initialized():
+        return 1
+    return dist.get_world_size(group=_LOCAL_PROCESS_GROUP)
+
+
+def is_main_process() -> bool:
+    return get_rank() == 0
+
+
+def synchronize():
+    """
+    Helper function to synchronize (barrier) among all processes when
+    using distributed training
+    """
+    if not dist.is_available():
+        return
+    if not dist.is_initialized():
+        return
+    world_size = dist.get_world_size()
+    if world_size == 1:
+        return
+    dist.barrier()
+
+
+@functools.lru_cache()
+def _get_global_gloo_group():
+    """
+    Return a process group based on gloo backend, containing all the ranks
+    The result is cached.
+    """
+    if dist.get_backend() == "nccl":
+        return dist.new_group(backend="gloo")
+    else:
+        return dist.group.WORLD
+
+
+def _serialize_to_tensor(data, group):
+    backend = dist.get_backend(group)
+    assert backend in ["gloo", "nccl"]
+    device = torch.device("cpu" if backend == "gloo" else "cuda")
+
+    buffer = pickle.dumps(data)
+    if len(buffer) > 1024 ** 3:
+        logger = logging.getLogger(__name__)
+        logger.warning(
+            "Rank {} trying to all-gather {:.2f} GB of data on device {}".format(
+                get_rank(), len(buffer) / (1024 ** 3), device
+            )
+        )
+    storage = torch.ByteStorage.from_buffer(buffer)
+    tensor = torch.ByteTensor(storage).to(device=device)
+    return tensor
+
+
+def _pad_to_largest_tensor(tensor, group):
+    """
+    Returns:
+        list[int]: size of the tensor, on each rank
+        Tensor: padded tensor that has the max size
+    """
+    world_size = dist.get_world_size(group=group)
+    assert (
+        world_size >= 1
+    ), "comm.gather/all_gather must be called from ranks within the given group!"
+    local_size = torch.tensor([tensor.numel()], dtype=torch.int64, device=tensor.device)
+    size_list = [
+        torch.zeros([1], dtype=torch.int64, device=tensor.device) for _ in range(world_size)
+    ]
+    dist.all_gather(size_list, local_size, group=group)
+    size_list = [int(size.item()) for size in size_list]
+
+    max_size = max(size_list)
+
+    # we pad the tensor because torch all_gather does not support
+    # gathering tensors of different shapes
+    if local_size != max_size:
+        padding = torch.zeros((max_size - local_size,), dtype=torch.uint8, device=tensor.device)
+        tensor = torch.cat((tensor, padding), dim=0)
+    return size_list, tensor
+
+
+def all_gather(data, group=None):
+    """
+    Run all_gather on arbitrary picklable data (not necessarily tensors).
+
+    Args:
+        data: any picklable object
+        group: a torch process group. By default, will use a group which
+            contains all ranks on gloo backend.
+
+    Returns:
+        list[data]: list of data gathered from each rank
+    """
+    if get_world_size() == 1:
+        return [data]
+    if group is None:
+        group = _get_global_gloo_group()
+    if dist.get_world_size(group) == 1:
+        return [data]
+
+    tensor = _serialize_to_tensor(data, group)
+
+    size_list, tensor = _pad_to_largest_tensor(tensor, group)
+    max_size = max(size_list)
+
+    # receiving Tensor from all ranks
+    tensor_list = [
+        torch.empty((max_size,), dtype=torch.uint8, device=tensor.device) for _ in size_list
+    ]
+    dist.all_gather(tensor_list, tensor, group=group)
+
+    data_list = []
+    for size, tensor in zip(size_list, tensor_list):
+        buffer = tensor.cpu().numpy().tobytes()[:size]
+        data_list.append(pickle.loads(buffer))
+
+    return data_list
+
+
+def gather(data, dst=0, group=None):
+    """
+    Run gather on arbitrary picklable data (not necessarily tensors).
+
+    Args:
+        data: any picklable object
+        dst (int): destination rank
+        group: a torch process group. By default, will use a group which
+            contains all ranks on gloo backend.
+
+    Returns:
+        list[data]: on dst, a list of data gathered from each rank. Otherwise,
+            an empty list.
+    """
+    if get_world_size() == 1:
+        return [data]
+    if group is None:
+        group = _get_global_gloo_group()
+    if dist.get_world_size(group=group) == 1:
+        return [data]
+    rank = dist.get_rank(group=group)
+
+    tensor = _serialize_to_tensor(data, group)
+    size_list, tensor = _pad_to_largest_tensor(tensor, group)
+
+    # receiving Tensor from all ranks
+    if rank == dst:
+        max_size = max(size_list)
+        tensor_list = [
+            torch.empty((max_size,), dtype=torch.uint8, device=tensor.device) for _ in size_list
+        ]
+        dist.gather(tensor, tensor_list, dst=dst, group=group)
+
+        data_list = []
+        for size, tensor in zip(size_list, tensor_list):
+            buffer = tensor.cpu().numpy().tobytes()[:size]
+            data_list.append(pickle.loads(buffer))
+        return data_list
+    else:
+        dist.gather(tensor, [], dst=dst, group=group)
+        return []
+
+
+def shared_random_seed():
+    """
+    Returns:
+        int: a random number that is the same across all workers.
+            If workers need a shared RNG, they can use this shared seed to
+            create one.
+
+    All workers must call this function, otherwise it will deadlock.
+    """
+    ints = np.random.randint(2 ** 31)
+    all_ints = all_gather(ints)
+    return all_ints[0]
+
+
+def reduce_dict(input_dict, average=True):
+    """
+    Reduce the values in the dictionary from all processes so that process with rank
+    0 has the reduced results.
+
+    Args:
+        input_dict (dict): inputs to be reduced. All the values must be scalar CUDA Tensor.
+        average (bool): whether to do average or sum
+
+    Returns:
+        a dict with the same keys as input_dict, after reduction.
+    """
+    world_size = get_world_size()
+    if world_size < 2:
+        return input_dict
+    with torch.no_grad():
+        names = []
+        values = []
+        # sort the keys so that they are consistent across processes
+        for k in sorted(input_dict.keys()):
+            names.append(k)
+            values.append(input_dict[k])
+        values = torch.stack(values, dim=0)
+        dist.reduce(values, dst=0)
+        if dist.get_rank() == 0 and average:
+            # only main process gets accumulated, so only divide by
+            # world_size in this case
+            values /= world_size
+        reduced_dict = {k: v for k, v in zip(names, values)}
+    return reduced_dict
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/detectron2/utils/env.py b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/utils/env.py
new file mode 100644
index 0000000000000000000000000000000000000000..6769cae4cfb71ae05c605cb9e30eb12ee58c6ee7
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/utils/env.py
@@ -0,0 +1,116 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+import importlib
+import importlib.util
+import logging
+import numpy as np
+import os
+import random
+import sys
+from datetime import datetime
+import torch
+
+__all__ = ["seed_all_rng"]
+
+
+def seed_all_rng(seed=None):
+    """
+    Set the random seed for the RNG in torch, numpy and python.
+
+    Args:
+        seed (int): if None, will use a strong random seed.
+    """
+    if seed is None:
+        seed = (
+            os.getpid()
+            + int(datetime.now().strftime("%S%f"))
+            + int.from_bytes(os.urandom(2), "big")
+        )
+        logger = logging.getLogger(__name__)
+        logger.info("Using a generated random seed {}".format(seed))
+    np.random.seed(seed)
+    torch.set_rng_state(torch.manual_seed(seed).get_state())
+    random.seed(seed)
+
+
+# from https://stackoverflow.com/questions/67631/how-to-import-a-module-given-the-full-path
+def _import_file(module_name, file_path, make_importable=False):
+    spec = importlib.util.spec_from_file_location(module_name, file_path)
+    module = importlib.util.module_from_spec(spec)
+    spec.loader.exec_module(module)
+    if make_importable:
+        sys.modules[module_name] = module
+    return module
+
+
+def _configure_libraries():
+    """
+    Configurations for some libraries.
+    """
+    # An environment option to disable `import cv2` globally,
+    # in case it leads to negative performance impact
+    disable_cv2 = int(os.environ.get("DETECTRON2_DISABLE_CV2", False))
+    if disable_cv2:
+        sys.modules["cv2"] = None
+    else:
+        # Disable opencl in opencv since its interaction with cuda often has negative effects
+        # This envvar is supported after OpenCV 3.4.0
+        os.environ["OPENCV_OPENCL_RUNTIME"] = "disabled"
+        try:
+            import cv2
+
+            if int(cv2.__version__.split(".")[0]) >= 3:
+                cv2.ocl.setUseOpenCL(False)
+        except ImportError:
+            pass
+
+    def get_version(module, digit=2):
+        return tuple(map(int, module.__version__.split(".")[:digit]))
+
+    # fmt: off
+    assert get_version(torch) >= (1, 4), "Requires torch>=1.4"
+    import fvcore
+    assert get_version(fvcore, 3) >= (0, 1, 1), "Requires fvcore>=0.1.1"
+    import yaml
+    assert get_version(yaml) >= (5, 1), "Requires pyyaml>=5.1"
+    # fmt: on
+
+
+_ENV_SETUP_DONE = False
+
+
+def setup_environment():
+    """Perform environment setup work. The default setup is a no-op, but this
+    function allows the user to specify a Python source file or a module in
+    the $DETECTRON2_ENV_MODULE environment variable, that performs
+    custom setup work that may be necessary to their computing environment.
+    """
+    global _ENV_SETUP_DONE
+    if _ENV_SETUP_DONE:
+        return
+    _ENV_SETUP_DONE = True
+
+    _configure_libraries()
+
+    custom_module_path = os.environ.get("DETECTRON2_ENV_MODULE")
+
+    if custom_module_path:
+        setup_custom_environment(custom_module_path)
+    else:
+        # The default setup is a no-op
+        pass
+
+
+def setup_custom_environment(custom_module):
+    """
+    Load custom environment setup by importing a Python source file or a
+    module, and run the setup function.
+    """
+    if custom_module.endswith(".py"):
+        module = _import_file("detectron2.utils.env.custom_module", custom_module)
+    else:
+        module = importlib.import_module(custom_module)
+    assert hasattr(module, "setup_environment") and callable(module.setup_environment), (
+        "Custom environment module defined in {} does not have the "
+        "required callable attribute 'setup_environment'."
+    ).format(custom_module)
+    module.setup_environment()
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/detectron2/utils/events.py b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/utils/events.py
new file mode 100644
index 0000000000000000000000000000000000000000..a3c57edb05016d2df041d756f59e90dfabddd718
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/utils/events.py
@@ -0,0 +1,432 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+import datetime
+import json
+import logging
+import os
+import time
+from collections import defaultdict
+from contextlib import contextmanager
+import torch
+from fvcore.common.file_io import PathManager
+from fvcore.common.history_buffer import HistoryBuffer
+
+_CURRENT_STORAGE_STACK = []
+
+
+def get_event_storage():
+    """
+    Returns:
+        The :class:`EventStorage` object that's currently being used.
+        Throws an error if no :class:`EventStorage` is currently enabled.
+    """
+    assert len(
+        _CURRENT_STORAGE_STACK
+    ), "get_event_storage() has to be called inside a 'with EventStorage(...)' context!"
+    return _CURRENT_STORAGE_STACK[-1]
+
+
+class EventWriter:
+    """
+    Base class for writers that obtain events from :class:`EventStorage` and process them.
+    """
+
+    def write(self):
+        raise NotImplementedError
+
+    def close(self):
+        pass
+
+
+class JSONWriter(EventWriter):
+    """
+    Write scalars to a json file.
+
+    It saves scalars as one json per line (instead of a big json) for easy parsing.
+
+    Examples parsing such a json file:
+
+    .. code-block:: none
+
+        $ cat metrics.json | jq -s '.[0:2]'
+        [
+          {
+            "data_time": 0.008433341979980469,
+            "iteration": 20,
+            "loss": 1.9228371381759644,
+            "loss_box_reg": 0.050025828182697296,
+            "loss_classifier": 0.5316952466964722,
+            "loss_mask": 0.7236229181289673,
+            "loss_rpn_box": 0.0856662318110466,
+            "loss_rpn_cls": 0.48198649287223816,
+            "lr": 0.007173333333333333,
+            "time": 0.25401854515075684
+          },
+          {
+            "data_time": 0.007216215133666992,
+            "iteration": 40,
+            "loss": 1.282649278640747,
+            "loss_box_reg": 0.06222952902317047,
+            "loss_classifier": 0.30682939291000366,
+            "loss_mask": 0.6970193982124329,
+            "loss_rpn_box": 0.038663312792778015,
+            "loss_rpn_cls": 0.1471673548221588,
+            "lr": 0.007706666666666667,
+            "time": 0.2490077018737793
+          }
+        ]
+
+        $ cat metrics.json | jq '.loss_mask'
+        0.7126231789588928
+        0.689423680305481
+        0.6776131987571716
+        ...
+
+    """
+
+    def __init__(self, json_file, window_size=20):
+        """
+        Args:
+            json_file (str): path to the json file. New data will be appended if the file exists.
+            window_size (int): the window size of median smoothing for the scalars whose
+                `smoothing_hint` are True.
+        """
+        self._file_handle = PathManager.open(json_file, "a")
+        self._window_size = window_size
+
+    def write(self):
+        storage = get_event_storage()
+        to_save = {"iteration": storage.iter}
+        to_save.update(storage.latest_with_smoothing_hint(self._window_size))
+        self._file_handle.write(json.dumps(to_save, sort_keys=True) + "\n")
+        self._file_handle.flush()
+        try:
+            os.fsync(self._file_handle.fileno())
+        except AttributeError:
+            pass
+
+    def close(self):
+        self._file_handle.close()
+
+
+class TensorboardXWriter(EventWriter):
+    """
+    Write all scalars to a tensorboard file.
+    """
+
+    def __init__(self, log_dir: str, window_size: int = 20, **kwargs):
+        """
+        Args:
+            log_dir (str): the directory to save the output events
+            window_size (int): the scalars will be median-smoothed by this window size
+
+            kwargs: other arguments passed to `torch.utils.tensorboard.SummaryWriter(...)`
+        """
+        self._window_size = window_size
+        from torch.utils.tensorboard import SummaryWriter
+
+        self._writer = SummaryWriter(log_dir, **kwargs)
+
+    def write(self):
+        storage = get_event_storage()
+        for k, v in storage.latest_with_smoothing_hint(self._window_size).items():
+            self._writer.add_scalar(k, v, storage.iter)
+
+        # storage.put_{image,histogram} is only meant to be used by
+        # tensorboard writer. So we access its internal fields directly from here.
+        if len(storage._vis_data) >= 1:
+            for img_name, img, step_num in storage._vis_data:
+                self._writer.add_image(img_name, img, step_num)
+            # Storage stores all image data and rely on this writer to clear them.
+            # As a result it assumes only one writer will use its image data.
+            # An alternative design is to let storage store limited recent
+            # data (e.g. only the most recent image) that all writers can access.
+            # In that case a writer may not see all image data if its period is long.
+            storage.clear_images()
+
+        if len(storage._histograms) >= 1:
+            for params in storage._histograms:
+                self._writer.add_histogram_raw(**params)
+            storage.clear_histograms()
+
+    def close(self):
+        if hasattr(self, "_writer"):  # doesn't exist when the code fails at import
+            self._writer.close()
+
+
+class CommonMetricPrinter(EventWriter):
+    """
+    Print **common** metrics to the terminal, including
+    iteration time, ETA, memory, all losses, and the learning rate.
+
+    To print something different, please implement a similar printer by yourself.
+    """
+
+    def __init__(self, max_iter):
+        """
+        Args:
+            max_iter (int): the maximum number of iterations to train.
+                Used to compute ETA.
+        """
+        self.logger = logging.getLogger(__name__)
+        self._max_iter = max_iter
+        self._last_write = None
+
+    def write(self):
+        storage = get_event_storage()
+        iteration = storage.iter
+
+        try:
+            data_time = storage.history("data_time").avg(20)
+        except KeyError:
+            # they may not exist in the first few iterations (due to warmup)
+            # or when SimpleTrainer is not used
+            data_time = None
+
+        eta_string = None
+        try:
+            iter_time = storage.history("time").global_avg()
+            eta_seconds = storage.history("time").median(1000) * (self._max_iter - iteration)
+            storage.put_scalar("eta_seconds", eta_seconds, smoothing_hint=False)
+            eta_string = str(datetime.timedelta(seconds=int(eta_seconds)))
+        except KeyError:
+            iter_time = None
+            # estimate eta on our own - more noisy
+            if self._last_write is not None:
+                estimate_iter_time = (time.perf_counter() - self._last_write[1]) / (
+                    iteration - self._last_write[0]
+                )
+                eta_seconds = estimate_iter_time * (self._max_iter - iteration)
+                eta_string = str(datetime.timedelta(seconds=int(eta_seconds)))
+            self._last_write = (iteration, time.perf_counter())
+
+        try:
+            lr = "{:.6f}".format(storage.history("lr").latest())
+        except KeyError:
+            lr = "N/A"
+
+        if torch.cuda.is_available():
+            max_mem_mb = torch.cuda.max_memory_allocated() / 1024.0 / 1024.0
+        else:
+            max_mem_mb = None
+
+        # NOTE: max_mem is parsed by grep in "dev/parse_results.sh"
+        self.logger.info(
+            " {eta}iter: {iter}  {losses}  {time}{data_time}lr: {lr}  {memory}".format(
+                eta=f"eta: {eta_string}  " if eta_string else "",
+                iter=iteration,
+                losses="  ".join(
+                    [
+                        "{}: {:.3f}".format(k, v.median(20))
+                        for k, v in storage.histories().items()
+                        if "loss" in k
+                    ]
+                ),
+                time="time: {:.4f}  ".format(iter_time) if iter_time is not None else "",
+                data_time="data_time: {:.4f}  ".format(data_time) if data_time is not None else "",
+                lr=lr,
+                memory="max_mem: {:.0f}M".format(max_mem_mb) if max_mem_mb is not None else "",
+            )
+        )
+
+
+class EventStorage:
+    """
+    The user-facing class that provides metric storage functionalities.
+
+    In the future we may add support for storing / logging other types of data if needed.
+    """
+
+    def __init__(self, start_iter=0):
+        """
+        Args:
+            start_iter (int): the iteration number to start with
+        """
+        self._history = defaultdict(HistoryBuffer)
+        self._smoothing_hints = {}
+        self._latest_scalars = {}
+        self._iter = start_iter
+        self._current_prefix = ""
+        self._vis_data = []
+        self._histograms = []
+
+    def put_image(self, img_name, img_tensor):
+        """
+        Add an `img_tensor` associated with `img_name`, to be shown on
+        tensorboard.
+
+        Args:
+            img_name (str): The name of the image to put into tensorboard.
+            img_tensor (torch.Tensor or numpy.array): An `uint8` or `float`
+                Tensor of shape `[channel, height, width]` where `channel` is
+                3. The image format should be RGB. The elements in img_tensor
+                can either have values in [0, 1] (float32) or [0, 255] (uint8).
+                The `img_tensor` will be visualized in tensorboard.
+        """
+        self._vis_data.append((img_name, img_tensor, self._iter))
+
+    def put_scalar(self, name, value, smoothing_hint=True):
+        """
+        Add a scalar `value` to the `HistoryBuffer` associated with `name`.
+
+        Args:
+            smoothing_hint (bool): a 'hint' on whether this scalar is noisy and should be
+                smoothed when logged. The hint will be accessible through
+                :meth:`EventStorage.smoothing_hints`.  A writer may ignore the hint
+                and apply custom smoothing rule.
+
+                It defaults to True because most scalars we save need to be smoothed to
+                provide any useful signal.
+        """
+        name = self._current_prefix + name
+        history = self._history[name]
+        value = float(value)
+        history.update(value, self._iter)
+        self._latest_scalars[name] = value
+
+        existing_hint = self._smoothing_hints.get(name)
+        if existing_hint is not None:
+            assert (
+                existing_hint == smoothing_hint
+            ), "Scalar {} was put with a different smoothing_hint!".format(name)
+        else:
+            self._smoothing_hints[name] = smoothing_hint
+
+    def put_scalars(self, *, smoothing_hint=True, **kwargs):
+        """
+        Put multiple scalars from keyword arguments.
+
+        Examples:
+
+            storage.put_scalars(loss=my_loss, accuracy=my_accuracy, smoothing_hint=True)
+        """
+        for k, v in kwargs.items():
+            self.put_scalar(k, v, smoothing_hint=smoothing_hint)
+
+    def put_histogram(self, hist_name, hist_tensor, bins=1000):
+        """
+        Create a histogram from a tensor.
+
+        Args:
+            hist_name (str): The name of the histogram to put into tensorboard.
+            hist_tensor (torch.Tensor): A Tensor of arbitrary shape to be converted
+                into a histogram.
+            bins (int): Number of histogram bins.
+        """
+        ht_min, ht_max = hist_tensor.min().item(), hist_tensor.max().item()
+
+        # Create a histogram with PyTorch
+        hist_counts = torch.histc(hist_tensor, bins=bins)
+        hist_edges = torch.linspace(start=ht_min, end=ht_max, steps=bins + 1, dtype=torch.float32)
+
+        # Parameter for the add_histogram_raw function of SummaryWriter
+        hist_params = dict(
+            tag=hist_name,
+            min=ht_min,
+            max=ht_max,
+            num=len(hist_tensor),
+            sum=float(hist_tensor.sum()),
+            sum_squares=float(torch.sum(hist_tensor ** 2)),
+            bucket_limits=hist_edges[1:].tolist(),
+            bucket_counts=hist_counts.tolist(),
+            global_step=self._iter,
+        )
+        self._histograms.append(hist_params)
+
+    def history(self, name):
+        """
+        Returns:
+            HistoryBuffer: the scalar history for name
+        """
+        ret = self._history.get(name, None)
+        if ret is None:
+            raise KeyError("No history metric available for {}!".format(name))
+        return ret
+
+    def histories(self):
+        """
+        Returns:
+            dict[name -> HistoryBuffer]: the HistoryBuffer for all scalars
+        """
+        return self._history
+
+    def latest(self):
+        """
+        Returns:
+            dict[name -> number]: the scalars that's added in the current iteration.
+        """
+        return self._latest_scalars
+
+    def latest_with_smoothing_hint(self, window_size=20):
+        """
+        Similar to :meth:`latest`, but the returned values
+        are either the un-smoothed original latest value,
+        or a median of the given window_size,
+        depend on whether the smoothing_hint is True.
+
+        This provides a default behavior that other writers can use.
+        """
+        result = {}
+        for k, v in self._latest_scalars.items():
+            result[k] = self._history[k].median(window_size) if self._smoothing_hints[k] else v
+        return result
+
+    def smoothing_hints(self):
+        """
+        Returns:
+            dict[name -> bool]: the user-provided hint on whether the scalar
+                is noisy and needs smoothing.
+        """
+        return self._smoothing_hints
+
+    def step(self):
+        """
+        User should call this function at the beginning of each iteration, to
+        notify the storage of the start of a new iteration.
+        The storage will then be able to associate the new data with the
+        correct iteration number.
+        """
+        self._iter += 1
+        self._latest_scalars = {}
+
+    @property
+    def iter(self):
+        return self._iter
+
+    @property
+    def iteration(self):
+        # for backward compatibility
+        return self._iter
+
+    def __enter__(self):
+        _CURRENT_STORAGE_STACK.append(self)
+        return self
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        assert _CURRENT_STORAGE_STACK[-1] == self
+        _CURRENT_STORAGE_STACK.pop()
+
+    @contextmanager
+    def name_scope(self, name):
+        """
+        Yields:
+            A context within which all the events added to this storage
+            will be prefixed by the name scope.
+        """
+        old_prefix = self._current_prefix
+        self._current_prefix = name.rstrip("/") + "/"
+        yield
+        self._current_prefix = old_prefix
+
+    def clear_images(self):
+        """
+        Delete all the stored images for visualization. This should be called
+        after images are written to tensorboard.
+        """
+        self._vis_data = []
+
+    def clear_histograms(self):
+        """
+        Delete all the stored histograms for visualization.
+        This should be called after histograms are written to tensorboard.
+        """
+        self._histograms = []
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/detectron2/utils/logger.py b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/utils/logger.py
new file mode 100644
index 0000000000000000000000000000000000000000..b6496d9d6096f557ffa684be80342ec220c6014c
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/utils/logger.py
@@ -0,0 +1,221 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+import functools
+import logging
+import os
+import sys
+import time
+from collections import Counter
+from fvcore.common.file_io import PathManager
+from tabulate import tabulate
+from termcolor import colored
+
+
+class _ColorfulFormatter(logging.Formatter):
+    def __init__(self, *args, **kwargs):
+        self._root_name = kwargs.pop("root_name") + "."
+        self._abbrev_name = kwargs.pop("abbrev_name", "")
+        if len(self._abbrev_name):
+            self._abbrev_name = self._abbrev_name + "."
+        super(_ColorfulFormatter, self).__init__(*args, **kwargs)
+
+    def formatMessage(self, record):
+        record.name = record.name.replace(self._root_name, self._abbrev_name)
+        log = super(_ColorfulFormatter, self).formatMessage(record)
+        if record.levelno == logging.WARNING:
+            prefix = colored("WARNING", "red", attrs=["blink"])
+        elif record.levelno == logging.ERROR or record.levelno == logging.CRITICAL:
+            prefix = colored("ERROR", "red", attrs=["blink", "underline"])
+        else:
+            return log
+        return prefix + " " + log
+
+
+@functools.lru_cache()  # so that calling setup_logger multiple times won't add many handlers
+def setup_logger(
+    output=None, distributed_rank=0, *, color=True, name="detectron2", abbrev_name=None
+):
+    """
+    Initialize the detectron2 logger and set its verbosity level to "DEBUG".
+
+    Args:
+        output (str): a file name or a directory to save log. If None, will not save log file.
+            If ends with ".txt" or ".log", assumed to be a file name.
+            Otherwise, logs will be saved to `output/log.txt`.
+        name (str): the root module name of this logger
+        abbrev_name (str): an abbreviation of the module, to avoid long names in logs.
+            Set to "" to not log the root module in logs.
+            By default, will abbreviate "detectron2" to "d2" and leave other
+            modules unchanged.
+
+    Returns:
+        logging.Logger: a logger
+    """
+    logger = logging.getLogger(name)
+    logger.setLevel(logging.DEBUG)
+    logger.propagate = False
+
+    if abbrev_name is None:
+        abbrev_name = "d2" if name == "detectron2" else name
+
+    plain_formatter = logging.Formatter(
+        "[%(asctime)s] %(name)s %(levelname)s: %(message)s", datefmt="%m/%d %H:%M:%S"
+    )
+    # stdout logging: master only
+    if distributed_rank == 0:
+        ch = logging.StreamHandler(stream=sys.stdout)
+        ch.setLevel(logging.DEBUG)
+        if color:
+            formatter = _ColorfulFormatter(
+                colored("[%(asctime)s %(name)s]: ", "green") + "%(message)s",
+                datefmt="%m/%d %H:%M:%S",
+                root_name=name,
+                abbrev_name=str(abbrev_name),
+            )
+        else:
+            formatter = plain_formatter
+        ch.setFormatter(formatter)
+        logger.addHandler(ch)
+
+    # file logging: all workers
+    if output is not None:
+        if output.endswith(".txt") or output.endswith(".log"):
+            filename = output
+        else:
+            filename = os.path.join(output, "log.txt")
+        if distributed_rank > 0:
+            filename = filename + ".rank{}".format(distributed_rank)
+        PathManager.mkdirs(os.path.dirname(filename))
+
+        fh = logging.StreamHandler(_cached_log_stream(filename))
+        fh.setLevel(logging.DEBUG)
+        fh.setFormatter(plain_formatter)
+        logger.addHandler(fh)
+
+    return logger
+
+
+# cache the opened file object, so that different calls to `setup_logger`
+# with the same file name can safely write to the same file.
+@functools.lru_cache(maxsize=None)
+def _cached_log_stream(filename):
+    return PathManager.open(filename, "a")
+
+
+"""
+Below are some other convenient logging methods.
+They are mainly adopted from
+https://github.com/abseil/abseil-py/blob/master/absl/logging/__init__.py
+"""
+
+
+def _find_caller():
+    """
+    Returns:
+        str: module name of the caller
+        tuple: a hashable key to be used to identify different callers
+    """
+    frame = sys._getframe(2)
+    while frame:
+        code = frame.f_code
+        if os.path.join("utils", "logger.") not in code.co_filename:
+            mod_name = frame.f_globals["__name__"]
+            if mod_name == "__main__":
+                mod_name = "detectron2"
+            return mod_name, (code.co_filename, frame.f_lineno, code.co_name)
+        frame = frame.f_back
+
+
+_LOG_COUNTER = Counter()
+_LOG_TIMER = {}
+
+
+def log_first_n(lvl, msg, n=1, *, name=None, key="caller"):
+    """
+    Log only for the first n times.
+
+    Args:
+        lvl (int): the logging level
+        msg (str):
+        n (int):
+        name (str): name of the logger to use. Will use the caller's module by default.
+        key (str or tuple[str]): the string(s) can be one of "caller" or
+            "message", which defines how to identify duplicated logs.
+            For example, if called with `n=1, key="caller"`, this function
+            will only log the first call from the same caller, regardless of
+            the message content.
+            If called with `n=1, key="message"`, this function will log the
+            same content only once, even if they are called from different places.
+            If called with `n=1, key=("caller", "message")`, this function
+            will not log only if the same caller has logged the same message before.
+    """
+    if isinstance(key, str):
+        key = (key,)
+    assert len(key) > 0
+
+    caller_module, caller_key = _find_caller()
+    hash_key = ()
+    if "caller" in key:
+        hash_key = hash_key + caller_key
+    if "message" in key:
+        hash_key = hash_key + (msg,)
+
+    _LOG_COUNTER[hash_key] += 1
+    if _LOG_COUNTER[hash_key] <= n:
+        logging.getLogger(name or caller_module).log(lvl, msg)
+
+
+def log_every_n(lvl, msg, n=1, *, name=None):
+    """
+    Log once per n times.
+
+    Args:
+        lvl (int): the logging level
+        msg (str):
+        n (int):
+        name (str): name of the logger to use. Will use the caller's module by default.
+    """
+    caller_module, key = _find_caller()
+    _LOG_COUNTER[key] += 1
+    if n == 1 or _LOG_COUNTER[key] % n == 1:
+        logging.getLogger(name or caller_module).log(lvl, msg)
+
+
+def log_every_n_seconds(lvl, msg, n=1, *, name=None):
+    """
+    Log no more than once per n seconds.
+
+    Args:
+        lvl (int): the logging level
+        msg (str):
+        n (int):
+        name (str): name of the logger to use. Will use the caller's module by default.
+    """
+    caller_module, key = _find_caller()
+    last_logged = _LOG_TIMER.get(key, None)
+    current_time = time.time()
+    if last_logged is None or current_time - last_logged >= n:
+        logging.getLogger(name or caller_module).log(lvl, msg)
+        _LOG_TIMER[key] = current_time
+
+
+def create_small_table(small_dict):
+    """
+    Create a small table using the keys of small_dict as headers. This is only
+    suitable for small dictionaries.
+
+    Args:
+        small_dict (dict): a result dictionary of only a few items.
+
+    Returns:
+        str: the table as a string.
+    """
+    keys, values = tuple(zip(*small_dict.items()))
+    table = tabulate(
+        [values],
+        headers=keys,
+        tablefmt="pipe",
+        floatfmt=".3f",
+        stralign="center",
+        numalign="center",
+    )
+    return table
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/detectron2/utils/memory.py b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/utils/memory.py
new file mode 100644
index 0000000000000000000000000000000000000000..d495a1681f460668c96f64454e31e7f2fca8137a
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/utils/memory.py
@@ -0,0 +1,86 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+
+import logging
+from contextlib import contextmanager
+from functools import wraps
+import torch
+
+__all__ = ["retry_if_cuda_oom"]
+
+
+@contextmanager
+def _ignore_torch_cuda_oom():
+    """
+    A context which ignores CUDA OOM exception from pytorch.
+    """
+    try:
+        yield
+    except RuntimeError as e:
+        # NOTE: the string may change?
+        if "CUDA out of memory. " in str(e):
+            pass
+        else:
+            raise
+
+
+def retry_if_cuda_oom(func):
+    """
+    Makes a function retry itself after encountering
+    pytorch's CUDA OOM error.
+    It will first retry after calling `torch.cuda.empty_cache()`.
+
+    If that still fails, it will then retry by trying to convert inputs to CPUs.
+    In this case, it expects the function to dispatch to CPU implementation.
+    The return values may become CPU tensors as well and it's user's
+    responsibility to convert it back to CUDA tensor if needed.
+
+    Args:
+        func: a stateless callable that takes tensor-like objects as arguments
+
+    Returns:
+        a callable which retries `func` if OOM is encountered.
+
+    Examples:
+
+    .. code-block:: python
+
+        output = retry_if_cuda_oom(some_torch_function)(input1, input2)
+        # output may be on CPU even if inputs are on GPU
+
+    Note:
+        1. When converting inputs to CPU, it will only look at each argument and check
+           if it has `.device` and `.to` for conversion. Nested structures of tensors
+           are not supported.
+
+        2. Since the function might be called more than once, it has to be
+           stateless.
+    """
+
+    def maybe_to_cpu(x):
+        try:
+            like_gpu_tensor = x.device.type == "cuda" and hasattr(x, "to")
+        except AttributeError:
+            like_gpu_tensor = False
+        if like_gpu_tensor:
+            return x.to(device="cpu")
+        else:
+            return x
+
+    @wraps(func)
+    def wrapped(*args, **kwargs):
+        with _ignore_torch_cuda_oom():
+            return func(*args, **kwargs)
+
+        # Clear cache and retry
+        torch.cuda.empty_cache()
+        with _ignore_torch_cuda_oom():
+            return func(*args, **kwargs)
+
+        # Try on CPU. This slows down the code significantly, therefore print a notice.
+        logger = logging.getLogger(__name__)
+        logger.info("Attempting to copy inputs of {} to CPU due to CUDA OOM".format(str(func)))
+        new_args = (maybe_to_cpu(x) for x in args)
+        new_kwargs = {k: maybe_to_cpu(v) for k, v in kwargs.items()}
+        return func(*new_args, **new_kwargs)
+
+    return wrapped
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/detectron2/utils/registry.py b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/utils/registry.py
new file mode 100644
index 0000000000000000000000000000000000000000..fea1de961f0dbdacc934e11b9af5647b2a008051
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/utils/registry.py
@@ -0,0 +1,6 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+
+# Keep this module for backward compatibility.
+from fvcore.common.registry import Registry  # noqa
+
+__all__ = ["Registry"]
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/detectron2/utils/serialize.py b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/utils/serialize.py
new file mode 100644
index 0000000000000000000000000000000000000000..734a62c2c4ecfd520eb9e8b941857b6f7e17d4c8
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/utils/serialize.py
@@ -0,0 +1,29 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+import cloudpickle
+
+
+class PicklableWrapper(object):
+    """
+    Wrap an object to make it more picklable, note that it uses
+    heavy weight serialization libraries that are slower than pickle.
+    It's best to use it only on closures (which are usually not picklable).
+
+    This is a simplified version of
+    https://github.com/joblib/joblib/blob/master/joblib/externals/loky/cloudpickle_wrapper.py
+    """
+
+    def __init__(self, obj):
+        self._obj = obj
+
+    def __reduce__(self):
+        s = cloudpickle.dumps(self._obj)
+        return cloudpickle.loads, (s,)
+
+    def __call__(self, *args, **kwargs):
+        return self._obj(*args, **kwargs)
+
+    def __getattr__(self, attr):
+        # Ensure that the wrapped object can be used seamlessly as the previous object.
+        if attr not in ["_obj"]:
+            return getattr(self._obj, attr)
+        return getattr(self, attr)
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/detectron2/utils/video_visualizer.py b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/utils/video_visualizer.py
new file mode 100644
index 0000000000000000000000000000000000000000..0144b679d09bbb8049c30eb849099422355b492c
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/utils/video_visualizer.py
@@ -0,0 +1,235 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+import numpy as np
+import pycocotools.mask as mask_util
+
+from detectron2.utils.visualizer import (
+    ColorMode,
+    Visualizer,
+    _create_text_labels,
+    _PanopticPrediction,
+)
+
+from .colormap import random_color
+
+
+class _DetectedInstance:
+    """
+    Used to store data about detected objects in video frame,
+    in order to transfer color to objects in the future frames.
+
+    Attributes:
+        label (int):
+        bbox (tuple[float]):
+        mask_rle (dict):
+        color (tuple[float]): RGB colors in range (0, 1)
+        ttl (int): time-to-live for the instance. For example, if ttl=2,
+            the instance color can be transferred to objects in the next two frames.
+    """
+
+    __slots__ = ["label", "bbox", "mask_rle", "color", "ttl"]
+
+    def __init__(self, label, bbox, mask_rle, color, ttl):
+        self.label = label
+        self.bbox = bbox
+        self.mask_rle = mask_rle
+        self.color = color
+        self.ttl = ttl
+
+
+class VideoVisualizer:
+    def __init__(self, metadata, instance_mode=ColorMode.IMAGE):
+        """
+        Args:
+            metadata (MetadataCatalog): image metadata.
+        """
+        self.metadata = metadata
+        self._old_instances = []
+        assert instance_mode in [
+            ColorMode.IMAGE,
+            ColorMode.IMAGE_BW,
+        ], "Other mode not supported yet."
+        self._instance_mode = instance_mode
+
+    def draw_instance_predictions(self, frame, predictions):
+        """
+        Draw instance-level prediction results on an image.
+
+        Args:
+            frame (ndarray): an RGB image of shape (H, W, C), in the range [0, 255].
+            predictions (Instances): the output of an instance detection/segmentation
+                model. Following fields will be used to draw:
+                "pred_boxes", "pred_classes", "scores", "pred_masks" (or "pred_masks_rle").
+
+        Returns:
+            output (VisImage): image object with visualizations.
+        """
+        frame_visualizer = Visualizer(frame, self.metadata)
+        num_instances = len(predictions)
+        if num_instances == 0:
+            return frame_visualizer.output
+
+        boxes = predictions.pred_boxes.tensor.numpy() if predictions.has("pred_boxes") else None
+        scores = predictions.scores if predictions.has("scores") else None
+        classes = predictions.pred_classes.numpy() if predictions.has("pred_classes") else None
+        keypoints = predictions.pred_keypoints if predictions.has("pred_keypoints") else None
+
+        if predictions.has("pred_masks"):
+            masks = predictions.pred_masks
+            # mask IOU is not yet enabled
+            # masks_rles = mask_util.encode(np.asarray(masks.permute(1, 2, 0), order="F"))
+            # assert len(masks_rles) == num_instances
+        else:
+            masks = None
+
+        detected = [
+            _DetectedInstance(classes[i], boxes[i], mask_rle=None, color=None, ttl=8)
+            for i in range(num_instances)
+        ]
+        colors = self._assign_colors(detected)
+
+        labels = _create_text_labels(classes, scores, self.metadata.get("thing_classes", None))
+
+        if self._instance_mode == ColorMode.IMAGE_BW:
+            # any() returns uint8 tensor
+            frame_visualizer.output.img = frame_visualizer._create_grayscale_image(
+                (masks.any(dim=0) > 0).numpy() if masks is not None else None
+            )
+            alpha = 0.3
+        else:
+            alpha = 0.5
+
+        frame_visualizer.overlay_instances(
+            boxes=None if masks is not None else boxes,  # boxes are a bit distracting
+            masks=masks,
+            labels=labels,
+            keypoints=keypoints,
+            assigned_colors=colors,
+            alpha=alpha,
+        )
+
+        return frame_visualizer.output
+
+    def draw_sem_seg(self, frame, sem_seg, area_threshold=None):
+        """
+        Args:
+            sem_seg (ndarray or Tensor): semantic segmentation of shape (H, W),
+                each value is the integer label.
+            area_threshold (Optional[int]): only draw segmentations larger than the threshold
+        """
+        # don't need to do anything special
+        frame_visualizer = Visualizer(frame, self.metadata)
+        frame_visualizer.draw_sem_seg(sem_seg, area_threshold=None)
+        return frame_visualizer.output
+
+    def draw_panoptic_seg_predictions(
+        self, frame, panoptic_seg, segments_info, area_threshold=None, alpha=0.5
+    ):
+        frame_visualizer = Visualizer(frame, self.metadata)
+        pred = _PanopticPrediction(panoptic_seg, segments_info)
+
+        if self._instance_mode == ColorMode.IMAGE_BW:
+            frame_visualizer.output.img = frame_visualizer._create_grayscale_image(
+                pred.non_empty_mask()
+            )
+
+        # draw mask for all semantic segments first i.e. "stuff"
+        for mask, sinfo in pred.semantic_masks():
+            category_idx = sinfo["category_id"]
+            try:
+                mask_color = [x / 255 for x in self.metadata.stuff_colors[category_idx]]
+            except AttributeError:
+                mask_color = None
+
+            frame_visualizer.draw_binary_mask(
+                mask,
+                color=mask_color,
+                text=self.metadata.stuff_classes[category_idx],
+                alpha=alpha,
+                area_threshold=area_threshold,
+            )
+
+        all_instances = list(pred.instance_masks())
+        if len(all_instances) == 0:
+            return frame_visualizer.output
+        # draw mask for all instances second
+        masks, sinfo = list(zip(*all_instances))
+        num_instances = len(masks)
+        masks_rles = mask_util.encode(
+            np.asarray(np.asarray(masks).transpose(1, 2, 0), dtype=np.uint8, order="F")
+        )
+        assert len(masks_rles) == num_instances
+
+        category_ids = [x["category_id"] for x in sinfo]
+        detected = [
+            _DetectedInstance(category_ids[i], bbox=None, mask_rle=masks_rles[i], color=None, ttl=8)
+            for i in range(num_instances)
+        ]
+        colors = self._assign_colors(detected)
+        labels = [self.metadata.thing_classes[k] for k in category_ids]
+
+        frame_visualizer.overlay_instances(
+            boxes=None,
+            masks=masks,
+            labels=labels,
+            keypoints=None,
+            assigned_colors=colors,
+            alpha=alpha,
+        )
+        return frame_visualizer.output
+
+    def _assign_colors(self, instances):
+        """
+        Naive tracking heuristics to assign same color to the same instance,
+        will update the internal state of tracked instances.
+
+        Returns:
+            list[tuple[float]]: list of colors.
+        """
+
+        # Compute iou with either boxes or masks:
+        is_crowd = np.zeros((len(instances),), dtype=np.bool)
+        if instances[0].bbox is None:
+            assert instances[0].mask_rle is not None
+            # use mask iou only when box iou is None
+            # because box seems good enough
+            rles_old = [x.mask_rle for x in self._old_instances]
+            rles_new = [x.mask_rle for x in instances]
+            ious = mask_util.iou(rles_old, rles_new, is_crowd)
+            threshold = 0.5
+        else:
+            boxes_old = [x.bbox for x in self._old_instances]
+            boxes_new = [x.bbox for x in instances]
+            ious = mask_util.iou(boxes_old, boxes_new, is_crowd)
+            threshold = 0.6
+        if len(ious) == 0:
+            ious = np.zeros((len(self._old_instances), len(instances)), dtype="float32")
+
+        # Only allow matching instances of the same label:
+        for old_idx, old in enumerate(self._old_instances):
+            for new_idx, new in enumerate(instances):
+                if old.label != new.label:
+                    ious[old_idx, new_idx] = 0
+
+        matched_new_per_old = np.asarray(ious).argmax(axis=1)
+        max_iou_per_old = np.asarray(ious).max(axis=1)
+
+        # Try to find match for each old instance:
+        extra_instances = []
+        for idx, inst in enumerate(self._old_instances):
+            if max_iou_per_old[idx] > threshold:
+                newidx = matched_new_per_old[idx]
+                if instances[newidx].color is None:
+                    instances[newidx].color = inst.color
+                    continue
+            # If an old instance does not match any new instances,
+            # keep it for the next frame in case it is just missed by the detector
+            inst.ttl -= 1
+            if inst.ttl > 0:
+                extra_instances.append(inst)
+
+        # Assign random color to newly-detected instances:
+        for inst in instances:
+            if inst.color is None:
+                inst.color = random_color(rgb=True, maximum=1)
+        self._old_instances = instances[:] + extra_instances
+        return [d.color for d in instances]
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/detectron2/utils/visualizer.py b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/utils/visualizer.py
new file mode 100644
index 0000000000000000000000000000000000000000..3ffcbdbd19518bce877a776582a7caeddc18108e
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/detectron2/utils/visualizer.py
@@ -0,0 +1,1143 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+import colorsys
+import logging
+import math
+import numpy as np
+from enum import Enum, unique
+import cv2
+import matplotlib as mpl
+import matplotlib.colors as mplc
+import matplotlib.figure as mplfigure
+import pycocotools.mask as mask_util
+import torch
+from fvcore.common.file_io import PathManager
+from matplotlib.backends.backend_agg import FigureCanvasAgg
+from PIL import Image
+
+from detectron2.structures import BitMasks, Boxes, BoxMode, Keypoints, PolygonMasks, RotatedBoxes
+
+from .colormap import random_color
+
+logger = logging.getLogger(__name__)
+
+__all__ = ["ColorMode", "VisImage", "Visualizer"]
+
+
+_SMALL_OBJECT_AREA_THRESH = 1000
+_LARGE_MASK_AREA_THRESH = 120000
+_OFF_WHITE = (1.0, 1.0, 240.0 / 255)
+_BLACK = (0, 0, 0)
+_RED = (1.0, 0, 0)
+
+_KEYPOINT_THRESHOLD = 0.05
+
+
+@unique
+class ColorMode(Enum):
+    """
+    Enum of different color modes to use for instance visualizations.
+    """
+
+    IMAGE = 0
+    """
+    Picks a random color for every instance and overlay segmentations with low opacity.
+    """
+    SEGMENTATION = 1
+    """
+    Let instances of the same category have similar colors
+    (from metadata.thing_colors), and overlay them with
+    high opacity. This provides more attention on the quality of segmentation.
+    """
+    IMAGE_BW = 2
+    """
+    Same as IMAGE, but convert all areas without masks to gray-scale.
+    Only available for drawing per-instance mask predictions.
+    """
+
+
+class GenericMask:
+    """
+    Attribute:
+        polygons (list[ndarray]): list[ndarray]: polygons for this mask.
+            Each ndarray has format [x, y, x, y, ...]
+        mask (ndarray): a binary mask
+    """
+
+    def __init__(self, mask_or_polygons, height, width):
+        self._mask = self._polygons = self._has_holes = None
+        self.height = height
+        self.width = width
+
+        m = mask_or_polygons
+        if isinstance(m, dict):
+            # RLEs
+            assert "counts" in m and "size" in m
+            if isinstance(m["counts"], list):  # uncompressed RLEs
+                h, w = m["size"]
+                assert h == height and w == width
+                m = mask_util.frPyObjects(m, h, w)
+            self._mask = mask_util.decode(m)[:, :]
+            return
+
+        if isinstance(m, list):  # list[ndarray]
+            self._polygons = [np.asarray(x).reshape(-1) for x in m]
+            return
+
+        if isinstance(m, np.ndarray):  # assumed to be a binary mask
+            assert m.shape[1] != 2, m.shape
+            assert m.shape == (height, width), m.shape
+            self._mask = m.astype("uint8")
+            return
+
+        raise ValueError("GenericMask cannot handle object {} of type '{}'".format(m, type(m)))
+
+    @property
+    def mask(self):
+        if self._mask is None:
+            self._mask = self.polygons_to_mask(self._polygons)
+        return self._mask
+
+    @property
+    def polygons(self):
+        if self._polygons is None:
+            self._polygons, self._has_holes = self.mask_to_polygons(self._mask)
+        return self._polygons
+
+    @property
+    def has_holes(self):
+        if self._has_holes is None:
+            if self._mask is not None:
+                self._polygons, self._has_holes = self.mask_to_polygons(self._mask)
+            else:
+                self._has_holes = False  # if original format is polygon, does not have holes
+        return self._has_holes
+
+    def mask_to_polygons(self, mask):
+        # cv2.RETR_CCOMP flag retrieves all the contours and arranges them to a 2-level
+        # hierarchy. External contours (boundary) of the object are placed in hierarchy-1.
+        # Internal contours (holes) are placed in hierarchy-2.
+        # cv2.CHAIN_APPROX_NONE flag gets vertices of polygons from contours.
+        mask = np.ascontiguousarray(mask)  # some versions of cv2 does not support incontiguous arr
+        res = cv2.findContours(mask.astype("uint8"), cv2.RETR_CCOMP, cv2.CHAIN_APPROX_NONE)
+        hierarchy = res[-1]
+        if hierarchy is None:  # empty mask
+            return [], False
+        has_holes = (hierarchy.reshape(-1, 4)[:, 3] >= 0).sum() > 0
+        res = res[-2]
+        res = [x.flatten() for x in res]
+        res = [x for x in res if len(x) >= 6]
+        return res, has_holes
+
+    def polygons_to_mask(self, polygons):
+        rle = mask_util.frPyObjects(polygons, self.height, self.width)
+        rle = mask_util.merge(rle)
+        return mask_util.decode(rle)[:, :]
+
+    def area(self):
+        return self.mask.sum()
+
+    def bbox(self):
+        p = mask_util.frPyObjects(self.polygons, self.height, self.width)
+        p = mask_util.merge(p)
+        bbox = mask_util.toBbox(p)
+        bbox[2] += bbox[0]
+        bbox[3] += bbox[1]
+        return bbox
+
+
+class _PanopticPrediction:
+    def __init__(self, panoptic_seg, segments_info):
+        self._seg = panoptic_seg
+
+        self._sinfo = {s["id"]: s for s in segments_info}  # seg id -> seg info
+        segment_ids, areas = torch.unique(panoptic_seg, sorted=True, return_counts=True)
+        areas = areas.numpy()
+        sorted_idxs = np.argsort(-areas)
+        self._seg_ids, self._seg_areas = segment_ids[sorted_idxs], areas[sorted_idxs]
+        self._seg_ids = self._seg_ids.tolist()
+        for sid, area in zip(self._seg_ids, self._seg_areas):
+            if sid in self._sinfo:
+                self._sinfo[sid]["area"] = float(area)
+
+    def non_empty_mask(self):
+        """
+        Returns:
+            (H, W) array, a mask for all pixels that have a prediction
+        """
+        empty_ids = []
+        for id in self._seg_ids:
+            if id not in self._sinfo:
+                empty_ids.append(id)
+        if len(empty_ids) == 0:
+            return np.zeros(self._seg.shape, dtype=np.uint8)
+        assert (
+            len(empty_ids) == 1
+        ), ">1 ids corresponds to no labels. This is currently not supported"
+        return (self._seg != empty_ids[0]).numpy().astype(np.bool)
+
+    def semantic_masks(self):
+        for sid in self._seg_ids:
+            sinfo = self._sinfo.get(sid)
+            if sinfo is None or sinfo["isthing"]:
+                # Some pixels (e.g. id 0 in PanopticFPN) have no instance or semantic predictions.
+                continue
+            yield (self._seg == sid).numpy().astype(np.bool), sinfo
+
+    def instance_masks(self):
+        for sid in self._seg_ids:
+            sinfo = self._sinfo.get(sid)
+            if sinfo is None or not sinfo["isthing"]:
+                continue
+            mask = (self._seg == sid).numpy().astype(np.bool)
+            if mask.sum() > 0:
+                yield mask, sinfo
+
+
+def _create_text_labels(classes, scores, class_names):
+    """
+    Args:
+        classes (list[int] or None):
+        scores (list[float] or None):
+        class_names (list[str] or None):
+
+    Returns:
+        list[str] or None
+    """
+    labels = None
+    if classes is not None and class_names is not None and len(class_names) > 1:
+        labels = [class_names[i] for i in classes]
+    if scores is not None:
+        if labels is None:
+            labels = ["{:.0f}%".format(s * 100) for s in scores]
+        else:
+            labels = ["{} {:.0f}%".format(l, s * 100) for l, s in zip(labels, scores)]
+    return labels
+
+
+class VisImage:
+    def __init__(self, img, scale=1.0):
+        """
+        Args:
+            img (ndarray): an RGB image of shape (H, W, 3).
+            scale (float): scale the input image
+        """
+        self.img = img
+        self.scale = scale
+        self.width, self.height = img.shape[1], img.shape[0]
+        self._setup_figure(img)
+
+    def _setup_figure(self, img):
+        """
+        Args:
+            Same as in :meth:`__init__()`.
+
+        Returns:
+            fig (matplotlib.pyplot.figure): top level container for all the image plot elements.
+            ax (matplotlib.pyplot.Axes): contains figure elements and sets the coordinate system.
+        """
+        fig = mplfigure.Figure(frameon=False)
+        self.dpi = fig.get_dpi()
+        # add a small 1e-2 to avoid precision lost due to matplotlib's truncation
+        # (https://github.com/matplotlib/matplotlib/issues/15363)
+        fig.set_size_inches(
+            (self.width * self.scale + 1e-2) / self.dpi,
+            (self.height * self.scale + 1e-2) / self.dpi,
+        )
+        self.canvas = FigureCanvasAgg(fig)
+        # self.canvas = mpl.backends.backend_cairo.FigureCanvasCairo(fig)
+        ax = fig.add_axes([0.0, 0.0, 1.0, 1.0])
+        ax.axis("off")
+        ax.set_xlim(0.0, self.width)
+        ax.set_ylim(self.height)
+
+        self.fig = fig
+        self.ax = ax
+
+    def save(self, filepath):
+        """
+        Args:
+            filepath (str): a string that contains the absolute path, including the file name, where
+                the visualized image will be saved.
+        """
+        if filepath.lower().endswith(".jpg") or filepath.lower().endswith(".png"):
+            # faster than matplotlib's imshow
+            cv2.imwrite(filepath, self.get_image()[:, :, ::-1])
+        else:
+            # support general formats (e.g. pdf)
+            self.ax.imshow(self.img, interpolation="nearest")
+            self.fig.savefig(filepath)
+
+    def get_image(self):
+        """
+        Returns:
+            ndarray:
+                the visualized image of shape (H, W, 3) (RGB) in uint8 type.
+                The shape is scaled w.r.t the input image using the given `scale` argument.
+        """
+        canvas = self.canvas
+        s, (width, height) = canvas.print_to_buffer()
+        if (self.width, self.height) != (width, height):
+            img = cv2.resize(self.img, (width, height))
+        else:
+            img = self.img
+
+        # buf = io.BytesIO()  # works for cairo backend
+        # canvas.print_rgba(buf)
+        # width, height = self.width, self.height
+        # s = buf.getvalue()
+
+        buffer = np.frombuffer(s, dtype="uint8")
+
+        # imshow is slow. blend manually (still quite slow)
+        img_rgba = buffer.reshape(height, width, 4)
+        rgb, alpha = np.split(img_rgba, [3], axis=2)
+
+        try:
+            import numexpr as ne  # fuse them with numexpr
+
+            visualized_image = ne.evaluate("demo * (1 - alpha / 255.0) + rgb * (alpha / 255.0)")
+        except ImportError:
+            alpha = alpha.astype("float32") / 255.0
+            visualized_image = img * (1 - alpha) + rgb * alpha
+
+        visualized_image = visualized_image.astype("uint8")
+
+        return visualized_image
+
+
+class Visualizer:
+    def __init__(self, img_rgb, metadata, scale=1.0, instance_mode=ColorMode.IMAGE):
+        """
+        Args:
+            img_rgb: a numpy array of shape (H, W, C), where H and W correspond to
+                the height and width of the image respectively. C is the number of
+                color channels. The image is required to be in RGB format since that
+                is a requirement of the Matplotlib library. The image is also expected
+                to be in the range [0, 255].
+            metadata (MetadataCatalog): image metadata.
+        """
+        self.img = np.asarray(img_rgb).clip(0, 255).astype(np.uint8)
+        self.metadata = metadata
+        self.output = VisImage(self.img, scale=scale)
+        self.cpu_device = torch.device("cpu")
+
+        # too small texts are useless, therefore clamp to 9
+        self._default_font_size = max(
+            np.sqrt(self.output.height * self.output.width) // 90, 10 // scale
+        )
+        self._instance_mode = instance_mode
+
+    def draw_instance_predictions(self, predictions):
+        """
+        Draw instance-level prediction results on an image.
+
+        Args:
+            predictions (Instances): the output of an instance detection/segmentation
+                model. Following fields will be used to draw:
+                "pred_boxes", "pred_classes", "scores", "pred_masks" (or "pred_masks_rle").
+
+        Returns:
+            output (VisImage): image object with visualizations.
+        """
+        boxes = predictions.pred_boxes if predictions.has("pred_boxes") else None
+        scores = predictions.scores if predictions.has("scores") else None
+        classes = predictions.pred_classes if predictions.has("pred_classes") else None
+        labels = _create_text_labels(classes, scores, self.metadata.get("thing_classes", None))
+        keypoints = predictions.pred_keypoints if predictions.has("pred_keypoints") else None
+
+        if predictions.has("pred_masks"):
+            masks = np.asarray(predictions.pred_masks)
+            masks = [GenericMask(x, self.output.height, self.output.width) for x in masks]
+        else:
+            masks = None
+
+        if self._instance_mode == ColorMode.SEGMENTATION and self.metadata.get("thing_colors"):
+            colors = [
+                self._jitter([x / 255 for x in self.metadata.thing_colors[c]]) for c in classes
+            ]
+            alpha = 0.8
+        else:
+            colors = None
+            alpha = 0.5
+
+        if self._instance_mode == ColorMode.IMAGE_BW:
+            self.output.img = self._create_grayscale_image(
+                (predictions.pred_masks.any(dim=0) > 0).numpy()
+            )
+            alpha = 0.3
+
+        self.overlay_instances(
+            masks=masks,
+            boxes=boxes,
+            labels=labels,
+            keypoints=keypoints,
+            assigned_colors=colors,
+            alpha=alpha,
+        )
+        return self.output
+
+    def draw_sem_seg(self, sem_seg, area_threshold=None, alpha=0.8):
+        """
+        Draw semantic segmentation predictions/labels.
+
+        Args:
+            sem_seg (Tensor or ndarray): the segmentation of shape (H, W).
+                Each value is the integer label of the pixel.
+            area_threshold (int): segments with less than `area_threshold` are not drawn.
+            alpha (float): the larger it is, the more opaque the segmentations are.
+
+        Returns:
+            output (VisImage): image object with visualizations.
+        """
+        if isinstance(sem_seg, torch.Tensor):
+            sem_seg = sem_seg.numpy()
+        labels, areas = np.unique(sem_seg, return_counts=True)
+        sorted_idxs = np.argsort(-areas).tolist()
+        labels = labels[sorted_idxs]
+        for label in filter(lambda l: l < len(self.metadata.stuff_classes), labels):
+            try:
+                mask_color = [x / 255 for x in self.metadata.stuff_colors[label]]
+            except (AttributeError, IndexError):
+                mask_color = None
+
+            binary_mask = (sem_seg == label).astype(np.uint8)
+            text = self.metadata.stuff_classes[label]
+            self.draw_binary_mask(
+                binary_mask,
+                color=mask_color,
+                edge_color=_OFF_WHITE,
+                text=text,
+                alpha=alpha,
+                area_threshold=area_threshold,
+            )
+        return self.output
+
+    def draw_panoptic_seg_predictions(
+        self, panoptic_seg, segments_info, area_threshold=None, alpha=0.7
+    ):
+        """
+        Draw panoptic prediction results on an image.
+
+        Args:
+            panoptic_seg (Tensor): of shape (height, width) where the values are ids for each
+                segment.
+            segments_info (list[dict]): Describe each segment in `panoptic_seg`.
+                Each dict contains keys "id", "category_id", "isthing".
+            area_threshold (int): stuff segments with less than `area_threshold` are not drawn.
+
+        Returns:
+            output (VisImage): image object with visualizations.
+        """
+        pred = _PanopticPrediction(panoptic_seg, segments_info)
+
+        if self._instance_mode == ColorMode.IMAGE_BW:
+            self.output.img = self._create_grayscale_image(pred.non_empty_mask())
+
+        # draw mask for all semantic segments first i.e. "stuff"
+        for mask, sinfo in pred.semantic_masks():
+            category_idx = sinfo["category_id"]
+            try:
+                mask_color = [x / 255 for x in self.metadata.stuff_colors[category_idx]]
+            except AttributeError:
+                mask_color = None
+
+            text = self.metadata.stuff_classes[category_idx]
+            self.draw_binary_mask(
+                mask,
+                color=mask_color,
+                edge_color=_OFF_WHITE,
+                text=text,
+                alpha=alpha,
+                area_threshold=area_threshold,
+            )
+
+        # draw mask for all instances second
+        all_instances = list(pred.instance_masks())
+        if len(all_instances) == 0:
+            return self.output
+        masks, sinfo = list(zip(*all_instances))
+        category_ids = [x["category_id"] for x in sinfo]
+
+        try:
+            scores = [x["score"] for x in sinfo]
+        except KeyError:
+            scores = None
+        labels = _create_text_labels(category_ids, scores, self.metadata.thing_classes)
+
+        try:
+            colors = [random_color(rgb=True, maximum=1) for k in category_ids]
+        except AttributeError:
+            colors = None
+        self.overlay_instances(masks=masks, labels=labels, assigned_colors=colors, alpha=alpha)
+
+        return self.output
+
+    def draw_dataset_dict(self, dic):
+        """
+        Draw annotations/segmentaions in Detectron2 Dataset format.
+
+        Args:
+            dic (dict): annotation/segmentation data of one image, in Detectron2 Dataset format.
+
+        Returns:
+            output (VisImage): image object with visualizations.
+        """
+        annos = dic.get("annotations", None)
+        if annos:
+            if "segmentation" in annos[0]:
+                masks = [x["segmentation"] for x in annos]
+            else:
+                masks = None
+            if "keypoints" in annos[0]:
+                keypts = [x["keypoints"] for x in annos]
+                keypts = np.array(keypts).reshape(len(annos), -1, 3)
+            else:
+                keypts = None
+
+            boxes = [BoxMode.convert(x["bbox"], x["bbox_mode"], BoxMode.XYXY_ABS) for x in annos]
+
+            labels = [x["category_id"] for x in annos]
+            colors = None
+            if self._instance_mode == ColorMode.SEGMENTATION and self.metadata.get("thing_colors"):
+                colors = [
+                    self._jitter([x / 255 for x in self.metadata.thing_colors[c]]) for c in labels
+                ]
+            names = self.metadata.get("thing_classes", None)
+            if names:
+                labels = [names[i] for i in labels]
+            labels = [
+                "{}".format(i) + ("|crowd" if a.get("iscrowd", 0) else "")
+                for i, a in zip(labels, annos)
+            ]
+            self.overlay_instances(
+                labels=labels, boxes=boxes, masks=masks, keypoints=keypts, assigned_colors=colors
+            )
+
+        sem_seg = dic.get("sem_seg", None)
+        if sem_seg is None and "sem_seg_file_name" in dic:
+            with PathManager.open(dic["sem_seg_file_name"], "rb") as f:
+                sem_seg = Image.open(f)
+                sem_seg = np.asarray(sem_seg, dtype="uint8")
+        if sem_seg is not None:
+            self.draw_sem_seg(sem_seg, area_threshold=0, alpha=0.5)
+        return self.output
+
+    def overlay_instances(
+        self,
+        *,
+        boxes=None,
+        labels=None,
+        masks=None,
+        keypoints=None,
+        assigned_colors=None,
+        alpha=0.5
+    ):
+        """
+        Args:
+            boxes (Boxes, RotatedBoxes or ndarray): either a :class:`Boxes`,
+                or an Nx4 numpy array of XYXY_ABS format for the N objects in a single image,
+                or a :class:`RotatedBoxes`,
+                or an Nx5 numpy array of (x_center, y_center, width, height, angle_degrees) format
+                for the N objects in a single image,
+            labels (list[str]): the text to be displayed for each instance.
+            masks (masks-like object): Supported types are:
+
+                * :class:`detectron2.structures.PolygonMasks`,
+                  :class:`detectron2.structures.BitMasks`.
+                * list[list[ndarray]]: contains the segmentation masks for all objects in one image.
+                  The first level of the list corresponds to individual instances. The second
+                  level to all the polygon that compose the instance, and the third level
+                  to the polygon coordinates. The third level should have the format of
+                  [x0, y0, x1, y1, ..., xn, yn] (n >= 3).
+                * list[ndarray]: each ndarray is a binary mask of shape (H, W).
+                * list[dict]: each dict is a COCO-style RLE.
+            keypoints (Keypoint or array like): an array-like object of shape (N, K, 3),
+                where the N is the number of instances and K is the number of keypoints.
+                The last dimension corresponds to (x, y, visibility or score).
+            assigned_colors (list[matplotlib.colors]): a list of colors, where each color
+                corresponds to each mask or box in the image. Refer to 'matplotlib.colors'
+                for full list of formats that the colors are accepted in.
+
+        Returns:
+            output (VisImage): image object with visualizations.
+        """
+        num_instances = None
+        if boxes is not None:
+            boxes = self._convert_boxes(boxes)
+            num_instances = len(boxes)
+        if masks is not None:
+            masks = self._convert_masks(masks)
+            if num_instances:
+                assert len(masks) == num_instances
+            else:
+                num_instances = len(masks)
+        if keypoints is not None:
+            if num_instances:
+                assert len(keypoints) == num_instances
+            else:
+                num_instances = len(keypoints)
+            keypoints = self._convert_keypoints(keypoints)
+        if labels is not None:
+            assert len(labels) == num_instances
+        if assigned_colors is None:
+            assigned_colors = [random_color(rgb=True, maximum=1) for _ in range(num_instances)]
+        if num_instances == 0:
+            return self.output
+        if boxes is not None and boxes.shape[1] == 5:
+            return self.overlay_rotated_instances(
+                boxes=boxes, labels=labels, assigned_colors=assigned_colors
+            )
+
+        # Display in largest to smallest order to reduce occlusion.
+        areas = None
+        if boxes is not None:
+            areas = np.prod(boxes[:, 2:] - boxes[:, :2], axis=1)
+        elif masks is not None:
+            areas = np.asarray([x.area() for x in masks])
+
+        if areas is not None:
+            sorted_idxs = np.argsort(-areas).tolist()
+            # Re-order overlapped instances in descending order.
+            boxes = boxes[sorted_idxs] if boxes is not None else None
+            labels = [labels[k] for k in sorted_idxs] if labels is not None else None
+            masks = [masks[idx] for idx in sorted_idxs] if masks is not None else None
+            assigned_colors = [assigned_colors[idx] for idx in sorted_idxs]
+            keypoints = keypoints[sorted_idxs] if keypoints is not None else None
+
+        for i in range(num_instances):
+            color = assigned_colors[i]
+            if boxes is not None:
+                self.draw_box(boxes[i], edge_color=color)
+
+            if masks is not None:
+                for segment in masks[i].polygons:
+                    self.draw_polygon(segment.reshape(-1, 2), color, alpha=alpha)
+
+            if labels is not None:
+                # first get a box
+                if boxes is not None:
+                    x0, y0, x1, y1 = boxes[i]
+                    text_pos = (x0, y0)  # if drawing boxes, put text on the box corner.
+                    horiz_align = "left"
+                elif masks is not None:
+                    x0, y0, x1, y1 = masks[i].bbox()
+
+                    # draw text in the center (defined by median) when box is not drawn
+                    # median is less sensitive to outliers.
+                    text_pos = np.median(masks[i].mask.nonzero(), axis=1)[::-1]
+                    horiz_align = "center"
+                else:
+                    continue  # drawing the box confidence for keypoints isn't very useful.
+                # for small objects, draw text at the side to avoid occlusion
+                instance_area = (y1 - y0) * (x1 - x0)
+                if (
+                    instance_area < _SMALL_OBJECT_AREA_THRESH * self.output.scale
+                    or y1 - y0 < 40 * self.output.scale
+                ):
+                    if y1 >= self.output.height - 5:
+                        text_pos = (x1, y0)
+                    else:
+                        text_pos = (x0, y1)
+
+                height_ratio = (y1 - y0) / np.sqrt(self.output.height * self.output.width)
+                lighter_color = self._change_color_brightness(color, brightness_factor=0.7)
+                font_size = (
+                    np.clip((height_ratio - 0.02) / 0.08 + 1, 1.2, 2)
+                    * 0.5
+                    * self._default_font_size
+                )
+                self.draw_text(
+                    labels[i],
+                    text_pos,
+                    color=lighter_color,
+                    horizontal_alignment=horiz_align,
+                    font_size=font_size,
+                )
+
+        # draw keypoints
+        if keypoints is not None:
+            for keypoints_per_instance in keypoints:
+                self.draw_and_connect_keypoints(keypoints_per_instance)
+
+        return self.output
+
+    def overlay_rotated_instances(self, boxes=None, labels=None, assigned_colors=None):
+        """
+        Args:
+            boxes (ndarray): an Nx5 numpy array of
+                (x_center, y_center, width, height, angle_degrees) format
+                for the N objects in a single image.
+            labels (list[str]): the text to be displayed for each instance.
+            assigned_colors (list[matplotlib.colors]): a list of colors, where each color
+                corresponds to each mask or box in the image. Refer to 'matplotlib.colors'
+                for full list of formats that the colors are accepted in.
+
+        Returns:
+            output (VisImage): image object with visualizations.
+        """
+
+        num_instances = len(boxes)
+
+        if assigned_colors is None:
+            assigned_colors = [random_color(rgb=True, maximum=1) for _ in range(num_instances)]
+        if num_instances == 0:
+            return self.output
+
+        # Display in largest to smallest order to reduce occlusion.
+        if boxes is not None:
+            areas = boxes[:, 2] * boxes[:, 3]
+
+        sorted_idxs = np.argsort(-areas).tolist()
+        # Re-order overlapped instances in descending order.
+        boxes = boxes[sorted_idxs]
+        labels = [labels[k] for k in sorted_idxs] if labels is not None else None
+        colors = [assigned_colors[idx] for idx in sorted_idxs]
+
+        for i in range(num_instances):
+            self.draw_rotated_box_with_label(
+                boxes[i], edge_color=colors[i], label=labels[i] if labels is not None else None
+            )
+
+        return self.output
+
+    def draw_and_connect_keypoints(self, keypoints):
+        """
+        Draws keypoints of an instance and follows the rules for keypoint connections
+        to draw lines between appropriate keypoints. This follows color heuristics for
+        line color.
+
+        Args:
+            keypoints (Tensor): a tensor of shape (K, 3), where K is the number of keypoints
+                and the last dimension corresponds to (x, y, probability).
+
+        Returns:
+            output (VisImage): image object with visualizations.
+        """
+        visible = {}
+        keypoint_names = self.metadata.get("keypoint_names")
+        for idx, keypoint in enumerate(keypoints):
+            # draw keypoint
+            x, y, prob = keypoint
+            if prob > _KEYPOINT_THRESHOLD:
+                self.draw_circle((x, y), color=_RED)
+                if keypoint_names:
+                    keypoint_name = keypoint_names[idx]
+                    visible[keypoint_name] = (x, y)
+
+        if self.metadata.get("keypoint_connection_rules"):
+            for kp0, kp1, color in self.metadata.keypoint_connection_rules:
+                if kp0 in visible and kp1 in visible:
+                    x0, y0 = visible[kp0]
+                    x1, y1 = visible[kp1]
+                    color = tuple(x / 255.0 for x in color)
+                    self.draw_line([x0, x1], [y0, y1], color=color)
+
+        # draw lines from nose to mid-shoulder and mid-shoulder to mid-hip
+        # Note that this strategy is specific to person keypoints.
+        # For other keypoints, it should just do nothing
+        try:
+            ls_x, ls_y = visible["left_shoulder"]
+            rs_x, rs_y = visible["right_shoulder"]
+            mid_shoulder_x, mid_shoulder_y = (ls_x + rs_x) / 2, (ls_y + rs_y) / 2
+        except KeyError:
+            pass
+        else:
+            # draw line from nose to mid-shoulder
+            nose_x, nose_y = visible.get("nose", (None, None))
+            if nose_x is not None:
+                self.draw_line([nose_x, mid_shoulder_x], [nose_y, mid_shoulder_y], color=_RED)
+
+            try:
+                # draw line from mid-shoulder to mid-hip
+                lh_x, lh_y = visible["left_hip"]
+                rh_x, rh_y = visible["right_hip"]
+            except KeyError:
+                pass
+            else:
+                mid_hip_x, mid_hip_y = (lh_x + rh_x) / 2, (lh_y + rh_y) / 2
+                self.draw_line([mid_hip_x, mid_shoulder_x], [mid_hip_y, mid_shoulder_y], color=_RED)
+        return self.output
+
+    """
+    Primitive drawing functions:
+    """
+
+    def draw_text(
+        self,
+        text,
+        position,
+        *,
+        font_size=None,
+        color="g",
+        horizontal_alignment="center",
+        rotation=0
+    ):
+        """
+        Args:
+            text (str): class label
+            position (tuple): a tuple of the x and y coordinates to place text on image.
+            font_size (int, optional): font of the text. If not provided, a font size
+                proportional to the image width is calculated and used.
+            color: color of the text. Refer to `matplotlib.colors` for full list
+                of formats that are accepted.
+            horizontal_alignment (str): see `matplotlib.text.Text`
+            rotation: rotation angle in degrees CCW
+
+        Returns:
+            output (VisImage): image object with text drawn.
+        """
+        if not font_size:
+            font_size = self._default_font_size
+
+        # since the text background is dark, we don't want the text to be dark
+        color = np.maximum(list(mplc.to_rgb(color)), 0.2)
+        color[np.argmax(color)] = max(0.8, np.max(color))
+
+        x, y = position
+        self.output.ax.text(
+            x,
+            y,
+            text,
+            size=font_size * self.output.scale,
+            family="sans-serif",
+            bbox={"facecolor": "black", "alpha": 0.8, "pad": 0.7, "edgecolor": "none"},
+            verticalalignment="top",
+            horizontalalignment=horizontal_alignment,
+            color=color,
+            zorder=10,
+            rotation=rotation,
+        )
+        return self.output
+
+    def draw_box(self, box_coord, alpha=0.5, edge_color="g", line_style="-"):
+        """
+        Args:
+            box_coord (tuple): a tuple containing x0, y0, x1, y1 coordinates, where x0 and y0
+                are the coordinates of the image's top left corner. x1 and y1 are the
+                coordinates of the image's bottom right corner.
+            alpha (float): blending efficient. Smaller values lead to more transparent masks.
+            edge_color: color of the outline of the box. Refer to `matplotlib.colors`
+                for full list of formats that are accepted.
+            line_style (string): the string to use to create the outline of the boxes.
+
+        Returns:
+            output (VisImage): image object with box drawn.
+        """
+        x0, y0, x1, y1 = box_coord
+        width = x1 - x0
+        height = y1 - y0
+
+        linewidth = max(self._default_font_size / 4, 1)
+
+        self.output.ax.add_patch(
+            mpl.patches.Rectangle(
+                (x0, y0),
+                width,
+                height,
+                fill=False,
+                edgecolor=edge_color,
+                linewidth=linewidth * self.output.scale,
+                alpha=alpha,
+                linestyle=line_style,
+            )
+        )
+        return self.output
+
+    def draw_rotated_box_with_label(
+        self, rotated_box, alpha=0.5, edge_color="g", line_style="-", label=None
+    ):
+        """
+        Args:
+            rotated_box (tuple): a tuple containing (cnt_x, cnt_y, w, h, angle),
+                where cnt_x and cnt_y are the center coordinates of the box.
+                w and h are the width and height of the box. angle represents how
+                many degrees the box is rotated CCW with regard to the 0-degree box.
+            alpha (float): blending efficient. Smaller values lead to more transparent masks.
+            edge_color: color of the outline of the box. Refer to `matplotlib.colors`
+                for full list of formats that are accepted.
+            line_style (string): the string to use to create the outline of the boxes.
+            label (string): label for rotated box. It will not be rendered when set to None.
+
+        Returns:
+            output (VisImage): image object with box drawn.
+        """
+        cnt_x, cnt_y, w, h, angle = rotated_box
+        area = w * h
+        # use thinner lines when the box is small
+        linewidth = self._default_font_size / (
+            6 if area < _SMALL_OBJECT_AREA_THRESH * self.output.scale else 3
+        )
+
+        theta = angle * math.pi / 180.0
+        c = math.cos(theta)
+        s = math.sin(theta)
+        rect = [(-w / 2, h / 2), (-w / 2, -h / 2), (w / 2, -h / 2), (w / 2, h / 2)]
+        # x: left->right ; y: top->down
+        rotated_rect = [(s * yy + c * xx + cnt_x, c * yy - s * xx + cnt_y) for (xx, yy) in rect]
+        for k in range(4):
+            j = (k + 1) % 4
+            self.draw_line(
+                [rotated_rect[k][0], rotated_rect[j][0]],
+                [rotated_rect[k][1], rotated_rect[j][1]],
+                color=edge_color,
+                linestyle="--" if k == 1 else line_style,
+                linewidth=linewidth,
+            )
+
+        if label is not None:
+            text_pos = rotated_rect[1]  # topleft corner
+
+            height_ratio = h / np.sqrt(self.output.height * self.output.width)
+            label_color = self._change_color_brightness(edge_color, brightness_factor=0.7)
+            font_size = (
+                np.clip((height_ratio - 0.02) / 0.08 + 1, 1.2, 2) * 0.5 * self._default_font_size
+            )
+            self.draw_text(label, text_pos, color=label_color, font_size=font_size, rotation=angle)
+
+        return self.output
+
+    def draw_circle(self, circle_coord, color, radius=3):
+        """
+        Args:
+            circle_coord (list(int) or tuple(int)): contains the x and y coordinates
+                of the center of the circle.
+            color: color of the polygon. Refer to `matplotlib.colors` for a full list of
+                formats that are accepted.
+            radius (int): radius of the circle.
+
+        Returns:
+            output (VisImage): image object with box drawn.
+        """
+        x, y = circle_coord
+        self.output.ax.add_patch(
+            mpl.patches.Circle(circle_coord, radius=radius, fill=True, color=color)
+        )
+        return self.output
+
+    def draw_line(self, x_data, y_data, color, linestyle="-", linewidth=None):
+        """
+        Args:
+            x_data (list[int]): a list containing x values of all the points being drawn.
+                Length of list should match the length of y_data.
+            y_data (list[int]): a list containing y values of all the points being drawn.
+                Length of list should match the length of x_data.
+            color: color of the line. Refer to `matplotlib.colors` for a full list of
+                formats that are accepted.
+            linestyle: style of the line. Refer to `matplotlib.lines.Line2D`
+                for a full list of formats that are accepted.
+            linewidth (float or None): width of the line. When it's None,
+                a default value will be computed and used.
+
+        Returns:
+            output (VisImage): image object with line drawn.
+        """
+        if linewidth is None:
+            linewidth = self._default_font_size / 3
+        linewidth = max(linewidth, 1)
+        self.output.ax.add_line(
+            mpl.lines.Line2D(
+                x_data,
+                y_data,
+                linewidth=linewidth * self.output.scale,
+                color=color,
+                linestyle=linestyle,
+            )
+        )
+        return self.output
+
+    def draw_binary_mask(
+        self, binary_mask, color=None, *, edge_color=None, text=None, alpha=0.5, area_threshold=4096
+    ):
+        """
+        Args:
+            binary_mask (ndarray): numpy array of shape (H, W), where H is the image height and
+                W is the image width. Each value in the array is either a 0 or 1 value of uint8
+                type.
+            color: color of the mask. Refer to `matplotlib.colors` for a full list of
+                formats that are accepted. If None, will pick a random color.
+            edge_color: color of the polygon edges. Refer to `matplotlib.colors` for a
+                full list of formats that are accepted.
+            text (str): if None, will be drawn in the object's center of mass.
+            alpha (float): blending efficient. Smaller values lead to more transparent masks.
+            area_threshold (float): a connected component small than this will not be shown.
+
+        Returns:
+            output (VisImage): image object with mask drawn.
+        """
+        if color is None:
+            color = random_color(rgb=True, maximum=1)
+        if area_threshold is None:
+            area_threshold = 4096
+
+        has_valid_segment = False
+        binary_mask = binary_mask.astype("uint8")  # opencv needs uint8
+        mask = GenericMask(binary_mask, self.output.height, self.output.width)
+        shape2d = (binary_mask.shape[0], binary_mask.shape[1])
+
+        if not mask.has_holes:
+            # draw polygons for regular masks
+            for segment in mask.polygons:
+                area = mask_util.area(mask_util.frPyObjects([segment], shape2d[0], shape2d[1]))
+                if area < area_threshold:
+                    continue
+                has_valid_segment = True
+                segment = segment.reshape(-1, 2)
+                self.draw_polygon(segment, color=color, edge_color=edge_color, alpha=alpha)
+        else:
+            rgba = np.zeros(shape2d + (4,), dtype="float32")
+            rgba[:, :, :3] = color
+            rgba[:, :, 3] = (mask.mask == 1).astype("float32") * alpha
+            has_valid_segment = True
+            self.output.ax.imshow(rgba)
+
+        if text is not None and has_valid_segment:
+            # TODO sometimes drawn on wrong objects. the heuristics here can improve.
+            lighter_color = self._change_color_brightness(color, brightness_factor=0.7)
+            _num_cc, cc_labels, stats, centroids = cv2.connectedComponentsWithStats(binary_mask, 8)
+            largest_component_id = np.argmax(stats[1:, -1]) + 1
+
+            # draw text on the largest component, as well as other very large components.
+            for cid in range(1, _num_cc):
+                if cid == largest_component_id or stats[cid, -1] > _LARGE_MASK_AREA_THRESH:
+                    # median is more stable than centroid
+                    # center = centroids[largest_component_id]
+                    center = np.median((cc_labels == cid).nonzero(), axis=1)[::-1]
+                    self.draw_text(text, center, color=lighter_color)
+        return self.output
+
+    def draw_polygon(self, segment, color, edge_color=None, alpha=0.5):
+        """
+        Args:
+            segment: numpy array of shape Nx2, containing all the points in the polygon.
+            color: color of the polygon. Refer to `matplotlib.colors` for a full list of
+                formats that are accepted.
+            edge_color: color of the polygon edges. Refer to `matplotlib.colors` for a
+                full list of formats that are accepted. If not provided, a darker shade
+                of the polygon color will be used instead.
+            alpha (float): blending efficient. Smaller values lead to more transparent masks.
+
+        Returns:
+            output (VisImage): image object with polygon drawn.
+        """
+        if edge_color is None:
+            # make edge color darker than the polygon color
+            if alpha > 0.8:
+                edge_color = self._change_color_brightness(color, brightness_factor=-0.7)
+            else:
+                edge_color = color
+        edge_color = mplc.to_rgb(edge_color) + (1,)
+
+        polygon = mpl.patches.Polygon(
+            segment,
+            fill=True,
+            facecolor=mplc.to_rgb(color) + (alpha,),
+            edgecolor=edge_color,
+            linewidth=max(self._default_font_size // 15 * self.output.scale, 1),
+        )
+        self.output.ax.add_patch(polygon)
+        return self.output
+
+    """
+    Internal methods:
+    """
+
+    def _jitter(self, color):
+        """
+        Randomly modifies given color to produce a slightly different color than the color given.
+
+        Args:
+            color (tuple[double]): a tuple of 3 elements, containing the RGB values of the color
+                picked. The values in the list are in the [0.0, 1.0] range.
+
+        Returns:
+            jittered_color (tuple[double]): a tuple of 3 elements, containing the RGB values of the
+                color after being jittered. The values in the list are in the [0.0, 1.0] range.
+        """
+        color = mplc.to_rgb(color)
+        vec = np.random.rand(3)
+        # better to do it in another color space
+        vec = vec / np.linalg.norm(vec) * 0.5
+        res = np.clip(vec + color, 0, 1)
+        return tuple(res)
+
+    def _create_grayscale_image(self, mask=None):
+        """
+        Create a grayscale version of the original image.
+        The colors in masked area, if given, will be kept.
+        """
+        img_bw = self.img.astype("f4").mean(axis=2)
+        img_bw = np.stack([img_bw] * 3, axis=2)
+        if mask is not None:
+            img_bw[mask] = self.img[mask]
+        return img_bw
+
+    def _change_color_brightness(self, color, brightness_factor):
+        """
+        Depending on the brightness_factor, gives a lighter or darker color i.e. a color with
+        less or more saturation than the original color.
+
+        Args:
+            color: color of the polygon. Refer to `matplotlib.colors` for a full list of
+                formats that are accepted.
+            brightness_factor (float): a value in [-1.0, 1.0] range. A lightness factor of
+                0 will correspond to no change, a factor in [-1.0, 0) range will result in
+                a darker color and a factor in (0, 1.0] range will result in a lighter color.
+
+        Returns:
+            modified_color (tuple[double]): a tuple containing the RGB values of the
+                modified color. Each value in the tuple is in the [0.0, 1.0] range.
+        """
+        assert brightness_factor >= -1.0 and brightness_factor <= 1.0
+        color = mplc.to_rgb(color)
+        polygon_color = colorsys.rgb_to_hls(*mplc.to_rgb(color))
+        modified_lightness = polygon_color[1] + (brightness_factor * polygon_color[1])
+        modified_lightness = 0.0 if modified_lightness < 0.0 else modified_lightness
+        modified_lightness = 1.0 if modified_lightness > 1.0 else modified_lightness
+        modified_color = colorsys.hls_to_rgb(polygon_color[0], modified_lightness, polygon_color[2])
+        return modified_color
+
+    def _convert_boxes(self, boxes):
+        """
+        Convert different format of boxes to an NxB array, where B = 4 or 5 is the box dimension.
+        """
+        if isinstance(boxes, Boxes) or isinstance(boxes, RotatedBoxes):
+            return boxes.tensor.numpy()
+        else:
+            return np.asarray(boxes)
+
+    def _convert_masks(self, masks_or_polygons):
+        """
+        Convert different format of masks or polygons to a tuple of masks and polygons.
+
+        Returns:
+            list[GenericMask]:
+        """
+
+        m = masks_or_polygons
+        if isinstance(m, PolygonMasks):
+            m = m.polygons
+        if isinstance(m, BitMasks):
+            m = m.tensor.numpy()
+        if isinstance(m, torch.Tensor):
+            m = m.numpy()
+        ret = []
+        for x in m:
+            if isinstance(x, GenericMask):
+                ret.append(x)
+            else:
+                ret.append(GenericMask(x, self.output.height, self.output.width))
+        return ret
+
+    def _convert_keypoints(self, keypoints):
+        if isinstance(keypoints, Keypoints):
+            keypoints = keypoints.tensor
+        keypoints = np.asarray(keypoints)
+        return keypoints
+
+    def get_output(self):
+        """
+        Returns:
+            output (VisImage): the image output containing the visualizations added
+            to the image.
+        """
+        return self.output
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/dev/README.md b/preprocess/humanparsing/mhp_extension/detectron2/dev/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..cc0d3297b2d436f279c3546c16c86f296402f6c5
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/dev/README.md
@@ -0,0 +1,7 @@
+
+## Some scripts for developers to use, include:
+
+- `linter.sh`: lint the codebase before commit
+- `run_{inference,instant}_tests.sh`: run inference/training for a few iterations.
+   Note that these tests require 2 GPUs.
+- `parse_results.sh`: parse results from a log file.
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/dev/linter.sh b/preprocess/humanparsing/mhp_extension/detectron2/dev/linter.sh
new file mode 100644
index 0000000000000000000000000000000000000000..fd7081dbc27b85e5323d25085fb79c7ee3b54e4a
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/dev/linter.sh
@@ -0,0 +1,46 @@
+#!/bin/bash -e
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+
+# Run this script at project root by "./dev/linter.sh" before you commit
+
+vergte() {
+  [ "$2" = "$(echo -e "$1\\n$2" | sort -V | head -n1)" ]
+}
+
+{
+  black --version | grep -E "(19.3b0.*6733274)|(19.3b0\\+8)" > /dev/null
+} || {
+	echo "Linter requires 'black @ git+https://github.com/psf/black@673327449f86fce558adde153bb6cbe54bfebad2' !"
+	exit 1
+}
+
+ISORT_TARGET_VERSION="4.3.21"
+ISORT_VERSION=$(isort -v | grep VERSION | awk '{print $2}')
+vergte "$ISORT_VERSION" "$ISORT_TARGET_VERSION" || {
+  echo "Linter requires isort>=${ISORT_TARGET_VERSION} !"
+  exit 1
+}
+
+set -v
+
+echo "Running isort ..."
+isort -y -sp . --atomic
+
+echo "Running black ..."
+black -l 100 .
+
+echo "Running flake8 ..."
+if [ -x "$(command -v flake8-3)" ]; then
+  flake8-3 .
+else
+  python3 -m flake8 .
+fi
+
+# echo "Running mypy ..."
+# Pytorch does not have enough type annotations
+# mypy detectron2/solver detectron2/structures detectron2/config
+
+echo "Running clang-format ..."
+find . -regex ".*\.\(cpp\|c\|cc\|cu\|cxx\|h\|hh\|hpp\|hxx\|tcc\|mm\|m\)" -print0 | xargs -0 clang-format -i
+
+command -v arc > /dev/null && arc lint
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/dev/packaging/README.md b/preprocess/humanparsing/mhp_extension/detectron2/dev/packaging/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..095684fcc1c5593805158c81aa0168263eb57ced
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/dev/packaging/README.md
@@ -0,0 +1,17 @@
+
+## To build a cu101 wheel for release:
+
+```
+$ nvidia-docker run -it --storage-opt "size=20GB" --name pt  pytorch/manylinux-cuda101
+# inside the container:
+# git clone https://github.com/facebookresearch/detectron2/
+# cd detectron2
+# export CU_VERSION=cu101 D2_VERSION_SUFFIX= PYTHON_VERSION=3.7 PYTORCH_VERSION=1.4
+# ./dev/packaging/build_wheel.sh
+```
+
+## To build all wheels for `CUDA {9.2,10.0,10.1}` x `Python {3.6,3.7,3.8}`:
+```
+./dev/packaging/build_all_wheels.sh
+./dev/packaging/gen_wheel_index.sh /path/to/wheels
+```
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/dev/packaging/build_all_wheels.sh b/preprocess/humanparsing/mhp_extension/detectron2/dev/packaging/build_all_wheels.sh
new file mode 100644
index 0000000000000000000000000000000000000000..eb64dea70cda26f5d101c414af43645ef7e3a349
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/dev/packaging/build_all_wheels.sh
@@ -0,0 +1,57 @@
+#!/bin/bash -e
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+
+PYTORCH_VERSION=1.5
+
+build_for_one_cuda() {
+  cu=$1
+
+  case "$cu" in
+    cu*)
+      container_name=manylinux-cuda${cu/cu/}
+      ;;
+    cpu)
+      container_name=manylinux-cuda101
+      ;;
+    *)
+      echo "Unrecognized cu=$cu"
+      exit 1
+      ;;
+  esac
+
+  echo "Launching container $container_name ..."
+
+  for py in 3.6 3.7 3.8; do
+    docker run -itd \
+      --name $container_name \
+      --mount type=bind,source="$(pwd)",target=/detectron2 \
+      pytorch/$container_name
+
+    cat <<EOF | docker exec -i $container_name sh
+      export CU_VERSION=$cu D2_VERSION_SUFFIX=+$cu PYTHON_VERSION=$py
+      export PYTORCH_VERSION=$PYTORCH_VERSION
+      cd /detectron2 && ./dev/packaging/build_wheel.sh
+EOF
+
+#     if [[ "$cu" == "cu101" ]]; then
+#       # build wheel without local version
+#       cat <<EOF | docker exec -i $container_name sh
+#         export CU_VERSION=$cu D2_VERSION_SUFFIX= PYTHON_VERSION=$py
+#         export PYTORCH_VERSION=$PYTORCH_VERSION
+#         cd /detectron2 && ./dev/packaging/build_wheel.sh
+# EOF
+#     fi
+
+    docker exec -i $container_name rm -rf /detectron2/build/$cu
+    docker container stop $container_name
+    docker container rm $container_name
+  done
+}
+
+if [[ -n "$1" ]]; then
+  build_for_one_cuda "$1"
+else
+  for cu in cu102 cu101 cu92 cpu; do
+    build_for_one_cuda "$cu"
+  done
+fi
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/dev/packaging/build_wheel.sh b/preprocess/humanparsing/mhp_extension/detectron2/dev/packaging/build_wheel.sh
new file mode 100644
index 0000000000000000000000000000000000000000..bc80b56c6717edca7f11fe51de261beed7902514
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/dev/packaging/build_wheel.sh
@@ -0,0 +1,32 @@
+#!/bin/bash
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+set -ex
+
+ldconfig  # https://github.com/NVIDIA/nvidia-docker/issues/854
+
+script_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
+. "$script_dir/pkg_helpers.bash"
+
+echo "Build Settings:"
+echo "CU_VERSION: $CU_VERSION"                 # e.g. cu101
+echo "D2_VERSION_SUFFIX: $D2_VERSION_SUFFIX"   # e.g. +cu101 or ""
+echo "PYTHON_VERSION: $PYTHON_VERSION"         # e.g. 3.6
+echo "PYTORCH_VERSION: $PYTORCH_VERSION"       # e.g. 1.4
+
+setup_cuda
+setup_wheel_python
+yum install ninja-build -y && ln -sv /usr/bin/ninja-build /usr/bin/ninja
+
+export TORCH_VERSION_SUFFIX="+$CU_VERSION"
+if [[ "$CU_VERSION" == "cu102" ]]; then
+	export TORCH_VERSION_SUFFIX=""
+fi
+pip_install pip numpy -U
+pip_install "torch==$PYTORCH_VERSION$TORCH_VERSION_SUFFIX" \
+	-f https://download.pytorch.org/whl/$CU_VERSION/torch_stable.html
+
+# use separate directories to allow parallel build
+BASE_BUILD_DIR=build/$CU_VERSION/$PYTHON_VERSION
+python setup.py \
+  build -b $BASE_BUILD_DIR \
+  bdist_wheel -b $BASE_BUILD_DIR/build_dist -d wheels/$CU_VERSION
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/dev/packaging/gen_wheel_index.sh b/preprocess/humanparsing/mhp_extension/detectron2/dev/packaging/gen_wheel_index.sh
new file mode 100644
index 0000000000000000000000000000000000000000..44d6041cdf45afdd39a85d413f08373e8516999b
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/dev/packaging/gen_wheel_index.sh
@@ -0,0 +1,27 @@
+#!/bin/bash -e
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+
+
+root=$1
+if [[ -z "$root" ]]; then
+  echo "Usage: ./gen_wheel_index.sh /path/to/wheels"
+  exit
+fi
+
+index=$root/index.html
+
+cd "$root"
+for cu in cpu cu92 cu100 cu101 cu102; do
+  cd $cu
+  echo "Creating $PWD/index.html ..."
+  for whl in *.whl; do
+    echo "<a href=\"${whl/+/%2B}\">$whl</a><br>"
+  done > index.html
+  cd "$root"
+done
+
+echo "Creating $index ..."
+for whl in $(find . -type f -name '*.whl' -printf '%P\n' | sort); do
+  echo "<a href=\"${whl/+/%2B}\">$whl</a><br>"
+done > "$index"
+
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/dev/packaging/pkg_helpers.bash b/preprocess/humanparsing/mhp_extension/detectron2/dev/packaging/pkg_helpers.bash
new file mode 100644
index 0000000000000000000000000000000000000000..51e6185c7fba6ba0f7a325c467993196f1c9b4ef
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/dev/packaging/pkg_helpers.bash
@@ -0,0 +1,57 @@
+#!/bin/bash -e
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+
+# Function to retry functions that sometimes timeout or have flaky failures
+retry () {
+    $*  || (sleep 1 && $*) || (sleep 2 && $*) || (sleep 4 && $*) || (sleep 8 && $*)
+}
+# Install with pip a bit more robustly than the default
+pip_install() {
+  retry pip install --progress-bar off "$@"
+}
+
+
+setup_cuda() {
+  # Now work out the CUDA settings
+  # Like other torch domain libraries, we choose common GPU architectures only.
+  export FORCE_CUDA=1
+  case "$CU_VERSION" in
+    cu102)
+      export CUDA_HOME=/usr/local/cuda-10.2/
+      export TORCH_CUDA_ARCH_LIST="3.5;3.7;5.0;5.2;6.0+PTX;6.1+PTX;7.0+PTX;7.5+PTX"
+      ;;
+    cu101)
+      export CUDA_HOME=/usr/local/cuda-10.1/
+      export TORCH_CUDA_ARCH_LIST="3.5;3.7;5.0;5.2;6.0+PTX;6.1+PTX;7.0+PTX;7.5+PTX"
+      ;;
+    cu100)
+      export CUDA_HOME=/usr/local/cuda-10.0/
+      export TORCH_CUDA_ARCH_LIST="3.5;3.7;5.0;5.2;6.0+PTX;6.1+PTX;7.0+PTX;7.5+PTX"
+      ;;
+    cu92)
+      export CUDA_HOME=/usr/local/cuda-9.2/
+      export TORCH_CUDA_ARCH_LIST="3.5;3.7;5.0;5.2;6.0+PTX;6.1+PTX;7.0+PTX"
+      ;;
+    cpu)
+      unset FORCE_CUDA
+      export CUDA_VISIBLE_DEVICES=
+      ;;
+    *)
+      echo "Unrecognized CU_VERSION=$CU_VERSION"
+      exit 1
+      ;;
+  esac
+}
+
+setup_wheel_python() {
+  case "$PYTHON_VERSION" in
+    3.6) python_abi=cp36-cp36m ;;
+    3.7) python_abi=cp37-cp37m ;;
+    3.8) python_abi=cp38-cp38 ;;
+    *)
+      echo "Unrecognized PYTHON_VERSION=$PYTHON_VERSION"
+      exit 1
+      ;;
+  esac
+  export PATH="/opt/python/$python_abi/bin:$PATH"
+}
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/dev/parse_results.sh b/preprocess/humanparsing/mhp_extension/detectron2/dev/parse_results.sh
new file mode 100644
index 0000000000000000000000000000000000000000..874b688889049e869854273c83182e5b019315b3
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/dev/parse_results.sh
@@ -0,0 +1,45 @@
+#!/bin/bash
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+
+# A shell script that parses metrics from the log file.
+# Make it easier for developers to track performance of models.
+
+LOG="$1"
+
+if [[ -z "$LOG" ]]; then
+	echo "Usage: $0 /path/to/log/file"
+	exit 1
+fi
+
+# [12/15 11:47:32] trainer INFO: Total training time: 12:15:04.446477 (0.4900 s / it)
+# [12/15 11:49:03] inference INFO: Total inference time: 0:01:25.326167 (0.13652186737060548 s / demo per device, on 8 devices)
+# [12/15 11:49:03] inference INFO: Total inference pure compute time: .....
+
+# training time
+trainspeed=$(grep -o 'Overall training.*' "$LOG" | grep -Eo '\(.*\)' | grep -o '[0-9\.]*')
+echo "Training speed: $trainspeed s/it"
+
+# inference time: there could be multiple inference during training
+inferencespeed=$(grep -o 'Total inference pure.*' "$LOG" | tail -n1 | grep -Eo '\(.*\)' | grep -o '[0-9\.]*' | head -n1)
+echo "Inference speed: $inferencespeed s/it"
+
+# [12/15 11:47:18] trainer INFO: eta: 0:00:00  iter: 90000  loss: 0.5407 (0.7256)  loss_classifier: 0.1744 (0.2446)  loss_box_reg: 0.0838 (0.1160)  loss_mask: 0.2159 (0.2722)  loss_objectness: 0.0244 (0.0429)  loss_rpn_box_reg: 0.0279 (0.0500)  time: 0.4487 (0.4899)  data: 0.0076 (0.0975) lr: 0.000200  max mem: 4161
+memory=$(grep -o 'max[_ ]mem: [0-9]*' "$LOG" | tail -n1 | grep -o '[0-9]*')
+echo "Training memory: $memory MB"
+
+echo "Easy to copypaste:"
+echo "$trainspeed","$inferencespeed","$memory"
+
+echo "------------------------------"
+
+# [12/26 17:26:32] engine.coco_evaluation: copypaste: Task: bbox
+# [12/26 17:26:32] engine.coco_evaluation: copypaste: AP,AP50,AP75,APs,APm,APl
+# [12/26 17:26:32] engine.coco_evaluation: copypaste: 0.0017,0.0024,0.0017,0.0005,0.0019,0.0011
+# [12/26 17:26:32] engine.coco_evaluation: copypaste: Task: segm
+# [12/26 17:26:32] engine.coco_evaluation: copypaste: AP,AP50,AP75,APs,APm,APl
+# [12/26 17:26:32] engine.coco_evaluation: copypaste: 0.0014,0.0021,0.0016,0.0005,0.0016,0.0011
+
+echo "COCO Results:"
+num_tasks=$(grep -o 'copypaste:.*Task.*' "$LOG" | sort -u | wc -l)
+# each task has 3 lines
+grep -o 'copypaste:.*' "$LOG" | cut -d ' ' -f 2- | tail -n $((num_tasks * 3))
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/dev/run_inference_tests.sh b/preprocess/humanparsing/mhp_extension/detectron2/dev/run_inference_tests.sh
new file mode 100644
index 0000000000000000000000000000000000000000..17e422d576e5fe9efcd85790954c569c962657d6
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/dev/run_inference_tests.sh
@@ -0,0 +1,44 @@
+#!/bin/bash -e
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+
+BIN="python tools/train_net.py"
+OUTPUT="inference_test_output"
+NUM_GPUS=2
+
+CFG_LIST=( "${@:1}" )
+
+if [ ${#CFG_LIST[@]} -eq 0 ]; then
+  CFG_LIST=( ./configs/quick_schedules/*inference_acc_test.yaml )
+fi
+
+echo "========================================================================"
+echo "Configs to run:"
+echo "${CFG_LIST[@]}"
+echo "========================================================================"
+
+
+for cfg in "${CFG_LIST[@]}"; do
+    echo "========================================================================"
+    echo "Running $cfg ..."
+    echo "========================================================================"
+    $BIN \
+      --eval-only \
+      --num-gpus $NUM_GPUS \
+      --config-file "$cfg" \
+      OUTPUT_DIR $OUTPUT
+      rm -rf $OUTPUT
+done
+
+
+echo "========================================================================"
+echo "Running demo.py ..."
+echo "========================================================================"
+DEMO_BIN="python demo/demo.py"
+COCO_DIR=datasets/coco/val2014
+mkdir -pv $OUTPUT
+
+set -v
+
+$DEMO_BIN --config-file ./configs/quick_schedules/panoptic_fpn_R_50_inference_acc_test.yaml \
+  --input $COCO_DIR/COCO_val2014_0000001933* --output $OUTPUT
+rm -rf $OUTPUT
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/dev/run_instant_tests.sh b/preprocess/humanparsing/mhp_extension/detectron2/dev/run_instant_tests.sh
new file mode 100644
index 0000000000000000000000000000000000000000..2c51de649262e7371fb173210c8edc377e8177e0
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/dev/run_instant_tests.sh
@@ -0,0 +1,27 @@
+#!/bin/bash -e
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+
+BIN="python tools/train_net.py"
+OUTPUT="instant_test_output"
+NUM_GPUS=2
+
+CFG_LIST=( "${@:1}" )
+if [ ${#CFG_LIST[@]} -eq 0 ]; then
+  CFG_LIST=( ./configs/quick_schedules/*instant_test.yaml )
+fi
+
+echo "========================================================================"
+echo "Configs to run:"
+echo "${CFG_LIST[@]}"
+echo "========================================================================"
+
+for cfg in "${CFG_LIST[@]}"; do
+    echo "========================================================================"
+    echo "Running $cfg ..."
+    echo "========================================================================"
+    $BIN --num-gpus $NUM_GPUS --config-file "$cfg" \
+      SOLVER.IMS_PER_BATCH $(($NUM_GPUS * 2)) \
+      OUTPUT_DIR "$OUTPUT"
+    rm -rf "$OUTPUT"
+done
+
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/docker/Dockerfile b/preprocess/humanparsing/mhp_extension/detectron2/docker/Dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..2a8603903e36eafb3a61fac0a086a919cc67fe38
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/docker/Dockerfile
@@ -0,0 +1,49 @@
+FROM nvidia/cuda:10.1-cudnn7-devel
+
+ENV DEBIAN_FRONTEND noninteractive
+RUN apt-get update && apt-get install -y \
+	python3-opencv ca-certificates python3-dev git wget sudo  \
+	cmake ninja-build protobuf-compiler libprotobuf-dev && \
+  rm -rf /var/lib/apt/lists/*
+RUN ln -sv /usr/bin/python3 /usr/bin/python
+
+# create a non-root user
+ARG USER_ID=1000
+RUN useradd -m --no-log-init --system  --uid ${USER_ID} appuser -g sudo
+RUN echo '%sudo ALL=(ALL) NOPASSWD:ALL' >> /etc/sudoers
+USER appuser
+WORKDIR /home/appuser
+
+ENV PATH="/home/appuser/.local/bin:${PATH}"
+RUN wget https://bootstrap.pypa.io/get-pip.py && \
+	python3 get-pip.py --user && \
+	rm get-pip.py
+
+# install dependencies
+# See https://pytorch.org/ for other options if you use a different version of CUDA
+RUN pip install --user tensorboard cython
+RUN pip install --user torch==1.5+cu101 torchvision==0.6+cu101 -f https://download.pytorch.org/whl/torch_stable.html
+RUN pip install --user 'git+https://github.com/cocodataset/cocoapi.git#subdirectory=PythonAPI'
+
+RUN pip install --user 'git+https://github.com/facebookresearch/fvcore'
+# install detectron2
+RUN git clone https://github.com/facebookresearch/detectron2 detectron2_repo
+# set FORCE_CUDA because during `docker build` cuda is not accessible
+ENV FORCE_CUDA="1"
+# This will by default build detectron2 for all common cuda architectures and take a lot more time,
+# because inside `docker build`, there is no way to tell which architecture will be used.
+ARG TORCH_CUDA_ARCH_LIST="Kepler;Kepler+Tesla;Maxwell;Maxwell+Tegra;Pascal;Volta;Turing"
+ENV TORCH_CUDA_ARCH_LIST="${TORCH_CUDA_ARCH_LIST}"
+
+RUN pip install --user -e detectron2_repo
+
+# Set a fixed model cache directory.
+ENV FVCORE_CACHE="/tmp"
+WORKDIR /home/appuser/detectron2_repo
+
+# run detectron2 under user "appuser":
+# wget http://images.cocodataset.org/val2017/000000439715.jpg -O input.jpg
+# python3 demo/demo.py  \
+	#--config-file configs/COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_3x.yaml \
+	#--input input.jpg --output outputs/ \
+	#--opts MODEL.WEIGHTS detectron2://COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_3x/137849600/model_final_f10217.pkl
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/docker/Dockerfile-circleci b/preprocess/humanparsing/mhp_extension/detectron2/docker/Dockerfile-circleci
new file mode 100644
index 0000000000000000000000000000000000000000..bc0be845adc247eb458d212ae5352c594cd80a72
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/docker/Dockerfile-circleci
@@ -0,0 +1,17 @@
+FROM nvidia/cuda:10.1-cudnn7-devel
+# This dockerfile only aims to provide an environment for unittest on CircleCI
+
+ENV DEBIAN_FRONTEND noninteractive
+RUN apt-get update && apt-get install -y \
+	python3-opencv ca-certificates python3-dev git wget sudo ninja-build && \
+  rm -rf /var/lib/apt/lists/*
+
+RUN wget -q https://bootstrap.pypa.io/get-pip.py && \
+	python3 get-pip.py && \
+	rm get-pip.py
+
+# install dependencies
+# See https://pytorch.org/ for other options if you use a different version of CUDA
+RUN pip install tensorboard cython
+RUN pip install torch==1.5+cu101 torchvision==0.6+cu101 -f https://download.pytorch.org/whl/torch_stable.html
+RUN pip install 'git+https://github.com/cocodataset/cocoapi.git#subdirectory=PythonAPI'
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/docker/README.md b/preprocess/humanparsing/mhp_extension/detectron2/docker/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..760c4054d0e4fa56a67ab4b59c14979498e2f94a
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/docker/README.md
@@ -0,0 +1,36 @@
+
+## Use the container (with docker ≥ 19.03)
+
+```
+cd docker/
+# Build:
+docker build --build-arg USER_ID=$UID -t detectron2:v0 .
+# Run:
+docker run --gpus all -it \
+	--shm-size=8gb --env="DISPLAY" --volume="/tmp/.X11-unix:/tmp/.X11-unix:rw" \
+	--name=detectron2 detectron2:v0
+
+# Grant docker access to host X server to show images
+xhost +local:`docker inspect --format='{{ .Config.Hostname }}' detectron2`
+```
+
+## Use the container (with docker < 19.03)
+
+Install docker-compose and nvidia-docker2, then run:
+```
+cd docker && USER_ID=$UID docker-compose run detectron2
+```
+
+#### Using a persistent cache directory
+
+You can prevent models from being re-downloaded on every run,
+by storing them in a cache directory.
+
+To do this, add `--volume=$HOME/.torch/fvcore_cache:/tmp:rw` in the run command.
+
+## Install new dependencies
+Add the following to `Dockerfile` to make persistent changes.
+```
+RUN sudo apt-get update && sudo apt-get install -y vim
+```
+Or run them in the container to make temporary changes.
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/docker/docker-compose.yml b/preprocess/humanparsing/mhp_extension/detectron2/docker/docker-compose.yml
new file mode 100644
index 0000000000000000000000000000000000000000..e660f44645a5cc164cd5a59f2cdcf7e1ded60c2e
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/docker/docker-compose.yml
@@ -0,0 +1,18 @@
+version: "2.3"
+services:
+  detectron2:
+    build:
+      context: .
+      dockerfile: Dockerfile
+      args:
+        USER_ID: ${USER_ID:-1000}
+    runtime: nvidia  # TODO: Exchange with "gpu: all" in the future (see https://github.com/facebookresearch/detectron2/pull/197/commits/00545e1f376918db4a8ce264d427a07c1e896c5a).
+    shm_size: "8gb"
+    ulimits:
+      memlock: -1
+      stack: 67108864
+    volumes:
+      - /tmp/.X11-unix:/tmp/.X11-unix:ro
+    environment:
+      - DISPLAY=$DISPLAY
+      - NVIDIA_VISIBLE_DEVICES=all
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/docs/.gitignore b/preprocess/humanparsing/mhp_extension/detectron2/docs/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..e35d8850c9688b1ce82711694692cc574a799396
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/docs/.gitignore
@@ -0,0 +1 @@
+_build
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/docs/Makefile b/preprocess/humanparsing/mhp_extension/detectron2/docs/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..d537643dd411736a5f309383cfef52ea7d5e4599
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/docs/Makefile
@@ -0,0 +1,19 @@
+# Minimal makefile for Sphinx documentation
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+
+# You can set these variables from the command line.
+SPHINXOPTS    =
+SPHINXBUILD   = sphinx-build
+SOURCEDIR     = .
+BUILDDIR      = _build
+
+# Put it first so that "make" without argument is like "make help".
+help:
+	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
+
+.PHONY: help Makefile
+
+# Catch-all target: route all unknown targets to Sphinx using the new
+# "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
+%: Makefile
+	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/docs/README.md b/preprocess/humanparsing/mhp_extension/detectron2/docs/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..2c65c3676b488f3654b7e3231e1cfd06df48d4be
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/docs/README.md
@@ -0,0 +1,16 @@
+# Read the docs:
+
+The latest documentation built from this directory is available at [detectron2.readthedocs.io](https://detectron2.readthedocs.io/).
+Documents in this directory are not meant to be read on github.
+
+# Build the docs:
+
+1. Install detectron2 according to [INSTALL.md](INSTALL.md).
+2. Install additional libraries required to build docs:
+  - docutils==0.16
+  - Sphinx==3.0.0
+  - recommonmark==0.6.0
+  - sphinx_rtd_theme
+  - mock
+
+3. Run `make html` from this directory.
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/docs/conf.py b/preprocess/humanparsing/mhp_extension/detectron2/docs/conf.py
new file mode 100644
index 0000000000000000000000000000000000000000..44e9f2b4db549a3a5ef1420b27d408915e86657c
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/docs/conf.py
@@ -0,0 +1,335 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+
+# flake8: noqa
+
+# Configuration file for the Sphinx documentation builder.
+#
+# This file does only contain a selection of the most common options. For a
+# full list see the documentation:
+# http://www.sphinx-doc.org/en/master/config
+
+# -- Path setup --------------------------------------------------------------
+
+# If extensions (or modules to document with autodoc) are in another directory,
+# add these directories to sys.path here. If the directory is relative to the
+# documentation root, use os.path.abspath to make it absolute, like shown here.
+#
+import os
+import sys
+import mock
+from sphinx.domains import Domain
+from typing import Dict, List, Tuple
+
+# The theme to use for HTML and HTML Help pages.  See the documentation for
+# a list of builtin themes.
+#
+import sphinx_rtd_theme
+
+
+class GithubURLDomain(Domain):
+    """
+    Resolve certain links in markdown files to github source.
+    """
+
+    name = "githuburl"
+    ROOT = "https://github.com/facebookresearch/detectron2/blob/master/"
+    LINKED_DOC = ["tutorials/install", "tutorials/getting_started"]
+
+    def resolve_any_xref(self, env, fromdocname, builder, target, node, contnode):
+        github_url = None
+        if not target.endswith("html") and target.startswith("../../"):
+            url = target.replace("../", "")
+            github_url = url
+        if fromdocname in self.LINKED_DOC:
+            # unresolved links in these docs are all github links
+            github_url = target
+
+        if github_url is not None:
+            if github_url.endswith("MODEL_ZOO") or github_url.endswith("README"):
+                # bug of recommonmark.
+                # https://github.com/readthedocs/recommonmark/blob/ddd56e7717e9745f11300059e4268e204138a6b1/recommonmark/parser.py#L152-L155
+                github_url += ".md"
+            print("Ref {} resolved to github:{}".format(target, github_url))
+            contnode["refuri"] = self.ROOT + github_url
+            return [("githuburl:any", contnode)]
+        else:
+            return []
+
+
+# to support markdown
+from recommonmark.parser import CommonMarkParser
+
+sys.path.insert(0, os.path.abspath("../"))
+os.environ["DOC_BUILDING"] = "True"
+DEPLOY = os.environ.get("READTHEDOCS") == "True"
+
+
+# -- Project information -----------------------------------------------------
+
+# fmt: off
+try:
+    import torch  # noqa
+except ImportError:
+    for m in [
+        "torch", "torchvision", "torch.nn", "torch.nn.parallel", "torch.distributed", "torch.multiprocessing", "torch.autograd",
+        "torch.autograd.function", "torch.nn.modules", "torch.nn.modules.utils", "torch.utils", "torch.utils.data", "torch.onnx",
+        "torchvision", "torchvision.ops",
+    ]:
+        sys.modules[m] = mock.Mock(name=m)
+    sys.modules['torch'].__version__ = "1.5"  # fake version
+
+for m in [
+    "cv2", "scipy", "portalocker", "detectron2._C",
+    "pycocotools", "pycocotools.mask", "pycocotools.coco", "pycocotools.cocoeval",
+    "google", "google.protobuf", "google.protobuf.internal", "onnx",
+    "caffe2", "caffe2.proto", "caffe2.python", "caffe2.python.utils", "caffe2.python.onnx", "caffe2.python.onnx.backend",
+]:
+    sys.modules[m] = mock.Mock(name=m)
+# fmt: on
+sys.modules["cv2"].__version__ = "3.4"
+
+import detectron2  # isort: skip
+
+
+project = "detectron2"
+copyright = "2019-2020, detectron2 contributors"
+author = "detectron2 contributors"
+
+# The short X.Y version
+version = detectron2.__version__
+# The full version, including alpha/beta/rc tags
+release = version
+
+
+# -- General configuration ---------------------------------------------------
+
+# If your documentation needs a minimal Sphinx version, state it here.
+#
+needs_sphinx = "3.0"
+
+# Add any Sphinx extension module names here, as strings. They can be
+# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
+# ones.
+extensions = [
+    "recommonmark",
+    "sphinx.ext.autodoc",
+    "sphinx.ext.napoleon",
+    "sphinx.ext.intersphinx",
+    "sphinx.ext.todo",
+    "sphinx.ext.coverage",
+    "sphinx.ext.mathjax",
+    "sphinx.ext.viewcode",
+    "sphinx.ext.githubpages",
+]
+
+# -- Configurations for plugins ------------
+napoleon_google_docstring = True
+napoleon_include_init_with_doc = True
+napoleon_include_special_with_doc = True
+napoleon_numpy_docstring = False
+napoleon_use_rtype = False
+autodoc_inherit_docstrings = False
+autodoc_member_order = "bysource"
+
+if DEPLOY:
+    intersphinx_timeout = 10
+else:
+    # skip this when building locally
+    intersphinx_timeout = 0.1
+intersphinx_mapping = {
+    "python": ("https://docs.python.org/3.6", None),
+    "numpy": ("https://docs.scipy.org/doc/numpy/", None),
+    "torch": ("https://pytorch.org/docs/master/", None),
+}
+# -------------------------
+
+
+# Add any paths that contain templates here, relative to this directory.
+templates_path = ["_templates"]
+
+source_suffix = [".rst", ".md"]
+
+# The master toctree document.
+master_doc = "index"
+
+# The language for content autogenerated by Sphinx. Refer to documentation
+# for a list of supported languages.
+#
+# This is also used if you do content translation via gettext catalogs.
+# Usually you set "language" from the command line for these cases.
+language = None
+
+# List of patterns, relative to source directory, that match files and
+# directories to ignore when looking for source files.
+# This pattern also affects html_static_path and html_extra_path.
+exclude_patterns = ["_build", "Thumbs.db", ".DS_Store", "build", "README.md", "tutorials/README.md"]
+
+# The name of the Pygments (syntax highlighting) style to use.
+pygments_style = "sphinx"
+
+
+# -- Options for HTML output -------------------------------------------------
+
+html_theme = "sphinx_rtd_theme"
+html_theme_path = [sphinx_rtd_theme.get_html_theme_path()]
+
+# Theme options are theme-specific and customize the look and feel of a theme
+# further.  For a list of options available for each theme, see the
+# documentation.
+#
+# html_theme_options = {}
+
+# Add any paths that contain custom static files (such as style sheets) here,
+# relative to this directory. They are copied after the builtin static files,
+# so a file named "default.css" will overwrite the builtin "default.css".
+html_static_path = ["_static"]
+
+# Custom sidebar templates, must be a dictionary that maps document names
+# to template names.
+#
+# The default sidebars (for documents that don't match any pattern) are
+# defined by theme itself.  Builtin themes are using these templates by
+# default: ``['localtoc.html', 'relations.html', 'sourcelink.html',
+# 'searchbox.html']``.
+#
+# html_sidebars = {}
+
+
+# -- Options for HTMLHelp output ---------------------------------------------
+
+# Output file base name for HTML help builder.
+htmlhelp_basename = "detectron2doc"
+
+
+# -- Options for LaTeX output ------------------------------------------------
+
+latex_elements = {
+    # The paper size ('letterpaper' or 'a4paper').
+    #
+    # 'papersize': 'letterpaper',
+    # The font size ('10pt', '11pt' or '12pt').
+    #
+    # 'pointsize': '10pt',
+    # Additional stuff for the LaTeX preamble.
+    #
+    # 'preamble': '',
+    # Latex figure (float) alignment
+    #
+    # 'figure_align': 'htbp',
+}
+
+# Grouping the document tree into LaTeX files. List of tuples
+# (source start file, target name, title,
+#  author, documentclass [howto, manual, or own class]).
+latex_documents = [
+    (master_doc, "detectron2.tex", "detectron2 Documentation", "detectron2 contributors", "manual")
+]
+
+
+# -- Options for manual page output ------------------------------------------
+
+# One entry per manual page. List of tuples
+# (source start file, name, description, authors, manual section).
+man_pages = [(master_doc, "detectron2", "detectron2 Documentation", [author], 1)]
+
+
+# -- Options for Texinfo output ----------------------------------------------
+
+# Grouping the document tree into Texinfo files. List of tuples
+# (source start file, target name, title, author,
+#  dir menu entry, description, category)
+texinfo_documents = [
+    (
+        master_doc,
+        "detectron2",
+        "detectron2 Documentation",
+        author,
+        "detectron2",
+        "One line description of project.",
+        "Miscellaneous",
+    )
+]
+
+
+# -- Options for todo extension ----------------------------------------------
+
+# If true, `todo` and `todoList` produce output, else they produce nothing.
+todo_include_todos = True
+
+
+_DEPRECATED_NAMES = set()
+
+
+def autodoc_skip_member(app, what, name, obj, skip, options):
+    # we hide something deliberately
+    if getattr(obj, "__HIDE_SPHINX_DOC__", False):
+        return True
+    # Hide some names that are deprecated or not intended to be used
+    if name in _DEPRECATED_NAMES:
+        return True
+    return None
+
+
+_PAPER_DATA = {
+    "resnet": ("1512.03385", "Deep Residual Learning for Image Recognition"),
+    "fpn": ("1612.03144", "Feature Pyramid Networks for Object Detection"),
+    "mask r-cnn": ("1703.06870", "Mask R-CNN"),
+    "faster r-cnn": (
+        "1506.01497",
+        "Faster R-CNN: Towards Real-Time Object Detection with Region Proposal Networks",
+    ),
+    "deformconv": ("1703.06211", "Deformable Convolutional Networks"),
+    "deformconv2": ("1811.11168", "Deformable ConvNets v2: More Deformable, Better Results"),
+    "panopticfpn": ("1901.02446", "Panoptic Feature Pyramid Networks"),
+    "retinanet": ("1708.02002", "Focal Loss for Dense Object Detection"),
+    "cascade r-cnn": ("1712.00726", "Cascade R-CNN: Delving into High Quality Object Detection"),
+    "lvis": ("1908.03195", "LVIS: A Dataset for Large Vocabulary Instance Segmentation"),
+    "rrpn": ("1703.01086", "Arbitrary-Oriented Scene Text Detection via Rotation Proposals"),
+    "in1k1h": ("1706.02677", "Accurate, Large Minibatch SGD: Training ImageNet in 1 Hour"),
+}
+
+
+def paper_ref_role(
+    typ: str,
+    rawtext: str,
+    text: str,
+    lineno: int,
+    inliner,
+    options: Dict = {},
+    content: List[str] = [],
+):
+    """
+    Parse :paper:`xxx`. Similar to the "extlinks" sphinx extension.
+    """
+    from docutils import nodes, utils
+    from sphinx.util.nodes import split_explicit_title
+
+    text = utils.unescape(text)
+    has_explicit_title, title, link = split_explicit_title(text)
+    link = link.lower()
+    if link not in _PAPER_DATA:
+        inliner.reporter.warning("Cannot find paper " + link)
+        paper_url, paper_title = "#", link
+    else:
+        paper_url, paper_title = _PAPER_DATA[link]
+        if "/" not in paper_url:
+            paper_url = "https://arxiv.org/abs/" + paper_url
+    if not has_explicit_title:
+        title = paper_title
+    pnode = nodes.reference(title, title, internal=False, refuri=paper_url)
+    return [pnode], []
+
+
+def setup(app):
+    from recommonmark.transform import AutoStructify
+
+    app.add_domain(GithubURLDomain)
+    app.connect("autodoc-skip-member", autodoc_skip_member)
+    app.add_role("paper", paper_ref_role)
+    app.add_config_value(
+        "recommonmark_config",
+        {"enable_math": True, "enable_inline_math": True, "enable_eval_rst": True},
+        True,
+    )
+    app.add_transform(AutoStructify)
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/docs/index.rst b/preprocess/humanparsing/mhp_extension/detectron2/docs/index.rst
new file mode 100644
index 0000000000000000000000000000000000000000..8634b7b12ab906c10a78d6053428029799282ffd
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/docs/index.rst
@@ -0,0 +1,14 @@
+.. detectron2 documentation master file, created by
+   sphinx-quickstart on Sat Sep 21 13:46:45 2019.
+   You can adapt this file completely to your liking, but it should at least
+   contain the root `toctree` directive.
+
+Welcome to detectron2's documentation!
+======================================
+
+.. toctree::
+   :maxdepth: 2
+
+   tutorials/index
+   notes/index
+   modules/index
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/docs/modules/checkpoint.rst b/preprocess/humanparsing/mhp_extension/detectron2/docs/modules/checkpoint.rst
new file mode 100644
index 0000000000000000000000000000000000000000..616cb186c40212d7a0ca311d21691245b2fce996
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/docs/modules/checkpoint.rst
@@ -0,0 +1,7 @@
+detectron2.checkpoint package
+=============================
+
+.. automodule:: detectron2.checkpoint
+    :members:
+    :undoc-members:
+    :show-inheritance:
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/docs/modules/config.rst b/preprocess/humanparsing/mhp_extension/detectron2/docs/modules/config.rst
new file mode 100644
index 0000000000000000000000000000000000000000..034bd5f5e8a79d9eb2109f86b7aa12eea9c8b786
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/docs/modules/config.rst
@@ -0,0 +1,17 @@
+detectron2.config package
+=========================
+
+.. automodule:: detectron2.config
+    :members:
+    :undoc-members:
+    :show-inheritance:
+    :inherited-members:
+
+
+Config References
+-----------------
+
+.. literalinclude:: ../../detectron2/config/defaults.py
+  :language: python
+  :linenos:
+  :lines: 4-
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/docs/modules/data.rst b/preprocess/humanparsing/mhp_extension/detectron2/docs/modules/data.rst
new file mode 100644
index 0000000000000000000000000000000000000000..3697f0e22f3351a68ee40e4cadbd3ee6d978af8d
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/docs/modules/data.rst
@@ -0,0 +1,40 @@
+detectron2.data package
+=======================
+
+.. automodule:: detectron2.data
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+detectron2.data.detection\_utils module
+---------------------------------------
+
+.. automodule:: detectron2.data.detection_utils
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+detectron2.data.datasets module
+---------------------------------------
+
+.. automodule:: detectron2.data.datasets
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+detectron2.data.samplers module
+---------------------------------------
+
+.. automodule:: detectron2.data.samplers
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+
+detectron2.data.transforms module
+---------------------------------------
+
+.. automodule:: detectron2.data.transforms
+    :members:
+    :undoc-members:
+    :show-inheritance:
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/docs/modules/engine.rst b/preprocess/humanparsing/mhp_extension/detectron2/docs/modules/engine.rst
new file mode 100644
index 0000000000000000000000000000000000000000..bb8b533aee225b1096fe4353b03533208f92732e
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/docs/modules/engine.rst
@@ -0,0 +1,25 @@
+detectron2.engine package
+=========================
+
+
+.. automodule:: detectron2.engine
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+
+detectron2.engine.defaults module
+---------------------------------
+
+.. automodule:: detectron2.engine.defaults
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+detectron2.engine.hooks module
+---------------------------------
+
+.. automodule:: detectron2.engine.hooks
+    :members:
+    :undoc-members:
+    :show-inheritance:
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/docs/modules/evaluation.rst b/preprocess/humanparsing/mhp_extension/detectron2/docs/modules/evaluation.rst
new file mode 100644
index 0000000000000000000000000000000000000000..d9d34ff1a21c42b33ce2ad8b4415052af194397f
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/docs/modules/evaluation.rst
@@ -0,0 +1,7 @@
+detectron2.evaluation package
+=============================
+
+.. automodule:: detectron2.evaluation
+    :members:
+    :undoc-members:
+    :show-inheritance:
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/docs/modules/export.rst b/preprocess/humanparsing/mhp_extension/detectron2/docs/modules/export.rst
new file mode 100644
index 0000000000000000000000000000000000000000..bb7c3c9173cae323e67cb9330b292fefc40ec760
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/docs/modules/export.rst
@@ -0,0 +1,7 @@
+detectron2.export package
+=========================
+
+.. automodule:: detectron2.export
+    :members:
+    :undoc-members:
+    :show-inheritance:
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/docs/modules/index.rst b/preprocess/humanparsing/mhp_extension/detectron2/docs/modules/index.rst
new file mode 100644
index 0000000000000000000000000000000000000000..1b246f570070b4f8ef47d00968498d49f0310a6e
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/docs/modules/index.rst
@@ -0,0 +1,17 @@
+API Documentation
+==================
+
+.. toctree::
+
+    checkpoint
+    config
+    data
+    engine
+    evaluation
+    layers
+    model_zoo
+    modeling
+    solver
+    structures
+    utils
+    export
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/docs/modules/layers.rst b/preprocess/humanparsing/mhp_extension/detectron2/docs/modules/layers.rst
new file mode 100644
index 0000000000000000000000000000000000000000..6aeb5213a4b27edeb7c0b2bdb816fd1af8d22ce4
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/docs/modules/layers.rst
@@ -0,0 +1,7 @@
+detectron2.layers package
+=========================
+
+.. automodule:: detectron2.layers
+    :members:
+    :undoc-members:
+    :show-inheritance:
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/docs/modules/model_zoo.rst b/preprocess/humanparsing/mhp_extension/detectron2/docs/modules/model_zoo.rst
new file mode 100644
index 0000000000000000000000000000000000000000..8b1c7d598f509db2361928aac1be4f25854d9f93
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/docs/modules/model_zoo.rst
@@ -0,0 +1,7 @@
+detectron2.model_zoo package
+============================
+
+.. automodule:: detectron2.model_zoo
+    :members:
+    :undoc-members:
+    :show-inheritance:
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/docs/modules/modeling.rst b/preprocess/humanparsing/mhp_extension/detectron2/docs/modules/modeling.rst
new file mode 100644
index 0000000000000000000000000000000000000000..58ccd2c591774f3766f71da00b6938a0f4f3f592
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/docs/modules/modeling.rst
@@ -0,0 +1,58 @@
+detectron2.modeling package
+===========================
+
+.. automodule:: detectron2.modeling
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+
+detectron2.modeling.poolers module
+---------------------------------------
+
+.. automodule:: detectron2.modeling.poolers
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+
+detectron2.modeling.sampling module
+------------------------------------
+
+.. automodule:: detectron2.modeling.sampling
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+
+detectron2.modeling.box_regression module
+------------------------------------------
+
+.. automodule:: detectron2.modeling.box_regression
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+
+Model Registries
+-----------------
+
+These are different registries provided in modeling.
+Each registry provide you the ability to replace it with your customized component,
+without having to modify detectron2's code.
+
+Note that it is impossible to allow users to customize any line of code directly.
+Even just to add one line at some place,
+you'll likely need to find out the smallest registry which contains that line,
+and register your component to that registry.
+
+
+.. autodata:: detectron2.modeling.META_ARCH_REGISTRY
+.. autodata:: detectron2.modeling.BACKBONE_REGISTRY
+.. autodata:: detectron2.modeling.PROPOSAL_GENERATOR_REGISTRY
+.. autodata:: detectron2.modeling.RPN_HEAD_REGISTRY
+.. autodata:: detectron2.modeling.ANCHOR_GENERATOR_REGISTRY
+.. autodata:: detectron2.modeling.ROI_HEADS_REGISTRY
+.. autodata:: detectron2.modeling.ROI_BOX_HEAD_REGISTRY
+.. autodata:: detectron2.modeling.ROI_MASK_HEAD_REGISTRY
+.. autodata:: detectron2.modeling.ROI_KEYPOINT_HEAD_REGISTRY
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/docs/modules/solver.rst b/preprocess/humanparsing/mhp_extension/detectron2/docs/modules/solver.rst
new file mode 100644
index 0000000000000000000000000000000000000000..7f4a49f2ebaef2760b91eb7cecd32dcbff038efb
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/docs/modules/solver.rst
@@ -0,0 +1,7 @@
+detectron2.solver package
+=========================
+
+.. automodule:: detectron2.solver
+    :members:
+    :undoc-members:
+    :show-inheritance:
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/docs/modules/structures.rst b/preprocess/humanparsing/mhp_extension/detectron2/docs/modules/structures.rst
new file mode 100644
index 0000000000000000000000000000000000000000..5701c61abf5f74f61807e131f708304a8c9bab82
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/docs/modules/structures.rst
@@ -0,0 +1,7 @@
+detectron2.structures package
+=============================
+
+.. automodule:: detectron2.structures
+    :members:
+    :undoc-members:
+    :show-inheritance:
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/docs/modules/utils.rst b/preprocess/humanparsing/mhp_extension/detectron2/docs/modules/utils.rst
new file mode 100644
index 0000000000000000000000000000000000000000..8b57292ac0e655f40756b19c8eea259bddb62aab
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/docs/modules/utils.rst
@@ -0,0 +1,80 @@
+detectron2.utils package
+========================
+
+detectron2.utils.colormap module
+--------------------------------
+
+.. automodule:: detectron2.utils.colormap
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+detectron2.utils.comm module
+----------------------------
+
+.. automodule:: detectron2.utils.comm
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+
+detectron2.utils.events module
+------------------------------
+
+.. automodule:: detectron2.utils.events
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+
+detectron2.utils.logger module
+------------------------------
+
+.. automodule:: detectron2.utils.logger
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+
+detectron2.utils.registry module
+--------------------------------
+
+.. automodule:: detectron2.utils.registry
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+detectron2.utils.memory module
+----------------------------------
+
+.. automodule:: detectron2.utils.memory
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+
+detectron2.utils.analysis module
+----------------------------------
+
+.. automodule:: detectron2.utils.analysis
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+
+detectron2.utils.visualizer module
+----------------------------------
+
+.. automodule:: detectron2.utils.visualizer
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+detectron2.utils.video\_visualizer module
+-----------------------------------------
+
+.. automodule:: detectron2.utils.video_visualizer
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/docs/notes/benchmarks.md b/preprocess/humanparsing/mhp_extension/detectron2/docs/notes/benchmarks.md
new file mode 100644
index 0000000000000000000000000000000000000000..963f9210b39ce3ae248541644362631cb325d2b2
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/docs/notes/benchmarks.md
@@ -0,0 +1,196 @@
+
+# Benchmarks
+
+Here we benchmark the training speed of a Mask R-CNN in detectron2,
+with some other popular open source Mask R-CNN implementations.
+
+
+### Settings
+
+* Hardware: 8 NVIDIA V100s with NVLink.
+* Software: Python 3.7, CUDA 10.1, cuDNN 7.6.5, PyTorch 1.5,
+  TensorFlow 1.15.0rc2, Keras 2.2.5, MxNet 1.6.0b20190820.
+* Model: an end-to-end R-50-FPN Mask-RCNN model, using the same hyperparameter as the
+  [Detectron baseline config](https://github.com/facebookresearch/Detectron/blob/master/configs/12_2017_baselines/e2e_mask_rcnn_R-50-FPN_1x.yaml)
+	(it does no have scale augmentation).
+* Metrics: We use the average throughput in iterations 100-500 to skip GPU warmup time.
+  Note that for R-CNN-style models, the throughput of a model typically changes during training, because
+  it depends on the predictions of the model. Therefore this metric is not directly comparable with
+  "train speed" in model zoo, which is the average speed of the entire training run.
+
+
+### Main Results
+
+```eval_rst
++-------------------------------+--------------------+
+| Implementation                | Throughput (img/s) |
++===============================+====================+
+| |D2| |PT|                     | 62                 |
++-------------------------------+--------------------+
+| mmdetection_  |PT|            | 53                 |
++-------------------------------+--------------------+
+| maskrcnn-benchmark_  |PT|     | 53                 |
++-------------------------------+--------------------+
+| tensorpack_ |TF|              | 50                 |
++-------------------------------+--------------------+
+| simpledet_ |mxnet|            | 39                 |
++-------------------------------+--------------------+
+| Detectron_  |C2|              | 19                 |
++-------------------------------+--------------------+
+| `matterport/Mask_RCNN`__ |TF| | 14                 |
++-------------------------------+--------------------+
+
+.. _maskrcnn-benchmark: https://github.com/facebookresearch/maskrcnn-benchmark/
+.. _tensorpack: https://github.com/tensorpack/tensorpack/tree/master/examples/FasterRCNN
+.. _mmdetection: https://github.com/open-mmlab/mmdetection/
+.. _simpledet: https://github.com/TuSimple/simpledet/
+.. _Detectron: https://github.com/facebookresearch/Detectron
+__ https://github.com/matterport/Mask_RCNN/
+
+.. |D2| image:: https://github.com/facebookresearch/detectron2/raw/master/.github/Detectron2-Logo-Horz.svg?sanitize=true
+   :height: 15pt
+   :target: https://github.com/facebookresearch/detectron2/
+.. |PT| image:: https://pytorch.org/assets/images/logo-icon.svg
+   :width: 15pt
+   :height: 15pt
+   :target: https://pytorch.org
+.. |TF| image:: https://static.nvidiagrid.net/ngc/containers/tensorflow.png
+   :width: 15pt
+   :height: 15pt
+   :target: https://tensorflow.org
+.. |mxnet| image:: https://github.com/dmlc/web-data/raw/master/mxnet/image/mxnet_favicon.png
+   :width: 15pt
+   :height: 15pt
+   :target: https://mxnet.apache.org/
+.. |C2| image:: https://caffe2.ai/static/logo.svg
+   :width: 15pt
+   :height: 15pt
+   :target: https://caffe2.ai
+```
+
+
+Details for each implementation:
+
+* __Detectron2__: with release v0.1.2, run:
+  ```
+  python tools/train_net.py  --config-file configs/Detectron1-Comparisons/mask_rcnn_R_50_FPN_noaug_1x.yaml --num-gpus 8
+  ```
+
+* __mmdetection__: at commit `b0d845f`, run
+  ```
+  ./tools/dist_train.sh configs/mask_rcnn/mask_rcnn_r50_caffe_fpn_1x_coco.py 8
+  ```
+
+* __maskrcnn-benchmark__: use commit `0ce8f6f` with `sed -i ‘s/torch.uint8/torch.bool/g’ **/*.py; sed -i 's/AT_CHECK/TORCH_CHECK/g' **/*.cu`
+	to make it compatible with PyTorch 1.5. Then, run training with
+  ```
+  python -m torch.distributed.launch --nproc_per_node=8 tools/train_net.py --config-file configs/e2e_mask_rcnn_R_50_FPN_1x.yaml
+  ```
+  The speed we observed is faster than its model zoo, likely due to different software versions.
+
+* __tensorpack__: at commit `caafda`, `export TF_CUDNN_USE_AUTOTUNE=0`, then run
+  ```
+  mpirun -np 8 ./train.py --config DATA.BASEDIR=/data/coco TRAINER=horovod BACKBONE.STRIDE_1X1=True TRAIN.STEPS_PER_EPOCH=50 --load ImageNet-R50-AlignPadding.npz
+  ```
+
+* __SimpleDet__: at commit `9187a1`, run
+  ```
+  python detection_train.py --config config/mask_r50v1_fpn_1x.py
+  ```
+
+* __Detectron__: run
+  ```
+  python tools/train_net.py --cfg configs/12_2017_baselines/e2e_mask_rcnn_R-50-FPN_1x.yaml
+  ```
+  Note that many of its ops run on CPUs, therefore the performance is limited.
+
+* __matterport/Mask_RCNN__: at commit `3deaec`, apply the following diff, `export TF_CUDNN_USE_AUTOTUNE=0`, then run
+  ```
+  python coco.py train --dataset=/data/coco/ --model=imagenet
+  ```
+  Note that many small details in this implementation might be different
+  from Detectron's standards.
+
+  <details>
+  <summary>
+  (diff to make it use the same hyperparameters - click to expand)
+  </summary>
+
+  ```diff
+  diff --git i/mrcnn/model.py w/mrcnn/model.py
+  index 62cb2b0..61d7779 100644
+  --- i/mrcnn/model.py
+  +++ w/mrcnn/model.py
+  @@ -2367,8 +2367,8 @@ class MaskRCNN():
+        epochs=epochs,
+        steps_per_epoch=self.config.STEPS_PER_EPOCH,
+        callbacks=callbacks,
+  -            validation_data=val_generator,
+  -            validation_steps=self.config.VALIDATION_STEPS,
+  +            #validation_data=val_generator,
+  +            #validation_steps=self.config.VALIDATION_STEPS,
+        max_queue_size=100,
+        workers=workers,
+        use_multiprocessing=True,
+  diff --git i/mrcnn/parallel_model.py w/mrcnn/parallel_model.py
+  index d2bf53b..060172a 100644
+  --- i/mrcnn/parallel_model.py
+  +++ w/mrcnn/parallel_model.py
+  @@ -32,6 +32,7 @@ class ParallelModel(KM.Model):
+      keras_model: The Keras model to parallelize
+      gpu_count: Number of GPUs. Must be > 1
+      """
+  +        super().__init__()
+      self.inner_model = keras_model
+      self.gpu_count = gpu_count
+      merged_outputs = self.make_parallel()
+  diff --git i/samples/coco/coco.py w/samples/coco/coco.py
+  index 5d172b5..239ed75 100644
+  --- i/samples/coco/coco.py
+  +++ w/samples/coco/coco.py
+  @@ -81,7 +81,10 @@ class CocoConfig(Config):
+    IMAGES_PER_GPU = 2
+
+    # Uncomment to train on 8 GPUs (default is 1)
+  -    # GPU_COUNT = 8
+  +    GPU_COUNT = 8
+  +    BACKBONE = "resnet50"
+  +    STEPS_PER_EPOCH = 50
+  +    TRAIN_ROIS_PER_IMAGE = 512
+
+    # Number of classes (including background)
+    NUM_CLASSES = 1 + 80  # COCO has 80 classes
+  @@ -496,29 +499,10 @@ if __name__ == '__main__':
+      # *** This training schedule is an example. Update to your needs ***
+
+      # Training - Stage 1
+  -        print("Training network heads")
+      model.train(dataset_train, dataset_val,
+            learning_rate=config.LEARNING_RATE,
+            epochs=40,
+  -                    layers='heads',
+  -                    augmentation=augmentation)
+  -
+  -        # Training - Stage 2
+  -        # Finetune layers from ResNet stage 4 and up
+  -        print("Fine tune Resnet stage 4 and up")
+  -        model.train(dataset_train, dataset_val,
+  -                    learning_rate=config.LEARNING_RATE,
+  -                    epochs=120,
+  -                    layers='4+',
+  -                    augmentation=augmentation)
+  -
+  -        # Training - Stage 3
+  -        # Fine tune all layers
+  -        print("Fine tune all layers")
+  -        model.train(dataset_train, dataset_val,
+  -                    learning_rate=config.LEARNING_RATE / 10,
+  -                    epochs=160,
+  -                    layers='all',
+  +                    layers='3+',
+            augmentation=augmentation)
+
+    elif args.command == "evaluate":
+  ```
+
+  </details>
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/docs/notes/changelog.md b/preprocess/humanparsing/mhp_extension/detectron2/docs/notes/changelog.md
new file mode 100644
index 0000000000000000000000000000000000000000..c0d4f5900bc64dbc4d2ce2d9bd31d32b9ee39f8f
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/docs/notes/changelog.md
@@ -0,0 +1,26 @@
+# Change Log
+
+### Releases
+See release log at
+[https://github.com/facebookresearch/detectron2/releases](https://github.com/facebookresearch/detectron2/releases).
+
+### Notable Backward Incompatible Changes:
+
+* 03/30/2020: Custom box head's `output_size` changed to `output_shape`.
+* 02/14/2020,02/18/2020: Mask head and keypoint head now include logic for losses & inference. Custom heads
+	should overwrite the feature computation by `layers()` method.
+* 11/11/2019: `detectron2.data.detection_utils.read_image` transposes images with exif information.
+
+### Config Version Change Log
+
+* v1: Rename `RPN_HEAD.NAME` to `RPN.HEAD_NAME`.
+* v2: A batch of rename of many configurations before release.
+
+### Silent Regression in Historical Versions:
+
+We list a few silent regressions since they may silently produce incorrect results and will be hard to debug.
+
+* 04/01/2020 - 05/11/2020: Bad accuracy if `TRAIN_ON_PRED_BOXES` is set to True.
+* 03/30/2020 - 04/01/2020: ResNets are not correctly built.
+* 12/19/2019 - 12/26/2019: Using aspect ratio grouping causes a drop in accuracy.
+* release - 11/9/2019: Test time augmentation does not predict the last category.
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/docs/notes/compatibility.md b/preprocess/humanparsing/mhp_extension/detectron2/docs/notes/compatibility.md
new file mode 100644
index 0000000000000000000000000000000000000000..f7b66c2e384b162864fb96a2fed44ba3084b8226
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/docs/notes/compatibility.md
@@ -0,0 +1,83 @@
+# Compatibility with Other Libraries
+
+## Compatibility with Detectron (and maskrcnn-benchmark)
+
+Detectron2 addresses some legacy issues left in Detectron. As a result, their models
+are not compatible:
+running inference with the same model weights will produce different results in the two code bases.
+
+The major differences regarding inference are:
+
+- The height and width of a box with corners (x1, y1) and (x2, y2) is now computed more naturally as
+  width = x2 - x1 and height = y2 - y1;
+  In Detectron, a "+ 1" was added both height and width.
+
+  Note that the relevant ops in Caffe2 have [adopted this change of convention](https://github.com/pytorch/pytorch/pull/20550)
+  with an extra option.
+  So it is still possible to run inference with a Detectron2-trained model in Caffe2.
+
+  The change in height/width calculations most notably changes:
+  - encoding/decoding in bounding box regression.
+  - non-maximum suppression. The effect here is very negligible, though.
+
+- RPN now uses simpler anchors with fewer quantization artifacts.
+
+  In Detectron, the anchors were quantized and
+  [do not have accurate areas](https://github.com/facebookresearch/Detectron/issues/227).
+  In Detectron2, the anchors are center-aligned to feature grid points and not quantized.
+
+- Classification layers have a different ordering of class labels.
+
+  This involves any trainable parameter with shape (..., num_categories + 1, ...).
+  In Detectron2, integer labels [0, K-1] correspond to the K = num_categories object categories
+  and the label "K" corresponds to the special "background" category.
+  In Detectron, label "0" means background, and labels [1, K] correspond to the K categories.
+
+- ROIAlign is implemented differently. The new implementation is [available in Caffe2](https://github.com/pytorch/pytorch/pull/23706).
+
+  1. All the ROIs are shifted by half a pixel compared to Detectron in order to create better image-feature-map alignment.
+     See `layers/roi_align.py` for details.
+     To enable the old behavior, use `ROIAlign(aligned=False)`, or `POOLER_TYPE=ROIAlign` instead of
+     `ROIAlignV2` (the default).
+
+  1. The ROIs are not required to have a minimum size of 1.
+     This will lead to tiny differences in the output, but should be negligible.
+
+- Mask inference function is different.
+
+  In Detectron2, the "paste_mask" function is different and should be more accurate than in Detectron. This change
+  can improve mask AP on COCO by ~0.5% absolute.
+
+There are some other differences in training as well, but they won't affect
+model-level compatibility. The major ones are:
+
+- We fixed a [bug](https://github.com/facebookresearch/Detectron/issues/459) in
+  Detectron, by making `RPN.POST_NMS_TOPK_TRAIN` per-image, rather than per-batch.
+  The fix may lead to a small accuracy drop for a few models (e.g. keypoint
+  detection) and will require some parameter tuning to match the Detectron results.
+- For simplicity, we change the default loss in bounding box regression to L1 loss, instead of smooth L1 loss.
+  We have observed that this tends to slightly decrease box AP50 while improving box AP for higher
+  overlap thresholds (and leading to a slight overall improvement in box AP).
+- We interpret the coordinates in COCO bounding box and segmentation annotations
+  as coordinates in range `[0, width]` or `[0, height]`. The coordinates in
+  COCO keypoint annotations are interpreted as pixel indices in range `[0, width - 1]` or `[0, height - 1]`.
+  Note that this affects how flip augmentation is implemented.
+
+
+We will later share more details and rationale behind the above mentioned issues
+about pixels, coordinates, and "+1"s.
+
+
+## Compatibility with Caffe2
+
+As mentioned above, despite the incompatibilities with Detectron, the relevant
+ops have been implemented in Caffe2.
+Therefore, models trained with detectron2 can be converted in Caffe2.
+See [Deployment](../tutorials/deployment.md) for the tutorial.
+
+## Compatibility with TensorFlow
+
+Most ops are available in TensorFlow, although some tiny differences in
+the implementation of resize / ROIAlign / padding need to be addressed.
+A working conversion script is provided by [tensorpack FasterRCNN](https://github.com/tensorpack/tensorpack/tree/master/examples/FasterRCNN/convert_d2)
+to run a standard detectron2 model in TensorFlow.
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/docs/notes/contributing.md b/preprocess/humanparsing/mhp_extension/detectron2/docs/notes/contributing.md
new file mode 100644
index 0000000000000000000000000000000000000000..81936dfedb495dd5cd21da2bfcf9819b97ed1dff
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/docs/notes/contributing.md
@@ -0,0 +1,49 @@
+# Contributing to detectron2
+
+## Issues
+We use GitHub issues to track public bugs and questions.
+Please make sure to follow one of the
+[issue templates](https://github.com/facebookresearch/detectron2/issues/new/choose)
+when reporting any issues.
+
+Facebook has a [bounty program](https://www.facebook.com/whitehat/) for the safe
+disclosure of security bugs. In those cases, please go through the process
+outlined on that page and do not file a public issue.
+
+## Pull Requests
+We actively welcome your pull requests.
+
+However, if you're adding any significant features (e.g. > 50 lines), please
+make sure to have a corresponding issue to discuss your motivation and proposals,
+before sending a PR. We do not always accept new features, and we take the following
+factors into consideration:
+
+1. Whether the same feature can be achieved without modifying detectron2.
+Detectron2 is designed so that you can implement many extensions from the outside, e.g.
+those in [projects](https://github.com/facebookresearch/detectron2/tree/master/projects).
+If some part is not as extensible, you can also bring up the issue to make it more extensible.
+2. Whether the feature is potentially useful to a large audience, or only to a small portion of users.
+3. Whether the proposed solution has a good design / interface.
+4. Whether the proposed solution adds extra mental/practical overhead to users who don't
+   need such feature.
+5. Whether the proposed solution breaks existing APIs.
+
+When sending a PR, please do:
+
+1. If a PR contains multiple orthogonal changes, split it to several PRs.
+2. If you've added code that should be tested, add tests.
+3. For PRs that need experiments (e.g. adding a new model or new methods),
+	 you don't need to update model zoo, but do provide experiment results in the description of the PR.
+4. If APIs are changed, update the documentation.
+5. Make sure your code lints with `./dev/linter.sh`.
+
+
+## Contributor License Agreement ("CLA")
+In order to accept your pull request, we need you to submit a CLA. You only need
+to do this once to work on any of Facebook's open source projects.
+
+Complete your CLA here: <https://code.facebook.com/cla>
+
+## License
+By contributing to detectron2, you agree that your contributions will be licensed
+under the LICENSE file in the root directory of this source tree.
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/docs/notes/index.rst b/preprocess/humanparsing/mhp_extension/detectron2/docs/notes/index.rst
new file mode 100644
index 0000000000000000000000000000000000000000..63cf907be7bb15f5316af6d44a46df601755a86b
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/docs/notes/index.rst
@@ -0,0 +1,10 @@
+Notes
+======================================
+
+.. toctree::
+   :maxdepth: 2
+
+   benchmarks
+   compatibility
+   contributing
+   changelog
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/docs/tutorials/README.md b/preprocess/humanparsing/mhp_extension/detectron2/docs/tutorials/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..1ca9c94d042ef838143a45490fe6b4556c19f3c9
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/docs/tutorials/README.md
@@ -0,0 +1,4 @@
+# Read the docs:
+
+The latest documentation built from this directory is available at [detectron2.readthedocs.io](https://detectron2.readthedocs.io/).
+Documents in this directory are not meant to be read on github.
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/docs/tutorials/builtin_datasets.md b/preprocess/humanparsing/mhp_extension/detectron2/docs/tutorials/builtin_datasets.md
new file mode 100644
index 0000000000000000000000000000000000000000..1a2633f95e6f6a5e54c8beca102a490036478587
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/docs/tutorials/builtin_datasets.md
@@ -0,0 +1,99 @@
+# Setup Builtin Datasets
+
+Detectron2 has builtin support for a few datasets.
+The datasets are assumed to exist in a directory specified by the environment variable
+`DETECTRON2_DATASETS`.
+Under this directory, detectron2 expects to find datasets in the structure described below.
+
+You can set the location for builtin datasets by `export DETECTRON2_DATASETS=/path/to/datasets`.
+If left unset, the default is `./datasets` relative to your current working directory.
+
+The [model zoo](https://github.com/facebookresearch/detectron2/blob/master/MODEL_ZOO.md)
+contains configs and models that use these builtin datasets.
+
+## Expected dataset structure for COCO instance/keypoint detection:
+
+```
+coco/
+  annotations/
+    instances_{train,val}2017.json
+    person_keypoints_{train,val}2017.json
+  {train,val}2017/
+    # image files that are mentioned in the corresponding json
+```
+
+You can use the 2014 version of the dataset as well.
+
+Some of the builtin tests (`dev/run_*_tests.sh`) uses a tiny version of the COCO dataset,
+which you can download with `./prepare_for_tests.sh`.
+
+## Expected dataset structure for PanopticFPN:
+
+```
+coco/
+  annotations/
+    panoptic_{train,val}2017.json
+  panoptic_{train,val}2017/  # png annotations
+  panoptic_stuff_{train,val}2017/  # generated by the script mentioned below
+```
+
+Install panopticapi by:
+```
+pip install git+https://github.com/cocodataset/panopticapi.git
+```
+Then, run `python prepare_panoptic_fpn.py`, to extract semantic annotations from panoptic annotations.
+
+## Expected dataset structure for LVIS instance segmentation:
+```
+coco/
+  {train,val,test}2017/
+lvis/
+  lvis_v0.5_{train,val}.json
+  lvis_v0.5_image_info_test.json
+```
+
+Install lvis-api by:
+```
+pip install git+https://github.com/lvis-dataset/lvis-api.git
+```
+
+Run `python prepare_cocofied_lvis.py` to prepare "cocofied" LVIS annotations for evaluation of models trained on the COCO dataset.
+
+## Expected dataset structure for cityscapes:
+```
+cityscapes/
+  gtFine/
+    train/
+      aachen/
+        color.png, instanceIds.png, labelIds.png, polygons.json,
+        labelTrainIds.png
+      ...
+    val/
+    test/
+  leftImg8bit/
+    train/
+    val/
+    test/
+```
+Install cityscapes scripts by:
+```
+pip install git+https://github.com/mcordts/cityscapesScripts.git
+```
+
+Note: labelTrainIds.png are created using cityscapesescript with:
+```
+CITYSCAPES_DATASET=$DETECTRON2_DATASETS/cityscapes python cityscapesscripts/preparation/createTrainIdLabelImgs.py
+```
+They are not needed for instance segmentation.
+
+## Expected dataset structure for Pascal VOC:
+```
+VOC20{07,12}/
+  Annotations/
+  ImageSets/
+    Main/
+      trainval.txt
+      test.txt
+      # train.txt or val.txt, if you use these splits
+  JPEGImages/
+```
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/docs/tutorials/configs.md b/preprocess/humanparsing/mhp_extension/detectron2/docs/tutorials/configs.md
new file mode 100644
index 0000000000000000000000000000000000000000..ea82583825b51955993ca87d14c17ffb3ab031f4
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/docs/tutorials/configs.md
@@ -0,0 +1,58 @@
+# Configs
+
+Detectron2 provides a key-value based config system that can be
+used to obtain standard, common behaviors.
+
+Detectron2's config system uses YAML and [yacs](https://github.com/rbgirshick/yacs).
+In addition to the [basic operations](../modules/config.html#detectron2.config.CfgNode)
+that access and update a config, we provide the following extra functionalities:
+
+1. The config can have `_BASE_: base.yaml` field, which will load a base config first.
+   Values in the base config will be overwritten in sub-configs, if there are any conflicts.
+   We provided several base configs for standard model architectures.
+2. We provide config versioning, for backward compatibility.
+   If your config file is versioned with a config line like `VERSION: 2`,
+   detectron2 will still recognize it even if we change some keys in the future.
+
+"Config" is a very limited abstraction.
+We do not expect all features in detectron2 to be available through configs.
+If you need something that's not available in the config space,
+please write code using detectron2's API.
+
+### Basic Usage
+
+Some basic usage of the `CfgNode` object is shown here. See more in [documentation](../modules/config.html#detectron2.config.CfgNode).
+```python
+from detectron2.config import get_cfg
+cfg = get_cfg()    # obtain detectron2's default config
+cfg.xxx = yyy      # add new configs for your own custom components
+cfg.merge_from_file("my_cfg.yaml")   # load values from a file
+
+cfg.merge_from_list(["MODEL.WEIGHTS", "weights.pth"])   # can also load values from a list of str
+print(cfg.dump())  # print formatted configs
+```
+
+Many builtin tools in detectron2 accepts command line config overwrite:
+Key-value pairs provided in the command line will overwrite the existing values in the config file.
+For example, [demo.py](../../demo/demo.py) can be used with
+```
+./demo.py --config-file config.yaml [--other-options] \
+  --opts MODEL.WEIGHTS /path/to/weights INPUT.MIN_SIZE_TEST 1000
+```
+
+To see a list of available configs in detectron2 and what they mean,
+check [Config References](../modules/config.html#config-references)
+
+
+### Best Practice with Configs
+
+1. Treat the configs you write as "code": avoid copying them or duplicating them; use `_BASE_`
+   to share common parts between configs.
+
+2. Keep the configs you write simple: don't include keys that do not affect the experimental setting.
+
+3. Keep a version number in your configs (or the base config), e.g., `VERSION: 2`,
+   for backward compatibility.
+	 We print a warning when reading a config without version number.
+   The official configs do not include version number because they are meant to
+   be always up-to-date.
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/docs/tutorials/data_loading.md b/preprocess/humanparsing/mhp_extension/detectron2/docs/tutorials/data_loading.md
new file mode 100644
index 0000000000000000000000000000000000000000..bb037ca534ccbb0cf82c456d0cd54544520b3a3f
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/docs/tutorials/data_loading.md
@@ -0,0 +1,77 @@
+
+# Use Custom Dataloaders
+
+## How the Existing Dataloader Works
+
+Detectron2 contains a builtin data loading pipeline.
+It's good to understand how it works, in case you need to write a custom one.
+
+Detectron2 provides two functions
+[build_detection_{train,test}_loader](../modules/data.html#detectron2.data.build_detection_train_loader)
+that create a default data loader from a given config.
+Here is how `build_detection_{train,test}_loader` work:
+
+1. It takes the name of a registered dataset (e.g., "coco_2017_train") and loads a `list[dict]` representing the dataset items
+   in a lightweight, canonical format. These dataset items are not yet ready to be used by the model (e.g., images are
+   not loaded into memory, random augmentations have not been applied, etc.).
+   Details about the dataset format and dataset registration can be found in
+   [datasets](./datasets.md).
+2. Each dict in this list is mapped by a function ("mapper"):
+   * Users can customize this mapping function by specifying the "mapper" argument in
+        `build_detection_{train,test}_loader`. The default mapper is [DatasetMapper](../modules/data.html#detectron2.data.DatasetMapper).
+   * The output format of such function can be arbitrary, as long as it is accepted by the consumer of this data loader (usually the model).
+     The outputs of the default mapper, after batching, follow the default model input format documented in
+     [Use Models](./models.html#model-input-format).
+   * The role of the mapper is to transform the lightweight, canonical representation of a dataset item into a format
+     that is ready for the model to consume (including, e.g., read images, perform random data augmentation and convert to torch Tensors).
+     If you would like to perform custom transformations to data, you often want a custom mapper.
+3. The outputs of the mapper are batched (simply into a list).
+4. This batched data is the output of the data loader. Typically, it's also the input of
+   `model.forward()`.
+
+
+## Write a Custom Dataloader
+
+Using a different "mapper" with `build_detection_{train,test}_loader(mapper=)` works for most use cases
+of custom data loading.
+For example, if you want to resize all images to a fixed size for Mask R-CNN training, write this:
+
+```python
+from detectron2.data import build_detection_train_loader
+from detectron2.data import transforms as T
+from detectron2.data import detection_utils as utils
+
+def mapper(dataset_dict):
+	# Implement a mapper, similar to the default DatasetMapper, but with your own customizations
+	dataset_dict = copy.deepcopy(dataset_dict)  # it will be modified by code below
+	image = utils.read_image(dataset_dict["file_name"], format="BGR")
+	image, transforms = T.apply_transform_gens([T.Resize((800, 800))], image)
+	dataset_dict["image"] = torch.as_tensor(image.transpose(2, 0, 1).astype("float32"))
+
+	annos = [
+		utils.transform_instance_annotations(obj, transforms, image.shape[:2])
+		for obj in dataset_dict.pop("annotations")
+		if obj.get("iscrowd", 0) == 0
+	]
+	instances = utils.annotations_to_instances(annos, image.shape[:2])
+	dataset_dict["instances"] = utils.filter_empty_instances(instances)
+	return dataset_dict
+
+data_loader = build_detection_train_loader(cfg, mapper=mapper)
+# use this dataloader instead of the default
+```
+Refer to [API documentation of detectron2.data](../modules/data) for details.
+
+If you want to change not only the mapper (e.g., to write different sampling or batching logic),
+you can write your own data loader. The data loader is simply a
+python iterator that produces [the format](./models.md) your model accepts.
+You can implement it using any tools you like.
+
+## Use a Custom Dataloader
+
+If you use [DefaultTrainer](../modules/engine.html#detectron2.engine.defaults.DefaultTrainer),
+you can overwrite its `build_{train,test}_loader` method to use your own dataloader.
+See the [densepose dataloader](../../projects/DensePose/train_net.py)
+for an example.
+
+If you write your own training loop, you can plug in your data loader easily.
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/docs/tutorials/datasets.md b/preprocess/humanparsing/mhp_extension/detectron2/docs/tutorials/datasets.md
new file mode 100644
index 0000000000000000000000000000000000000000..8dc1c0c55598887e4de73e988567753ebf4538e2
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/docs/tutorials/datasets.md
@@ -0,0 +1,221 @@
+# Use Custom Datasets
+
+Datasets that have builtin support in detectron2 are listed in [datasets](../../datasets).
+If you want to use a custom dataset while also reusing detectron2's data loaders,
+you will need to
+
+1. __Register__ your dataset (i.e., tell detectron2 how to obtain your dataset).
+2. Optionally, __register metadata__ for your dataset.
+
+Next, we explain the above two concepts in detail.
+
+The [Colab tutorial](https://colab.research.google.com/drive/16jcaJoc6bCFAQ96jDe2HwtXj7BMD_-m5)
+has a live example of how to register and train on a dataset of custom formats.
+
+### Register a Dataset
+
+To let detectron2 know how to obtain a dataset named "my_dataset", you will implement
+a function that returns the items in your dataset and then tell detectron2 about this
+function:
+```python
+def my_dataset_function():
+  ...
+  return list[dict] in the following format
+
+from detectron2.data import DatasetCatalog
+DatasetCatalog.register("my_dataset", my_dataset_function)
+```
+
+Here, the snippet associates a dataset "my_dataset" with a function that returns the data.
+The registration stays effective until the process exists.
+
+The function can processes data from its original format into either one of the following:
+1. Detectron2's standard dataset dict, described below. This will work with many other builtin
+	 features in detectron2, so it's recommended to use it when it's sufficient for your task.
+2. Your custom dataset dict. You can also return arbitrary dicts in your own format,
+	 such as adding extra keys for new tasks.
+	 Then you will need to handle them properly downstream as well.
+	 See below for more details.
+
+#### Standard Dataset Dicts
+
+For standard tasks
+(instance detection, instance/semantic/panoptic segmentation, keypoint detection),
+we load the original dataset into `list[dict]` with a specification similar to COCO's json annotations.
+This is our standard representation for a dataset.
+
+Each dict contains information about one image.
+The dict may have the following fields,
+and the required fields vary based on what the dataloader or the task needs (see more below).
+
++ `file_name`: the full path to the image file. Will apply rotation and flipping if the image has such exif information.
++ `height`, `width`: integer. The shape of image.
++ `image_id` (str or int): a unique id that identifies this image. Used
+	during evaluation to identify the images, but a dataset may use it for different purposes.
++ `annotations` (list[dict]): each dict corresponds to annotations of one instance
+  in this image. Required by instance detection/segmentation or keypoint detection tasks.
+
+	Images with empty `annotations` will by default be removed from training,
+	but can be included using `DATALOADER.FILTER_EMPTY_ANNOTATIONS`.
+
+	Each dict contains the following keys, of which `bbox`,`bbox_mode` and `category_id` are required:
+  + `bbox` (list[float]): list of 4 numbers representing the bounding box of the instance.
+  + `bbox_mode` (int): the format of bbox.
+    It must be a member of
+    [structures.BoxMode](../modules/structures.html#detectron2.structures.BoxMode).
+    Currently supports: `BoxMode.XYXY_ABS`, `BoxMode.XYWH_ABS`.
+  + `category_id` (int): an integer in the range [0, num_categories) representing the category label.
+    The value num_categories is reserved to represent the "background" category, if applicable.
+  + `segmentation` (list[list[float]] or dict): the segmentation mask of the instance.
+    + If `list[list[float]]`, it represents a list of polygons, one for each connected component
+      of the object. Each `list[float]` is one simple polygon in the format of `[x1, y1, ..., xn, yn]`.
+      The Xs and Ys are either relative coordinates in [0, 1], or absolute coordinates,
+      depend on whether "bbox_mode" is relative.
+    + If `dict`, it represents the per-pixel segmentation mask in COCO's RLE format. The dict should have
+			keys "size" and "counts". You can convert a uint8 segmentation mask of 0s and 1s into
+			RLE format by `pycocotools.mask.encode(np.asarray(mask, order="F"))`.
+  + `keypoints` (list[float]): in the format of [x1, y1, v1,..., xn, yn, vn].
+    v[i] means the [visibility](http://cocodataset.org/#format-data) of this keypoint.
+    `n` must be equal to the number of keypoint categories.
+    The Xs and Ys are either relative coordinates in [0, 1], or absolute coordinates,
+    depend on whether "bbox_mode" is relative.
+
+    Note that the coordinate annotations in COCO format are integers in range [0, H-1 or W-1].
+    By default, detectron2 adds 0.5 to absolute keypoint coordinates to convert them from discrete
+    pixel indices to floating point coordinates.
+  + `iscrowd`: 0 (default) or 1. Whether this instance is labeled as COCO's "crowd
+    region". Don't include this field if you don't know what it means.
++ `sem_seg_file_name`: the full path to the ground truth semantic segmentation file.
+	Required by semantic segmentation task.
+	It should be an image whose pixel values are integer labels.
+
+
+Fast R-CNN (with precomputed proposals) is rarely used today.
+To train a Fast R-CNN, the following extra keys are needed:
+
++ `proposal_boxes` (array): 2D numpy array with shape (K, 4) representing K precomputed proposal boxes for this image.
++ `proposal_objectness_logits` (array): numpy array with shape (K, ), which corresponds to the objectness
+  logits of proposals in 'proposal_boxes'.
++ `proposal_bbox_mode` (int): the format of the precomputed proposal bbox.
+  It must be a member of
+  [structures.BoxMode](../modules/structures.html#detectron2.structures.BoxMode).
+  Default is `BoxMode.XYXY_ABS`.
+
+#### Custom Dataset Dicts for New Tasks
+
+In the `list[dict]` that your dataset function returns, the dictionary can also have arbitrary custom data.
+This will be useful for a new task that needs extra information not supported
+by the standard dataset dicts. In this case, you need to make sure the downstream code can handle your data
+correctly. Usually this requires writing a new `mapper` for the dataloader (see [Use Custom Dataloaders](./data_loading.md)).
+
+When designing a custom format, note that all dicts are stored in memory
+(sometimes serialized and with multiple copies).
+To save memory, each dict is meant to contain small but sufficient information
+about each sample, such as file names and annotations.
+Loading full samples typically happens in the data loader.
+
+For attributes shared among the entire dataset, use `Metadata` (see below).
+To avoid extra memory, do not save such information repeatly for each sample.
+
+### "Metadata" for Datasets
+
+Each dataset is associated with some metadata, accessible through
+`MetadataCatalog.get(dataset_name).some_metadata`.
+Metadata is a key-value mapping that contains information that's shared among
+the entire dataset, and usually is used to interpret what's in the dataset, e.g.,
+names of classes, colors of classes, root of files, etc.
+This information will be useful for augmentation, evaluation, visualization, logging, etc.
+The structure of metadata depends on the what is needed from the corresponding downstream code.
+
+If you register a new dataset through `DatasetCatalog.register`,
+you may also want to add its corresponding metadata through
+`MetadataCatalog.get(dataset_name).some_key = some_value`, to enable any features that need the metadata.
+You can do it like this (using the metadata key "thing_classes" as an example):
+
+```python
+from detectron2.data import MetadataCatalog
+MetadataCatalog.get("my_dataset").thing_classes = ["person", "dog"]
+```
+
+Here is a list of metadata keys that are used by builtin features in detectron2.
+If you add your own dataset without these metadata, some features may be
+unavailable to you:
+
+* `thing_classes` (list[str]): Used by all instance detection/segmentation tasks.
+  A list of names for each instance/thing category.
+  If you load a COCO format dataset, it will be automatically set by the function `load_coco_json`.
+
+* `thing_colors` (list[tuple(r, g, b)]): Pre-defined color (in [0, 255]) for each thing category.
+  Used for visualization. If not given, random colors are used.
+
+* `stuff_classes` (list[str]): Used by semantic and panoptic segmentation tasks.
+  A list of names for each stuff category.
+
+* `stuff_colors` (list[tuple(r, g, b)]): Pre-defined color (in [0, 255]) for each stuff category.
+  Used for visualization. If not given, random colors are used.
+
+* `keypoint_names` (list[str]): Used by keypoint localization. A list of names for each keypoint.
+
+* `keypoint_flip_map` (list[tuple[str]]): Used by the keypoint localization task. A list of pairs of names,
+  where each pair are the two keypoints that should be flipped if the image is
+  flipped horizontally during augmentation.
+* `keypoint_connection_rules`: list[tuple(str, str, (r, g, b))]. Each tuple specifies a pair of keypoints
+  that are connected and the color to use for the line between them when visualized.
+
+Some additional metadata that are specific to the evaluation of certain datasets (e.g. COCO):
+
+* `thing_dataset_id_to_contiguous_id` (dict[int->int]): Used by all instance detection/segmentation tasks in the COCO format.
+  A mapping from instance class ids in the dataset to contiguous ids in range [0, #class).
+  Will be automatically set by the function `load_coco_json`.
+
+* `stuff_dataset_id_to_contiguous_id` (dict[int->int]): Used when generating prediction json files for
+  semantic/panoptic segmentation.
+  A mapping from semantic segmentation class ids in the dataset
+  to contiguous ids in [0, num_categories). It is useful for evaluation only.
+
+* `json_file`: The COCO annotation json file. Used by COCO evaluation for COCO-format datasets.
+* `panoptic_root`, `panoptic_json`: Used by panoptic evaluation.
+* `evaluator_type`: Used by the builtin main training script to select
+   evaluator. Don't use it in a new training script.
+   You can just provide the [DatasetEvaluator](../modules/evaluation.html#detectron2.evaluation.DatasetEvaluator)
+   for your dataset directly in your main script.
+
+NOTE: For background on the concept of "thing" and "stuff", see
+[On Seeing Stuff: The Perception of Materials by Humans and Machines](http://persci.mit.edu/pub_pdfs/adelson_spie_01.pdf).
+In detectron2, the term "thing" is used for instance-level tasks,
+and "stuff" is used for semantic segmentation tasks.
+Both are used in panoptic segmentation.
+
+### Register a COCO Format Dataset
+
+If your dataset is already a json file in the COCO format,
+the dataset and its associated metadata can be registered easily with:
+```python
+from detectron2.data.datasets import register_coco_instances
+register_coco_instances("my_dataset", {}, "json_annotation.json", "path/to/image/dir")
+```
+
+If your dataset is in COCO format but with extra custom per-instance annotations,
+the [load_coco_json](../modules/data.html#detectron2.data.datasets.load_coco_json)
+function might be useful.
+
+### Update the Config for New Datasets
+
+Once you've registered the dataset, you can use the name of the dataset (e.g., "my_dataset" in
+example above) in `cfg.DATASETS.{TRAIN,TEST}`.
+There are other configs you might want to change to train or evaluate on new datasets:
+
+* `MODEL.ROI_HEADS.NUM_CLASSES` and `MODEL.RETINANET.NUM_CLASSES` are the number of thing classes
+	for R-CNN and RetinaNet models, respectively.
+* `MODEL.ROI_KEYPOINT_HEAD.NUM_KEYPOINTS` sets the number of keypoints for Keypoint R-CNN.
+  You'll also need to set [Keypoint OKS](http://cocodataset.org/#keypoints-eval)
+	with `TEST.KEYPOINT_OKS_SIGMAS` for evaluation.
+* `MODEL.SEM_SEG_HEAD.NUM_CLASSES` sets the number of stuff classes for Semantic FPN & Panoptic FPN.
+* If you're training Fast R-CNN (with precomputed proposals), `DATASETS.PROPOSAL_FILES_{TRAIN,TEST}`
+	need to match the datasets. The format of proposal files are documented
+	[here](../modules/data.html#detectron2.data.load_proposals_into_dataset).
+
+New models
+(e.g. [TensorMask](../../projects/TensorMask),
+[PointRend](../../projects/PointRend))
+often have similar configs of their own that need to be changed as well.
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/docs/tutorials/deployment.md b/preprocess/humanparsing/mhp_extension/detectron2/docs/tutorials/deployment.md
new file mode 100644
index 0000000000000000000000000000000000000000..a473247abf7df74e35b6de71c018f1aa34eaf435
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/docs/tutorials/deployment.md
@@ -0,0 +1,92 @@
+# Deployment
+
+## Caffe2 Deployment
+We currently support converting a detectron2 model to Caffe2 format through ONNX.
+The converted Caffe2 model is able to run without detectron2 dependency in either Python or C++.
+It has a runtime optimized for CPU & mobile inference, but not for GPU inference.
+
+Caffe2 conversion requires PyTorch ≥ 1.4 and ONNX ≥ 1.6.
+
+### Coverage
+
+It supports 3 most common meta architectures: `GeneralizedRCNN`, `RetinaNet`, `PanopticFPN`,
+and most official models under these 3 meta architectures.
+
+Users' custom extensions under these architectures (added through registration) are supported
+as long as they do not contain control flow or operators not available in Caffe2 (e.g. deformable convolution).
+For example, custom backbones and heads are often supported out of the box.
+
+### Usage
+
+The conversion APIs are documented at [the API documentation](../modules/export).
+We provide a tool, `caffe2_converter.py` as an example that uses
+these APIs to convert a standard model.
+
+To convert an official Mask R-CNN trained on COCO, first
+[prepare the COCO dataset](../../datasets/), then pick the model from [Model Zoo](../../MODEL_ZOO.md), and run:
+```
+cd tools/deploy/ && ./caffe2_converter.py --config-file ../../configs/COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_3x.yaml \
+	--output ./caffe2_model --run-eval \
+	MODEL.WEIGHTS detectron2://COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_3x/137849600/model_final_f10217.pkl \
+	MODEL.DEVICE cpu
+```
+
+Note that:
+1. The conversion needs valid sample inputs & weights to trace the model. That's why the script requires the dataset.
+	 You can modify the script to obtain sample inputs in other ways.
+2. With the `--run-eval` flag, it will evaluate the converted models to verify its accuracy.
+   The accuracy is typically slightly different (within 0.1 AP) from PyTorch due to
+	 numerical precisions between different implementations.
+	 It's recommended to always verify the accuracy in case your custom model is not supported by the
+	 conversion.
+
+The converted model is available at the specified `caffe2_model/` directory. Two files `model.pb`
+and `model_init.pb` that contain network structure and network parameters are necessary for deployment.
+These files can then be loaded in C++ or Python using Caffe2's APIs.
+
+The script generates `model.svg` file which contains a visualization of the network.
+You can also load `model.pb` to tools such as [netron](https://github.com/lutzroeder/netron) to visualize it.
+
+### Use the model in C++/Python
+
+The model can be loaded in C++. An example [caffe2_mask_rcnn.cpp](../../tools/deploy/) is given,
+which performs CPU/GPU inference using `COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_3x`.
+
+The C++ example needs to be built with:
+* PyTorch with caffe2 inside
+* gflags, glog, opencv
+* protobuf headers that match the version of your caffe2
+* MKL headers if caffe2 is built with MKL
+
+The following can compile the example inside [official detectron2 docker](../../docker/):
+```
+sudo apt update && sudo apt install libgflags-dev libgoogle-glog-dev libopencv-dev
+pip install mkl-include
+wget https://github.com/protocolbuffers/protobuf/releases/download/v3.6.1/protobuf-cpp-3.6.1.tar.gz
+tar xf protobuf-cpp-3.6.1.tar.gz
+export CPATH=$(readlink -f ./protobuf-3.6.1/src/):$HOME/.local/include
+export CMAKE_PREFIX_PATH=$HOME/.local/lib/python3.6/site-packages/torch/
+mkdir build && cd build
+cmake -DTORCH_CUDA_ARCH_LIST=$TORCH_CUDA_ARCH_LIST .. && make
+
+# To run:
+./caffe2_mask_rcnn --predict_net=./model.pb --init_net=./model_init.pb --input=input.jpg
+```
+
+Note that:
+
+* All converted models (the .pb files) take two input tensors:
+  "data" is an NCHW image, and "im_info" is an Nx3 tensor consisting of (height, width, 1.0) for
+  each image (the shape of "data" might be larger than that in "im_info" due to padding).
+
+* The converted models do not contain post-processing operations that
+  transform raw layer outputs into formatted predictions.
+  The example only produces raw outputs (28x28 masks) from the final
+  layers that are not post-processed, because in actual deployment, an application often needs
+  its custom lightweight post-processing (e.g. full-image masks for every detected object is often not necessary).
+
+We also provide a python wrapper around the converted model, in the
+[Caffe2Model.\_\_call\_\_](../modules/export.html#detectron2.export.Caffe2Model.__call__) method.
+This method has an interface that's identical to the [pytorch versions of models](./models.md),
+and it internally applies pre/post-processing code to match the formats.
+They can serve as a reference for pre/post-processing in actual deployment.
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/docs/tutorials/evaluation.md b/preprocess/humanparsing/mhp_extension/detectron2/docs/tutorials/evaluation.md
new file mode 100644
index 0000000000000000000000000000000000000000..c71adb7eb2e554e5ea848f1feb44bbee01a13f8e
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/docs/tutorials/evaluation.md
@@ -0,0 +1,43 @@
+
+# Evaluation
+
+Evaluation is a process that takes a number of inputs/outputs pairs and aggregate them.
+You can always [use the model](./models.md) directly and just parse its inputs/outputs manually to perform
+evaluation.
+Alternatively, evaluation is implemented in detectron2 using the [DatasetEvaluator](../modules/evaluation.html#detectron2.evaluation.DatasetEvaluator)
+interface.
+
+Detectron2 includes a few `DatasetEvaluator` that computes metrics using standard dataset-specific
+APIs (e.g., COCO, LVIS).
+You can also implement your own `DatasetEvaluator` that performs some other jobs
+using the inputs/outputs pairs.
+For example, to count how many instances are detected on the validation set:
+
+```
+class Counter(DatasetEvaluator):
+  def reset(self):
+    self.count = 0
+  def process(self, inputs, outputs):
+    for output in outputs:
+      self.count += len(output["instances"])
+  def evaluate(self):
+    # save self.count somewhere, or print it, or return it.
+    return {"count": self.count}
+```
+
+Once you have some `DatasetEvaluator`, you can run it with
+[inference_on_dataset](../modules/evaluation.html#detectron2.evaluation.inference_on_dataset).
+For example,
+
+```python
+val_results = inference_on_dataset(
+    model,
+    val_data_loader,
+    DatasetEvaluators([COCOEvaluator(...), Counter()]))
+```
+Compared to running the evaluation manually using the model, the benefit of this function is that
+you can merge evaluators together using [DatasetEvaluators](../modules/evaluation.html#detectron2.evaluation.DatasetEvaluators).
+In this way you can run all evaluations without having to go through the dataset multiple times.
+
+The `inference_on_dataset` function also provides accurate speed benchmarks for the
+given model and dataset.
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/docs/tutorials/extend.md b/preprocess/humanparsing/mhp_extension/detectron2/docs/tutorials/extend.md
new file mode 100644
index 0000000000000000000000000000000000000000..4232185757139e45078bf58c4f0fffb5fa0e4c04
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/docs/tutorials/extend.md
@@ -0,0 +1,53 @@
+# Extend Detectron2's Defaults
+
+__Research is about doing things in new ways__.
+This brings a tension in how to create abstractions in code,
+which is a challenge for any research engineering project of a significant size:
+
+1. On one hand, it needs to have very thin abstractions to allow for the possibility of doing
+   everything in new ways. It should be reasonably easy to break existing
+   abstractions and replace them with new ones.
+
+2. On the other hand, such a project also needs reasonably high-level
+   abstractions, so that users can easily do things in standard ways,
+   without worrying too much about the details that only certain researchers care about.
+
+In detectron2, there are two types of interfaces that address this tension together:
+
+1. Functions and classes that take a config (`cfg`) argument
+   (sometimes with only a few extra arguments).
+
+   Such functions and classes implement
+   the "standard default" behavior: it will read what it needs from the
+   config and do the "standard" thing.
+   Users only need to load a given config and pass it around, without having to worry about
+   which arguments are used and what they all mean.
+
+2. Functions and classes that have well-defined explicit arguments.
+
+   Each of these is a small building block of the entire system.
+   They require users' expertise to understand what each argument should be,
+   and require more effort to stitch together to a larger system.
+   But they can be stitched together in more flexible ways.
+
+   When you need to implement something not supported by the "standard defaults"
+   included in detectron2, these well-defined components can be reused.
+
+3. (experimental) A few classes are implemented with the
+   [@configurable](../../modules/config.html#detectron2.config.configurable)
+   decorator - they can be called with either a config, or with explicit arguments.
+   Their explicit argument interfaces are currently __experimental__ and subject to change.
+
+
+If you only need the standard behavior, the [Beginner's Tutorial](./getting_started.md)
+should suffice. If you need to extend detectron2 to your own needs,
+see the following tutorials for more details:
+
+* Detectron2 includes a few standard datasets. To use custom ones, see
+  [Use Custom Datasets](./datasets.md).
+* Detectron2 contains the standard logic that creates a data loader for training/testing from a
+  dataset, but you can write your own as well. See [Use Custom Data Loaders](./data_loading.md).
+* Detectron2 implements many standard detection models, and provide ways for you
+  to overwrite their behaviors. See [Use Models](./models.md) and [Write Models](./write-models.md).
+* Detectron2 provides a default training loop that is good for common training tasks.
+  You can customize it with hooks, or write your own loop instead. See [training](./training.md).
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/docs/tutorials/getting_started.md b/preprocess/humanparsing/mhp_extension/detectron2/docs/tutorials/getting_started.md
new file mode 100644
index 0000000000000000000000000000000000000000..acaf13f02c906b45ffc2f49ee5a0ce01d82b4786
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/docs/tutorials/getting_started.md
@@ -0,0 +1,79 @@
+## Getting Started with Detectron2
+
+This document provides a brief intro of the usage of builtin command-line tools in detectron2.
+
+For a tutorial that involves actual coding with the API,
+see our [Colab Notebook](https://colab.research.google.com/drive/16jcaJoc6bCFAQ96jDe2HwtXj7BMD_-m5)
+which covers how to run inference with an
+existing model, and how to train a builtin model on a custom dataset.
+
+For more advanced tutorials, refer to our [documentation](https://detectron2.readthedocs.io/tutorials/extend.html).
+
+
+### Inference Demo with Pre-trained Models
+
+1. Pick a model and its config file from
+	[model zoo](MODEL_ZOO.md),
+	for example, `mask_rcnn_R_50_FPN_3x.yaml`.
+2. We provide `demo.py` that is able to run builtin standard models. Run it with:
+```
+cd demo/
+python demo.py --config-file ../configs/COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_3x.yaml \
+  --input input1.jpg input2.jpg \
+  [--other-options]
+  --opts MODEL.WEIGHTS detectron2://COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_3x/137849600/model_final_f10217.pkl
+```
+The configs are made for training, therefore we need to specify `MODEL.WEIGHTS` to a model from model zoo for evaluation.
+This command will run the inference and show visualizations in an OpenCV window.
+
+For details of the command line arguments, see `demo.py -h` or look at its source code
+to understand its behavior. Some common arguments are:
+* To run __on your webcam__, replace `--input files` with `--webcam`.
+* To run __on a video__, replace `--input files` with `--video-input video.mp4`.
+* To run __on cpu__, add `MODEL.DEVICE cpu` after `--opts`.
+* To save outputs to a directory (for images) or a file (for webcam or video), use `--output`.
+
+
+### Training & Evaluation in Command Line
+
+We provide a script in "tools/{,plain_}train_net.py", that is made to train
+all the configs provided in detectron2.
+You may want to use it as a reference to write your own training script.
+
+To train a model with "train_net.py", first
+setup the corresponding datasets following
+[datasets/README.md](./datasets/README.md),
+then run:
+```
+cd tools/
+./train_net.py --num-gpus 8 \
+	--config-file ../configs/COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_1x.yaml
+```
+
+The configs are made for 8-GPU training.
+To train on 1 GPU, you may need to [change some parameters](https://arxiv.org/abs/1706.02677), e.g.:
+```
+./train_net.py \
+	--config-file ../configs/COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_1x.yaml \
+	--num-gpus 1 SOLVER.IMS_PER_BATCH 2 SOLVER.BASE_LR 0.0025
+```
+
+For most models, CPU training is not supported.
+
+To evaluate a model's performance, use
+```
+./train_net.py \
+	--config-file ../configs/COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_1x.yaml \
+	--eval-only MODEL.WEIGHTS /path/to/checkpoint_file
+```
+For more options, see `./train_net.py -h`.
+
+### Use Detectron2 APIs in Your Code
+
+See our [Colab Notebook](https://colab.research.google.com/drive/16jcaJoc6bCFAQ96jDe2HwtXj7BMD_-m5)
+to learn how to use detectron2 APIs to:
+1. run inference with an existing model
+2. train a builtin model on a custom dataset
+
+See [detectron2/projects](https://github.com/facebookresearch/detectron2/tree/master/projects)
+for more ways to build your project on detectron2.
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/docs/tutorials/index.rst b/preprocess/humanparsing/mhp_extension/detectron2/docs/tutorials/index.rst
new file mode 100644
index 0000000000000000000000000000000000000000..896e71e64139a35a566bbdd76e4b57006af35e2d
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/docs/tutorials/index.rst
@@ -0,0 +1,18 @@
+Tutorials
+======================================
+
+.. toctree::
+   :maxdepth: 2
+
+   install
+   getting_started
+   builtin_datasets
+   extend
+   datasets
+   data_loading
+   models
+   write-models
+   training
+   evaluation
+   configs
+   deployment
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/docs/tutorials/install.md b/preprocess/humanparsing/mhp_extension/detectron2/docs/tutorials/install.md
new file mode 100644
index 0000000000000000000000000000000000000000..3985f8ae4f5ecde26b310b4ab01c49b922f742e9
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/docs/tutorials/install.md
@@ -0,0 +1,184 @@
+## Installation
+
+Our [Colab Notebook](https://colab.research.google.com/drive/16jcaJoc6bCFAQ96jDe2HwtXj7BMD_-m5)
+has step-by-step instructions that install detectron2.
+The [Dockerfile](docker)
+also installs detectron2 with a few simple commands.
+
+### Requirements
+- Linux or macOS with Python ≥ 3.6
+- PyTorch ≥ 1.4
+- [torchvision](https://github.com/pytorch/vision/) that matches the PyTorch installation.
+	You can install them together at [pytorch.org](https://pytorch.org) to make sure of this.
+- OpenCV, optional, needed by demo and visualization
+- pycocotools: `pip install cython; pip install -U 'git+https://github.com/cocodataset/cocoapi.git#subdirectory=PythonAPI'`
+
+
+### Build Detectron2 from Source
+
+gcc & g++ ≥ 5 are required. [ninja](https://ninja-build.org/) is recommended for faster build.
+After having them, run:
+```
+python -m pip install 'git+https://github.com/facebookresearch/detectron2.git'
+# (add --user if you don't have permission)
+
+# Or, to install it from a local clone:
+git clone https://github.com/facebookresearch/detectron2.git
+python -m pip install -e detectron2
+
+# Or if you are on macOS
+# CC=clang CXX=clang++ python -m pip install -e .
+```
+
+To __rebuild__ detectron2 that's built from a local clone, use `rm -rf build/ **/*.so` to clean the
+old build first. You often need to rebuild detectron2 after reinstalling PyTorch.
+
+### Install Pre-Built Detectron2 (Linux only)
+```
+# for CUDA 10.1:
+python -m pip install detectron2 -f https://dl.fbaipublicfiles.com/detectron2/wheels/cu101/index.html
+```
+You can replace cu101 with "cu{100,92}" or "cpu".
+
+Note that:
+1. Such installation has to be used with certain version of official PyTorch release.
+   See [releases](https://github.com/facebookresearch/detectron2/releases) for requirements.
+   It will not work with a different version of PyTorch or a non-official build of PyTorch.
+2. Such installation is out-of-date w.r.t. master branch of detectron2. It may not be
+	 compatible with the master branch of a research project that uses detectron2 (e.g. those in
+	 [projects](projects) or [meshrcnn](https://github.com/facebookresearch/meshrcnn/)).
+
+### Common Installation Issues
+
+If you met issues using the pre-built detectron2, please uninstall it and try building it from source.
+
+Click each issue for its solutions:
+
+<details>
+<summary>
+Undefined torch/aten/caffe2 symbols, or segmentation fault immediately when running the library.
+</summary>
+<br/>
+
+This usually happens when detectron2 or torchvision is not
+compiled with the version of PyTorch you're running.
+
+Pre-built torchvision or detectron2 has to work with the corresponding official release of pytorch.
+If the error comes from a pre-built torchvision, uninstall torchvision and pytorch and reinstall them
+following [pytorch.org](http://pytorch.org). So the versions will match.
+
+If the error comes from a pre-built detectron2, check [release notes](https://github.com/facebookresearch/detectron2/releases)
+to see the corresponding pytorch version required for each pre-built detectron2.
+
+If the error comes from detectron2 or torchvision that you built manually from source,
+remove files you built (`build/`, `**/*.so`) and rebuild it so it can pick up the version of pytorch currently in your environment.
+
+If you cannot resolve this problem, please include the output of `gdb -ex "r" -ex "bt" -ex "quit" --args python -m detectron2.utils.collect_env`
+in your issue.
+</details>
+
+<details>
+<summary>
+Undefined C++ symbols (e.g. `GLIBCXX`) or C++ symbols not found.
+</summary>
+<br/>
+Usually it's because the library is compiled with a newer C++ compiler but run with an old C++ runtime.
+
+This often happens with old anaconda.
+Try `conda update libgcc`. Then rebuild detectron2.
+
+The fundamental solution is to run the code with proper C++ runtime.
+One way is to use `LD_PRELOAD=/path/to/libstdc++.so`.
+
+</details>
+
+<details>
+<summary>
+"Not compiled with GPU support" or "Detectron2 CUDA Compiler: not available".
+</summary>
+<br/>
+CUDA is not found when building detectron2.
+You should make sure
+
+```
+python -c 'import torch; from torch.utils.cpp_extension import CUDA_HOME; print(torch.cuda.is_available(), CUDA_HOME)'
+```
+
+print valid outputs at the time you build detectron2.
+
+Most models can run inference (but not training) without GPU support. To use CPUs, set `MODEL.DEVICE='cpu'` in the config.
+</details>
+
+<details>
+<summary>
+"invalid device function" or "no kernel image is available for execution".
+</summary>
+<br/>
+Two possibilities:
+
+* You build detectron2 with one version of CUDA but run it with a different version.
+
+  To check whether it is the case,
+  use `python -m detectron2.utils.collect_env` to find out inconsistent CUDA versions.
+	In the output of this command, you should expect "Detectron2 CUDA Compiler", "CUDA_HOME", "PyTorch built with - CUDA"
+	to contain cuda libraries of the same version.
+
+	When they are inconsistent,
+	you need to either install a different build of PyTorch (or build by yourself)
+	to match your local CUDA installation, or install a different version of CUDA to match PyTorch.
+
+* Detectron2 or PyTorch/torchvision is not built for the correct GPU architecture (compute compatibility).
+
+	The GPU architecture for PyTorch/detectron2/torchvision is available in the "architecture flags" in
+	`python -m detectron2.utils.collect_env`.
+
+	The GPU architecture flags of detectron2/torchvision by default matches the GPU model detected
+	during compilation. This means the compiled code may not work on a different GPU model.
+	To overwrite the GPU architecture for detectron2/torchvision, use `TORCH_CUDA_ARCH_LIST` environment variable during compilation.
+
+	For example, `export TORCH_CUDA_ARCH_LIST=6.0,7.0` makes it compile for both P100s and V100s.
+	Visit [developer.nvidia.com/cuda-gpus](https://developer.nvidia.com/cuda-gpus) to find out
+	the correct compute compatibility number for your device.
+
+</details>
+
+<details>
+<summary>
+Undefined CUDA symbols; cannot open libcudart.so; other nvcc failures.
+</summary>
+<br/>
+The version of NVCC you use to build detectron2 or torchvision does
+not match the version of CUDA you are running with.
+This often happens when using anaconda's CUDA runtime.
+
+Use `python -m detectron2.utils.collect_env` to find out inconsistent CUDA versions.
+In the output of this command, you should expect "Detectron2 CUDA Compiler", "CUDA_HOME", "PyTorch built with - CUDA"
+to contain cuda libraries of the same version.
+
+When they are inconsistent,
+you need to either install a different build of PyTorch (or build by yourself)
+to match your local CUDA installation, or install a different version of CUDA to match PyTorch.
+</details>
+
+
+<details>
+<summary>
+"ImportError: cannot import name '_C'".
+</summary>
+<br/>
+Please build and install detectron2 following the instructions above.
+
+If you are running code from detectron2's root directory, `cd` to a different one.
+Otherwise you may not import the code that you installed.
+</details>
+
+<details>
+<summary>
+ONNX conversion segfault after some "TraceWarning".
+</summary>
+<br/>
+The ONNX package is compiled with too old compiler.
+
+Please build and install ONNX from its source code using a compiler
+whose version is closer to what's used by PyTorch (available in `torch.__config__.show()`).
+</details>
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/docs/tutorials/models.md b/preprocess/humanparsing/mhp_extension/detectron2/docs/tutorials/models.md
new file mode 100644
index 0000000000000000000000000000000000000000..456f36d1c03f657ba0b63eb6f26506c4b1b0d60f
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/docs/tutorials/models.md
@@ -0,0 +1,151 @@
+# Use Models
+
+Models (and their sub-models) in detectron2 are built by
+functions such as `build_model`, `build_backbone`, `build_roi_heads`:
+```python
+from detectron2.modeling import build_model
+model = build_model(cfg)  # returns a torch.nn.Module
+```
+
+`build_model` only builds the model structure, and fill it with random parameters.
+See below for how to load an existing checkpoint to the model,
+and how to use the `model` object.
+
+### Load/Save a Checkpoint
+```python
+from detectron2.checkpoint import DetectionCheckpointer
+DetectionCheckpointer(model).load(file_path)   # load a file to model
+
+checkpointer = DetectionCheckpointer(model, save_dir="output")
+checkpointer.save("model_999")  # save to output/model_999.pth
+```
+
+Detectron2's checkpointer recognizes models in pytorch's `.pth` format, as well as the `.pkl` files
+in our model zoo.
+See [API doc](../modules/checkpoint.html#detectron2.checkpoint.DetectionCheckpointer)
+for more details about its usage.
+
+The model files can be arbitrarily manipulated using `torch.{load,save}` for `.pth` files or
+`pickle.{dump,load}` for `.pkl` files.
+
+### Use a Model
+
+A model can be called by `outputs = model(inputs)`, where `inputs` is a `list[dict]`.
+Each dict corresponds to one image and the required keys
+depend on the type of model, and whether the model is in training or evaluation mode.
+For example, in order to do inference,
+all existing models expect the "image" key, and optionally "height" and "width".
+The detailed format of inputs and outputs of existing models are explained below.
+
+When in training mode, all models are required to be used under an `EventStorage`.
+The training statistics will be put into the storage:
+```python
+from detectron2.utils.events import EventStorage
+with EventStorage() as storage:
+  losses = model(inputs)
+```
+
+If you only want to do simple inference using an existing model,
+[DefaultPredictor](../modules/engine.html#detectron2.engine.defaults.DefaultPredictor)
+is a wrapper around model that provides such basic functionality.
+It includes default behavior including model loading, preprocessing,
+and operates on single image rather than batches.
+
+### Model Input Format
+
+Users can implement custom models that support any arbitrary input format.
+Here we describe the standard input format that all builtin models support in detectron2.
+They all take a `list[dict]` as the inputs. Each dict
+corresponds to information about one image.
+
+The dict may contain the following keys:
+
+* "image": `Tensor` in (C, H, W) format. The meaning of channels are defined by `cfg.INPUT.FORMAT`.
+  Image normalization, if any, will be performed inside the model using
+	`cfg.MODEL.PIXEL_{MEAN,STD}`.
+* "instances": an [Instances](../modules/structures.html#detectron2.structures.Instances)
+  object, with the following fields:
+  + "gt_boxes": a [Boxes](../modules/structures.html#detectron2.structures.Boxes) object storing N boxes, one for each instance.
+  + "gt_classes": `Tensor` of long type, a vector of N labels, in range [0, num_categories).
+  + "gt_masks": a [PolygonMasks](../modules/structures.html#detectron2.structures.PolygonMasks)
+    or [BitMasks](../modules/structures.html#detectron2.structures.BitMasks) object storing N masks, one for each instance.
+  + "gt_keypoints": a [Keypoints](../modules/structures.html#detectron2.structures.Keypoints)
+    object storing N keypoint sets, one for each instance.
+* "proposals": an [Instances](../modules/structures.html#detectron2.structures.Instances)
+  object used only in Fast R-CNN style models, with the following fields:
+  + "proposal_boxes": a [Boxes](../modules/structures.html#detectron2.structures.Boxes) object storing P proposal boxes.
+  + "objectness_logits": `Tensor`, a vector of P scores, one for each proposal.
+* "height", "width": the **desired** output height and width, which is not necessarily the same
+  as the height or width of the `image` input field.
+  For example, the `image` input field might be a resized image,
+  but you may want the outputs to be in **original** resolution.
+
+  If provided, the model will produce output in this resolution,
+  rather than in the resolution of the `image` as input into the model. This is more efficient and accurate.
+* "sem_seg": `Tensor[int]` in (H, W) format. The semantic segmentation ground truth.
+  Values represent category labels starting from 0.
+
+
+#### How it connects to data loader:
+
+The output of the default [DatasetMapper]( ../modules/data.html#detectron2.data.DatasetMapper) is a dict
+that follows the above format.
+After the data loader performs batching, it becomes `list[dict]` which the builtin models support.
+
+
+### Model Output Format
+
+When in training mode, the builtin models output a `dict[str->ScalarTensor]` with all the losses.
+
+When in inference mode, the builtin models output a `list[dict]`, one dict for each image.
+Based on the tasks the model is doing, each dict may contain the following fields:
+
+* "instances": [Instances](../modules/structures.html#detectron2.structures.Instances)
+  object with the following fields:
+  * "pred_boxes": [Boxes](../modules/structures.html#detectron2.structures.Boxes) object storing N boxes, one for each detected instance.
+  * "scores": `Tensor`, a vector of N scores.
+  * "pred_classes": `Tensor`, a vector of N labels in range [0, num_categories).
+  + "pred_masks": a `Tensor` of shape (N, H, W), masks for each detected instance.
+  + "pred_keypoints": a `Tensor` of shape (N, num_keypoint, 3).
+    Each row in the last dimension is (x, y, score). Scores are larger than 0.
+* "sem_seg": `Tensor` of (num_categories, H, W), the semantic segmentation prediction.
+* "proposals": [Instances](../modules/structures.html#detectron2.structures.Instances)
+  object with the following fields:
+  * "proposal_boxes": [Boxes](../modules/structures.html#detectron2.structures.Boxes)
+    object storing N boxes.
+  * "objectness_logits": a torch vector of N scores.
+* "panoptic_seg": A tuple of `(Tensor, list[dict])`. The tensor has shape (H, W), where each element
+  represent the segment id of the pixel. Each dict describes one segment id and has the following fields:
+  * "id": the segment id
+  * "isthing": whether the segment is a thing or stuff
+  * "category_id": the category id of this segment. It represents the thing
+       class id when `isthing==True`, and the stuff class id otherwise.
+
+
+### Partially execute a model:
+
+Sometimes you may want to obtain an intermediate tensor inside a model.
+Since there are typically hundreds of intermediate tensors, there isn't an API that provides you
+the intermediate result you need.
+You have the following options:
+
+1. Write a (sub)model. Following the [tutorial](./write-models.md), you can
+   rewrite a model component (e.g. a head of a model), such that it
+   does the same thing as the existing component, but returns the output
+   you need.
+2. Partially execute a model. You can create the model as usual,
+   but use custom code to execute it instead of its `forward()`. For example,
+   the following code obtains mask features before mask head.
+
+```python
+images = ImageList.from_tensors(...)  # preprocessed input tensor
+model = build_model(cfg)
+features = model.backbone(images.tensor)
+proposals, _ = model.proposal_generator(images, features)
+instances = model.roi_heads._forward_box(features, proposals)
+mask_features = [features[f] for f in model.roi_heads.in_features]
+mask_features = model.roi_heads.mask_pooler(mask_features, [x.pred_boxes for x in instances])
+```
+
+Note that both options require you to read the existing forward code to understand
+how to write code to obtain the outputs you need.
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/docs/tutorials/training.md b/preprocess/humanparsing/mhp_extension/detectron2/docs/tutorials/training.md
new file mode 100644
index 0000000000000000000000000000000000000000..dc7d537254c398252e3b91c25e33489aa91709c4
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/docs/tutorials/training.md
@@ -0,0 +1,50 @@
+# Training
+
+From the previous tutorials, you may now have a custom model and data loader.
+
+You are free to create your own optimizer, and write the training logic: it's
+usually easy with PyTorch, and allow researchers to see the entire training
+logic more clearly and have full control.
+One such example is provided in [tools/plain_train_net.py](../../tools/plain_train_net.py).
+
+We also provide a standarized "trainer" abstraction with a
+[minimal hook system](../modules/engine.html#detectron2.engine.HookBase)
+that helps simplify the standard types of training.
+
+You can use
+[SimpleTrainer().train()](../modules/engine.html#detectron2.engine.SimpleTrainer)
+which provides minimal abstraction for single-cost single-optimizer single-data-source training.
+The builtin `train_net.py` script uses
+[DefaultTrainer().train()](../modules/engine.html#detectron2.engine.defaults.DefaultTrainer),
+which includes more standard default behavior that one might want to opt in,
+including default configurations for learning rate schedule,
+logging, evaluation, checkpointing etc.
+This also means that it's less likely to support some non-standard behavior
+you might want during research.
+
+To customize the training loops, you can:
+
+1. If your customization is similar to what `DefaultTrainer` is already doing,
+you can change behavior of `DefaultTrainer` by overwriting [its methods](../modules/engine.html#detectron2.engine.defaults.DefaultTrainer)
+in a subclass, like what [tools/train_net.py](../../tools/train_net.py) does.
+2. If you need something very novel, you can start from [tools/plain_train_net.py](../../tools/plain_train_net.py) to implement them yourself.
+
+### Logging of Metrics
+
+During training, metrics are saved to a centralized [EventStorage](../modules/utils.html#detectron2.utils.events.EventStorage).
+You can use the following code to access it and log metrics to it:
+```
+from detectron2.utils.events import get_event_storage
+
+# inside the model:
+if self.training:
+  value = # compute the value from inputs
+  storage = get_event_storage()
+  storage.put_scalar("some_accuracy", value)
+```
+
+Refer to its documentation for more details.
+
+Metrics are then saved to various destinations with [EventWriter](../modules/utils.html#module-detectron2.utils.events).
+DefaultTrainer enables a few `EventWriter` with default configurations.
+See above for how to customize them.
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/docs/tutorials/write-models.md b/preprocess/humanparsing/mhp_extension/detectron2/docs/tutorials/write-models.md
new file mode 100644
index 0000000000000000000000000000000000000000..bb87d586d609ca94240f32f2eaab7eadb0d07b93
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/docs/tutorials/write-models.md
@@ -0,0 +1,39 @@
+# Write Models
+
+If you are trying to do something completely new, you may wish to implement
+a model entirely from scratch within detectron2. However, in many situations you may
+be interested in modifying or extending some components of an existing model.
+Therefore, we also provide a registration mechanism that lets you override the
+behavior of certain internal components of standard models.
+
+For example, to add a new backbone, import this code in your code:
+```python
+from detectron2.modeling import BACKBONE_REGISTRY, Backbone, ShapeSpec
+
+@BACKBONE_REGISTRY.register()
+class ToyBackBone(Backbone):
+  def __init__(self, cfg, input_shape):
+    # create your own backbone
+    self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=16, padding=3)
+
+  def forward(self, image):
+    return {"conv1": self.conv1(image)}
+
+  def output_shape(self):
+    return {"conv1": ShapeSpec(channels=64, stride=16)}
+```
+Then, you can use `cfg.MODEL.BACKBONE.NAME = 'ToyBackBone'` in your config object.
+`build_model(cfg)` will then call your `ToyBackBone` instead.
+
+As another example, to add new abilities to the ROI heads in the Generalized R-CNN meta-architecture,
+you can implement a new
+[ROIHeads](../modules/modeling.html#detectron2.modeling.ROIHeads) subclass and put it in the `ROI_HEADS_REGISTRY`.
+See [densepose in detectron2](../../projects/DensePose)
+and [meshrcnn](https://github.com/facebookresearch/meshrcnn)
+for examples that implement new ROIHeads to perform new tasks.
+And [projects/](../../projects/)
+contains more examples that implement different architectures.
+
+A complete list of registries can be found in [API documentation](../modules/modeling.html#model-registries).
+You can register components in these registries to customize different parts of a model, or the
+entire model.
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/projects/DensePose/README.md b/preprocess/humanparsing/mhp_extension/detectron2/projects/DensePose/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..fd2f1ee3382365ab53ae44471c90266dff42d883
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/projects/DensePose/README.md
@@ -0,0 +1,54 @@
+# DensePose in Detectron2
+**Dense Human Pose Estimation In The Wild**
+
+_Rıza Alp Güler, Natalia Neverova, Iasonas Kokkinos_
+
+[[`densepose.org`](https://densepose.org)] [[`arXiv`](https://arxiv.org/abs/1802.00434)] [[`BibTeX`](#CitingDensePose)]
+
+Dense human pose estimation aims at mapping all human pixels of an RGB image to the 3D surface of the human body.
+
+<div align="center">
+  <img src="https://drive.google.com/uc?export=view&id=1qfSOkpueo1kVZbXOuQJJhyagKjMgepsz" width="700px" />
+</div>
+
+In this repository, we provide the code to train and evaluate DensePose-RCNN. We also provide tools to visualize
+DensePose annotation and results.
+
+# Quick Start
+
+See [ Getting Started ](doc/GETTING_STARTED.md)
+
+# Model Zoo and Baselines
+
+We provide a number of baseline results and trained models available for download. See [Model Zoo](doc/MODEL_ZOO.md) for details.
+
+# License
+
+Detectron2 is released under the [Apache 2.0 license](../../LICENSE)
+
+## <a name="CitingDensePose"></a>Citing DensePose
+
+If you use DensePose, please take the references from the following BibTeX entries:
+
+For DensePose with estimated confidences:
+
+```
+@InProceedings{Neverova2019DensePoseConfidences,
+    title = {Correlated Uncertainty for Learning Dense Correspondences from Noisy Labels},
+    author = {Neverova, Natalia and Novotny, David and Vedaldi, Andrea},
+    journal = {Advances in Neural Information Processing Systems},
+    year = {2019},
+}
+```
+
+For the original DensePose:
+
+```
+@InProceedings{Guler2018DensePose,
+  title={DensePose: Dense Human Pose Estimation In The Wild},
+  author={R\{i}za Alp G\"uler, Natalia Neverova, Iasonas Kokkinos},
+  journal={The IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},
+  year={2018}
+}
+```
+
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/projects/DensePose/apply_net.py b/preprocess/humanparsing/mhp_extension/detectron2/projects/DensePose/apply_net.py
new file mode 100644
index 0000000000000000000000000000000000000000..7262f7c059b42225b809429654d34f29dbd2801f
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/projects/DensePose/apply_net.py
@@ -0,0 +1,318 @@
+#!/usr/bin/env python3
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+
+import argparse
+import glob
+import logging
+import os
+import pickle
+import sys
+from typing import Any, ClassVar, Dict, List
+import torch
+
+from detectron2.config import get_cfg
+from detectron2.data.detection_utils import read_image
+from detectron2.engine.defaults import DefaultPredictor
+from detectron2.structures.boxes import BoxMode
+from detectron2.structures.instances import Instances
+from detectron2.utils.logger import setup_logger
+
+from densepose import add_densepose_config
+from densepose.utils.logger import verbosity_to_level
+from densepose.vis.base import CompoundVisualizer
+from densepose.vis.bounding_box import ScoredBoundingBoxVisualizer
+from densepose.vis.densepose import (
+    DensePoseResultsContourVisualizer,
+    DensePoseResultsFineSegmentationVisualizer,
+    DensePoseResultsUVisualizer,
+    DensePoseResultsVVisualizer,
+)
+from densepose.vis.extractor import CompoundExtractor, create_extractor
+
+DOC = """Apply Net - a tool to print / visualize DensePose results
+"""
+
+LOGGER_NAME = "apply_net"
+logger = logging.getLogger(LOGGER_NAME)
+
+_ACTION_REGISTRY: Dict[str, "Action"] = {}
+
+
+class Action(object):
+    @classmethod
+    def add_arguments(cls: type, parser: argparse.ArgumentParser):
+        parser.add_argument(
+            "-v",
+            "--verbosity",
+            action="count",
+            help="Verbose mode. Multiple -v options increase the verbosity.",
+        )
+
+
+def register_action(cls: type):
+    """
+    Decorator for action classes to automate action registration
+    """
+    global _ACTION_REGISTRY
+    _ACTION_REGISTRY[cls.COMMAND] = cls
+    return cls
+
+
+class InferenceAction(Action):
+    @classmethod
+    def add_arguments(cls: type, parser: argparse.ArgumentParser):
+        super(InferenceAction, cls).add_arguments(parser)
+        parser.add_argument("cfg", metavar="<config>", help="Config file")
+        parser.add_argument("model", metavar="<model>", help="Model file")
+        parser.add_argument("input", metavar="<input>", help="Input data")
+        parser.add_argument(
+            "--opts",
+            help="Modify config options using the command-line 'KEY VALUE' pairs",
+            default=[],
+            nargs=argparse.REMAINDER,
+        )
+
+    @classmethod
+    def execute(cls: type, args: argparse.Namespace):
+        logger.info(f"Loading config from {args.cfg}")
+        opts = []
+        cfg = cls.setup_config(args.cfg, args.model, args, opts)
+        logger.info(f"Loading model from {args.model}")
+        predictor = DefaultPredictor(cfg)
+        logger.info(f"Loading data from {args.input}")
+        file_list = cls._get_input_file_list(args.input)
+        if len(file_list) == 0:
+            logger.warning(f"No input images for {args.input}")
+            return
+        context = cls.create_context(args)
+        for file_name in file_list:
+            img = read_image(file_name, format="BGR")  # predictor expects BGR image.
+            with torch.no_grad():
+                outputs = predictor(img)["instances"]
+                cls.execute_on_outputs(context, {"file_name": file_name, "image": img}, outputs)
+        cls.postexecute(context)
+
+    @classmethod
+    def setup_config(
+        cls: type, config_fpath: str, model_fpath: str, args: argparse.Namespace, opts: List[str]
+    ):
+        cfg = get_cfg()
+        add_densepose_config(cfg)
+        cfg.merge_from_file(config_fpath)
+        cfg.merge_from_list(args.opts)
+        if opts:
+            cfg.merge_from_list(opts)
+        cfg.MODEL.WEIGHTS = model_fpath
+        cfg.freeze()
+        return cfg
+
+    @classmethod
+    def _get_input_file_list(cls: type, input_spec: str):
+        if os.path.isdir(input_spec):
+            file_list = [
+                os.path.join(input_spec, fname)
+                for fname in os.listdir(input_spec)
+                if os.path.isfile(os.path.join(input_spec, fname))
+            ]
+        elif os.path.isfile(input_spec):
+            file_list = [input_spec]
+        else:
+            file_list = glob.glob(input_spec)
+        return file_list
+
+
+@register_action
+class DumpAction(InferenceAction):
+    """
+    Dump action that outputs results to a pickle file
+    """
+
+    COMMAND: ClassVar[str] = "dump"
+
+    @classmethod
+    def add_parser(cls: type, subparsers: argparse._SubParsersAction):
+        parser = subparsers.add_parser(cls.COMMAND, help="Dump model outputs to a file.")
+        cls.add_arguments(parser)
+        parser.set_defaults(func=cls.execute)
+
+    @classmethod
+    def add_arguments(cls: type, parser: argparse.ArgumentParser):
+        super(DumpAction, cls).add_arguments(parser)
+        parser.add_argument(
+            "--output",
+            metavar="<dump_file>",
+            default="results.pkl",
+            help="File name to save dump to",
+        )
+
+    @classmethod
+    def execute_on_outputs(
+        cls: type, context: Dict[str, Any], entry: Dict[str, Any], outputs: Instances
+    ):
+        image_fpath = entry["file_name"]
+        logger.info(f"Processing {image_fpath}")
+        result = {"file_name": image_fpath}
+        if outputs.has("scores"):
+            result["scores"] = outputs.get("scores").cpu()
+        if outputs.has("pred_boxes"):
+            result["pred_boxes_XYXY"] = outputs.get("pred_boxes").tensor.cpu()
+            if outputs.has("pred_densepose"):
+                boxes_XYWH = BoxMode.convert(
+                    result["pred_boxes_XYXY"], BoxMode.XYXY_ABS, BoxMode.XYWH_ABS
+                )
+                result["pred_densepose"] = outputs.get("pred_densepose").to_result(boxes_XYWH)
+        context["results"].append(result)
+
+    @classmethod
+    def create_context(cls: type, args: argparse.Namespace):
+        context = {"results": [], "out_fname": args.output}
+        return context
+
+    @classmethod
+    def postexecute(cls: type, context: Dict[str, Any]):
+        out_fname = context["out_fname"]
+        out_dir = os.path.dirname(out_fname)
+        if len(out_dir) > 0 and not os.path.exists(out_dir):
+            os.makedirs(out_dir)
+        with open(out_fname, "wb") as hFile:
+            pickle.dump(context["results"], hFile)
+            logger.info(f"Output saved to {out_fname}")
+
+
+@register_action
+class ShowAction(InferenceAction):
+    """
+    Show action that visualizes selected entries on an image
+    """
+
+    COMMAND: ClassVar[str] = "show"
+    VISUALIZERS: ClassVar[Dict[str, object]] = {
+        "dp_contour": DensePoseResultsContourVisualizer,
+        "dp_segm": DensePoseResultsFineSegmentationVisualizer,
+        "dp_u": DensePoseResultsUVisualizer,
+        "dp_v": DensePoseResultsVVisualizer,
+        "bbox": ScoredBoundingBoxVisualizer,
+    }
+
+    @classmethod
+    def add_parser(cls: type, subparsers: argparse._SubParsersAction):
+        parser = subparsers.add_parser(cls.COMMAND, help="Visualize selected entries")
+        cls.add_arguments(parser)
+        parser.set_defaults(func=cls.execute)
+
+    @classmethod
+    def add_arguments(cls: type, parser: argparse.ArgumentParser):
+        super(ShowAction, cls).add_arguments(parser)
+        parser.add_argument(
+            "visualizations",
+            metavar="<visualizations>",
+            help="Comma separated list of visualizations, possible values: "
+            "[{}]".format(",".join(sorted(cls.VISUALIZERS.keys()))),
+        )
+        parser.add_argument(
+            "--min_score",
+            metavar="<score>",
+            default=0.8,
+            type=float,
+            help="Minimum detection score to visualize",
+        )
+        parser.add_argument(
+            "--nms_thresh", metavar="<threshold>", default=None, type=float, help="NMS threshold"
+        )
+        parser.add_argument(
+            "--output",
+            metavar="<image_file>",
+            default="outputres.png",
+            help="File name to save output to",
+        )
+
+    @classmethod
+    def setup_config(
+        cls: type, config_fpath: str, model_fpath: str, args: argparse.Namespace, opts: List[str]
+    ):
+        opts.append("MODEL.ROI_HEADS.SCORE_THRESH_TEST")
+        opts.append(str(args.min_score))
+        if args.nms_thresh is not None:
+            opts.append("MODEL.ROI_HEADS.NMS_THRESH_TEST")
+            opts.append(str(args.nms_thresh))
+        cfg = super(ShowAction, cls).setup_config(config_fpath, model_fpath, args, opts)
+        return cfg
+
+    @classmethod
+    def execute_on_outputs(
+        cls: type, context: Dict[str, Any], entry: Dict[str, Any], outputs: Instances
+    ):
+        import cv2
+        import numpy as np
+
+        visualizer = context["visualizer"]
+        extractor = context["extractor"]
+        image_fpath = entry["file_name"]
+        logger.info(f"Processing {image_fpath}")
+        image = cv2.cvtColor(entry["image"], cv2.COLOR_BGR2GRAY)
+        image = np.tile(image[:, :, np.newaxis], [1, 1, 3])
+        data = extractor(outputs)
+        image_vis = visualizer.visualize(image, data)
+        entry_idx = context["entry_idx"] + 1
+        out_fname = cls._get_out_fname(entry_idx, context["out_fname"])
+        out_dir = os.path.dirname(out_fname)
+        if len(out_dir) > 0 and not os.path.exists(out_dir):
+            os.makedirs(out_dir)
+        cv2.imwrite(out_fname, image_vis)
+        logger.info(f"Output saved to {out_fname}")
+        context["entry_idx"] += 1
+
+    @classmethod
+    def postexecute(cls: type, context: Dict[str, Any]):
+        pass
+
+    @classmethod
+    def _get_out_fname(cls: type, entry_idx: int, fname_base: str):
+        base, ext = os.path.splitext(fname_base)
+        return base + ".{0:04d}".format(entry_idx) + ext
+
+    @classmethod
+    def create_context(cls: type, args: argparse.Namespace) -> Dict[str, Any]:
+        vis_specs = args.visualizations.split(",")
+        visualizers = []
+        extractors = []
+        for vis_spec in vis_specs:
+            vis = cls.VISUALIZERS[vis_spec]()
+            visualizers.append(vis)
+            extractor = create_extractor(vis)
+            extractors.append(extractor)
+        visualizer = CompoundVisualizer(visualizers)
+        extractor = CompoundExtractor(extractors)
+        context = {
+            "extractor": extractor,
+            "visualizer": visualizer,
+            "out_fname": args.output,
+            "entry_idx": 0,
+        }
+        return context
+
+
+def create_argument_parser() -> argparse.ArgumentParser:
+    parser = argparse.ArgumentParser(
+        description=DOC,
+        formatter_class=lambda prog: argparse.HelpFormatter(prog, max_help_position=120),
+    )
+    parser.set_defaults(func=lambda _: parser.print_help(sys.stdout))
+    subparsers = parser.add_subparsers(title="Actions")
+    for _, action in _ACTION_REGISTRY.items():
+        action.add_parser(subparsers)
+    return parser
+
+
+def main():
+    parser = create_argument_parser()
+    args = parser.parse_args()
+    verbosity = args.verbosity if hasattr(args, "verbosity") else None
+    global logger
+    logger = setup_logger(name=LOGGER_NAME)
+    logger.setLevel(verbosity_to_level(verbosity))
+    args.func(args)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/projects/DensePose/configs/Base-DensePose-RCNN-FPN.yaml b/preprocess/humanparsing/mhp_extension/detectron2/projects/DensePose/configs/Base-DensePose-RCNN-FPN.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..3ed1bcd68744a22472cc8b391993e4175013dc42
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/projects/DensePose/configs/Base-DensePose-RCNN-FPN.yaml
@@ -0,0 +1,47 @@
+MODEL:
+  META_ARCHITECTURE: "GeneralizedRCNN"
+  BACKBONE:
+    NAME: "build_resnet_fpn_backbone"
+  RESNETS:
+    OUT_FEATURES: ["res2", "res3", "res4", "res5"]
+  FPN:
+    IN_FEATURES: ["res2", "res3", "res4", "res5"]
+  ANCHOR_GENERATOR:
+    SIZES: [[32], [64], [128], [256], [512]]  # One size for each in feature map
+    ASPECT_RATIOS: [[0.5, 1.0, 2.0]]  # Three aspect ratios (same for all in feature maps)
+  RPN:
+    IN_FEATURES: ["p2", "p3", "p4", "p5", "p6"]
+    PRE_NMS_TOPK_TRAIN: 2000  # Per FPN level
+    PRE_NMS_TOPK_TEST: 1000  # Per FPN level
+    # Detectron1 uses 2000 proposals per-batch,
+    # (See "modeling/rpn/rpn_outputs.py" for details of this legacy issue)
+    # which is approximately 1000 proposals per-image since the default batch size for FPN is 2.
+    POST_NMS_TOPK_TRAIN: 1000
+    POST_NMS_TOPK_TEST: 1000
+
+  DENSEPOSE_ON: True
+  ROI_HEADS:
+    NAME: "DensePoseROIHeads"
+    IN_FEATURES: ["p2", "p3", "p4", "p5"]
+    NUM_CLASSES: 1
+  ROI_BOX_HEAD:
+    NAME: "FastRCNNConvFCHead"
+    NUM_FC: 2
+    POOLER_RESOLUTION: 7
+    POOLER_SAMPLING_RATIO: 2
+    POOLER_TYPE: "ROIAlign"
+  ROI_DENSEPOSE_HEAD:
+    NAME: "DensePoseV1ConvXHead"
+    POOLER_TYPE: "ROIAlign"
+    NUM_COARSE_SEGM_CHANNELS: 2
+DATASETS:
+  TRAIN: ("densepose_coco_2014_train", "densepose_coco_2014_valminusminival")
+  TEST: ("densepose_coco_2014_minival",)
+SOLVER:
+  IMS_PER_BATCH: 16
+  BASE_LR: 0.01
+  STEPS: (60000, 80000)
+  MAX_ITER: 90000
+  WARMUP_FACTOR: 0.1
+INPUT:
+  MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800)
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/projects/DensePose/configs/densepose_rcnn_R_101_FPN_DL_WC1_s1x.yaml b/preprocess/humanparsing/mhp_extension/detectron2/projects/DensePose/configs/densepose_rcnn_R_101_FPN_DL_WC1_s1x.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..15475b1ac3bb7272a7ebc0061a55119ffd2591b9
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/projects/DensePose/configs/densepose_rcnn_R_101_FPN_DL_WC1_s1x.yaml
@@ -0,0 +1,16 @@
+_BASE_: "Base-DensePose-RCNN-FPN.yaml"
+MODEL:
+  WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-101.pkl"
+  RESNETS:
+    DEPTH: 101
+  ROI_DENSEPOSE_HEAD:
+    NAME: "DensePoseDeepLabHead"
+    UV_CONFIDENCE:
+      ENABLED: True
+      TYPE: "iid_iso"
+    POINT_REGRESSION_WEIGHTS: 0.0005
+SOLVER:
+  CLIP_GRADIENTS:
+    ENABLED: True
+  MAX_ITER: 130000
+  STEPS: (100000, 120000)
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/projects/DensePose/configs/densepose_rcnn_R_101_FPN_DL_WC2_s1x.yaml b/preprocess/humanparsing/mhp_extension/detectron2/projects/DensePose/configs/densepose_rcnn_R_101_FPN_DL_WC2_s1x.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..7546b967ab89129c9a276f19b1cf2d6b59f1a462
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/projects/DensePose/configs/densepose_rcnn_R_101_FPN_DL_WC2_s1x.yaml
@@ -0,0 +1,16 @@
+_BASE_: "Base-DensePose-RCNN-FPN.yaml"
+MODEL:
+  WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-101.pkl"
+  RESNETS:
+    DEPTH: 101
+  ROI_DENSEPOSE_HEAD:
+    NAME: "DensePoseDeepLabHead"
+    UV_CONFIDENCE:
+      ENABLED: True
+      TYPE: "indep_aniso"
+    POINT_REGRESSION_WEIGHTS: 0.0005
+SOLVER:
+  CLIP_GRADIENTS:
+    ENABLED: True
+  MAX_ITER: 130000
+  STEPS: (100000, 120000)
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/projects/DensePose/configs/densepose_rcnn_R_101_FPN_DL_s1x.yaml b/preprocess/humanparsing/mhp_extension/detectron2/projects/DensePose/configs/densepose_rcnn_R_101_FPN_DL_s1x.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..045f7f02f1b4eb0c0ef1733c3ac65e3aa70168de
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/projects/DensePose/configs/densepose_rcnn_R_101_FPN_DL_s1x.yaml
@@ -0,0 +1,10 @@
+_BASE_: "Base-DensePose-RCNN-FPN.yaml"
+MODEL:
+  WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-101.pkl"
+  RESNETS:
+    DEPTH: 101
+  ROI_DENSEPOSE_HEAD:
+    NAME: "DensePoseDeepLabHead"
+SOLVER:
+  MAX_ITER: 130000
+  STEPS: (100000, 120000)
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/projects/DensePose/configs/densepose_rcnn_R_101_FPN_WC1_s1x.yaml b/preprocess/humanparsing/mhp_extension/detectron2/projects/DensePose/configs/densepose_rcnn_R_101_FPN_WC1_s1x.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..ace62094fbc4ce2024810333c11c7a955d8eeb22
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/projects/DensePose/configs/densepose_rcnn_R_101_FPN_WC1_s1x.yaml
@@ -0,0 +1,16 @@
+_BASE_: "Base-DensePose-RCNN-FPN.yaml"
+MODEL:
+  WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-101.pkl"
+  RESNETS:
+    DEPTH: 101
+  ROI_DENSEPOSE_HEAD:
+    UV_CONFIDENCE:
+      ENABLED: True
+      TYPE: "iid_iso"
+    POINT_REGRESSION_WEIGHTS: 0.0005
+SOLVER:
+  CLIP_GRADIENTS:
+    ENABLED: True
+  MAX_ITER: 130000
+  STEPS: (100000, 120000)
+  WARMUP_FACTOR: 0.025
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/projects/DensePose/configs/densepose_rcnn_R_101_FPN_WC2_s1x.yaml b/preprocess/humanparsing/mhp_extension/detectron2/projects/DensePose/configs/densepose_rcnn_R_101_FPN_WC2_s1x.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..766c098f6dcdd1fb3f67957d7d1d982b37747b96
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/projects/DensePose/configs/densepose_rcnn_R_101_FPN_WC2_s1x.yaml
@@ -0,0 +1,16 @@
+_BASE_: "Base-DensePose-RCNN-FPN.yaml"
+MODEL:
+  WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-101.pkl"
+  RESNETS:
+    DEPTH: 101
+  ROI_DENSEPOSE_HEAD:
+    UV_CONFIDENCE:
+      ENABLED: True
+      TYPE: "indep_aniso"
+    POINT_REGRESSION_WEIGHTS: 0.0005
+SOLVER:
+  CLIP_GRADIENTS:
+    ENABLED: True
+  MAX_ITER: 130000
+  STEPS: (100000, 120000)
+  WARMUP_FACTOR: 0.025
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/projects/DensePose/configs/densepose_rcnn_R_101_FPN_s1x.yaml b/preprocess/humanparsing/mhp_extension/detectron2/projects/DensePose/configs/densepose_rcnn_R_101_FPN_s1x.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..af44fb767edf9bf093463e62f93e070d0d019c5a
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/projects/DensePose/configs/densepose_rcnn_R_101_FPN_s1x.yaml
@@ -0,0 +1,8 @@
+_BASE_: "Base-DensePose-RCNN-FPN.yaml"
+MODEL:
+  WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-101.pkl"
+  RESNETS:
+    DEPTH: 101
+SOLVER:
+  MAX_ITER: 130000
+  STEPS: (100000, 120000)
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/projects/DensePose/configs/densepose_rcnn_R_101_FPN_s1x_legacy.yaml b/preprocess/humanparsing/mhp_extension/detectron2/projects/DensePose/configs/densepose_rcnn_R_101_FPN_s1x_legacy.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..8e79a1b9549cf19ed4a43cf9caf3dc88f6133310
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/projects/DensePose/configs/densepose_rcnn_R_101_FPN_s1x_legacy.yaml
@@ -0,0 +1,17 @@
+_BASE_: "Base-DensePose-RCNN-FPN.yaml"
+MODEL:
+  WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-101.pkl"
+  RESNETS:
+    DEPTH: 101
+  ROI_DENSEPOSE_HEAD:
+    NUM_COARSE_SEGM_CHANNELS: 15
+    POOLER_RESOLUTION: 14
+    HEATMAP_SIZE: 56
+    INDEX_WEIGHTS: 2.0
+    PART_WEIGHTS: 0.3
+    POINT_REGRESSION_WEIGHTS: 0.1
+    DECODER_ON: False
+SOLVER:
+  BASE_LR: 0.002
+  MAX_ITER: 130000
+  STEPS: (100000, 120000)
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/projects/DensePose/configs/densepose_rcnn_R_50_FPN_DL_WC1_s1x.yaml b/preprocess/humanparsing/mhp_extension/detectron2/projects/DensePose/configs/densepose_rcnn_R_50_FPN_DL_WC1_s1x.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..f3720eff56ce042a68da6c99f484b963cae2c7d9
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/projects/DensePose/configs/densepose_rcnn_R_50_FPN_DL_WC1_s1x.yaml
@@ -0,0 +1,16 @@
+_BASE_: "Base-DensePose-RCNN-FPN.yaml"
+MODEL:
+  WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
+  RESNETS:
+    DEPTH: 50
+  ROI_DENSEPOSE_HEAD:
+    NAME: "DensePoseDeepLabHead"
+    UV_CONFIDENCE:
+      ENABLED: True
+      TYPE: "iid_iso"
+    POINT_REGRESSION_WEIGHTS: 0.0005
+SOLVER:
+  CLIP_GRADIENTS:
+    ENABLED: True
+  MAX_ITER: 130000
+  STEPS: (100000, 120000)
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/projects/DensePose/configs/densepose_rcnn_R_50_FPN_DL_WC2_s1x.yaml b/preprocess/humanparsing/mhp_extension/detectron2/projects/DensePose/configs/densepose_rcnn_R_50_FPN_DL_WC2_s1x.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..5a47cc05e6e9dc882778c6b502d93cbcec88fb88
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/projects/DensePose/configs/densepose_rcnn_R_50_FPN_DL_WC2_s1x.yaml
@@ -0,0 +1,16 @@
+_BASE_: "Base-DensePose-RCNN-FPN.yaml"
+MODEL:
+  WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
+  RESNETS:
+    DEPTH: 50
+  ROI_DENSEPOSE_HEAD:
+    NAME: "DensePoseDeepLabHead"
+    UV_CONFIDENCE:
+      ENABLED: True
+      TYPE: "indep_aniso"
+    POINT_REGRESSION_WEIGHTS: 0.0005
+SOLVER:
+  CLIP_GRADIENTS:
+    ENABLED: True
+  MAX_ITER: 130000
+  STEPS: (100000, 120000)
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/projects/DensePose/configs/densepose_rcnn_R_50_FPN_DL_s1x.yaml b/preprocess/humanparsing/mhp_extension/detectron2/projects/DensePose/configs/densepose_rcnn_R_50_FPN_DL_s1x.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..52a170b4a28289ad943314f77256e34800d23121
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/projects/DensePose/configs/densepose_rcnn_R_50_FPN_DL_s1x.yaml
@@ -0,0 +1,10 @@
+_BASE_: "Base-DensePose-RCNN-FPN.yaml"
+MODEL:
+  WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
+  RESNETS:
+    DEPTH: 50
+  ROI_DENSEPOSE_HEAD:
+    NAME: "DensePoseDeepLabHead"
+SOLVER:
+  MAX_ITER: 130000
+  STEPS: (100000, 120000)
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/projects/DensePose/configs/densepose_rcnn_R_50_FPN_WC1_s1x.yaml b/preprocess/humanparsing/mhp_extension/detectron2/projects/DensePose/configs/densepose_rcnn_R_50_FPN_WC1_s1x.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..d36e54256ac22f1b01604e54430da24972f06eeb
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/projects/DensePose/configs/densepose_rcnn_R_50_FPN_WC1_s1x.yaml
@@ -0,0 +1,16 @@
+_BASE_: "Base-DensePose-RCNN-FPN.yaml"
+MODEL:
+  WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
+  RESNETS:
+    DEPTH: 50
+  ROI_DENSEPOSE_HEAD:
+    UV_CONFIDENCE:
+      ENABLED: True
+      TYPE: "iid_iso"
+    POINT_REGRESSION_WEIGHTS: 0.0005
+SOLVER:
+  CLIP_GRADIENTS:
+    ENABLED: True
+  MAX_ITER: 130000
+  STEPS: (100000, 120000)
+  WARMUP_FACTOR: 0.025
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/projects/DensePose/configs/densepose_rcnn_R_50_FPN_WC2_s1x.yaml b/preprocess/humanparsing/mhp_extension/detectron2/projects/DensePose/configs/densepose_rcnn_R_50_FPN_WC2_s1x.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..e880d469564a3757ba3f4d708054074cefda49b6
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/projects/DensePose/configs/densepose_rcnn_R_50_FPN_WC2_s1x.yaml
@@ -0,0 +1,16 @@
+_BASE_: "Base-DensePose-RCNN-FPN.yaml"
+MODEL:
+  WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
+  RESNETS:
+    DEPTH: 50
+  ROI_DENSEPOSE_HEAD:
+    UV_CONFIDENCE:
+      ENABLED: True
+      TYPE: "indep_aniso"
+    POINT_REGRESSION_WEIGHTS: 0.0005
+SOLVER:
+  CLIP_GRADIENTS:
+    ENABLED: True
+  MAX_ITER: 130000
+  STEPS: (100000, 120000)
+  WARMUP_FACTOR: 0.025
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/projects/DensePose/configs/densepose_rcnn_R_50_FPN_s1x.yaml b/preprocess/humanparsing/mhp_extension/detectron2/projects/DensePose/configs/densepose_rcnn_R_50_FPN_s1x.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..d2dd14c6f92f3850b99e6f1c828c0fcee52120e1
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/projects/DensePose/configs/densepose_rcnn_R_50_FPN_s1x.yaml
@@ -0,0 +1,8 @@
+_BASE_: "Base-DensePose-RCNN-FPN.yaml"
+MODEL:
+  WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
+  RESNETS:
+    DEPTH: 50
+SOLVER:
+  MAX_ITER: 130000
+  STEPS: (100000, 120000)
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/projects/DensePose/configs/densepose_rcnn_R_50_FPN_s1x_legacy.yaml b/preprocess/humanparsing/mhp_extension/detectron2/projects/DensePose/configs/densepose_rcnn_R_50_FPN_s1x_legacy.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..6c5391f3b3c3d437312a290d29b0656cb3804b25
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/projects/DensePose/configs/densepose_rcnn_R_50_FPN_s1x_legacy.yaml
@@ -0,0 +1,17 @@
+_BASE_: "Base-DensePose-RCNN-FPN.yaml"
+MODEL:
+  WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
+  RESNETS:
+    DEPTH: 50
+  ROI_DENSEPOSE_HEAD:
+    NUM_COARSE_SEGM_CHANNELS: 15
+    POOLER_RESOLUTION: 14
+    HEATMAP_SIZE: 56
+    INDEX_WEIGHTS: 2.0
+    PART_WEIGHTS: 0.3
+    POINT_REGRESSION_WEIGHTS: 0.1
+    DECODER_ON: False
+SOLVER:
+  BASE_LR: 0.002
+  MAX_ITER: 130000
+  STEPS: (100000, 120000)
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/projects/DensePose/configs/evolution/Base-RCNN-FPN-MC.yaml b/preprocess/humanparsing/mhp_extension/detectron2/projects/DensePose/configs/evolution/Base-RCNN-FPN-MC.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..5a20882a9fd275bac3e3cf49c128684c73085ca1
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/projects/DensePose/configs/evolution/Base-RCNN-FPN-MC.yaml
@@ -0,0 +1,91 @@
+MODEL:
+  META_ARCHITECTURE: "GeneralizedRCNN"
+  BACKBONE:
+    NAME: "build_resnet_fpn_backbone"
+  RESNETS:
+    OUT_FEATURES: ["res2", "res3", "res4", "res5"]
+  FPN:
+    IN_FEATURES: ["res2", "res3", "res4", "res5"]
+  ANCHOR_GENERATOR:
+    SIZES: [[32], [64], [128], [256], [512]]  # One size for each in feature map
+    ASPECT_RATIOS: [[0.5, 1.0, 2.0]]  # Three aspect ratios (same for all in feature maps)
+  RPN:
+    IN_FEATURES: ["p2", "p3", "p4", "p5", "p6"]
+    PRE_NMS_TOPK_TRAIN: 2000  # Per FPN level
+    PRE_NMS_TOPK_TEST: 1000  # Per FPN level
+    # Detectron1 uses 2000 proposals per-batch,
+    # (See "modeling/rpn/rpn_outputs.py" for details of this legacy issue)
+    # which is approximately 1000 proposals per-image since the default batch size for FPN is 2.
+    POST_NMS_TOPK_TRAIN: 1000
+    POST_NMS_TOPK_TEST: 1000
+  ROI_HEADS:
+    NAME: "StandardROIHeads"
+    IN_FEATURES: ["p2", "p3", "p4", "p5"]
+    NUM_CLASSES: 1
+  ROI_BOX_HEAD:
+    NAME: "FastRCNNConvFCHead"
+    NUM_FC: 2
+    POOLER_RESOLUTION: 7
+  ROI_MASK_HEAD:
+    NAME: "MaskRCNNConvUpsampleHead"
+    NUM_CONV: 4
+    POOLER_RESOLUTION: 14
+DATASETS:
+  TRAIN: ("base_coco_2017_train",)
+  TEST: ("base_coco_2017_val", "densepose_chimps")
+  CATEGORY_MAPS:
+    "base_coco_2017_train":
+      "16": 1 # bird -> person
+      "17": 1 # cat -> person
+      "18": 1 # dog -> person
+      "19": 1 # horse -> person
+      "20": 1 # sheep -> person
+      "21": 1 # cow -> person
+      "22": 1 # elephant -> person
+      "23": 1 # bear -> person
+      "24": 1 # zebra -> person
+      "25": 1 # girafe -> person
+    "base_coco_2017_val":
+      "16": 1 # bird -> person
+      "17": 1 # cat -> person
+      "18": 1 # dog -> person
+      "19": 1 # horse -> person
+      "20": 1 # sheep -> person
+      "21": 1 # cow -> person
+      "22": 1 # elephant -> person
+      "23": 1 # bear -> person
+      "24": 1 # zebra -> person
+      "25": 1 # girafe -> person
+  WHITELISTED_CATEGORIES:
+    "base_coco_2017_train":
+      - 1  # person
+      - 16 # bird
+      - 17 # cat
+      - 18 # dog
+      - 19 # horse
+      - 20 # sheep
+      - 21 # cow
+      - 22 # elephant
+      - 23 # bear
+      - 24 # zebra
+      - 25 # girafe
+    "base_coco_2017_val":
+      - 1  # person
+      - 16 # bird
+      - 17 # cat
+      - 18 # dog
+      - 19 # horse
+      - 20 # sheep
+      - 21 # cow
+      - 22 # elephant
+      - 23 # bear
+      - 24 # zebra
+      - 25 # girafe
+SOLVER:
+  IMS_PER_BATCH: 16
+  BASE_LR: 0.02
+  STEPS: (60000, 80000)
+  MAX_ITER: 90000
+INPUT:
+  MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800)
+VERSION: 2
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/projects/DensePose/configs/evolution/faster_rcnn_R_50_FPN_1x_MC.yaml b/preprocess/humanparsing/mhp_extension/detectron2/projects/DensePose/configs/evolution/faster_rcnn_R_50_FPN_1x_MC.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..80139ad9e40c09fdd862cdac80aa18c5cabc0a1e
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/projects/DensePose/configs/evolution/faster_rcnn_R_50_FPN_1x_MC.yaml
@@ -0,0 +1,7 @@
+_BASE_: "Base-RCNN-FPN-MC.yaml"
+MODEL:
+  WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
+  MASK_ON: False
+  DENSEPOSE_ON: False
+  RESNETS:
+    DEPTH: 50
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/projects/DensePose/configs/quick_schedules/densepose_rcnn_R_50_FPN_DL_instant_test.yaml b/preprocess/humanparsing/mhp_extension/detectron2/projects/DensePose/configs/quick_schedules/densepose_rcnn_R_50_FPN_DL_instant_test.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..b90989eef81e27d23119d2cd4627e8cea211ac51
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/projects/DensePose/configs/quick_schedules/densepose_rcnn_R_50_FPN_DL_instant_test.yaml
@@ -0,0 +1,11 @@
+_BASE_: "../Base-DensePose-RCNN-FPN.yaml"
+MODEL:
+  WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
+  ROI_DENSEPOSE_HEAD:
+    NAME: "DensePoseDeepLabHead"
+DATASETS:
+  TRAIN: ("densepose_coco_2014_minival_100",)
+  TEST: ("densepose_coco_2014_minival_100",)
+SOLVER:
+  MAX_ITER: 40
+  STEPS: (30,)
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/projects/DensePose/configs/quick_schedules/densepose_rcnn_R_50_FPN_TTA_inference_acc_test.yaml b/preprocess/humanparsing/mhp_extension/detectron2/projects/DensePose/configs/quick_schedules/densepose_rcnn_R_50_FPN_TTA_inference_acc_test.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..7d412740340d924bacc3baa57f32bfea0b871511
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/projects/DensePose/configs/quick_schedules/densepose_rcnn_R_50_FPN_TTA_inference_acc_test.yaml
@@ -0,0 +1,13 @@
+_BASE_: "../densepose_rcnn_R_50_FPN_s1x.yaml"
+MODEL:
+  WEIGHTS: "https://dl.fbaipublicfiles.com/densepose/densepose_rcnn_R_50_FPN_s1x/165712039/model_final_162be9.pkl"
+DATASETS:
+  TRAIN: ()
+  TEST: ("densepose_coco_2014_minival_100",)
+TEST:
+  AUG:
+    ENABLED: True
+    MIN_SIZES: (400, 500, 600, 700, 800, 900, 1000, 1100, 1200)
+    MAX_SIZE: 4000
+    FLIP: True
+  EXPECTED_RESULTS: [["bbox_TTA", "AP", 61.74, 0.03], ["densepose_gps_TTA", "AP",  60.22, 0.03], ["densepose_gpsm_TTA", "AP", 63.85, 0.03]]
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/projects/DensePose/configs/quick_schedules/densepose_rcnn_R_50_FPN_WC1_instant_test.yaml b/preprocess/humanparsing/mhp_extension/detectron2/projects/DensePose/configs/quick_schedules/densepose_rcnn_R_50_FPN_WC1_instant_test.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..f0fe61151adf255baba717f3e65ff6fab52829a6
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/projects/DensePose/configs/quick_schedules/densepose_rcnn_R_50_FPN_WC1_instant_test.yaml
@@ -0,0 +1,19 @@
+_BASE_: "../Base-DensePose-RCNN-FPN.yaml"
+MODEL:
+  WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
+  RESNETS:
+    DEPTH: 50
+  ROI_DENSEPOSE_HEAD:
+    UV_CONFIDENCE:
+      ENABLED: True
+      TYPE: "iid_iso"
+    POINT_REGRESSION_WEIGHTS: 0.0005
+DATASETS:
+  TRAIN: ("densepose_coco_2014_minival_100",)
+  TEST: ("densepose_coco_2014_minival_100",)
+SOLVER:
+  CLIP_GRADIENTS:
+    ENABLED: True
+  MAX_ITER: 40
+  STEPS: (30,)
+  WARMUP_FACTOR: 0.025
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/projects/DensePose/configs/quick_schedules/densepose_rcnn_R_50_FPN_WC2_instant_test.yaml b/preprocess/humanparsing/mhp_extension/detectron2/projects/DensePose/configs/quick_schedules/densepose_rcnn_R_50_FPN_WC2_instant_test.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..f0d9358c8846452314697a19b5e2ea9e075ddaeb
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/projects/DensePose/configs/quick_schedules/densepose_rcnn_R_50_FPN_WC2_instant_test.yaml
@@ -0,0 +1,19 @@
+_BASE_: "../Base-DensePose-RCNN-FPN.yaml"
+MODEL:
+  WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
+  RESNETS:
+    DEPTH: 50
+  ROI_DENSEPOSE_HEAD:
+    UV_CONFIDENCE:
+      ENABLED: True
+      TYPE: "indep_aniso"
+    POINT_REGRESSION_WEIGHTS: 0.0005
+DATASETS:
+  TRAIN: ("densepose_coco_2014_minival_100",)
+  TEST: ("densepose_coco_2014_minival_100",)
+SOLVER:
+  CLIP_GRADIENTS:
+    ENABLED: True
+  MAX_ITER: 40 
+  STEPS: (30,)
+  WARMUP_FACTOR: 0.025
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/projects/DensePose/configs/quick_schedules/densepose_rcnn_R_50_FPN_inference_acc_test.yaml b/preprocess/humanparsing/mhp_extension/detectron2/projects/DensePose/configs/quick_schedules/densepose_rcnn_R_50_FPN_inference_acc_test.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..3c5a7d20989e774cbba2b443e3026a2361201d0f
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/projects/DensePose/configs/quick_schedules/densepose_rcnn_R_50_FPN_inference_acc_test.yaml
@@ -0,0 +1,8 @@
+_BASE_: "../densepose_rcnn_R_50_FPN_s1x.yaml"
+MODEL:
+  WEIGHTS: "https://dl.fbaipublicfiles.com/densepose/densepose_rcnn_R_50_FPN_s1x/165712039/model_final_162be9.pkl"
+DATASETS:
+  TRAIN: ()
+  TEST: ("densepose_coco_2014_minival_100",)
+TEST:
+  EXPECTED_RESULTS: [["bbox", "AP", 59.27, 0.025], ["densepose_gps", "AP",  60.11, 0.02], ["densepose_gpsm", "AP", 64.20, 0.02]]
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/projects/DensePose/configs/quick_schedules/densepose_rcnn_R_50_FPN_instant_test.yaml b/preprocess/humanparsing/mhp_extension/detectron2/projects/DensePose/configs/quick_schedules/densepose_rcnn_R_50_FPN_instant_test.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..057c8768186e8a818228aa2f028ba3007374c571
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/projects/DensePose/configs/quick_schedules/densepose_rcnn_R_50_FPN_instant_test.yaml
@@ -0,0 +1,9 @@
+_BASE_: "../Base-DensePose-RCNN-FPN.yaml"
+MODEL:
+  WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
+DATASETS:
+  TRAIN: ("densepose_coco_2014_minival_100",)
+  TEST: ("densepose_coco_2014_minival_100",)
+SOLVER:
+  MAX_ITER: 40
+  STEPS: (30,)
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/projects/DensePose/configs/quick_schedules/densepose_rcnn_R_50_FPN_training_acc_test.yaml b/preprocess/humanparsing/mhp_extension/detectron2/projects/DensePose/configs/quick_schedules/densepose_rcnn_R_50_FPN_training_acc_test.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..b991160c79e5a95feac22be30deea10d200178d4
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/projects/DensePose/configs/quick_schedules/densepose_rcnn_R_50_FPN_training_acc_test.yaml
@@ -0,0 +1,14 @@
+_BASE_: "../Base-DensePose-RCNN-FPN.yaml"
+MODEL:
+  WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
+  ROI_HEADS:
+    NUM_CLASSES: 1
+DATASETS:
+  TRAIN: ("densepose_coco_2014_minival",)
+  TEST: ("densepose_coco_2014_minival",)
+SOLVER:
+  MAX_ITER: 6000
+  STEPS: (5500, 5800)
+TEST:
+  EXPECTED_RESULTS: [["bbox", "AP", 58.27, 1.0], ["densepose_gps", "AP", 42.47, 1.5], ["densepose_gpsm", "AP", 49.20, 1.5]]
+
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/projects/DensePose/densepose/__init__.py b/preprocess/humanparsing/mhp_extension/detectron2/projects/DensePose/densepose/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..aea5a1a9c3e63ce168a41545322599ccc4adbbb8
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/projects/DensePose/densepose/__init__.py
@@ -0,0 +1,9 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+from .data.datasets import builtin  # just to register data
+from .config import add_densepose_config, add_dataset_category_config
+from .densepose_head import ROI_DENSEPOSE_HEAD_REGISTRY
+from .evaluator import DensePoseCOCOEvaluator
+from .roi_head import DensePoseROIHeads
+from .data.structures import DensePoseDataRelative, DensePoseList, DensePoseTransformData
+from .modeling.test_time_augmentation import DensePoseGeneralizedRCNNWithTTA
+from .utils.transform import load_from_cfg
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/projects/DensePose/densepose/config.py b/preprocess/humanparsing/mhp_extension/detectron2/projects/DensePose/densepose/config.py
new file mode 100644
index 0000000000000000000000000000000000000000..2d76056b362beb7c0832e775b9e3415dd42767a5
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/projects/DensePose/densepose/config.py
@@ -0,0 +1,68 @@
+# -*- coding = utf-8 -*-
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+
+from detectron2.config import CfgNode as CN
+
+
+def add_dataset_category_config(cfg: CN):
+    """
+    Add config for additional category-related dataset options
+     - category whitelisting
+     - category mapping
+    """
+    _C = cfg
+    _C.DATASETS.CATEGORY_MAPS = CN(new_allowed=True)
+    _C.DATASETS.WHITELISTED_CATEGORIES = CN(new_allowed=True)
+
+
+def add_densepose_config(cfg: CN):
+    """
+    Add config for densepose head.
+    """
+    _C = cfg
+
+    _C.MODEL.DENSEPOSE_ON = True
+
+    _C.MODEL.ROI_DENSEPOSE_HEAD = CN()
+    _C.MODEL.ROI_DENSEPOSE_HEAD.NAME = ""
+    _C.MODEL.ROI_DENSEPOSE_HEAD.NUM_STACKED_CONVS = 8
+    # Number of parts used for point labels
+    _C.MODEL.ROI_DENSEPOSE_HEAD.NUM_PATCHES = 24
+    _C.MODEL.ROI_DENSEPOSE_HEAD.DECONV_KERNEL = 4
+    _C.MODEL.ROI_DENSEPOSE_HEAD.CONV_HEAD_DIM = 512
+    _C.MODEL.ROI_DENSEPOSE_HEAD.CONV_HEAD_KERNEL = 3
+    _C.MODEL.ROI_DENSEPOSE_HEAD.UP_SCALE = 2
+    _C.MODEL.ROI_DENSEPOSE_HEAD.HEATMAP_SIZE = 112
+    _C.MODEL.ROI_DENSEPOSE_HEAD.POOLER_TYPE = "ROIAlignV2"
+    _C.MODEL.ROI_DENSEPOSE_HEAD.POOLER_RESOLUTION = 28
+    _C.MODEL.ROI_DENSEPOSE_HEAD.POOLER_SAMPLING_RATIO = 2
+    _C.MODEL.ROI_DENSEPOSE_HEAD.NUM_COARSE_SEGM_CHANNELS = 2  # 15 or 2
+    # Overlap threshold for an RoI to be considered foreground (if >= FG_IOU_THRESHOLD)
+    _C.MODEL.ROI_DENSEPOSE_HEAD.FG_IOU_THRESHOLD = 0.7
+    # Loss weights for annotation masks.(14 Parts)
+    _C.MODEL.ROI_DENSEPOSE_HEAD.INDEX_WEIGHTS = 5.0
+    # Loss weights for surface parts. (24 Parts)
+    _C.MODEL.ROI_DENSEPOSE_HEAD.PART_WEIGHTS = 1.0
+    # Loss weights for UV regression.
+    _C.MODEL.ROI_DENSEPOSE_HEAD.POINT_REGRESSION_WEIGHTS = 0.01
+    # For Decoder
+    _C.MODEL.ROI_DENSEPOSE_HEAD.DECODER_ON = True
+    _C.MODEL.ROI_DENSEPOSE_HEAD.DECODER_NUM_CLASSES = 256
+    _C.MODEL.ROI_DENSEPOSE_HEAD.DECODER_CONV_DIMS = 256
+    _C.MODEL.ROI_DENSEPOSE_HEAD.DECODER_NORM = ""
+    _C.MODEL.ROI_DENSEPOSE_HEAD.DECODER_COMMON_STRIDE = 4
+    # For DeepLab head
+    _C.MODEL.ROI_DENSEPOSE_HEAD.DEEPLAB = CN()
+    _C.MODEL.ROI_DENSEPOSE_HEAD.DEEPLAB.NORM = "GN"
+    _C.MODEL.ROI_DENSEPOSE_HEAD.DEEPLAB.NONLOCAL_ON = 0
+    # Confidences
+    # Enable learning confidences (variances) along with the actual values
+    _C.MODEL.ROI_DENSEPOSE_HEAD.UV_CONFIDENCE = CN({"ENABLED": False})
+    # UV confidence lower bound
+    _C.MODEL.ROI_DENSEPOSE_HEAD.UV_CONFIDENCE.EPSILON = 0.01
+    # Statistical model type for confidence learning, possible values:
+    # - "iid_iso": statistically independent identically distributed residuals
+    #    with isotropic covariance
+    # - "indep_aniso": statistically independent residuals with anisotropic
+    #    covariances
+    _C.MODEL.ROI_DENSEPOSE_HEAD.UV_CONFIDENCE.TYPE = "iid_iso"
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/projects/DensePose/densepose/data/__init__.py b/preprocess/humanparsing/mhp_extension/detectron2/projects/DensePose/densepose/data/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..5484f59dc6aa8b1d54dd6771c1e4c490fad7e20e
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/projects/DensePose/densepose/data/__init__.py
@@ -0,0 +1,9 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+
+from .build import build_detection_test_loader, build_detection_train_loader
+from .dataset_mapper import DatasetMapper
+
+# ensure the builtin data are registered
+from . import datasets
+
+__all__ = [k for k in globals().keys() if not k.startswith("_")]
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/projects/DensePose/densepose/data/build.py b/preprocess/humanparsing/mhp_extension/detectron2/projects/DensePose/densepose/data/build.py
new file mode 100644
index 0000000000000000000000000000000000000000..c722ec12ffacf26ee0babe766b023566b2e79543
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/projects/DensePose/densepose/data/build.py
@@ -0,0 +1,405 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+
+import itertools
+import logging
+import numpy as np
+import operator
+from typing import Any, Callable, Collection, Dict, Iterable, List, Optional
+import torch
+
+from detectron2.config import CfgNode
+from detectron2.data import samplers
+from detectron2.data.build import (
+    load_proposals_into_dataset,
+    print_instances_class_histogram,
+    trivial_batch_collator,
+    worker_init_reset_seed,
+)
+from detectron2.data.catalog import DatasetCatalog, MetadataCatalog
+from detectron2.data.common import AspectRatioGroupedDataset, DatasetFromList, MapDataset
+from detectron2.utils.comm import get_world_size
+
+from .dataset_mapper import DatasetMapper
+from .datasets.coco import DENSEPOSE_KEYS_WITHOUT_MASK as DENSEPOSE_COCO_KEYS_WITHOUT_MASK
+from .datasets.coco import DENSEPOSE_MASK_KEY as DENSEPOSE_COCO_MASK_KEY
+
+__all__ = ["build_detection_train_loader", "build_detection_test_loader"]
+
+
+Instance = Dict[str, Any]
+InstancePredicate = Callable[[Instance], bool]
+
+
+def _compute_num_images_per_worker(cfg: CfgNode):
+    num_workers = get_world_size()
+    images_per_batch = cfg.SOLVER.IMS_PER_BATCH
+    assert (
+        images_per_batch % num_workers == 0
+    ), "SOLVER.IMS_PER_BATCH ({}) must be divisible by the number of workers ({}).".format(
+        images_per_batch, num_workers
+    )
+    assert (
+        images_per_batch >= num_workers
+    ), "SOLVER.IMS_PER_BATCH ({}) must be larger than the number of workers ({}).".format(
+        images_per_batch, num_workers
+    )
+    images_per_worker = images_per_batch // num_workers
+    return images_per_worker
+
+
+def _map_category_id_to_contiguous_id(dataset_name: str, dataset_dicts: Iterable[Instance]):
+    meta = MetadataCatalog.get(dataset_name)
+    for dataset_dict in dataset_dicts:
+        for ann in dataset_dict["annotations"]:
+            ann["category_id"] = meta.thing_dataset_id_to_contiguous_id[ann["category_id"]]
+
+
+def _add_category_id_to_contiguous_id_maps_to_metadata(dataset_names: Iterable[str]):
+    # merge categories for all data
+    merged_categories = {}
+    for dataset_name in dataset_names:
+        meta = MetadataCatalog.get(dataset_name)
+        for cat_id, cat_name in meta.categories.items():
+            if cat_id not in merged_categories:
+                merged_categories[cat_id] = (cat_name, dataset_name)
+                continue
+            cat_name_other, dataset_name_other = merged_categories[cat_id]
+            if cat_name_other != cat_name:
+                raise ValueError(
+                    f"Incompatible categories for category ID {cat_id}: "
+                    f'dataset {dataset_name} value "{cat_name}", '
+                    f'dataset {dataset_name_other} value "{cat_name_other}"'
+                )
+
+    merged_cat_id_to_cont_id = {}
+    for i, cat_id in enumerate(sorted(merged_categories.keys())):
+        merged_cat_id_to_cont_id[cat_id] = i
+
+    # add category maps to metadata
+    for dataset_name in dataset_names:
+        meta = MetadataCatalog.get(dataset_name)
+        categories = meta.get("categories")
+        meta.thing_classes = [categories[cat_id] for cat_id in sorted(categories.keys())]
+        meta.thing_dataset_id_to_contiguous_id = {
+            cat_id: merged_cat_id_to_cont_id[cat_id] for cat_id in sorted(categories.keys())
+        }
+        meta.thing_contiguous_id_to_dataset_id = {
+            merged_cat_id_to_cont_id[cat_id]: cat_id for cat_id in sorted(categories.keys())
+        }
+
+
+def _maybe_create_general_keep_instance_predicate(cfg: CfgNode) -> Optional[InstancePredicate]:
+    def has_annotations(instance: Instance) -> bool:
+        return "annotations" in instance
+
+    def has_only_crowd_anotations(instance: Instance) -> bool:
+        for ann in instance["annotations"]:
+            if ann.get("is_crowd", 0) == 0:
+                return False
+        return True
+
+    def general_keep_instance_predicate(instance: Instance) -> bool:
+        return has_annotations(instance) and not has_only_crowd_anotations(instance)
+
+    if not cfg.DATALOADER.FILTER_EMPTY_ANNOTATIONS:
+        return None
+    return general_keep_instance_predicate
+
+
+def _maybe_create_keypoints_keep_instance_predicate(cfg: CfgNode) -> Optional[InstancePredicate]:
+
+    min_num_keypoints = cfg.MODEL.ROI_KEYPOINT_HEAD.MIN_KEYPOINTS_PER_IMAGE
+
+    def has_sufficient_num_keypoints(instance: Instance) -> bool:
+        num_kpts = sum(
+            (np.array(ann["keypoints"][2::3]) > 0).sum()
+            for ann in instance["annotations"]
+            if "keypoints" in ann
+        )
+        return num_kpts >= min_num_keypoints
+
+    if cfg.MODEL.KEYPOINT_ON and (min_num_keypoints > 0):
+        return has_sufficient_num_keypoints
+    return None
+
+
+def _maybe_create_mask_keep_instance_predicate(cfg: CfgNode) -> Optional[InstancePredicate]:
+    if not cfg.MODEL.MASK_ON:
+        return None
+
+    def has_mask_annotations(instance: Instance) -> bool:
+        return any("segmentation" in ann for ann in instance["annotations"])
+
+    return has_mask_annotations
+
+
+def _maybe_create_densepose_keep_instance_predicate(cfg: CfgNode) -> Optional[InstancePredicate]:
+    if not cfg.MODEL.DENSEPOSE_ON:
+        return None
+
+    def has_densepose_annotations(instance: Instance) -> bool:
+        for ann in instance["annotations"]:
+            if all(key in ann for key in DENSEPOSE_COCO_KEYS_WITHOUT_MASK) and (
+                (DENSEPOSE_COCO_MASK_KEY in ann) or ("segmentation" in ann)
+            ):
+                return True
+        return False
+
+    return has_densepose_annotations
+
+
+def _maybe_create_specific_keep_instance_predicate(cfg: CfgNode) -> Optional[InstancePredicate]:
+    specific_predicate_creators = [
+        _maybe_create_keypoints_keep_instance_predicate,
+        _maybe_create_mask_keep_instance_predicate,
+        _maybe_create_densepose_keep_instance_predicate,
+    ]
+    predicates = [creator(cfg) for creator in specific_predicate_creators]
+    predicates = [p for p in predicates if p is not None]
+    if not predicates:
+        return None
+
+    def combined_predicate(instance: Instance) -> bool:
+        return any(p(instance) for p in predicates)
+
+    return combined_predicate
+
+
+def _get_train_keep_instance_predicate(cfg: CfgNode):
+    general_keep_predicate = _maybe_create_general_keep_instance_predicate(cfg)
+    combined_specific_keep_predicate = _maybe_create_specific_keep_instance_predicate(cfg)
+
+    def combined_general_specific_keep_predicate(instance: Instance) -> bool:
+        return general_keep_predicate(instance) and combined_specific_keep_predicate(instance)
+
+    if (general_keep_predicate is None) and (combined_specific_keep_predicate is None):
+        return None
+    if general_keep_predicate is None:
+        return combined_specific_keep_predicate
+    if combined_specific_keep_predicate is None:
+        return general_keep_predicate
+    return combined_general_specific_keep_predicate
+
+
+def _get_test_keep_instance_predicate(cfg: CfgNode):
+    general_keep_predicate = _maybe_create_general_keep_instance_predicate(cfg)
+    return general_keep_predicate
+
+
+def _maybe_filter_and_map_categories(
+    dataset_name: str, dataset_dicts: List[Instance]
+) -> List[Instance]:
+    meta = MetadataCatalog.get(dataset_name)
+    whitelisted_categories = meta.get("whitelisted_categories")
+    category_map = meta.get("category_map", {})
+    if whitelisted_categories is None and not category_map:
+        return dataset_dicts
+    filtered_dataset_dicts = []
+    for dataset_dict in dataset_dicts:
+        anns = []
+        for ann in dataset_dict["annotations"]:
+            cat_id = ann["category_id"]
+            if whitelisted_categories is not None and cat_id not in whitelisted_categories:
+                continue
+            ann["category_id"] = category_map.get(cat_id, cat_id)
+            anns.append(ann)
+        dataset_dict["annotations"] = anns
+        filtered_dataset_dicts.append(dataset_dict)
+    return filtered_dataset_dicts
+
+
+def _add_category_whitelists_to_metadata(cfg: CfgNode):
+    for dataset_name, whitelisted_cat_ids in cfg.DATASETS.WHITELISTED_CATEGORIES.items():
+        meta = MetadataCatalog.get(dataset_name)
+        meta.whitelisted_categories = whitelisted_cat_ids
+        logger = logging.getLogger(__name__)
+        logger.info(
+            "Whitelisted categories for dataset {}: {}".format(
+                dataset_name, meta.whitelisted_categories
+            )
+        )
+
+
+def _add_category_maps_to_metadata(cfg: CfgNode):
+    for dataset_name, category_map in cfg.DATASETS.CATEGORY_MAPS.items():
+        category_map = {
+            int(cat_id_src): int(cat_id_dst) for cat_id_src, cat_id_dst in category_map.items()
+        }
+        meta = MetadataCatalog.get(dataset_name)
+        meta.category_map = category_map
+        logger = logging.getLogger(__name__)
+        logger.info("Category maps for dataset {}: {}".format(dataset_name, meta.category_map))
+
+
+def combine_detection_dataset_dicts(
+    dataset_names: Collection[str],
+    keep_instance_predicate: Optional[InstancePredicate] = None,
+    proposal_files: Optional[Collection[str]] = None,
+) -> List[Instance]:
+    """
+    Load and prepare dataset dicts for training / testing
+
+    Args:
+        dataset_names (Collection[str]): a list of dataset names
+        keep_instance_predicate (Callable: Dict[str, Any] -> bool): predicate
+            applied to instance dicts which defines whether to keep the instance
+        proposal_files (Collection[str]): if given, a list of object proposal files
+            that match each dataset in `dataset_names`.
+    """
+    assert len(dataset_names)
+    if proposal_files is None:
+        proposal_files = [None] * len(dataset_names)
+    assert len(dataset_names) == len(proposal_files)
+    # load annotations and dataset metadata
+    dataset_map = {}
+    for dataset_name in dataset_names:
+        dataset_dicts = DatasetCatalog.get(dataset_name)
+        dataset_map[dataset_name] = dataset_dicts
+    # initialize category maps
+    _add_category_id_to_contiguous_id_maps_to_metadata(dataset_names)
+    # apply category maps
+    all_datasets_dicts = []
+    for dataset_name, proposal_file in zip(dataset_names, proposal_files):
+        dataset_dicts = dataset_map[dataset_name]
+        assert len(dataset_dicts), f"Dataset '{dataset_name}' is empty!"
+        if proposal_file is not None:
+            dataset_dicts = load_proposals_into_dataset(dataset_dicts, proposal_file)
+        dataset_dicts = _maybe_filter_and_map_categories(dataset_name, dataset_dicts)
+        _map_category_id_to_contiguous_id(dataset_name, dataset_dicts)
+        print_instances_class_histogram(
+            dataset_dicts, MetadataCatalog.get(dataset_name).thing_classes
+        )
+        all_datasets_dicts.append(dataset_dicts)
+
+    if keep_instance_predicate is not None:
+        all_datasets_dicts_plain = [
+            d
+            for d in itertools.chain.from_iterable(all_datasets_dicts)
+            if keep_instance_predicate(d)
+        ]
+    else:
+        all_datasets_dicts_plain = list(itertools.chain.from_iterable(all_datasets_dicts))
+    return all_datasets_dicts_plain
+
+
+def build_detection_train_loader(cfg: CfgNode, mapper=None):
+    """
+    A data loader is created in a way similar to that of Detectron2.
+    The main differences are:
+     - it allows to combine data with different but compatible object category sets
+
+    The data loader is created by the following steps:
+    1. Use the dataset names in config to query :class:`DatasetCatalog`, and obtain a list of dicts.
+    2. Start workers to work on the dicts. Each worker will:
+        * Map each metadata dict into another format to be consumed by the model.
+        * Batch them by simply putting dicts into a list.
+    The batched ``list[mapped_dict]`` is what this dataloader will return.
+
+    Args:
+        cfg (CfgNode): the config
+        mapper (callable): a callable which takes a sample (dict) from dataset and
+            returns the format to be consumed by the model.
+            By default it will be `DatasetMapper(cfg, True)`.
+
+    Returns:
+        an infinite iterator of training data
+    """
+    images_per_worker = _compute_num_images_per_worker(cfg)
+
+    _add_category_whitelists_to_metadata(cfg)
+    _add_category_maps_to_metadata(cfg)
+    dataset_dicts = combine_detection_dataset_dicts(
+        cfg.DATASETS.TRAIN,
+        keep_instance_predicate=_get_train_keep_instance_predicate(cfg),
+        proposal_files=cfg.DATASETS.PROPOSAL_FILES_TRAIN if cfg.MODEL.LOAD_PROPOSALS else None,
+    )
+    dataset = DatasetFromList(dataset_dicts, copy=False)
+
+    if mapper is None:
+        mapper = DatasetMapper(cfg, True)
+    dataset = MapDataset(dataset, mapper)
+
+    sampler_name = cfg.DATALOADER.SAMPLER_TRAIN
+    logger = logging.getLogger(__name__)
+    logger.info("Using training sampler {}".format(sampler_name))
+    if sampler_name == "TrainingSampler":
+        sampler = samplers.TrainingSampler(len(dataset))
+    elif sampler_name == "RepeatFactorTrainingSampler":
+        sampler = samplers.RepeatFactorTrainingSampler(
+            dataset_dicts, cfg.DATALOADER.REPEAT_THRESHOLD
+        )
+    else:
+        raise ValueError("Unknown training sampler: {}".format(sampler_name))
+
+    if cfg.DATALOADER.ASPECT_RATIO_GROUPING:
+        data_loader = torch.utils.data.DataLoader(
+            dataset,
+            sampler=sampler,
+            num_workers=cfg.DATALOADER.NUM_WORKERS,
+            batch_sampler=None,
+            collate_fn=operator.itemgetter(0),  # don't batch, but yield individual elements
+            worker_init_fn=worker_init_reset_seed,
+        )  # yield individual mapped dict
+        data_loader = AspectRatioGroupedDataset(data_loader, images_per_worker)
+    else:
+        batch_sampler = torch.utils.data.sampler.BatchSampler(
+            sampler, images_per_worker, drop_last=True
+        )
+        # drop_last so the batch always have the same size
+        data_loader = torch.utils.data.DataLoader(
+            dataset,
+            num_workers=cfg.DATALOADER.NUM_WORKERS,
+            batch_sampler=batch_sampler,
+            collate_fn=trivial_batch_collator,
+            worker_init_fn=worker_init_reset_seed,
+        )
+
+    return data_loader
+
+
+def build_detection_test_loader(cfg, dataset_name, mapper=None):
+    """
+    Similar to `build_detection_train_loader`.
+    But this function uses the given `dataset_name` argument (instead of the names in cfg),
+    and uses batch size 1.
+
+    Args:
+        cfg: a detectron2 CfgNode
+        dataset_name (str): a name of the dataset that's available in the DatasetCatalog
+        mapper (callable): a callable which takes a sample (dict) from dataset
+            and returns the format to be consumed by the model.
+            By default it will be `DatasetMapper(cfg, False)`.
+
+    Returns:
+        DataLoader: a torch DataLoader, that loads the given detection
+            dataset, with test-time transformation and batching.
+    """
+    _add_category_whitelists_to_metadata(cfg)
+    _add_category_maps_to_metadata(cfg)
+    dataset_dicts = combine_detection_dataset_dicts(
+        [dataset_name],
+        keep_instance_predicate=_get_test_keep_instance_predicate(cfg),
+        proposal_files=[
+            cfg.DATASETS.PROPOSAL_FILES_TEST[list(cfg.DATASETS.TEST).index(dataset_name)]
+        ]
+        if cfg.MODEL.LOAD_PROPOSALS
+        else None,
+    )
+
+    dataset = DatasetFromList(dataset_dicts)
+    if mapper is None:
+        mapper = DatasetMapper(cfg, False)
+    dataset = MapDataset(dataset, mapper)
+
+    sampler = samplers.InferenceSampler(len(dataset))
+    # Always use 1 image per worker during inference since this is the
+    # standard when reporting inference time in papers.
+    batch_sampler = torch.utils.data.sampler.BatchSampler(sampler, 1, drop_last=False)
+
+    data_loader = torch.utils.data.DataLoader(
+        dataset,
+        num_workers=cfg.DATALOADER.NUM_WORKERS,
+        batch_sampler=batch_sampler,
+        collate_fn=trivial_batch_collator,
+    )
+    return data_loader
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/projects/DensePose/densepose/data/dataset_mapper.py b/preprocess/humanparsing/mhp_extension/detectron2/projects/DensePose/densepose/data/dataset_mapper.py
new file mode 100644
index 0000000000000000000000000000000000000000..f74976745151952ece06c7b7ba542e0b63f53899
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/projects/DensePose/densepose/data/dataset_mapper.py
@@ -0,0 +1,118 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+
+import copy
+import torch
+from fvcore.common.file_io import PathManager
+
+from detectron2.data import MetadataCatalog
+from detectron2.data import detection_utils as utils
+from detectron2.data import transforms as T
+
+from .structures import DensePoseDataRelative, DensePoseList, DensePoseTransformData
+
+
+class DatasetMapper:
+    """
+    A customized version of `detectron2.data.DatasetMapper`
+    """
+
+    def __init__(self, cfg, is_train=True):
+        self.tfm_gens = utils.build_transform_gen(cfg, is_train)
+
+        # fmt: off
+        self.img_format     = cfg.INPUT.FORMAT
+        self.mask_on        = cfg.MODEL.MASK_ON
+        self.keypoint_on    = cfg.MODEL.KEYPOINT_ON
+        self.densepose_on   = cfg.MODEL.DENSEPOSE_ON
+        assert not cfg.MODEL.LOAD_PROPOSALS, "not supported yet"
+        # fmt: on
+        if self.keypoint_on and is_train:
+            # Flip only makes sense in training
+            self.keypoint_hflip_indices = utils.create_keypoint_hflip_indices(cfg.DATASETS.TRAIN)
+        else:
+            self.keypoint_hflip_indices = None
+
+        if self.densepose_on:
+            densepose_transform_srcs = [
+                MetadataCatalog.get(ds).densepose_transform_src
+                for ds in cfg.DATASETS.TRAIN + cfg.DATASETS.TEST
+            ]
+            assert len(densepose_transform_srcs) > 0
+            # TODO: check that DensePose transformation data is the same for
+            # all the data. Otherwise one would have to pass DB ID with
+            # each entry to select proper transformation data. For now, since
+            # all DensePose annotated data uses the same data semantics, we
+            # omit this check.
+            densepose_transform_data_fpath = PathManager.get_local_path(densepose_transform_srcs[0])
+            self.densepose_transform_data = DensePoseTransformData.load(
+                densepose_transform_data_fpath
+            )
+
+        self.is_train = is_train
+
+    def __call__(self, dataset_dict):
+        """
+        Args:
+            dataset_dict (dict): Metadata of one image, in Detectron2 Dataset format.
+
+        Returns:
+            dict: a format that builtin models in detectron2 accept
+        """
+        dataset_dict = copy.deepcopy(dataset_dict)  # it will be modified by code below
+        image = utils.read_image(dataset_dict["file_name"], format=self.img_format)
+        utils.check_image_size(dataset_dict, image)
+
+        image, transforms = T.apply_transform_gens(self.tfm_gens, image)
+        image_shape = image.shape[:2]  # h, w
+        dataset_dict["image"] = torch.as_tensor(image.transpose(2, 0, 1).astype("float32"))
+
+        if not self.is_train:
+            dataset_dict.pop("annotations", None)
+            return dataset_dict
+
+        for anno in dataset_dict["annotations"]:
+            if not self.mask_on:
+                anno.pop("segmentation", None)
+            if not self.keypoint_on:
+                anno.pop("keypoints", None)
+
+        # USER: Implement additional transformations if you have other types of data
+        # USER: Don't call transpose_densepose if you don't need
+        annos = [
+            self._transform_densepose(
+                utils.transform_instance_annotations(
+                    obj, transforms, image_shape, keypoint_hflip_indices=self.keypoint_hflip_indices
+                ),
+                transforms,
+            )
+            for obj in dataset_dict.pop("annotations")
+            if obj.get("iscrowd", 0) == 0
+        ]
+        instances = utils.annotations_to_instances(annos, image_shape)
+
+        if len(annos) and "densepose" in annos[0]:
+            gt_densepose = [obj["densepose"] for obj in annos]
+            instances.gt_densepose = DensePoseList(gt_densepose, instances.gt_boxes, image_shape)
+
+        dataset_dict["instances"] = instances[instances.gt_boxes.nonempty()]
+        return dataset_dict
+
+    def _transform_densepose(self, annotation, transforms):
+        if not self.densepose_on:
+            return annotation
+
+        # Handle densepose annotations
+        is_valid, reason_not_valid = DensePoseDataRelative.validate_annotation(annotation)
+        if is_valid:
+            densepose_data = DensePoseDataRelative(annotation, cleanup=True)
+            densepose_data.apply_transform(transforms, self.densepose_transform_data)
+            annotation["densepose"] = densepose_data
+        else:
+            # logger = logging.getLogger(__name__)
+            # logger.debug("Could not load DensePose annotation: {}".format(reason_not_valid))
+            DensePoseDataRelative.cleanup_annotation(annotation)
+            # NOTE: annotations for certain instances may be unavailable.
+            # 'None' is accepted by the DensePostList data structure.
+            annotation["densepose"] = None
+        return annotation
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/projects/DensePose/densepose/data/datasets/__init__.py b/preprocess/humanparsing/mhp_extension/detectron2/projects/DensePose/densepose/data/datasets/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..4a59d9332034e9dc3a09f0ba7aa63f0c61b25e87
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/projects/DensePose/densepose/data/datasets/__init__.py
@@ -0,0 +1,5 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+
+from . import builtin  # ensure the builtin data are registered
+
+__all__ = [k for k in globals().keys() if "builtin" not in k and not k.startswith("_")]
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/projects/DensePose/densepose/data/datasets/builtin.py b/preprocess/humanparsing/mhp_extension/detectron2/projects/DensePose/densepose/data/datasets/builtin.py
new file mode 100644
index 0000000000000000000000000000000000000000..e70f3d3e006d1801dcfb743c9c21b46ca54a3053
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/projects/DensePose/densepose/data/datasets/builtin.py
@@ -0,0 +1,10 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+from .coco import BASE_DATASETS as BASE_COCO_DATASETS
+from .coco import DATASETS as COCO_DATASETS
+from .coco import register_datasets as register_coco_datasets
+
+DEFAULT_DATASETS_ROOT = "data"
+
+
+register_coco_datasets(COCO_DATASETS, DEFAULT_DATASETS_ROOT)
+register_coco_datasets(BASE_COCO_DATASETS, DEFAULT_DATASETS_ROOT)
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/projects/DensePose/densepose/data/datasets/coco.py b/preprocess/humanparsing/mhp_extension/detectron2/projects/DensePose/densepose/data/datasets/coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..3a96474fc990129d5c92786f62720621de97b230
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/projects/DensePose/densepose/data/datasets/coco.py
@@ -0,0 +1,314 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+import contextlib
+import io
+import logging
+import os
+from dataclasses import dataclass
+from typing import Any, Dict, Iterable, List, Optional
+from fvcore.common.file_io import PathManager
+from fvcore.common.timer import Timer
+
+from detectron2.data import DatasetCatalog, MetadataCatalog
+from detectron2.structures import BoxMode
+
+DENSEPOSE_MASK_KEY = "dp_masks"
+DENSEPOSE_KEYS_WITHOUT_MASK = ["dp_x", "dp_y", "dp_I", "dp_U", "dp_V"]
+DENSEPOSE_KEYS = DENSEPOSE_KEYS_WITHOUT_MASK + [DENSEPOSE_MASK_KEY]
+DENSEPOSE_METADATA_URL_PREFIX = "https://dl.fbaipublicfiles.com/densepose/data/"
+
+
+@dataclass
+class CocoDatasetInfo:
+    name: str
+    images_root: str
+    annotations_fpath: str
+
+
+DATASETS = [
+    CocoDatasetInfo(
+        name="densepose_coco_2014_train",
+        images_root="coco/train2014",
+        annotations_fpath="coco/annotations/densepose_train2014.json",
+    ),
+    CocoDatasetInfo(
+        name="densepose_coco_2014_minival",
+        images_root="coco/val2014",
+        annotations_fpath="coco/annotations/densepose_minival2014.json",
+    ),
+    CocoDatasetInfo(
+        name="densepose_coco_2014_minival_100",
+        images_root="coco/val2014",
+        annotations_fpath="coco/annotations/densepose_minival2014_100.json",
+    ),
+    CocoDatasetInfo(
+        name="densepose_coco_2014_valminusminival",
+        images_root="coco/val2014",
+        annotations_fpath="coco/annotations/densepose_valminusminival2014.json",
+    ),
+    CocoDatasetInfo(
+        name="densepose_chimps",
+        images_root="densepose_evolution/densepose_chimps",
+        annotations_fpath="densepose_evolution/annotations/densepose_chimps_densepose.json",
+    ),
+]
+
+
+BASE_DATASETS = [
+    CocoDatasetInfo(
+        name="base_coco_2017_train",
+        images_root="coco/train2017",
+        annotations_fpath="coco/annotations/instances_train2017.json",
+    ),
+    CocoDatasetInfo(
+        name="base_coco_2017_val",
+        images_root="coco/val2017",
+        annotations_fpath="coco/annotations/instances_val2017.json",
+    ),
+    CocoDatasetInfo(
+        name="base_coco_2017_val_100",
+        images_root="coco/val2017",
+        annotations_fpath="coco/annotations/instances_val2017_100.json",
+    ),
+]
+
+
+def _is_relative_local_path(path: os.PathLike):
+    path_str = os.fsdecode(path)
+    return ("://" not in path_str) and not os.path.isabs(path)
+
+
+def _maybe_prepend_base_path(base_path: Optional[os.PathLike], path: os.PathLike):
+    """
+    Prepends the provided path with a base path prefix if:
+    1) base path is not None;
+    2) path is a local path
+    """
+    if base_path is None:
+        return path
+    if _is_relative_local_path(path):
+        return os.path.join(base_path, path)
+    return path
+
+
+def get_metadata(base_path: Optional[os.PathLike]) -> Dict[str, Any]:
+    """
+    Returns metadata associated with COCO DensePose data
+
+    Args:
+    base_path: Optional[os.PathLike]
+        Base path used to load metadata from
+
+    Returns:
+    Dict[str, Any]
+        Metadata in the form of a dictionary
+    """
+    meta = {
+        "densepose_transform_src": _maybe_prepend_base_path(
+            base_path, "UV_symmetry_transforms.mat"
+        ),
+        "densepose_smpl_subdiv": _maybe_prepend_base_path(base_path, "SMPL_subdiv.mat"),
+        "densepose_smpl_subdiv_transform": _maybe_prepend_base_path(
+            base_path, "SMPL_SUBDIV_TRANSFORM.mat"
+        ),
+    }
+    return meta
+
+
+def _load_coco_annotations(json_file: str):
+    """
+    Load COCO annotations from a JSON file
+
+    Args:
+        json_file: str
+            Path to the file to load annotations from
+    Returns:
+        Instance of `pycocotools.coco.COCO` that provides access to annotations
+        data
+    """
+    from pycocotools.coco import COCO
+
+    logger = logging.getLogger(__name__)
+    timer = Timer()
+    with contextlib.redirect_stdout(io.StringIO()):
+        coco_api = COCO(json_file)
+    if timer.seconds() > 1:
+        logger.info("Loading {} takes {:.2f} seconds.".format(json_file, timer.seconds()))
+    return coco_api
+
+
+def _add_categories_metadata(dataset_name: str, categories: Dict[str, Any]):
+    meta = MetadataCatalog.get(dataset_name)
+    meta.categories = {c["id"]: c["name"] for c in categories}
+    logger = logging.getLogger(__name__)
+    logger.info("Dataset {} categories: {}".format(dataset_name, categories))
+
+
+def _verify_annotations_have_unique_ids(json_file: str, anns: List[List[Dict[str, Any]]]):
+    if "minival" in json_file:
+        # Skip validation on COCO2014 valminusminival and minival annotations
+        # The ratio of buggy annotations there is tiny and does not affect accuracy
+        # Therefore we explicitly white-list them
+        return
+    ann_ids = [ann["id"] for anns_per_image in anns for ann in anns_per_image]
+    assert len(set(ann_ids)) == len(ann_ids), "Annotation ids in '{}' are not unique!".format(
+        json_file
+    )
+
+
+def _maybe_add_bbox(obj: Dict[str, Any], ann_dict: Dict[str, Any]):
+    if "bbox" not in ann_dict:
+        return
+    obj["bbox"] = ann_dict["bbox"]
+    obj["bbox_mode"] = BoxMode.XYWH_ABS
+
+
+def _maybe_add_segm(obj: Dict[str, Any], ann_dict: Dict[str, Any]):
+    if "segmentation" not in ann_dict:
+        return
+    segm = ann_dict["segmentation"]
+    if not isinstance(segm, dict):
+        # filter out invalid polygons (< 3 points)
+        segm = [poly for poly in segm if len(poly) % 2 == 0 and len(poly) >= 6]
+        if len(segm) == 0:
+            return
+    obj["segmentation"] = segm
+
+
+def _maybe_add_keypoints(obj: Dict[str, Any], ann_dict: Dict[str, Any]):
+    if "keypoints" not in ann_dict:
+        return
+    keypts = ann_dict["keypoints"]  # list[int]
+    for idx, v in enumerate(keypts):
+        if idx % 3 != 2:
+            # COCO's segmentation coordinates are floating points in [0, H or W],
+            # but keypoint coordinates are integers in [0, H-1 or W-1]
+            # Therefore we assume the coordinates are "pixel indices" and
+            # add 0.5 to convert to floating point coordinates.
+            keypts[idx] = v + 0.5
+    obj["keypoints"] = keypts
+
+
+def _maybe_add_densepose(obj: Dict[str, Any], ann_dict: Dict[str, Any]):
+    for key in DENSEPOSE_KEYS:
+        if key in ann_dict:
+            obj[key] = ann_dict[key]
+
+
+def _combine_images_with_annotations(
+    dataset_name: str,
+    image_root: str,
+    img_datas: Iterable[Dict[str, Any]],
+    ann_datas: Iterable[Iterable[Dict[str, Any]]],
+):
+
+    ann_keys = ["iscrowd", "category_id"]
+    dataset_dicts = []
+
+    for img_dict, ann_dicts in zip(img_datas, ann_datas):
+        record = {}
+        record["file_name"] = os.path.join(image_root, img_dict["file_name"])
+        record["height"] = img_dict["height"]
+        record["width"] = img_dict["width"]
+        record["image_id"] = img_dict["id"]
+        record["dataset"] = dataset_name
+        objs = []
+        for ann_dict in ann_dicts:
+            assert ann_dict["image_id"] == record["image_id"]
+            assert ann_dict.get("ignore", 0) == 0
+            obj = {key: ann_dict[key] for key in ann_keys if key in ann_dict}
+            _maybe_add_bbox(obj, ann_dict)
+            _maybe_add_segm(obj, ann_dict)
+            _maybe_add_keypoints(obj, ann_dict)
+            _maybe_add_densepose(obj, ann_dict)
+            objs.append(obj)
+        record["annotations"] = objs
+        dataset_dicts.append(record)
+    return dataset_dicts
+
+
+def load_coco_json(annotations_json_file: str, image_root: str, dataset_name: str):
+    """
+    Loads a JSON file with annotations in COCO instances format.
+    Replaces `detectron2.data.data.coco.load_coco_json` to handle metadata
+    in a more flexible way. Postpones category mapping to a later stage to be
+    able to combine several data with different (but coherent) sets of
+    categories.
+
+    Args:
+
+    annotations_json_file: str
+        Path to the JSON file with annotations in COCO instances format.
+    image_root: str
+        directory that contains all the images
+    dataset_name: str
+        the name that identifies a dataset, e.g. "densepose_coco_2014_train"
+    extra_annotation_keys: Optional[List[str]]
+        If provided, these keys are used to extract additional data from
+        the annotations.
+    """
+    coco_api = _load_coco_annotations(PathManager.get_local_path(annotations_json_file))
+    _add_categories_metadata(dataset_name, coco_api.loadCats(coco_api.getCatIds()))
+    # sort indices for reproducible results
+    img_ids = sorted(coco_api.imgs.keys())
+    # imgs is a list of dicts, each looks something like:
+    # {'license': 4,
+    #  'url': 'http://farm6.staticflickr.com/5454/9413846304_881d5e5c3b_z.jpg',
+    #  'file_name': 'COCO_val2014_000000001268.jpg',
+    #  'height': 427,
+    #  'width': 640,
+    #  'date_captured': '2013-11-17 05:57:24',
+    #  'id': 1268}
+    imgs = coco_api.loadImgs(img_ids)
+    logger = logging.getLogger(__name__)
+    logger.info("Loaded {} images in COCO format from {}".format(len(imgs), annotations_json_file))
+    # anns is a list[list[dict]], where each dict is an annotation
+    # record for an object. The inner list enumerates the objects in an image
+    # and the outer list enumerates over images.
+    anns = [coco_api.imgToAnns[img_id] for img_id in img_ids]
+    _verify_annotations_have_unique_ids(annotations_json_file, anns)
+    dataset_records = _combine_images_with_annotations(dataset_name, image_root, imgs, anns)
+    return dataset_records
+
+
+def register_dataset(dataset_data: CocoDatasetInfo, datasets_root: Optional[os.PathLike] = None):
+    """
+    Registers provided COCO DensePose dataset
+
+    Args:
+    dataset_data: CocoDatasetInfo
+        Dataset data
+    datasets_root: Optional[os.PathLike]
+        Datasets root folder (default: None)
+    """
+    annotations_fpath = _maybe_prepend_base_path(datasets_root, dataset_data.annotations_fpath)
+    images_root = _maybe_prepend_base_path(datasets_root, dataset_data.images_root)
+
+    def load_annotations():
+        return load_coco_json(
+            annotations_json_file=annotations_fpath,
+            image_root=images_root,
+            dataset_name=dataset_data.name,
+        )
+
+    DatasetCatalog.register(dataset_data.name, load_annotations)
+    MetadataCatalog.get(dataset_data.name).set(
+        json_file=annotations_fpath,
+        image_root=images_root,
+        **get_metadata(DENSEPOSE_METADATA_URL_PREFIX)
+    )
+
+
+def register_datasets(
+    datasets_data: Iterable[CocoDatasetInfo], datasets_root: Optional[os.PathLike] = None
+):
+    """
+    Registers provided COCO DensePose data
+
+    Args:
+    datasets_data: Iterable[CocoDatasetInfo]
+        An iterable of dataset datas
+    datasets_root: Optional[os.PathLike]
+        Datasets root folder (default: None)
+    """
+    for dataset_data in datasets_data:
+        register_dataset(dataset_data, datasets_root)
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/projects/DensePose/densepose/data/structures.py b/preprocess/humanparsing/mhp_extension/detectron2/projects/DensePose/densepose/data/structures.py
new file mode 100644
index 0000000000000000000000000000000000000000..bbb950ba09b1302b72f36d143e092d2ade6dc11e
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/projects/DensePose/densepose/data/structures.py
@@ -0,0 +1,579 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+import base64
+import numpy as np
+from io import BytesIO
+import torch
+from PIL import Image
+from torch.nn import functional as F
+
+
+class DensePoseTransformData(object):
+
+    # Horizontal symmetry label transforms used for horizontal flip
+    MASK_LABEL_SYMMETRIES = [0, 1, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 14]
+    # fmt: off
+    POINT_LABEL_SYMMETRIES = [ 0, 1, 2, 4, 3, 6, 5, 8, 7, 10, 9, 12, 11, 14, 13, 16, 15, 18, 17, 20, 19, 22, 21, 24, 23]  # noqa
+    # fmt: on
+
+    def __init__(self, uv_symmetries):
+        self.mask_label_symmetries = DensePoseTransformData.MASK_LABEL_SYMMETRIES
+        self.point_label_symmetries = DensePoseTransformData.POINT_LABEL_SYMMETRIES
+        self.uv_symmetries = uv_symmetries
+
+    @staticmethod
+    def load(fpath):
+        import scipy.io
+
+        uv_symmetry_map = scipy.io.loadmat(fpath)
+        uv_symmetry_map_torch = {}
+        for key in ["U_transforms", "V_transforms"]:
+            uv_symmetry_map_torch[key] = []
+            map_src = uv_symmetry_map[key]
+            map_dst = uv_symmetry_map_torch[key]
+            for i in range(map_src.shape[1]):
+                map_dst.append(torch.from_numpy(map_src[0, i]).to(dtype=torch.float))
+            uv_symmetry_map_torch[key] = torch.stack(map_dst, dim=0).to(
+                device=torch.cuda.current_device()
+            )
+        transform_data = DensePoseTransformData(uv_symmetry_map_torch)
+        return transform_data
+
+
+class DensePoseDataRelative(object):
+    """
+    Dense pose relative annotations that can be applied to any bounding box:
+        x - normalized X coordinates [0, 255] of annotated points
+        y - normalized Y coordinates [0, 255] of annotated points
+        i - body part labels 0,...,24 for annotated points
+        u - body part U coordinates [0, 1] for annotated points
+        v - body part V coordinates [0, 1] for annotated points
+        segm - 256x256 segmentation mask with values 0,...,14
+    To obtain absolute x and y data wrt some bounding box one needs to first
+    divide the data by 256, multiply by the respective bounding box size
+    and add bounding box offset:
+        x_img = x0 + x_norm * w / 256.0
+        y_img = y0 + y_norm * h / 256.0
+    Segmentation masks are typically sampled to get image-based masks.
+    """
+
+    # Key for normalized X coordinates in annotation dict
+    X_KEY = "dp_x"
+    # Key for normalized Y coordinates in annotation dict
+    Y_KEY = "dp_y"
+    # Key for U part coordinates in annotation dict
+    U_KEY = "dp_U"
+    # Key for V part coordinates in annotation dict
+    V_KEY = "dp_V"
+    # Key for I point labels in annotation dict
+    I_KEY = "dp_I"
+    # Key for segmentation mask in annotation dict
+    S_KEY = "dp_masks"
+    # Number of body parts in segmentation masks
+    N_BODY_PARTS = 14
+    # Number of parts in point labels
+    N_PART_LABELS = 24
+    MASK_SIZE = 256
+
+    def __init__(self, annotation, cleanup=False):
+        is_valid, reason_not_valid = DensePoseDataRelative.validate_annotation(annotation)
+        assert is_valid, "Invalid DensePose annotations: {}".format(reason_not_valid)
+        self.x = torch.as_tensor(annotation[DensePoseDataRelative.X_KEY])
+        self.y = torch.as_tensor(annotation[DensePoseDataRelative.Y_KEY])
+        self.i = torch.as_tensor(annotation[DensePoseDataRelative.I_KEY])
+        self.u = torch.as_tensor(annotation[DensePoseDataRelative.U_KEY])
+        self.v = torch.as_tensor(annotation[DensePoseDataRelative.V_KEY])
+        self.segm = DensePoseDataRelative.extract_segmentation_mask(annotation)
+        self.device = torch.device("cpu")
+        if cleanup:
+            DensePoseDataRelative.cleanup_annotation(annotation)
+
+    def to(self, device):
+        if self.device == device:
+            return self
+        new_data = DensePoseDataRelative.__new__(DensePoseDataRelative)
+        new_data.x = self.x
+        new_data.x = self.x.to(device)
+        new_data.y = self.y.to(device)
+        new_data.i = self.i.to(device)
+        new_data.u = self.u.to(device)
+        new_data.v = self.v.to(device)
+        new_data.segm = self.segm.to(device)
+        new_data.device = device
+        return new_data
+
+    @staticmethod
+    def extract_segmentation_mask(annotation):
+        import pycocotools.mask as mask_utils
+
+        poly_specs = annotation[DensePoseDataRelative.S_KEY]
+        segm = torch.zeros((DensePoseDataRelative.MASK_SIZE,) * 2, dtype=torch.float32)
+        for i in range(DensePoseDataRelative.N_BODY_PARTS):
+            poly_i = poly_specs[i]
+            if poly_i:
+                mask_i = mask_utils.decode(poly_i)
+                segm[mask_i > 0] = i + 1
+        return segm
+
+    @staticmethod
+    def validate_annotation(annotation):
+        for key in [
+            DensePoseDataRelative.X_KEY,
+            DensePoseDataRelative.Y_KEY,
+            DensePoseDataRelative.I_KEY,
+            DensePoseDataRelative.U_KEY,
+            DensePoseDataRelative.V_KEY,
+            DensePoseDataRelative.S_KEY,
+        ]:
+            if key not in annotation:
+                return False, "no {key} data in the annotation".format(key=key)
+        return True, None
+
+    @staticmethod
+    def cleanup_annotation(annotation):
+        for key in [
+            DensePoseDataRelative.X_KEY,
+            DensePoseDataRelative.Y_KEY,
+            DensePoseDataRelative.I_KEY,
+            DensePoseDataRelative.U_KEY,
+            DensePoseDataRelative.V_KEY,
+            DensePoseDataRelative.S_KEY,
+        ]:
+            if key in annotation:
+                del annotation[key]
+
+    def apply_transform(self, transforms, densepose_transform_data):
+        self._transform_pts(transforms, densepose_transform_data)
+        self._transform_segm(transforms, densepose_transform_data)
+
+    def _transform_pts(self, transforms, dp_transform_data):
+        import detectron2.data.transforms as T
+
+        # NOTE: This assumes that HorizFlipTransform is the only one that does flip
+        do_hflip = sum(isinstance(t, T.HFlipTransform) for t in transforms.transforms) % 2 == 1
+        if do_hflip:
+            self.x = self.segm.size(1) - self.x
+            self._flip_iuv_semantics(dp_transform_data)
+
+    def _flip_iuv_semantics(self, dp_transform_data: DensePoseTransformData) -> None:
+        i_old = self.i.clone()
+        uv_symmetries = dp_transform_data.uv_symmetries
+        pt_label_symmetries = dp_transform_data.point_label_symmetries
+        for i in range(self.N_PART_LABELS):
+            if i + 1 in i_old:
+                annot_indices_i = i_old == i + 1
+                if pt_label_symmetries[i + 1] != i + 1:
+                    self.i[annot_indices_i] = pt_label_symmetries[i + 1]
+                u_loc = (self.u[annot_indices_i] * 255).long()
+                v_loc = (self.v[annot_indices_i] * 255).long()
+                self.u[annot_indices_i] = uv_symmetries["U_transforms"][i][v_loc, u_loc].to(
+                    device=self.u.device
+                )
+                self.v[annot_indices_i] = uv_symmetries["V_transforms"][i][v_loc, u_loc].to(
+                    device=self.v.device
+                )
+
+    def _transform_segm(self, transforms, dp_transform_data):
+        import detectron2.data.transforms as T
+
+        # NOTE: This assumes that HorizFlipTransform is the only one that does flip
+        do_hflip = sum(isinstance(t, T.HFlipTransform) for t in transforms.transforms) % 2 == 1
+        if do_hflip:
+            self.segm = torch.flip(self.segm, [1])
+            self._flip_segm_semantics(dp_transform_data)
+
+    def _flip_segm_semantics(self, dp_transform_data):
+        old_segm = self.segm.clone()
+        mask_label_symmetries = dp_transform_data.mask_label_symmetries
+        for i in range(self.N_BODY_PARTS):
+            if mask_label_symmetries[i + 1] != i + 1:
+                self.segm[old_segm == i + 1] = mask_label_symmetries[i + 1]
+
+
+def normalized_coords_transform(x0, y0, w, h):
+    """
+    Coordinates transform that maps top left corner to (-1, -1) and bottom
+    right corner to (1, 1). Used for torch.grid_sample to initialize the
+    grid
+    """
+
+    def f(p):
+        return (2 * (p[0] - x0) / w - 1, 2 * (p[1] - y0) / h - 1)
+
+    return f
+
+
+class DensePoseOutput(object):
+    def __init__(self, S, I, U, V, confidences):
+        """
+        Args:
+            S (`torch.Tensor`): coarse segmentation tensor of size (N, A, H, W)
+            I (`torch.Tensor`): fine segmentation tensor of size (N, C, H, W)
+            U (`torch.Tensor`): U coordinates for each fine segmentation label of size (N, C, H, W)
+            V (`torch.Tensor`): V coordinates for each fine segmentation label of size (N, C, H, W)
+            confidences (dict of str -> `torch.Tensor`) estimated confidence model parameters
+        """
+        self.S = S
+        self.I = I  # noqa: E741
+        self.U = U
+        self.V = V
+        self.confidences = confidences
+        self._check_output_dims(S, I, U, V)
+
+    def _check_output_dims(self, S, I, U, V):
+        assert (
+            len(S.size()) == 4
+        ), "Segmentation output should have 4 " "dimensions (NCHW), but has size {}".format(
+            S.size()
+        )
+        assert (
+            len(I.size()) == 4
+        ), "Segmentation output should have 4 " "dimensions (NCHW), but has size {}".format(
+            S.size()
+        )
+        assert (
+            len(U.size()) == 4
+        ), "Segmentation output should have 4 " "dimensions (NCHW), but has size {}".format(
+            S.size()
+        )
+        assert (
+            len(V.size()) == 4
+        ), "Segmentation output should have 4 " "dimensions (NCHW), but has size {}".format(
+            S.size()
+        )
+        assert len(S) == len(I), (
+            "Number of output segmentation planes {} "
+            "should be equal to the number of output part index "
+            "planes {}".format(len(S), len(I))
+        )
+        assert S.size()[2:] == I.size()[2:], (
+            "Output segmentation plane size {} "
+            "should be equal to the output part index "
+            "plane size {}".format(S.size()[2:], I.size()[2:])
+        )
+        assert I.size() == U.size(), (
+            "Part index output shape {} "
+            "should be the same as U coordinates output shape {}".format(I.size(), U.size())
+        )
+        assert I.size() == V.size(), (
+            "Part index output shape {} "
+            "should be the same as V coordinates output shape {}".format(I.size(), V.size())
+        )
+
+    def resize(self, image_size_hw):
+        # do nothing - outputs are invariant to resize
+        pass
+
+    def _crop(self, S, I, U, V, bbox_old_xywh, bbox_new_xywh):
+        """
+        Resample S, I, U, V from bbox_old to the cropped bbox_new
+        """
+        x0old, y0old, wold, hold = bbox_old_xywh
+        x0new, y0new, wnew, hnew = bbox_new_xywh
+        tr_coords = normalized_coords_transform(x0old, y0old, wold, hold)
+        topleft = (x0new, y0new)
+        bottomright = (x0new + wnew, y0new + hnew)
+        topleft_norm = tr_coords(topleft)
+        bottomright_norm = tr_coords(bottomright)
+        hsize = S.size(1)
+        wsize = S.size(2)
+        grid = torch.meshgrid(
+            torch.arange(
+                topleft_norm[1],
+                bottomright_norm[1],
+                (bottomright_norm[1] - topleft_norm[1]) / hsize,
+            )[:hsize],
+            torch.arange(
+                topleft_norm[0],
+                bottomright_norm[0],
+                (bottomright_norm[0] - topleft_norm[0]) / wsize,
+            )[:wsize],
+        )
+        grid = torch.stack(grid, dim=2).to(S.device)
+        assert (
+            grid.size(0) == hsize
+        ), "Resampled grid expected " "height={}, actual height={}".format(hsize, grid.size(0))
+        assert grid.size(1) == wsize, "Resampled grid expected " "width={}, actual width={}".format(
+            wsize, grid.size(1)
+        )
+        S_new = F.grid_sample(
+            S.unsqueeze(0),
+            torch.unsqueeze(grid, 0),
+            mode="bilinear",
+            padding_mode="border",
+            align_corners=True,
+        ).squeeze(0)
+        I_new = F.grid_sample(
+            I.unsqueeze(0),
+            torch.unsqueeze(grid, 0),
+            mode="bilinear",
+            padding_mode="border",
+            align_corners=True,
+        ).squeeze(0)
+        U_new = F.grid_sample(
+            U.unsqueeze(0),
+            torch.unsqueeze(grid, 0),
+            mode="bilinear",
+            padding_mode="border",
+            align_corners=True,
+        ).squeeze(0)
+        V_new = F.grid_sample(
+            V.unsqueeze(0),
+            torch.unsqueeze(grid, 0),
+            mode="bilinear",
+            padding_mode="border",
+            align_corners=True,
+        ).squeeze(0)
+        return S_new, I_new, U_new, V_new
+
+    def crop(self, indices_cropped, bboxes_old, bboxes_new):
+        """
+        Crop outputs for selected bounding boxes to the new bounding boxes.
+        """
+        # VK: cropping is ignored for now
+        # for i, ic in enumerate(indices_cropped):
+        #    self.S[ic], self.I[ic], self.U[ic], self.V[ic] = \
+        #        self._crop(self.S[ic], self.I[ic], self.U[ic], self.V[ic],
+        #        bboxes_old[i], bboxes_new[i])
+        pass
+
+    def hflip(self, transform_data: DensePoseTransformData) -> None:
+        """
+        Change S, I, U and V to take into account a Horizontal flip.
+        """
+        if self.I.shape[0] > 0:
+            for el in "SIUV":
+                self.__dict__[el] = torch.flip(self.__dict__[el], [3])
+            self._flip_iuv_semantics_tensor(transform_data)
+            self._flip_segm_semantics_tensor(transform_data)
+
+    def _flip_iuv_semantics_tensor(self, dp_transform_data: DensePoseTransformData) -> None:
+        point_label_symmetries = dp_transform_data.point_label_symmetries
+        uv_symmetries = dp_transform_data.uv_symmetries
+
+        N, C, H, W = self.U.shape
+        u_loc = (self.U[:, 1:, :, :].clamp(0, 1) * 255).long()
+        v_loc = (self.V[:, 1:, :, :].clamp(0, 1) * 255).long()
+        Iindex = torch.arange(C - 1, device=self.U.device)[None, :, None, None].expand(
+            N, C - 1, H, W
+        )
+        self.U[:, 1:, :, :] = uv_symmetries["U_transforms"][Iindex, v_loc, u_loc].to(
+            device=self.U.device
+        )
+        self.V[:, 1:, :, :] = uv_symmetries["V_transforms"][Iindex, v_loc, u_loc].to(
+            device=self.V.device
+        )
+
+        for el in "IUV":
+            self.__dict__[el] = self.__dict__[el][:, point_label_symmetries, :, :]
+
+    def _flip_segm_semantics_tensor(self, dp_transform_data):
+        if self.S.shape[1] == DensePoseDataRelative.N_BODY_PARTS + 1:
+            self.S = self.S[:, dp_transform_data.mask_label_symmetries, :, :]
+
+    def to_result(self, boxes_xywh):
+        """
+        Convert DensePose outputs to results format. Results are more compact,
+        but cannot be resampled any more
+        """
+        result = DensePoseResult(boxes_xywh, self.S, self.I, self.U, self.V)
+        return result
+
+    def __getitem__(self, item):
+        if isinstance(item, int):
+            S_selected = self.S[item].unsqueeze(0)
+            I_selected = self.I[item].unsqueeze(0)
+            U_selected = self.U[item].unsqueeze(0)
+            V_selected = self.V[item].unsqueeze(0)
+            conf_selected = {}
+            for key in self.confidences:
+                conf_selected[key] = self.confidences[key][item].unsqueeze(0)
+        else:
+            S_selected = self.S[item]
+            I_selected = self.I[item]
+            U_selected = self.U[item]
+            V_selected = self.V[item]
+            conf_selected = {}
+            for key in self.confidences:
+                conf_selected[key] = self.confidences[key][item]
+        return DensePoseOutput(S_selected, I_selected, U_selected, V_selected, conf_selected)
+
+    def __str__(self):
+        s = "DensePoseOutput S {}, I {}, U {}, V {}".format(
+            list(self.S.size()), list(self.I.size()), list(self.U.size()), list(self.V.size())
+        )
+        s_conf = "confidences: [{}]".format(
+            ", ".join([f"{key} {list(self.confidences[key].size())}" for key in self.confidences])
+        )
+        return ", ".join([s, s_conf])
+
+    def __len__(self):
+        return self.S.size(0)
+
+
+class DensePoseResult(object):
+    def __init__(self, boxes_xywh, S, I, U, V):
+        self.results = []
+        self.boxes_xywh = boxes_xywh.cpu().tolist()
+        assert len(boxes_xywh.size()) == 2
+        assert boxes_xywh.size(1) == 4
+        for i, box_xywh in enumerate(boxes_xywh):
+            result_i = self._output_to_result(box_xywh, S[[i]], I[[i]], U[[i]], V[[i]])
+            result_numpy_i = result_i.cpu().numpy()
+            result_encoded_i = DensePoseResult.encode_png_data(result_numpy_i)
+            result_encoded_with_shape_i = (result_numpy_i.shape, result_encoded_i)
+            self.results.append(result_encoded_with_shape_i)
+
+    def __str__(self):
+        s = "DensePoseResult: N={} [{}]".format(
+            len(self.results), ", ".join([str(list(r[0])) for r in self.results])
+        )
+        return s
+
+    def _output_to_result(self, box_xywh, S, I, U, V):
+        x, y, w, h = box_xywh
+        w = max(int(w), 1)
+        h = max(int(h), 1)
+        result = torch.zeros([3, h, w], dtype=torch.uint8, device=U.device)
+        assert (
+            len(S.size()) == 4
+        ), "AnnIndex tensor size should have {} " "dimensions but has {}".format(4, len(S.size()))
+        s_bbox = F.interpolate(S, (h, w), mode="bilinear", align_corners=False).argmax(dim=1)
+        assert (
+            len(I.size()) == 4
+        ), "IndexUV tensor size should have {} " "dimensions but has {}".format(4, len(S.size()))
+        i_bbox = (
+            F.interpolate(I, (h, w), mode="bilinear", align_corners=False).argmax(dim=1)
+            * (s_bbox > 0).long()
+        ).squeeze(0)
+        assert len(U.size()) == 4, "U tensor size should have {} " "dimensions but has {}".format(
+            4, len(U.size())
+        )
+        u_bbox = F.interpolate(U, (h, w), mode="bilinear", align_corners=False)
+        assert len(V.size()) == 4, "V tensor size should have {} " "dimensions but has {}".format(
+            4, len(V.size())
+        )
+        v_bbox = F.interpolate(V, (h, w), mode="bilinear", align_corners=False)
+        result[0] = i_bbox
+        for part_id in range(1, u_bbox.size(1)):
+            result[1][i_bbox == part_id] = (
+                (u_bbox[0, part_id][i_bbox == part_id] * 255).clamp(0, 255).to(torch.uint8)
+            )
+            result[2][i_bbox == part_id] = (
+                (v_bbox[0, part_id][i_bbox == part_id] * 255).clamp(0, 255).to(torch.uint8)
+            )
+        assert (
+            result.size(1) == h
+        ), "Results height {} should be equal" "to bounding box height {}".format(result.size(1), h)
+        assert (
+            result.size(2) == w
+        ), "Results width {} should be equal" "to bounding box width {}".format(result.size(2), w)
+        return result
+
+    @staticmethod
+    def encode_png_data(arr):
+        """
+        Encode array data as a PNG image using the highest compression rate
+        @param arr [in] Data stored in an array of size (3, M, N) of type uint8
+        @return Base64-encoded string containing PNG-compressed data
+        """
+        assert len(arr.shape) == 3, "Expected a 3D array as an input," " got a {0}D array".format(
+            len(arr.shape)
+        )
+        assert arr.shape[0] == 3, "Expected first array dimension of size 3," " got {0}".format(
+            arr.shape[0]
+        )
+        assert arr.dtype == np.uint8, "Expected an array of type np.uint8, " " got {0}".format(
+            arr.dtype
+        )
+        data = np.moveaxis(arr, 0, -1)
+        im = Image.fromarray(data)
+        fstream = BytesIO()
+        im.save(fstream, format="png", optimize=True)
+        s = base64.encodebytes(fstream.getvalue()).decode()
+        return s
+
+    @staticmethod
+    def decode_png_data(shape, s):
+        """
+        Decode array data from a string that contains PNG-compressed data
+        @param Base64-encoded string containing PNG-compressed data
+        @return Data stored in an array of size (3, M, N) of type uint8
+        """
+        fstream = BytesIO(base64.decodebytes(s.encode()))
+        im = Image.open(fstream)
+        data = np.moveaxis(np.array(im.getdata(), dtype=np.uint8), -1, 0)
+        return data.reshape(shape)
+
+    def __len__(self):
+        return len(self.results)
+
+    def __getitem__(self, item):
+        result_encoded = self.results[item]
+        bbox_xywh = self.boxes_xywh[item]
+        return result_encoded, bbox_xywh
+
+
+class DensePoseList(object):
+
+    _TORCH_DEVICE_CPU = torch.device("cpu")
+
+    def __init__(self, densepose_datas, boxes_xyxy_abs, image_size_hw, device=_TORCH_DEVICE_CPU):
+        assert len(densepose_datas) == len(
+            boxes_xyxy_abs
+        ), "Attempt to initialize DensePoseList with {} DensePose datas " "and {} boxes".format(
+            len(densepose_datas), len(boxes_xyxy_abs)
+        )
+        self.densepose_datas = []
+        for densepose_data in densepose_datas:
+            assert isinstance(densepose_data, DensePoseDataRelative) or densepose_data is None, (
+                "Attempt to initialize DensePoseList with DensePose datas "
+                "of type {}, expected DensePoseDataRelative".format(type(densepose_data))
+            )
+            densepose_data_ondevice = (
+                densepose_data.to(device) if densepose_data is not None else None
+            )
+            self.densepose_datas.append(densepose_data_ondevice)
+        self.boxes_xyxy_abs = boxes_xyxy_abs.to(device)
+        self.image_size_hw = image_size_hw
+        self.device = device
+
+    def to(self, device):
+        if self.device == device:
+            return self
+        return DensePoseList(self.densepose_datas, self.boxes_xyxy_abs, self.image_size_hw, device)
+
+    def __iter__(self):
+        return iter(self.densepose_datas)
+
+    def __len__(self):
+        return len(self.densepose_datas)
+
+    def __repr__(self):
+        s = self.__class__.__name__ + "("
+        s += "num_instances={}, ".format(len(self.densepose_datas))
+        s += "image_width={}, ".format(self.image_size_hw[1])
+        s += "image_height={})".format(self.image_size_hw[0])
+        return s
+
+    def __getitem__(self, item):
+        if isinstance(item, int):
+            densepose_data_rel = self.densepose_datas[item]
+            return densepose_data_rel
+        elif isinstance(item, slice):
+            densepose_datas_rel = self.densepose_datas[item]
+            boxes_xyxy_abs = self.boxes_xyxy_abs[item]
+            return DensePoseList(
+                densepose_datas_rel, boxes_xyxy_abs, self.image_size_hw, self.device
+            )
+        elif isinstance(item, torch.Tensor) and (item.dtype == torch.bool):
+            densepose_datas_rel = [self.densepose_datas[i] for i, x in enumerate(item) if x > 0]
+            boxes_xyxy_abs = self.boxes_xyxy_abs[item]
+            return DensePoseList(
+                densepose_datas_rel, boxes_xyxy_abs, self.image_size_hw, self.device
+            )
+        else:
+            densepose_datas_rel = [self.densepose_datas[i] for i in item]
+            boxes_xyxy_abs = self.boxes_xyxy_abs[item]
+            return DensePoseList(
+                densepose_datas_rel, boxes_xyxy_abs, self.image_size_hw, self.device
+            )
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/projects/DensePose/densepose/densepose_coco_evaluation.py b/preprocess/humanparsing/mhp_extension/detectron2/projects/DensePose/densepose/densepose_coco_evaluation.py
new file mode 100644
index 0000000000000000000000000000000000000000..489e7b006da436531e37ebeb1f01f13bad60874d
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/projects/DensePose/densepose/densepose_coco_evaluation.py
@@ -0,0 +1,1138 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+# This is a modified version of cocoeval.py where we also have the densepose evaluation.
+
+__author__ = "tsungyi"
+
+import copy
+import datetime
+import itertools
+import logging
+import numpy as np
+import pickle
+import time
+from collections import defaultdict
+from enum import Enum
+from typing import Any, Dict, Tuple
+import scipy.spatial.distance as ssd
+from fvcore.common.file_io import PathManager
+from pycocotools import mask as maskUtils
+from scipy.io import loadmat
+from scipy.ndimage import zoom as spzoom
+
+from .data.structures import DensePoseDataRelative, DensePoseResult
+
+logger = logging.getLogger(__name__)
+
+
+class DensePoseEvalMode(str, Enum):
+    # use both masks and geodesic distances (GPS * IOU) to compute scores
+    GPSM = "gpsm"
+    # use only geodesic distances (GPS)  to compute scores
+    GPS = "gps"
+    # use only masks (IOU) to compute scores
+    IOU = "iou"
+
+
+class DensePoseDataMode(str, Enum):
+    # use estimated IUV data (default mode)
+    IUV_DT = "iuvdt"
+    # use ground truth IUV data
+    IUV_GT = "iuvgt"
+    # use ground truth labels I and set UV to 0
+    I_GT_UV_0 = "igtuv0"
+    # use ground truth labels I and estimated UV coordinates
+    I_GT_UV_DT = "igtuvdt"
+    # use estimated labels I and set UV to 0
+    I_DT_UV_0 = "idtuv0"
+
+
+class DensePoseCocoEval(object):
+    # Interface for evaluating detection on the Microsoft COCO dataset.
+    #
+    # The usage for CocoEval is as follows:
+    #  cocoGt=..., cocoDt=...       # load dataset and results
+    #  E = CocoEval(cocoGt,cocoDt); # initialize CocoEval object
+    #  E.params.recThrs = ...;      # set parameters as desired
+    #  E.evaluate();                # run per image evaluation
+    #  E.accumulate();              # accumulate per image results
+    #  E.summarize();               # display summary metrics of results
+    # For example usage see evalDemo.m and http://mscoco.org/.
+    #
+    # The evaluation parameters are as follows (defaults in brackets):
+    #  imgIds     - [all] N demo ids to use for evaluation
+    #  catIds     - [all] K cat ids to use for evaluation
+    #  iouThrs    - [.5:.05:.95] T=10 IoU thresholds for evaluation
+    #  recThrs    - [0:.01:1] R=101 recall thresholds for evaluation
+    #  areaRng    - [...] A=4 object area ranges for evaluation
+    #  maxDets    - [1 10 100] M=3 thresholds on max detections per image
+    #  iouType    - ['segm'] set iouType to 'segm', 'bbox', 'keypoints' or 'densepose'
+    #  iouType replaced the now DEPRECATED useSegm parameter.
+    #  useCats    - [1] if true use category labels for evaluation
+    # Note: if useCats=0 category labels are ignored as in proposal scoring.
+    # Note: multiple areaRngs [Ax2] and maxDets [Mx1] can be specified.
+    #
+    # evaluate(): evaluates detections on every image and every category and
+    # concats the results into the "evalImgs" with fields:
+    #  dtIds      - [1xD] id for each of the D detections (dt)
+    #  gtIds      - [1xG] id for each of the G ground truths (gt)
+    #  dtMatches  - [TxD] matching gt id at each IoU or 0
+    #  gtMatches  - [TxG] matching dt id at each IoU or 0
+    #  dtScores   - [1xD] confidence of each dt
+    #  gtIgnore   - [1xG] ignore flag for each gt
+    #  dtIgnore   - [TxD] ignore flag for each dt at each IoU
+    #
+    # accumulate(): accumulates the per-image, per-category evaluation
+    # results in "evalImgs" into the dictionary "eval" with fields:
+    #  params     - parameters used for evaluation
+    #  date       - date evaluation was performed
+    #  counts     - [T,R,K,A,M] parameter dimensions (see above)
+    #  precision  - [TxRxKxAxM] precision for every evaluation setting
+    #  recall     - [TxKxAxM] max recall for every evaluation setting
+    # Note: precision and recall==-1 for settings with no gt objects.
+    #
+    # See also coco, mask, pycocoDemo, pycocoEvalDemo
+    #
+    # Microsoft COCO Toolbox.      version 2.0
+    # Data, paper, and tutorials available at:  http://mscoco.org/
+    # Code written by Piotr Dollar and Tsung-Yi Lin, 2015.
+    # Licensed under the Simplified BSD License [see coco/license.txt]
+    def __init__(
+        self,
+        cocoGt=None,
+        cocoDt=None,
+        iouType: str = "densepose",
+        dpEvalMode: DensePoseEvalMode = DensePoseEvalMode.GPS,
+        dpDataMode: DensePoseDataMode = DensePoseDataMode.IUV_DT,
+    ):
+        """
+        Initialize CocoEval using coco APIs for gt and dt
+        :param cocoGt: coco object with ground truth annotations
+        :param cocoDt: coco object with detection results
+        :return: None
+        """
+        self.cocoGt = cocoGt  # ground truth COCO API
+        self.cocoDt = cocoDt  # detections COCO API
+        self._dpEvalMode = dpEvalMode
+        self._dpDataMode = dpDataMode
+        self.params = {}  # evaluation parameters
+        self.evalImgs = defaultdict(list)  # per-image per-category eval results [KxAxI]
+        self.eval = {}  # accumulated evaluation results
+        self._gts = defaultdict(list)  # gt for evaluation
+        self._dts = defaultdict(list)  # dt for evaluation
+        self.params = Params(iouType=iouType)  # parameters
+        self._paramsEval = {}  # parameters for evaluation
+        self.stats = []  # result summarization
+        self.ious = {}  # ious between all gts and dts
+        if cocoGt is not None:
+            self.params.imgIds = sorted(cocoGt.getImgIds())
+            self.params.catIds = sorted(cocoGt.getCatIds())
+        self.ignoreThrBB = 0.7
+        self.ignoreThrUV = 0.9
+
+    def _loadGEval(self):
+        smpl_subdiv_fpath = PathManager.get_local_path(
+            "https://dl.fbaipublicfiles.com/densepose/data/SMPL_subdiv.mat"
+        )
+        pdist_transform_fpath = PathManager.get_local_path(
+            "https://dl.fbaipublicfiles.com/densepose/data/SMPL_SUBDIV_TRANSFORM.mat"
+        )
+        pdist_matrix_fpath = PathManager.get_local_path(
+            "https://dl.fbaipublicfiles.com/densepose/data/Pdist_matrix.pkl", timeout_sec=120
+        )
+        SMPL_subdiv = loadmat(smpl_subdiv_fpath)
+        self.PDIST_transform = loadmat(pdist_transform_fpath)
+        self.PDIST_transform = self.PDIST_transform["index"].squeeze()
+        UV = np.array([SMPL_subdiv["U_subdiv"], SMPL_subdiv["V_subdiv"]]).squeeze()
+        ClosestVertInds = np.arange(UV.shape[1]) + 1
+        self.Part_UVs = []
+        self.Part_ClosestVertInds = []
+        for i in np.arange(24):
+            self.Part_UVs.append(UV[:, SMPL_subdiv["Part_ID_subdiv"].squeeze() == (i + 1)])
+            self.Part_ClosestVertInds.append(
+                ClosestVertInds[SMPL_subdiv["Part_ID_subdiv"].squeeze() == (i + 1)]
+            )
+
+        with open(pdist_matrix_fpath, "rb") as hFile:
+            arrays = pickle.load(hFile, encoding="latin1")
+        self.Pdist_matrix = arrays["Pdist_matrix"]
+        self.Part_ids = np.array(SMPL_subdiv["Part_ID_subdiv"].squeeze())
+        # Mean geodesic distances for parts.
+        self.Mean_Distances = np.array([0, 0.351, 0.107, 0.126, 0.237, 0.173, 0.142, 0.128, 0.150])
+        # Coarse Part labels.
+        self.CoarseParts = np.array(
+            [0, 1, 1, 2, 2, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7, 8, 8]
+        )
+
+    def _prepare(self):
+        """
+        Prepare ._gts and ._dts for evaluation based on params
+        :return: None
+        """
+
+        def _toMask(anns, coco):
+            # modify ann['segmentation'] by reference
+            for ann in anns:
+                rle = coco.annToRLE(ann)
+                ann["segmentation"] = rle
+
+        def _getIgnoreRegion(iid, coco):
+            img = coco.imgs[iid]
+
+            if "ignore_regions_x" not in img.keys():
+                return None
+
+            if len(img["ignore_regions_x"]) == 0:
+                return None
+
+            rgns_merged = []
+            for region_x, region_y in zip(img["ignore_regions_x"], img["ignore_regions_y"]):
+                rgns = [iter(region_x), iter(region_y)]
+                rgns_merged.append([next(it) for it in itertools.cycle(rgns)])
+            rles = maskUtils.frPyObjects(rgns_merged, img["height"], img["width"])
+            rle = maskUtils.merge(rles)
+            return maskUtils.decode(rle)
+
+        def _checkIgnore(dt, iregion):
+            if iregion is None:
+                return True
+
+            bb = np.array(dt["bbox"]).astype(np.int)
+            x1, y1, x2, y2 = bb[0], bb[1], bb[0] + bb[2], bb[1] + bb[3]
+            x2 = min([x2, iregion.shape[1]])
+            y2 = min([y2, iregion.shape[0]])
+
+            if bb[2] * bb[3] == 0:
+                return False
+
+            crop_iregion = iregion[y1:y2, x1:x2]
+
+            if crop_iregion.sum() == 0:
+                return True
+
+            if "densepose" not in dt.keys():  # filtering boxes
+                return crop_iregion.sum() / bb[2] / bb[3] < self.ignoreThrBB
+
+            # filtering UVs
+            ignoremask = np.require(crop_iregion, requirements=["F"])
+            mask = self._extract_mask(dt)
+            uvmask = np.require(np.asarray(mask > 0), dtype=np.uint8, requirements=["F"])
+            uvmask_ = maskUtils.encode(uvmask)
+            ignoremask_ = maskUtils.encode(ignoremask)
+            uviou = maskUtils.iou([uvmask_], [ignoremask_], [1])[0]
+            return uviou < self.ignoreThrUV
+
+        p = self.params
+
+        if p.useCats:
+            gts = self.cocoGt.loadAnns(self.cocoGt.getAnnIds(imgIds=p.imgIds, catIds=p.catIds))
+            dts = self.cocoDt.loadAnns(self.cocoDt.getAnnIds(imgIds=p.imgIds, catIds=p.catIds))
+        else:
+            gts = self.cocoGt.loadAnns(self.cocoGt.getAnnIds(imgIds=p.imgIds))
+            dts = self.cocoDt.loadAnns(self.cocoDt.getAnnIds(imgIds=p.imgIds))
+
+        imns = self.cocoGt.loadImgs(p.imgIds)
+        self.size_mapping = {}
+        for im in imns:
+            self.size_mapping[im["id"]] = [im["height"], im["width"]]
+
+        # if iouType == 'uv', add point gt annotations
+        if p.iouType == "densepose":
+            self._loadGEval()
+
+        # convert ground truth to mask if iouType == 'segm'
+        if p.iouType == "segm":
+            _toMask(gts, self.cocoGt)
+            _toMask(dts, self.cocoDt)
+
+        # set ignore flag
+        for gt in gts:
+            gt["ignore"] = gt["ignore"] if "ignore" in gt else 0
+            gt["ignore"] = "iscrowd" in gt and gt["iscrowd"]
+            if p.iouType == "keypoints":
+                gt["ignore"] = (gt["num_keypoints"] == 0) or gt["ignore"]
+            if p.iouType == "densepose":
+                gt["ignore"] = ("dp_x" in gt) == 0
+
+        self._gts = defaultdict(list)  # gt for evaluation
+        self._dts = defaultdict(list)  # dt for evaluation
+        self._igrgns = defaultdict(list)
+
+        for gt in gts:
+            iid = gt["image_id"]
+            if iid not in self._igrgns.keys():
+                self._igrgns[iid] = _getIgnoreRegion(iid, self.cocoGt)
+            if _checkIgnore(gt, self._igrgns[iid]):
+                self._gts[iid, gt["category_id"]].append(gt)
+        for dt in dts:
+            iid = dt["image_id"]
+            if (iid not in self._igrgns) or _checkIgnore(dt, self._igrgns[iid]):
+                self._dts[iid, dt["category_id"]].append(dt)
+
+        self.evalImgs = defaultdict(list)  # per-image per-category evaluation results
+        self.eval = {}  # accumulated evaluation results
+
+    def evaluate(self):
+        """
+        Run per image evaluation on given images and store results (a list of dict) in self.evalImgs
+        :return: None
+        """
+        tic = time.time()
+        logger.info("Running per image DensePose evaluation... {}".format(self.params.iouType))
+        p = self.params
+        # add backward compatibility if useSegm is specified in params
+        if p.useSegm is not None:
+            p.iouType = "segm" if p.useSegm == 1 else "bbox"
+            logger.info("useSegm (deprecated) is not None. Running DensePose evaluation")
+        p.imgIds = list(np.unique(p.imgIds))
+        if p.useCats:
+            p.catIds = list(np.unique(p.catIds))
+        p.maxDets = sorted(p.maxDets)
+        self.params = p
+
+        self._prepare()
+        # loop through images, area range, max detection number
+        catIds = p.catIds if p.useCats else [-1]
+
+        if p.iouType in ["segm", "bbox"]:
+            computeIoU = self.computeIoU
+        elif p.iouType == "keypoints":
+            computeIoU = self.computeOks
+        elif p.iouType == "densepose":
+            computeIoU = self.computeOgps
+            if self._dpEvalMode == DensePoseEvalMode.GPSM:
+                self.real_ious = {
+                    (imgId, catId): self.computeDPIoU(imgId, catId)
+                    for imgId in p.imgIds
+                    for catId in catIds
+                }
+
+        self.ious = {
+            (imgId, catId): computeIoU(imgId, catId) for imgId in p.imgIds for catId in catIds
+        }
+
+        evaluateImg = self.evaluateImg
+        maxDet = p.maxDets[-1]
+        self.evalImgs = [
+            evaluateImg(imgId, catId, areaRng, maxDet)
+            for catId in catIds
+            for areaRng in p.areaRng
+            for imgId in p.imgIds
+        ]
+        self._paramsEval = copy.deepcopy(self.params)
+        toc = time.time()
+        logger.info("DensePose evaluation DONE (t={:0.2f}s).".format(toc - tic))
+
+    def getDensePoseMask(self, polys):
+        maskGen = np.zeros([256, 256])
+        for i in range(1, 15):
+            if polys[i - 1]:
+                currentMask = maskUtils.decode(polys[i - 1])
+                maskGen[currentMask > 0] = i
+        return maskGen
+
+    def _generate_rlemask_on_image(self, mask, imgId, data):
+        bbox_xywh = np.array(data["bbox"])
+        x, y, w, h = bbox_xywh
+        im_h, im_w = self.size_mapping[imgId]
+        im_mask = np.zeros((im_h, im_w), dtype=np.uint8)
+        if mask is not None:
+            x0 = max(int(x), 0)
+            x1 = min(int(x + w), im_w, int(x) + mask.shape[1])
+            y0 = max(int(y), 0)
+            y1 = min(int(y + h), im_h, int(y) + mask.shape[0])
+            y = int(y)
+            x = int(x)
+            im_mask[y0:y1, x0:x1] = mask[y0 - y : y1 - y, x0 - x : x1 - x]
+        im_mask = np.require(np.asarray(im_mask > 0), dtype=np.uint8, requirements=["F"])
+        rle_mask = maskUtils.encode(np.array(im_mask[:, :, np.newaxis], order="F"))[0]
+        return rle_mask
+
+    def computeDPIoU(self, imgId, catId):
+        p = self.params
+        if p.useCats:
+            gt = self._gts[imgId, catId]
+            dt = self._dts[imgId, catId]
+        else:
+            gt = [_ for cId in p.catIds for _ in self._gts[imgId, cId]]
+            dt = [_ for cId in p.catIds for _ in self._dts[imgId, cId]]
+        if len(gt) == 0 and len(dt) == 0:
+            return []
+        inds = np.argsort([-d["score"] for d in dt], kind="mergesort")
+        dt = [dt[i] for i in inds]
+        if len(dt) > p.maxDets[-1]:
+            dt = dt[0 : p.maxDets[-1]]
+
+        gtmasks = []
+        for g in gt:
+            if DensePoseDataRelative.S_KEY in g:
+                mask = self.getDensePoseMask(g[DensePoseDataRelative.S_KEY])
+                _, _, w, h = g["bbox"]
+                scale_x = float(max(w, 1)) / mask.shape[1]
+                scale_y = float(max(h, 1)) / mask.shape[0]
+                mask = spzoom(mask, (scale_y, scale_x), order=1, prefilter=False)
+                mask = np.array(mask > 0.5, dtype=np.uint8)
+                rle_mask = self._generate_rlemask_on_image(mask, imgId, g)
+            elif "segmentation" in g:
+                segmentation = g["segmentation"]
+                if isinstance(segmentation, list) and segmentation:
+                    # polygons
+                    im_h, im_w = self.size_mapping[imgId]
+                    rles = maskUtils.frPyObjects(segmentation, im_h, im_w)
+                    rle_mask = maskUtils.merge(rles)
+                elif isinstance(segmentation, dict):
+                    if isinstance(segmentation["counts"], list):
+                        # uncompressed RLE
+                        im_h, im_w = self.size_mapping[imgId]
+                        rle_mask = maskUtils.frPyObjects(segmentation, im_h, im_w)
+                    else:
+                        # compressed RLE
+                        rle_mask = segmentation
+                else:
+                    rle_mask = self._generate_rlemask_on_image(None, imgId, g)
+            else:
+                rle_mask = self._generate_rlemask_on_image(None, imgId, g)
+            gtmasks.append(rle_mask)
+
+        dtmasks = []
+        for d in dt:
+            mask = self._extract_mask(d)
+            mask = np.require(np.asarray(mask > 0), dtype=np.uint8, requirements=["F"])
+            rle_mask = self._generate_rlemask_on_image(mask, imgId, d)
+            dtmasks.append(rle_mask)
+
+        # compute iou between each dt and gt region
+        iscrowd = [int(o["iscrowd"]) for o in gt]
+        iousDP = maskUtils.iou(dtmasks, gtmasks, iscrowd)
+        return iousDP
+
+    def computeIoU(self, imgId, catId):
+        p = self.params
+        if p.useCats:
+            gt = self._gts[imgId, catId]
+            dt = self._dts[imgId, catId]
+        else:
+            gt = [_ for cId in p.catIds for _ in self._gts[imgId, cId]]
+            dt = [_ for cId in p.catIds for _ in self._dts[imgId, cId]]
+        if len(gt) == 0 and len(dt) == 0:
+            return []
+        inds = np.argsort([-d["score"] for d in dt], kind="mergesort")
+        dt = [dt[i] for i in inds]
+        if len(dt) > p.maxDets[-1]:
+            dt = dt[0 : p.maxDets[-1]]
+
+        if p.iouType == "segm":
+            g = [g["segmentation"] for g in gt]
+            d = [d["segmentation"] for d in dt]
+        elif p.iouType == "bbox":
+            g = [g["bbox"] for g in gt]
+            d = [d["bbox"] for d in dt]
+        else:
+            raise Exception("unknown iouType for iou computation")
+
+        # compute iou between each dt and gt region
+        iscrowd = [int(o["iscrowd"]) for o in gt]
+        ious = maskUtils.iou(d, g, iscrowd)
+        return ious
+
+    def computeOks(self, imgId, catId):
+        p = self.params
+        # dimension here should be Nxm
+        gts = self._gts[imgId, catId]
+        dts = self._dts[imgId, catId]
+        inds = np.argsort([-d["score"] for d in dts], kind="mergesort")
+        dts = [dts[i] for i in inds]
+        if len(dts) > p.maxDets[-1]:
+            dts = dts[0 : p.maxDets[-1]]
+        # if len(gts) == 0 and len(dts) == 0:
+        if len(gts) == 0 or len(dts) == 0:
+            return []
+        ious = np.zeros((len(dts), len(gts)))
+        sigmas = (
+            np.array(
+                [
+                    0.26,
+                    0.25,
+                    0.25,
+                    0.35,
+                    0.35,
+                    0.79,
+                    0.79,
+                    0.72,
+                    0.72,
+                    0.62,
+                    0.62,
+                    1.07,
+                    1.07,
+                    0.87,
+                    0.87,
+                    0.89,
+                    0.89,
+                ]
+            )
+            / 10.0
+        )
+        vars = (sigmas * 2) ** 2
+        k = len(sigmas)
+        # compute oks between each detection and ground truth object
+        for j, gt in enumerate(gts):
+            # create bounds for ignore regions(double the gt bbox)
+            g = np.array(gt["keypoints"])
+            xg = g[0::3]
+            yg = g[1::3]
+            vg = g[2::3]
+            k1 = np.count_nonzero(vg > 0)
+            bb = gt["bbox"]
+            x0 = bb[0] - bb[2]
+            x1 = bb[0] + bb[2] * 2
+            y0 = bb[1] - bb[3]
+            y1 = bb[1] + bb[3] * 2
+            for i, dt in enumerate(dts):
+                d = np.array(dt["keypoints"])
+                xd = d[0::3]
+                yd = d[1::3]
+                if k1 > 0:
+                    # measure the per-keypoint distance if keypoints visible
+                    dx = xd - xg
+                    dy = yd - yg
+                else:
+                    # measure minimum distance to keypoints in (x0,y0) & (x1,y1)
+                    z = np.zeros(k)
+                    dx = np.max((z, x0 - xd), axis=0) + np.max((z, xd - x1), axis=0)
+                    dy = np.max((z, y0 - yd), axis=0) + np.max((z, yd - y1), axis=0)
+                e = (dx ** 2 + dy ** 2) / vars / (gt["area"] + np.spacing(1)) / 2
+                if k1 > 0:
+                    e = e[vg > 0]
+                ious[i, j] = np.sum(np.exp(-e)) / e.shape[0]
+        return ious
+
+    def _extract_mask(self, dt: Dict[str, Any]) -> np.ndarray:
+        (densepose_shape, densepose_data_encoded), densepose_bbox_xywh = dt["densepose"]
+        densepose_data = DensePoseResult.decode_png_data(densepose_shape, densepose_data_encoded)
+        return densepose_data[0]
+
+    def _extract_iuv(
+        self, densepose_data: np.ndarray, py: np.ndarray, px: np.ndarray, gt: Dict[str, Any]
+    ) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
+        """
+        Extract arrays of I, U and V values at given points as numpy arrays
+        given the data mode stored in self._dpDataMode
+        """
+        if self._dpDataMode == DensePoseDataMode.IUV_DT:
+            # estimated labels and UV (default)
+            ipoints = densepose_data[0, py, px]
+            upoints = densepose_data[1, py, px] / 255.0  # convert from uint8 by /255.
+            vpoints = densepose_data[2, py, px] / 255.0
+        elif self._dpDataMode == DensePoseDataMode.IUV_GT:
+            # ground truth
+            ipoints = np.array(gt["dp_I"])
+            upoints = np.array(gt["dp_U"])
+            vpoints = np.array(gt["dp_V"])
+        elif self._dpDataMode == DensePoseDataMode.I_GT_UV_0:
+            # ground truth labels, UV = 0
+            ipoints = np.array(gt["dp_I"])
+            upoints = upoints * 0.0
+            vpoints = vpoints * 0.0
+        elif self._dpDataMode == DensePoseDataMode.I_GT_UV_DT:
+            # ground truth labels, estimated UV
+            ipoints = np.array(gt["dp_I"])
+            upoints = densepose_data[1, py, px] / 255.0  # convert from uint8 by /255.
+            vpoints = densepose_data[2, py, px] / 255.0
+        elif self._dpDataMode == DensePoseDataMode.I_DT_UV_0:
+            # estimated labels, UV = 0
+            ipoints = densepose_data[0, py, px]
+            upoints = upoints * 0.0
+            vpoints = vpoints * 0.0
+        else:
+            raise ValueError(f"Unknown data mode: {self._dpDataMode}")
+        return ipoints, upoints, vpoints
+
+    def computeOgps(self, imgId, catId):
+        p = self.params
+        # dimension here should be Nxm
+        g = self._gts[imgId, catId]
+        d = self._dts[imgId, catId]
+        inds = np.argsort([-d_["score"] for d_ in d], kind="mergesort")
+        d = [d[i] for i in inds]
+        if len(d) > p.maxDets[-1]:
+            d = d[0 : p.maxDets[-1]]
+        # if len(gts) == 0 and len(dts) == 0:
+        if len(g) == 0 or len(d) == 0:
+            return []
+        ious = np.zeros((len(d), len(g)))
+        # compute opgs between each detection and ground truth object
+        # sigma = self.sigma #0.255 # dist = 0.3m corresponds to ogps = 0.5
+        # 1 # dist = 0.3m corresponds to ogps = 0.96
+        # 1.45 # dist = 1.7m (person height) corresponds to ogps = 0.5)
+        for j, gt in enumerate(g):
+            if not gt["ignore"]:
+                g_ = gt["bbox"]
+                for i, dt in enumerate(d):
+                    #
+                    dy = int(dt["bbox"][3])
+                    dx = int(dt["bbox"][2])
+                    dp_x = np.array(gt["dp_x"]) * g_[2] / 255.0
+                    dp_y = np.array(gt["dp_y"]) * g_[3] / 255.0
+                    py = (dp_y + g_[1] - dt["bbox"][1]).astype(np.int)
+                    px = (dp_x + g_[0] - dt["bbox"][0]).astype(np.int)
+                    #
+                    pts = np.zeros(len(px))
+                    pts[px >= dx] = -1
+                    pts[py >= dy] = -1
+                    pts[px < 0] = -1
+                    pts[py < 0] = -1
+                    if len(pts) < 1:
+                        ogps = 0.0
+                    elif np.max(pts) == -1:
+                        ogps = 0.0
+                    else:
+                        px[pts == -1] = 0
+                        py[pts == -1] = 0
+                        (densepose_shape, densepose_data_encoded), densepose_bbox_xywh = dt[
+                            "densepose"
+                        ]
+                        densepose_data = DensePoseResult.decode_png_data(
+                            densepose_shape, densepose_data_encoded
+                        )
+                        assert densepose_data.shape[2] == dx, (
+                            "DensePoseData width {} should be equal to "
+                            "detection bounding box width {}".format(densepose_data.shape[2], dx)
+                        )
+                        assert densepose_data.shape[1] == dy, (
+                            "DensePoseData height {} should be equal to "
+                            "detection bounding box height {}".format(densepose_data.shape[1], dy)
+                        )
+                        ipoints, upoints, vpoints = self._extract_iuv(densepose_data, py, px, gt)
+                        ipoints[pts == -1] = 0
+                        # Find closest vertices in subsampled mesh.
+                        cVerts, cVertsGT = self.findAllClosestVerts(gt, upoints, vpoints, ipoints)
+                        # Get pairwise geodesic distances between gt and estimated mesh points.
+                        dist = self.getDistances(cVertsGT, cVerts)
+                        # Compute the Ogps measure.
+                        # Find the mean geodesic normalization distance for
+                        # each GT point, based on which part it is on.
+                        Current_Mean_Distances = self.Mean_Distances[
+                            self.CoarseParts[self.Part_ids[cVertsGT[cVertsGT > 0].astype(int) - 1]]
+                        ]
+                        # Compute gps
+                        ogps_values = np.exp(-(dist ** 2) / (2 * (Current_Mean_Distances ** 2)))
+                        #
+                        if len(dist) > 0:
+                            ogps = np.sum(ogps_values) / len(dist)
+                    ious[i, j] = ogps
+
+        gbb = [gt["bbox"] for gt in g]
+        dbb = [dt["bbox"] for dt in d]
+
+        # compute iou between each dt and gt region
+        iscrowd = [int(o["iscrowd"]) for o in g]
+        ious_bb = maskUtils.iou(dbb, gbb, iscrowd)
+        return ious, ious_bb
+
+    def evaluateImg(self, imgId, catId, aRng, maxDet):
+        """
+        perform evaluation for single category and image
+        :return: dict (single image results)
+        """
+
+        p = self.params
+        if p.useCats:
+            gt = self._gts[imgId, catId]
+            dt = self._dts[imgId, catId]
+        else:
+            gt = [_ for cId in p.catIds for _ in self._gts[imgId, cId]]
+            dt = [_ for cId in p.catIds for _ in self._dts[imgId, cId]]
+        if len(gt) == 0 and len(dt) == 0:
+            return None
+
+        for g in gt:
+            # g['_ignore'] = g['ignore']
+            if g["ignore"] or (g["area"] < aRng[0] or g["area"] > aRng[1]):
+                g["_ignore"] = True
+            else:
+                g["_ignore"] = False
+
+        # sort dt highest score first, sort gt ignore last
+        gtind = np.argsort([g["_ignore"] for g in gt], kind="mergesort")
+        gt = [gt[i] for i in gtind]
+        dtind = np.argsort([-d["score"] for d in dt], kind="mergesort")
+        dt = [dt[i] for i in dtind[0:maxDet]]
+        iscrowd = [int(o["iscrowd"]) for o in gt]
+        # load computed ious
+        if p.iouType == "densepose":
+            # print('Checking the length', len(self.ious[imgId, catId]))
+            # if len(self.ious[imgId, catId]) == 0:
+            #    print(self.ious[imgId, catId])
+            ious = (
+                self.ious[imgId, catId][0][:, gtind]
+                if len(self.ious[imgId, catId]) > 0
+                else self.ious[imgId, catId]
+            )
+            ioubs = (
+                self.ious[imgId, catId][1][:, gtind]
+                if len(self.ious[imgId, catId]) > 0
+                else self.ious[imgId, catId]
+            )
+            if self._dpEvalMode == DensePoseEvalMode.GPSM:
+                iousM = (
+                    self.real_ious[imgId, catId][:, gtind]
+                    if len(self.real_ious[imgId, catId]) > 0
+                    else self.real_ious[imgId, catId]
+                )
+        else:
+            ious = (
+                self.ious[imgId, catId][:, gtind]
+                if len(self.ious[imgId, catId]) > 0
+                else self.ious[imgId, catId]
+            )
+
+        T = len(p.iouThrs)
+        G = len(gt)
+        D = len(dt)
+        gtm = np.zeros((T, G))
+        dtm = np.zeros((T, D))
+        gtIg = np.array([g["_ignore"] for g in gt])
+        dtIg = np.zeros((T, D))
+        if np.all(gtIg) and p.iouType == "densepose":
+            dtIg = np.logical_or(dtIg, True)
+
+        if len(ious) > 0:  # and not p.iouType == 'densepose':
+            for tind, t in enumerate(p.iouThrs):
+                for dind, d in enumerate(dt):
+                    # information about best match so far (m=-1 -> unmatched)
+                    iou = min([t, 1 - 1e-10])
+                    m = -1
+                    for gind, _g in enumerate(gt):
+                        # if this gt already matched, and not a crowd, continue
+                        if gtm[tind, gind] > 0 and not iscrowd[gind]:
+                            continue
+                        # if dt matched to reg gt, and on ignore gt, stop
+                        if m > -1 and gtIg[m] == 0 and gtIg[gind] == 1:
+                            break
+                        if p.iouType == "densepose":
+                            if self._dpEvalMode == DensePoseEvalMode.GPSM:
+                                new_iou = np.sqrt(iousM[dind, gind] * ious[dind, gind])
+                            elif self._dpEvalMode == DensePoseEvalMode.IOU:
+                                new_iou = iousM[dind, gind]
+                            elif self._dpEvalMode == DensePoseEvalMode.GPS:
+                                new_iou = ious[dind, gind]
+                        else:
+                            new_iou = ious[dind, gind]
+                        if new_iou < iou:
+                            continue
+                        if new_iou == 0.0:
+                            continue
+                        # if match successful and best so far, store appropriately
+                        iou = new_iou
+                        m = gind
+                    # if match made store id of match for both dt and gt
+                    if m == -1:
+                        continue
+                    dtIg[tind, dind] = gtIg[m]
+                    dtm[tind, dind] = gt[m]["id"]
+                    gtm[tind, m] = d["id"]
+
+        if p.iouType == "densepose":
+            if not len(ioubs) == 0:
+                for dind, d in enumerate(dt):
+                    # information about best match so far (m=-1 -> unmatched)
+                    if dtm[tind, dind] == 0:
+                        ioub = 0.8
+                        m = -1
+                        for gind, _g in enumerate(gt):
+                            # if this gt already matched, and not a crowd, continue
+                            if gtm[tind, gind] > 0 and not iscrowd[gind]:
+                                continue
+                            # continue to next gt unless better match made
+                            if ioubs[dind, gind] < ioub:
+                                continue
+                            # if match successful and best so far, store appropriately
+                            ioub = ioubs[dind, gind]
+                            m = gind
+                            # if match made store id of match for both dt and gt
+                        if m > -1:
+                            dtIg[:, dind] = gtIg[m]
+                            if gtIg[m]:
+                                dtm[tind, dind] = gt[m]["id"]
+                                gtm[tind, m] = d["id"]
+        # set unmatched detections outside of area range to ignore
+        a = np.array([d["area"] < aRng[0] or d["area"] > aRng[1] for d in dt]).reshape((1, len(dt)))
+        dtIg = np.logical_or(dtIg, np.logical_and(dtm == 0, np.repeat(a, T, 0)))
+        # store results for given image and category
+        # print('Done with the function', len(self.ious[imgId, catId]))
+        return {
+            "image_id": imgId,
+            "category_id": catId,
+            "aRng": aRng,
+            "maxDet": maxDet,
+            "dtIds": [d["id"] for d in dt],
+            "gtIds": [g["id"] for g in gt],
+            "dtMatches": dtm,
+            "gtMatches": gtm,
+            "dtScores": [d["score"] for d in dt],
+            "gtIgnore": gtIg,
+            "dtIgnore": dtIg,
+        }
+
+    def accumulate(self, p=None):
+        """
+        Accumulate per image evaluation results and store the result in self.eval
+        :param p: input params for evaluation
+        :return: None
+        """
+        logger.info("Accumulating evaluation results...")
+        tic = time.time()
+        if not self.evalImgs:
+            logger.info("Please run evaluate() first")
+        # allows input customized parameters
+        if p is None:
+            p = self.params
+        p.catIds = p.catIds if p.useCats == 1 else [-1]
+        T = len(p.iouThrs)
+        R = len(p.recThrs)
+        K = len(p.catIds) if p.useCats else 1
+        A = len(p.areaRng)
+        M = len(p.maxDets)
+        precision = -(np.ones((T, R, K, A, M)))  # -1 for the precision of absent categories
+        recall = -(np.ones((T, K, A, M)))
+
+        # create dictionary for future indexing
+        logger.info("Categories: {}".format(p.catIds))
+        _pe = self._paramsEval
+        catIds = _pe.catIds if _pe.useCats else [-1]
+        setK = set(catIds)
+        setA = set(map(tuple, _pe.areaRng))
+        setM = set(_pe.maxDets)
+        setI = set(_pe.imgIds)
+        # get inds to evaluate
+        k_list = [n for n, k in enumerate(p.catIds) if k in setK]
+        m_list = [m for n, m in enumerate(p.maxDets) if m in setM]
+        a_list = [n for n, a in enumerate(map(lambda x: tuple(x), p.areaRng)) if a in setA]
+        i_list = [n for n, i in enumerate(p.imgIds) if i in setI]
+        I0 = len(_pe.imgIds)
+        A0 = len(_pe.areaRng)
+        # retrieve E at each category, area range, and max number of detections
+        for k, k0 in enumerate(k_list):
+            Nk = k0 * A0 * I0
+            for a, a0 in enumerate(a_list):
+                Na = a0 * I0
+                for m, maxDet in enumerate(m_list):
+                    E = [self.evalImgs[Nk + Na + i] for i in i_list]
+                    E = [e for e in E if e is not None]
+                    if len(E) == 0:
+                        continue
+                    dtScores = np.concatenate([e["dtScores"][0:maxDet] for e in E])
+
+                    # different sorting method generates slightly different results.
+                    # mergesort is used to be consistent as Matlab implementation.
+                    inds = np.argsort(-dtScores, kind="mergesort")
+
+                    dtm = np.concatenate([e["dtMatches"][:, 0:maxDet] for e in E], axis=1)[:, inds]
+                    dtIg = np.concatenate([e["dtIgnore"][:, 0:maxDet] for e in E], axis=1)[:, inds]
+                    gtIg = np.concatenate([e["gtIgnore"] for e in E])
+                    npig = np.count_nonzero(gtIg == 0)
+                    if npig == 0:
+                        continue
+                    tps = np.logical_and(dtm, np.logical_not(dtIg))
+                    fps = np.logical_and(np.logical_not(dtm), np.logical_not(dtIg))
+                    tp_sum = np.cumsum(tps, axis=1).astype(dtype=np.float)
+                    fp_sum = np.cumsum(fps, axis=1).astype(dtype=np.float)
+                    for t, (tp, fp) in enumerate(zip(tp_sum, fp_sum)):
+                        tp = np.array(tp)
+                        fp = np.array(fp)
+                        nd = len(tp)
+                        rc = tp / npig
+                        pr = tp / (fp + tp + np.spacing(1))
+                        q = np.zeros((R,))
+
+                        if nd:
+                            recall[t, k, a, m] = rc[-1]
+                        else:
+                            recall[t, k, a, m] = 0
+
+                        # numpy is slow without cython optimization for accessing elements
+                        # use python array gets significant speed improvement
+                        pr = pr.tolist()
+                        q = q.tolist()
+
+                        for i in range(nd - 1, 0, -1):
+                            if pr[i] > pr[i - 1]:
+                                pr[i - 1] = pr[i]
+
+                        inds = np.searchsorted(rc, p.recThrs, side="left")
+                        try:
+                            for ri, pi in enumerate(inds):
+                                q[ri] = pr[pi]
+                        except Exception:
+                            pass
+                        precision[t, :, k, a, m] = np.array(q)
+        logger.info(
+            "Final: max precision {}, min precision {}".format(np.max(precision), np.min(precision))
+        )
+        self.eval = {
+            "params": p,
+            "counts": [T, R, K, A, M],
+            "date": datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
+            "precision": precision,
+            "recall": recall,
+        }
+        toc = time.time()
+        logger.info("DONE (t={:0.2f}s).".format(toc - tic))
+
+    def summarize(self):
+        """
+        Compute and display summary metrics for evaluation results.
+        Note this function can *only* be applied on the default parameter setting
+        """
+
+        def _summarize(ap=1, iouThr=None, areaRng="all", maxDets=100):
+            p = self.params
+            iStr = " {:<18} {} @[ {}={:<9} | area={:>6s} | maxDets={:>3d} ] = {:0.3f}"
+            titleStr = "Average Precision" if ap == 1 else "Average Recall"
+            typeStr = "(AP)" if ap == 1 else "(AR)"
+            measure = "IoU"
+            if self.params.iouType == "keypoints":
+                measure = "OKS"
+            elif self.params.iouType == "densepose":
+                measure = "OGPS"
+            iouStr = (
+                "{:0.2f}:{:0.2f}".format(p.iouThrs[0], p.iouThrs[-1])
+                if iouThr is None
+                else "{:0.2f}".format(iouThr)
+            )
+
+            aind = [i for i, aRng in enumerate(p.areaRngLbl) if aRng == areaRng]
+            mind = [i for i, mDet in enumerate(p.maxDets) if mDet == maxDets]
+            if ap == 1:
+                # dimension of precision: [TxRxKxAxM]
+                s = self.eval["precision"]
+                # IoU
+                if iouThr is not None:
+                    t = np.where(np.abs(iouThr - p.iouThrs) < 0.001)[0]
+                    s = s[t]
+                s = s[:, :, :, aind, mind]
+            else:
+                # dimension of recall: [TxKxAxM]
+                s = self.eval["recall"]
+                if iouThr is not None:
+                    t = np.where(iouThr == p.iouThrs)[0]
+                    s = s[t]
+                s = s[:, :, aind, mind]
+            if len(s[s > -1]) == 0:
+                mean_s = -1
+            else:
+                mean_s = np.mean(s[s > -1])
+            logger.info(iStr.format(titleStr, typeStr, measure, iouStr, areaRng, maxDets, mean_s))
+            return mean_s
+
+        def _summarizeDets():
+            stats = np.zeros((12,))
+            stats[0] = _summarize(1)
+            stats[1] = _summarize(1, iouThr=0.5, maxDets=self.params.maxDets[2])
+            stats[2] = _summarize(1, iouThr=0.75, maxDets=self.params.maxDets[2])
+            stats[3] = _summarize(1, areaRng="small", maxDets=self.params.maxDets[2])
+            stats[4] = _summarize(1, areaRng="medium", maxDets=self.params.maxDets[2])
+            stats[5] = _summarize(1, areaRng="large", maxDets=self.params.maxDets[2])
+            stats[6] = _summarize(0, maxDets=self.params.maxDets[0])
+            stats[7] = _summarize(0, maxDets=self.params.maxDets[1])
+            stats[8] = _summarize(0, maxDets=self.params.maxDets[2])
+            stats[9] = _summarize(0, areaRng="small", maxDets=self.params.maxDets[2])
+            stats[10] = _summarize(0, areaRng="medium", maxDets=self.params.maxDets[2])
+            stats[11] = _summarize(0, areaRng="large", maxDets=self.params.maxDets[2])
+            return stats
+
+        def _summarizeKps():
+            stats = np.zeros((10,))
+            stats[0] = _summarize(1, maxDets=20)
+            stats[1] = _summarize(1, maxDets=20, iouThr=0.5)
+            stats[2] = _summarize(1, maxDets=20, iouThr=0.75)
+            stats[3] = _summarize(1, maxDets=20, areaRng="medium")
+            stats[4] = _summarize(1, maxDets=20, areaRng="large")
+            stats[5] = _summarize(0, maxDets=20)
+            stats[6] = _summarize(0, maxDets=20, iouThr=0.5)
+            stats[7] = _summarize(0, maxDets=20, iouThr=0.75)
+            stats[8] = _summarize(0, maxDets=20, areaRng="medium")
+            stats[9] = _summarize(0, maxDets=20, areaRng="large")
+            return stats
+
+        def _summarizeUvs():
+            stats = np.zeros((10,))
+            stats[0] = _summarize(1, maxDets=self.params.maxDets[0])
+            stats[1] = _summarize(1, maxDets=self.params.maxDets[0], iouThr=0.5)
+            stats[2] = _summarize(1, maxDets=self.params.maxDets[0], iouThr=0.75)
+            stats[3] = _summarize(1, maxDets=self.params.maxDets[0], areaRng="medium")
+            stats[4] = _summarize(1, maxDets=self.params.maxDets[0], areaRng="large")
+            stats[5] = _summarize(0, maxDets=self.params.maxDets[0])
+            stats[6] = _summarize(0, maxDets=self.params.maxDets[0], iouThr=0.5)
+            stats[7] = _summarize(0, maxDets=self.params.maxDets[0], iouThr=0.75)
+            stats[8] = _summarize(0, maxDets=self.params.maxDets[0], areaRng="medium")
+            stats[9] = _summarize(0, maxDets=self.params.maxDets[0], areaRng="large")
+            return stats
+
+        def _summarizeUvsOld():
+            stats = np.zeros((18,))
+            stats[0] = _summarize(1, maxDets=self.params.maxDets[0])
+            stats[1] = _summarize(1, maxDets=self.params.maxDets[0], iouThr=0.5)
+            stats[2] = _summarize(1, maxDets=self.params.maxDets[0], iouThr=0.55)
+            stats[3] = _summarize(1, maxDets=self.params.maxDets[0], iouThr=0.60)
+            stats[4] = _summarize(1, maxDets=self.params.maxDets[0], iouThr=0.65)
+            stats[5] = _summarize(1, maxDets=self.params.maxDets[0], iouThr=0.70)
+            stats[6] = _summarize(1, maxDets=self.params.maxDets[0], iouThr=0.75)
+            stats[7] = _summarize(1, maxDets=self.params.maxDets[0], iouThr=0.80)
+            stats[8] = _summarize(1, maxDets=self.params.maxDets[0], iouThr=0.85)
+            stats[9] = _summarize(1, maxDets=self.params.maxDets[0], iouThr=0.90)
+            stats[10] = _summarize(1, maxDets=self.params.maxDets[0], iouThr=0.95)
+            stats[11] = _summarize(1, maxDets=self.params.maxDets[0], areaRng="medium")
+            stats[12] = _summarize(1, maxDets=self.params.maxDets[0], areaRng="large")
+            stats[13] = _summarize(0, maxDets=self.params.maxDets[0])
+            stats[14] = _summarize(0, maxDets=self.params.maxDets[0], iouThr=0.5)
+            stats[15] = _summarize(0, maxDets=self.params.maxDets[0], iouThr=0.75)
+            stats[16] = _summarize(0, maxDets=self.params.maxDets[0], areaRng="medium")
+            stats[17] = _summarize(0, maxDets=self.params.maxDets[0], areaRng="large")
+            return stats
+
+        if not self.eval:
+            raise Exception("Please run accumulate() first")
+        iouType = self.params.iouType
+        if iouType in ["segm", "bbox"]:
+            summarize = _summarizeDets
+        elif iouType in ["keypoints"]:
+            summarize = _summarizeKps
+        elif iouType in ["densepose"]:
+            summarize = _summarizeUvs
+        self.stats = summarize()
+
+    def __str__(self):
+        self.summarize()
+
+    # ================ functions for dense pose ==============================
+    def findAllClosestVerts(self, gt, U_points, V_points, Index_points):
+        #
+        I_gt = np.array(gt["dp_I"])
+        U_gt = np.array(gt["dp_U"])
+        V_gt = np.array(gt["dp_V"])
+        #
+        # print(I_gt)
+        #
+        ClosestVerts = np.ones(Index_points.shape) * -1
+        for i in np.arange(24):
+            #
+            if sum(Index_points == (i + 1)) > 0:
+                UVs = np.array(
+                    [U_points[Index_points == (i + 1)], V_points[Index_points == (i + 1)]]
+                )
+                Current_Part_UVs = self.Part_UVs[i]
+                Current_Part_ClosestVertInds = self.Part_ClosestVertInds[i]
+                D = ssd.cdist(Current_Part_UVs.transpose(), UVs.transpose()).squeeze()
+                ClosestVerts[Index_points == (i + 1)] = Current_Part_ClosestVertInds[
+                    np.argmin(D, axis=0)
+                ]
+        #
+        ClosestVertsGT = np.ones(Index_points.shape) * -1
+        for i in np.arange(24):
+            if sum(I_gt == (i + 1)) > 0:
+                UVs = np.array([U_gt[I_gt == (i + 1)], V_gt[I_gt == (i + 1)]])
+                Current_Part_UVs = self.Part_UVs[i]
+                Current_Part_ClosestVertInds = self.Part_ClosestVertInds[i]
+                D = ssd.cdist(Current_Part_UVs.transpose(), UVs.transpose()).squeeze()
+                ClosestVertsGT[I_gt == (i + 1)] = Current_Part_ClosestVertInds[np.argmin(D, axis=0)]
+        #
+        return ClosestVerts, ClosestVertsGT
+
+    def getDistances(self, cVertsGT, cVerts):
+
+        ClosestVertsTransformed = self.PDIST_transform[cVerts.astype(int) - 1]
+        ClosestVertsGTTransformed = self.PDIST_transform[cVertsGT.astype(int) - 1]
+        #
+        ClosestVertsTransformed[cVerts < 0] = 0
+        ClosestVertsGTTransformed[cVertsGT < 0] = 0
+        #
+        cVertsGT = ClosestVertsGTTransformed
+        cVerts = ClosestVertsTransformed
+        #
+        n = 27554
+        dists = []
+        for d in range(len(cVertsGT)):
+            if cVertsGT[d] > 0:
+                if cVerts[d] > 0:
+                    i = cVertsGT[d] - 1
+                    j = cVerts[d] - 1
+                    if j == i:
+                        dists.append(0)
+                    elif j > i:
+                        ccc = i
+                        i = j
+                        j = ccc
+                        i = n - i - 1
+                        j = n - j - 1
+                        k = (n * (n - 1) / 2) - (n - i) * ((n - i) - 1) / 2 + j - i - 1
+                        k = (n * n - n) / 2 - k - 1
+                        dists.append(self.Pdist_matrix[int(k)][0])
+                    else:
+                        i = n - i - 1
+                        j = n - j - 1
+                        k = (n * (n - 1) / 2) - (n - i) * ((n - i) - 1) / 2 + j - i - 1
+                        k = (n * n - n) / 2 - k - 1
+                        dists.append(self.Pdist_matrix[int(k)][0])
+                else:
+                    dists.append(np.inf)
+        return np.atleast_1d(np.array(dists).squeeze())
+
+
+class Params:
+    """
+    Params for coco evaluation api
+    """
+
+    def setDetParams(self):
+        self.imgIds = []
+        self.catIds = []
+        # np.arange causes trouble.  the data point on arange is slightly larger than the true value
+        self.iouThrs = np.linspace(0.5, 0.95, np.round((0.95 - 0.5) / 0.05) + 1, endpoint=True)
+        self.recThrs = np.linspace(0.0, 1.00, np.round((1.00 - 0.0) / 0.01) + 1, endpoint=True)
+        self.maxDets = [1, 10, 100]
+        self.areaRng = [
+            [0 ** 2, 1e5 ** 2],
+            [0 ** 2, 32 ** 2],
+            [32 ** 2, 96 ** 2],
+            [96 ** 2, 1e5 ** 2],
+        ]
+        self.areaRngLbl = ["all", "small", "medium", "large"]
+        self.useCats = 1
+
+    def setKpParams(self):
+        self.imgIds = []
+        self.catIds = []
+        # np.arange causes trouble.  the data point on arange is slightly larger than the true value
+        self.iouThrs = np.linspace(0.5, 0.95, np.round((0.95 - 0.5) / 0.05) + 1, endpoint=True)
+        self.recThrs = np.linspace(0.0, 1.00, np.round((1.00 - 0.0) / 0.01) + 1, endpoint=True)
+        self.maxDets = [20]
+        self.areaRng = [[0 ** 2, 1e5 ** 2], [32 ** 2, 96 ** 2], [96 ** 2, 1e5 ** 2]]
+        self.areaRngLbl = ["all", "medium", "large"]
+        self.useCats = 1
+
+    def setUvParams(self):
+        self.imgIds = []
+        self.catIds = []
+        self.iouThrs = np.linspace(0.5, 0.95, int(np.round((0.95 - 0.5) / 0.05)) + 1, endpoint=True)
+        self.recThrs = np.linspace(0.0, 1.00, int(np.round((1.00 - 0.0) / 0.01)) + 1, endpoint=True)
+        self.maxDets = [20]
+        self.areaRng = [[0 ** 2, 1e5 ** 2], [32 ** 2, 96 ** 2], [96 ** 2, 1e5 ** 2]]
+        self.areaRngLbl = ["all", "medium", "large"]
+        self.useCats = 1
+
+    def __init__(self, iouType="segm"):
+        if iouType == "segm" or iouType == "bbox":
+            self.setDetParams()
+        elif iouType == "keypoints":
+            self.setKpParams()
+        elif iouType == "densepose":
+            self.setUvParams()
+        else:
+            raise Exception("iouType not supported")
+        self.iouType = iouType
+        # useSegm is deprecated
+        self.useSegm = None
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/projects/DensePose/densepose/densepose_head.py b/preprocess/humanparsing/mhp_extension/detectron2/projects/DensePose/densepose/densepose_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..363970681db36a41d5bc5b1960960a2a8bf23855
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/projects/DensePose/densepose/densepose_head.py
@@ -0,0 +1,1216 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+import math
+from dataclasses import dataclass
+from enum import Enum
+import fvcore.nn.weight_init as weight_init
+import torch
+from torch import nn
+from torch.nn import functional as F
+
+from detectron2.config import CfgNode
+from detectron2.layers import Conv2d, ConvTranspose2d, interpolate
+from detectron2.structures.boxes import matched_boxlist_iou
+from detectron2.utils.registry import Registry
+
+from .data.structures import DensePoseOutput
+
+ROI_DENSEPOSE_HEAD_REGISTRY = Registry("ROI_DENSEPOSE_HEAD")
+
+
+class DensePoseUVConfidenceType(Enum):
+    """
+    Statistical model type for confidence learning, possible values:
+     - "iid_iso": statistically independent identically distributed residuals
+         with anisotropic covariance
+     - "indep_aniso": statistically independent residuals with anisotropic
+         covariances
+    For details, see:
+    N. Neverova, D. Novotny, A. Vedaldi "Correlated Uncertainty for Learning
+    Dense Correspondences from Noisy Labels", p. 918--926, in Proc. NIPS 2019
+    """
+
+    # fmt: off
+    IID_ISO     = "iid_iso"
+    INDEP_ANISO = "indep_aniso"
+    # fmt: on
+
+
+@dataclass
+class DensePoseUVConfidenceConfig:
+    """
+    Configuration options for confidence on UV data
+    """
+
+    enabled: bool = False
+    # lower bound on UV confidences
+    epsilon: float = 0.01
+    type: DensePoseUVConfidenceType = DensePoseUVConfidenceType.IID_ISO
+
+
+@dataclass
+class DensePoseConfidenceModelConfig:
+    """
+    Configuration options for confidence models
+    """
+
+    # confidence for U and V values
+    uv_confidence: DensePoseUVConfidenceConfig
+
+    @staticmethod
+    def from_cfg(cfg: CfgNode) -> "DensePoseConfidenceModelConfig":
+        return DensePoseConfidenceModelConfig(
+            uv_confidence=DensePoseUVConfidenceConfig(
+                enabled=cfg.MODEL.ROI_DENSEPOSE_HEAD.UV_CONFIDENCE.ENABLED,
+                epsilon=cfg.MODEL.ROI_DENSEPOSE_HEAD.UV_CONFIDENCE.EPSILON,
+                type=DensePoseUVConfidenceType(cfg.MODEL.ROI_DENSEPOSE_HEAD.UV_CONFIDENCE.TYPE),
+            )
+        )
+
+
+def initialize_module_params(module):
+    for name, param in module.named_parameters():
+        if "bias" in name:
+            nn.init.constant_(param, 0)
+        elif "weight" in name:
+            nn.init.kaiming_normal_(param, mode="fan_out", nonlinearity="relu")
+
+
+@ROI_DENSEPOSE_HEAD_REGISTRY.register()
+class DensePoseDeepLabHead(nn.Module):
+    def __init__(self, cfg, input_channels):
+        super(DensePoseDeepLabHead, self).__init__()
+        # fmt: off
+        hidden_dim           = cfg.MODEL.ROI_DENSEPOSE_HEAD.CONV_HEAD_DIM
+        kernel_size          = cfg.MODEL.ROI_DENSEPOSE_HEAD.CONV_HEAD_KERNEL
+        norm                 = cfg.MODEL.ROI_DENSEPOSE_HEAD.DEEPLAB.NORM
+        self.n_stacked_convs = cfg.MODEL.ROI_DENSEPOSE_HEAD.NUM_STACKED_CONVS
+        self.use_nonlocal    = cfg.MODEL.ROI_DENSEPOSE_HEAD.DEEPLAB.NONLOCAL_ON
+        # fmt: on
+        pad_size = kernel_size // 2
+        n_channels = input_channels
+
+        self.ASPP = ASPP(input_channels, [6, 12, 56], n_channels)  # 6, 12, 56
+        self.add_module("ASPP", self.ASPP)
+
+        if self.use_nonlocal:
+            self.NLBlock = NONLocalBlock2D(input_channels, bn_layer=True)
+            self.add_module("NLBlock", self.NLBlock)
+        # weight_init.c2_msra_fill(self.ASPP)
+
+        for i in range(self.n_stacked_convs):
+            norm_module = nn.GroupNorm(32, hidden_dim) if norm == "GN" else None
+            layer = Conv2d(
+                n_channels,
+                hidden_dim,
+                kernel_size,
+                stride=1,
+                padding=pad_size,
+                bias=not norm,
+                norm=norm_module,
+            )
+            weight_init.c2_msra_fill(layer)
+            n_channels = hidden_dim
+            layer_name = self._get_layer_name(i)
+            self.add_module(layer_name, layer)
+        self.n_out_channels = hidden_dim
+        # initialize_module_params(self)
+
+    def forward(self, features):
+        x0 = features
+        x = self.ASPP(x0)
+        if self.use_nonlocal:
+            x = self.NLBlock(x)
+        output = x
+        for i in range(self.n_stacked_convs):
+            layer_name = self._get_layer_name(i)
+            x = getattr(self, layer_name)(x)
+            x = F.relu(x)
+            output = x
+        return output
+
+    def _get_layer_name(self, i):
+        layer_name = "body_conv_fcn{}".format(i + 1)
+        return layer_name
+
+
+# Copied from
+# https://github.com/pytorch/vision/blob/master/torchvision/models/segmentation/deeplabv3.py
+# See https://arxiv.org/pdf/1706.05587.pdf for details
+class ASPPConv(nn.Sequential):
+    def __init__(self, in_channels, out_channels, dilation):
+        modules = [
+            nn.Conv2d(
+                in_channels, out_channels, 3, padding=dilation, dilation=dilation, bias=False
+            ),
+            nn.GroupNorm(32, out_channels),
+            nn.ReLU(),
+        ]
+        super(ASPPConv, self).__init__(*modules)
+
+
+class ASPPPooling(nn.Sequential):
+    def __init__(self, in_channels, out_channels):
+        super(ASPPPooling, self).__init__(
+            nn.AdaptiveAvgPool2d(1),
+            nn.Conv2d(in_channels, out_channels, 1, bias=False),
+            nn.GroupNorm(32, out_channels),
+            nn.ReLU(),
+        )
+
+    def forward(self, x):
+        size = x.shape[-2:]
+        x = super(ASPPPooling, self).forward(x)
+        return F.interpolate(x, size=size, mode="bilinear", align_corners=False)
+
+
+class ASPP(nn.Module):
+    def __init__(self, in_channels, atrous_rates, out_channels):
+        super(ASPP, self).__init__()
+        modules = []
+        modules.append(
+            nn.Sequential(
+                nn.Conv2d(in_channels, out_channels, 1, bias=False),
+                nn.GroupNorm(32, out_channels),
+                nn.ReLU(),
+            )
+        )
+
+        rate1, rate2, rate3 = tuple(atrous_rates)
+        modules.append(ASPPConv(in_channels, out_channels, rate1))
+        modules.append(ASPPConv(in_channels, out_channels, rate2))
+        modules.append(ASPPConv(in_channels, out_channels, rate3))
+        modules.append(ASPPPooling(in_channels, out_channels))
+
+        self.convs = nn.ModuleList(modules)
+
+        self.project = nn.Sequential(
+            nn.Conv2d(5 * out_channels, out_channels, 1, bias=False),
+            # nn.BatchNorm2d(out_channels),
+            nn.ReLU()
+            # nn.Dropout(0.5)
+        )
+
+    def forward(self, x):
+        res = []
+        for conv in self.convs:
+            res.append(conv(x))
+        res = torch.cat(res, dim=1)
+        return self.project(res)
+
+
+# copied from
+# https://github.com/AlexHex7/Non-local_pytorch/blob/master/lib/non_local_embedded_gaussian.py
+# See https://arxiv.org/abs/1711.07971 for details
+class _NonLocalBlockND(nn.Module):
+    def __init__(
+        self, in_channels, inter_channels=None, dimension=3, sub_sample=True, bn_layer=True
+    ):
+        super(_NonLocalBlockND, self).__init__()
+
+        assert dimension in [1, 2, 3]
+
+        self.dimension = dimension
+        self.sub_sample = sub_sample
+
+        self.in_channels = in_channels
+        self.inter_channels = inter_channels
+
+        if self.inter_channels is None:
+            self.inter_channels = in_channels // 2
+            if self.inter_channels == 0:
+                self.inter_channels = 1
+
+        if dimension == 3:
+            conv_nd = nn.Conv3d
+            max_pool_layer = nn.MaxPool3d(kernel_size=(1, 2, 2))
+            bn = nn.GroupNorm  # (32, hidden_dim) #nn.BatchNorm3d
+        elif dimension == 2:
+            conv_nd = nn.Conv2d
+            max_pool_layer = nn.MaxPool2d(kernel_size=(2, 2))
+            bn = nn.GroupNorm  # (32, hidden_dim)nn.BatchNorm2d
+        else:
+            conv_nd = nn.Conv1d
+            max_pool_layer = nn.MaxPool1d(kernel_size=2)
+            bn = nn.GroupNorm  # (32, hidden_dim)nn.BatchNorm1d
+
+        self.g = conv_nd(
+            in_channels=self.in_channels,
+            out_channels=self.inter_channels,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+        )
+
+        if bn_layer:
+            self.W = nn.Sequential(
+                conv_nd(
+                    in_channels=self.inter_channels,
+                    out_channels=self.in_channels,
+                    kernel_size=1,
+                    stride=1,
+                    padding=0,
+                ),
+                bn(32, self.in_channels),
+            )
+            nn.init.constant_(self.W[1].weight, 0)
+            nn.init.constant_(self.W[1].bias, 0)
+        else:
+            self.W = conv_nd(
+                in_channels=self.inter_channels,
+                out_channels=self.in_channels,
+                kernel_size=1,
+                stride=1,
+                padding=0,
+            )
+            nn.init.constant_(self.W.weight, 0)
+            nn.init.constant_(self.W.bias, 0)
+
+        self.theta = conv_nd(
+            in_channels=self.in_channels,
+            out_channels=self.inter_channels,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+        )
+        self.phi = conv_nd(
+            in_channels=self.in_channels,
+            out_channels=self.inter_channels,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+        )
+
+        if sub_sample:
+            self.g = nn.Sequential(self.g, max_pool_layer)
+            self.phi = nn.Sequential(self.phi, max_pool_layer)
+
+    def forward(self, x):
+        """
+        :param x: (b, c, t, h, w)
+        :return:
+        """
+
+        batch_size = x.size(0)
+
+        g_x = self.g(x).view(batch_size, self.inter_channels, -1)
+        g_x = g_x.permute(0, 2, 1)
+
+        theta_x = self.theta(x).view(batch_size, self.inter_channels, -1)
+        theta_x = theta_x.permute(0, 2, 1)
+        phi_x = self.phi(x).view(batch_size, self.inter_channels, -1)
+        f = torch.matmul(theta_x, phi_x)
+        f_div_C = F.softmax(f, dim=-1)
+
+        y = torch.matmul(f_div_C, g_x)
+        y = y.permute(0, 2, 1).contiguous()
+        y = y.view(batch_size, self.inter_channels, *x.size()[2:])
+        W_y = self.W(y)
+        z = W_y + x
+
+        return z
+
+
+class NONLocalBlock2D(_NonLocalBlockND):
+    def __init__(self, in_channels, inter_channels=None, sub_sample=True, bn_layer=True):
+        super(NONLocalBlock2D, self).__init__(
+            in_channels,
+            inter_channels=inter_channels,
+            dimension=2,
+            sub_sample=sub_sample,
+            bn_layer=bn_layer,
+        )
+
+
+@ROI_DENSEPOSE_HEAD_REGISTRY.register()
+class DensePoseV1ConvXHead(nn.Module):
+    def __init__(self, cfg, input_channels):
+        super(DensePoseV1ConvXHead, self).__init__()
+        # fmt: off
+        hidden_dim           = cfg.MODEL.ROI_DENSEPOSE_HEAD.CONV_HEAD_DIM
+        kernel_size          = cfg.MODEL.ROI_DENSEPOSE_HEAD.CONV_HEAD_KERNEL
+        self.n_stacked_convs = cfg.MODEL.ROI_DENSEPOSE_HEAD.NUM_STACKED_CONVS
+        # fmt: on
+        pad_size = kernel_size // 2
+        n_channels = input_channels
+        for i in range(self.n_stacked_convs):
+            layer = Conv2d(n_channels, hidden_dim, kernel_size, stride=1, padding=pad_size)
+            layer_name = self._get_layer_name(i)
+            self.add_module(layer_name, layer)
+            n_channels = hidden_dim
+        self.n_out_channels = n_channels
+        initialize_module_params(self)
+
+    def forward(self, features):
+        x = features
+        output = x
+        for i in range(self.n_stacked_convs):
+            layer_name = self._get_layer_name(i)
+            x = getattr(self, layer_name)(x)
+            x = F.relu(x)
+            output = x
+        return output
+
+    def _get_layer_name(self, i):
+        layer_name = "body_conv_fcn{}".format(i + 1)
+        return layer_name
+
+
+class DensePosePredictor(nn.Module):
+    def __init__(self, cfg, input_channels):
+
+        super(DensePosePredictor, self).__init__()
+        dim_in = input_channels
+        n_segm_chan = cfg.MODEL.ROI_DENSEPOSE_HEAD.NUM_COARSE_SEGM_CHANNELS
+        dim_out_patches = cfg.MODEL.ROI_DENSEPOSE_HEAD.NUM_PATCHES + 1
+        kernel_size = cfg.MODEL.ROI_DENSEPOSE_HEAD.DECONV_KERNEL
+        self.ann_index_lowres = ConvTranspose2d(
+            dim_in, n_segm_chan, kernel_size, stride=2, padding=int(kernel_size / 2 - 1)
+        )
+        self.index_uv_lowres = ConvTranspose2d(
+            dim_in, dim_out_patches, kernel_size, stride=2, padding=int(kernel_size / 2 - 1)
+        )
+        self.u_lowres = ConvTranspose2d(
+            dim_in, dim_out_patches, kernel_size, stride=2, padding=int(kernel_size / 2 - 1)
+        )
+        self.v_lowres = ConvTranspose2d(
+            dim_in, dim_out_patches, kernel_size, stride=2, padding=int(kernel_size / 2 - 1)
+        )
+        self.scale_factor = cfg.MODEL.ROI_DENSEPOSE_HEAD.UP_SCALE
+        self.confidence_model_cfg = DensePoseConfidenceModelConfig.from_cfg(cfg)
+        self._initialize_confidence_estimation_layers(cfg, self.confidence_model_cfg, dim_in)
+        initialize_module_params(self)
+
+    def forward(self, head_outputs):
+        ann_index_lowres = self.ann_index_lowres(head_outputs)
+        index_uv_lowres = self.index_uv_lowres(head_outputs)
+        u_lowres = self.u_lowres(head_outputs)
+        v_lowres = self.v_lowres(head_outputs)
+
+        def interp2d(input):
+            return interpolate(
+                input, scale_factor=self.scale_factor, mode="bilinear", align_corners=False
+            )
+
+        ann_index = interp2d(ann_index_lowres)
+        index_uv = interp2d(index_uv_lowres)
+        u = interp2d(u_lowres)
+        v = interp2d(v_lowres)
+        (
+            (sigma_1, sigma_2, kappa_u, kappa_v),
+            (sigma_1_lowres, sigma_2_lowres, kappa_u_lowres, kappa_v_lowres),
+            (ann_index, index_uv),
+        ) = self._forward_confidence_estimation_layers(
+            self.confidence_model_cfg, head_outputs, interp2d, ann_index, index_uv
+        )
+        return (
+            (ann_index, index_uv, u, v),
+            (ann_index_lowres, index_uv_lowres, u_lowres, v_lowres),
+            (sigma_1, sigma_2, kappa_u, kappa_v),
+            (sigma_1_lowres, sigma_2_lowres, kappa_u_lowres, kappa_v_lowres),
+        )
+
+    def _initialize_confidence_estimation_layers(
+        self, cfg: CfgNode, confidence_model_cfg: DensePoseConfidenceModelConfig, dim_in: int
+    ):
+        dim_out_patches = cfg.MODEL.ROI_DENSEPOSE_HEAD.NUM_PATCHES + 1
+        kernel_size = cfg.MODEL.ROI_DENSEPOSE_HEAD.DECONV_KERNEL
+        if confidence_model_cfg.uv_confidence.enabled:
+            if confidence_model_cfg.uv_confidence.type == DensePoseUVConfidenceType.IID_ISO:
+                self.sigma_2_lowres = ConvTranspose2d(
+                    dim_in, dim_out_patches, kernel_size, stride=2, padding=int(kernel_size / 2 - 1)
+                )
+            elif confidence_model_cfg.uv_confidence.type == DensePoseUVConfidenceType.INDEP_ANISO:
+                self.sigma_2_lowres = ConvTranspose2d(
+                    dim_in, dim_out_patches, kernel_size, stride=2, padding=int(kernel_size / 2 - 1)
+                )
+                self.kappa_u_lowres = ConvTranspose2d(
+                    dim_in, dim_out_patches, kernel_size, stride=2, padding=int(kernel_size / 2 - 1)
+                )
+                self.kappa_v_lowres = ConvTranspose2d(
+                    dim_in, dim_out_patches, kernel_size, stride=2, padding=int(kernel_size / 2 - 1)
+                )
+            else:
+                raise ValueError(
+                    f"Unknown confidence model type: {confidence_model_cfg.confidence_model_type}"
+                )
+
+    def _forward_confidence_estimation_layers(
+        self, confidence_model_cfg, head_outputs, interp2d, ann_index, index_uv
+    ):
+        sigma_1, sigma_2, kappa_u, kappa_v = None, None, None, None
+        sigma_1_lowres, sigma_2_lowres, kappa_u_lowres, kappa_v_lowres = None, None, None, None
+        if confidence_model_cfg.uv_confidence.enabled:
+            if confidence_model_cfg.uv_confidence.type == DensePoseUVConfidenceType.IID_ISO:
+                sigma_2_lowres = self.sigma_2_lowres(head_outputs)
+                sigma_2 = interp2d(sigma_2_lowres)
+            elif confidence_model_cfg.uv_confidence.type == DensePoseUVConfidenceType.INDEP_ANISO:
+                sigma_2_lowres = self.sigma_2_lowres(head_outputs)
+                kappa_u_lowres = self.kappa_u_lowres(head_outputs)
+                kappa_v_lowres = self.kappa_v_lowres(head_outputs)
+                sigma_2 = interp2d(sigma_2_lowres)
+                kappa_u = interp2d(kappa_u_lowres)
+                kappa_v = interp2d(kappa_v_lowres)
+            else:
+                raise ValueError(
+                    f"Unknown confidence model type: {confidence_model_cfg.confidence_model_type}"
+                )
+        return (
+            (sigma_1, sigma_2, kappa_u, kappa_v),
+            (sigma_1_lowres, sigma_2_lowres, kappa_u_lowres, kappa_v_lowres),
+            (ann_index, index_uv),
+        )
+
+
+class DensePoseDataFilter(object):
+    def __init__(self, cfg):
+        self.iou_threshold = cfg.MODEL.ROI_DENSEPOSE_HEAD.FG_IOU_THRESHOLD
+
+    @torch.no_grad()
+    def __call__(self, proposals_with_targets):
+        """
+        Filters proposals with targets to keep only the ones relevant for
+        DensePose training
+        proposals: list(Instances), each element of the list corresponds to
+            various instances (proposals, GT for boxes and densepose) for one
+            image
+        """
+        proposals_filtered = []
+        for proposals_per_image in proposals_with_targets:
+            if not hasattr(proposals_per_image, "gt_densepose"):
+                continue
+            assert hasattr(proposals_per_image, "gt_boxes")
+            assert hasattr(proposals_per_image, "proposal_boxes")
+            gt_boxes = proposals_per_image.gt_boxes
+            est_boxes = proposals_per_image.proposal_boxes
+            # apply match threshold for densepose head
+            iou = matched_boxlist_iou(gt_boxes, est_boxes)
+            iou_select = iou > self.iou_threshold
+            proposals_per_image = proposals_per_image[iou_select]
+            assert len(proposals_per_image.gt_boxes) == len(proposals_per_image.proposal_boxes)
+            # filter out any target without densepose annotation
+            gt_densepose = proposals_per_image.gt_densepose
+            assert len(proposals_per_image.gt_boxes) == len(proposals_per_image.gt_densepose)
+            selected_indices = [
+                i for i, dp_target in enumerate(gt_densepose) if dp_target is not None
+            ]
+            if len(selected_indices) != len(gt_densepose):
+                proposals_per_image = proposals_per_image[selected_indices]
+            assert len(proposals_per_image.gt_boxes) == len(proposals_per_image.proposal_boxes)
+            assert len(proposals_per_image.gt_boxes) == len(proposals_per_image.gt_densepose)
+            proposals_filtered.append(proposals_per_image)
+        return proposals_filtered
+
+
+def build_densepose_head(cfg, input_channels):
+    head_name = cfg.MODEL.ROI_DENSEPOSE_HEAD.NAME
+    return ROI_DENSEPOSE_HEAD_REGISTRY.get(head_name)(cfg, input_channels)
+
+
+def build_densepose_predictor(cfg, input_channels):
+    predictor = DensePosePredictor(cfg, input_channels)
+    return predictor
+
+
+def build_densepose_data_filter(cfg):
+    dp_filter = DensePoseDataFilter(cfg)
+    return dp_filter
+
+
+def densepose_inference(densepose_outputs, densepose_confidences, detections):
+    """
+    Infer dense pose estimate based on outputs from the DensePose head
+    and detections. The estimate for each detection instance is stored in its
+    "pred_densepose" attribute.
+
+    Args:
+        densepose_outputs (tuple(`torch.Tensor`)): iterable containing 4 elements:
+            - s (:obj: `torch.Tensor`): coarse segmentation tensor of size (N, A, H, W),
+            - i (:obj: `torch.Tensor`): fine segmentation tensor of size (N, C, H, W),
+            - u (:obj: `torch.Tensor`): U coordinates for each class of size (N, C, H, W),
+            - v (:obj: `torch.Tensor`): V coordinates for each class of size (N, C, H, W),
+            where N is the total number of detections in a batch,
+                  A is the number of coarse segmentations labels
+                      (e.g. 15 for coarse body parts + background),
+                  C is the number of fine segmentation labels
+                      (e.g. 25 for fine body parts + background),
+                  W is the resolution along the X axis
+                  H is the resolution along the Y axis
+        densepose_confidences (tuple(`torch.Tensor`)): iterable containing 4 elements:
+            - sigma_1 (:obj: `torch.Tensor`): global confidences for UV coordinates
+                of size (N, C, H, W)
+            - sigma_2 (:obj: `torch.Tensor`): individual confidences for UV coordinates
+                of size (N, C, H, W)
+            - kappa_u (:obj: `torch.Tensor`): first component of confidence direction
+                vector of size (N, C, H, W)
+            - kappa_v (:obj: `torch.Tensor`): second component of confidence direction
+                vector of size (N, C, H, W)
+        detections (list[Instances]): A list of N Instances, where N is the number of images
+            in the batch. Instances are modified by this method: "pred_densepose" attribute
+            is added to each instance, the attribute contains the corresponding
+            DensePoseOutput object.
+    """
+    # DensePose outputs: segmentation, body part indices, U, V
+    s, index_uv, u, v = densepose_outputs
+    sigma_1, sigma_2, kappa_u, kappa_v = densepose_confidences
+    k = 0
+    for detection in detections:
+        n_i = len(detection)
+        s_i = s[k : k + n_i]
+        index_uv_i = index_uv[k : k + n_i]
+        u_i = u[k : k + n_i]
+        v_i = v[k : k + n_i]
+        _local_vars = locals()
+        confidences = {
+            name: _local_vars[name]
+            for name in ("sigma_1", "sigma_2", "kappa_u", "kappa_v")
+            if _local_vars.get(name) is not None
+        }
+        densepose_output_i = DensePoseOutput(s_i, index_uv_i, u_i, v_i, confidences)
+        detection.pred_densepose = densepose_output_i
+        k += n_i
+
+
+def _linear_interpolation_utilities(v_norm, v0_src, size_src, v0_dst, size_dst, size_z):
+    """
+    Computes utility values for linear interpolation at points v.
+    The points are given as normalized offsets in the source interval
+    (v0_src, v0_src + size_src), more precisely:
+        v = v0_src + v_norm * size_src / 256.0
+    The computed utilities include lower points v_lo, upper points v_hi,
+    interpolation weights v_w and flags j_valid indicating whether the
+    points falls into the destination interval (v0_dst, v0_dst + size_dst).
+
+    Args:
+        v_norm (:obj: `torch.Tensor`): tensor of size N containing
+            normalized point offsets
+        v0_src (:obj: `torch.Tensor`): tensor of size N containing
+            left bounds of source intervals for normalized points
+        size_src (:obj: `torch.Tensor`): tensor of size N containing
+            source interval sizes for normalized points
+        v0_dst (:obj: `torch.Tensor`): tensor of size N containing
+            left bounds of destination intervals
+        size_dst (:obj: `torch.Tensor`): tensor of size N containing
+            destination interval sizes
+        size_z (int): interval size for data to be interpolated
+
+    Returns:
+        v_lo (:obj: `torch.Tensor`): int tensor of size N containing
+            indices of lower values used for interpolation, all values are
+            integers from [0, size_z - 1]
+        v_hi (:obj: `torch.Tensor`): int tensor of size N containing
+            indices of upper values used for interpolation, all values are
+            integers from [0, size_z - 1]
+        v_w (:obj: `torch.Tensor`): float tensor of size N containing
+            interpolation weights
+        j_valid (:obj: `torch.Tensor`): uint8 tensor of size N containing
+            0 for points outside the estimation interval
+            (v0_est, v0_est + size_est) and 1 otherwise
+    """
+    v = v0_src + v_norm * size_src / 256.0
+    j_valid = (v - v0_dst >= 0) * (v - v0_dst < size_dst)
+    v_grid = (v - v0_dst) * size_z / size_dst
+    v_lo = v_grid.floor().long().clamp(min=0, max=size_z - 1)
+    v_hi = (v_lo + 1).clamp(max=size_z - 1)
+    v_grid = torch.min(v_hi.float(), v_grid)
+    v_w = v_grid - v_lo.float()
+    return v_lo, v_hi, v_w, j_valid
+
+
+def _grid_sampling_utilities(
+    zh, zw, bbox_xywh_est, bbox_xywh_gt, index_gt, x_norm, y_norm, index_bbox
+):
+    """
+    Prepare tensors used in grid sampling.
+
+    Args:
+        z_est (:obj: `torch.Tensor`): tensor of size (N,C,H,W) with estimated
+            values of Z to be extracted for the points X, Y and channel
+            indices I
+        bbox_xywh_est (:obj: `torch.Tensor`): tensor of size (N, 4) containing
+            estimated bounding boxes in format XYWH
+        bbox_xywh_gt (:obj: `torch.Tensor`): tensor of size (N, 4) containing
+            matched ground truth bounding boxes in format XYWH
+        index_gt (:obj: `torch.Tensor`): tensor of size K with point labels for
+            ground truth points
+        x_norm (:obj: `torch.Tensor`): tensor of size K with X normalized
+            coordinates of ground truth points. Image X coordinates can be
+            obtained as X = Xbbox + x_norm * Wbbox / 255
+        y_norm (:obj: `torch.Tensor`): tensor of size K with Y normalized
+            coordinates of ground truth points. Image Y coordinates can be
+            obtained as Y = Ybbox + y_norm * Hbbox / 255
+        index_bbox (:obj: `torch.Tensor`): tensor of size K with bounding box
+            indices for each ground truth point. The values are thus in
+            [0, N-1]
+
+    Returns:
+        j_valid (:obj: `torch.Tensor`): uint8 tensor of size M containing
+            0 for points to be discarded and 1 for points to be selected
+        y_lo (:obj: `torch.Tensor`): int tensor of indices of upper values
+            in z_est for each point
+        y_hi (:obj: `torch.Tensor`): int tensor of indices of lower values
+            in z_est for each point
+        x_lo (:obj: `torch.Tensor`): int tensor of indices of left values
+            in z_est for each point
+        x_hi (:obj: `torch.Tensor`): int tensor of indices of right values
+            in z_est for each point
+        w_ylo_xlo (:obj: `torch.Tensor`): float tensor of size M;
+            contains upper-left value weight for each point
+        w_ylo_xhi (:obj: `torch.Tensor`): float tensor of size M;
+            contains upper-right value weight for each point
+        w_yhi_xlo (:obj: `torch.Tensor`): float tensor of size M;
+            contains lower-left value weight for each point
+        w_yhi_xhi (:obj: `torch.Tensor`): float tensor of size M;
+            contains lower-right value weight for each point
+    """
+
+    x0_gt, y0_gt, w_gt, h_gt = bbox_xywh_gt[index_bbox].unbind(dim=1)
+    x0_est, y0_est, w_est, h_est = bbox_xywh_est[index_bbox].unbind(dim=1)
+    x_lo, x_hi, x_w, jx_valid = _linear_interpolation_utilities(
+        x_norm, x0_gt, w_gt, x0_est, w_est, zw
+    )
+    y_lo, y_hi, y_w, jy_valid = _linear_interpolation_utilities(
+        y_norm, y0_gt, h_gt, y0_est, h_est, zh
+    )
+    j_valid = jx_valid * jy_valid
+
+    w_ylo_xlo = (1.0 - x_w) * (1.0 - y_w)
+    w_ylo_xhi = x_w * (1.0 - y_w)
+    w_yhi_xlo = (1.0 - x_w) * y_w
+    w_yhi_xhi = x_w * y_w
+
+    return j_valid, y_lo, y_hi, x_lo, x_hi, w_ylo_xlo, w_ylo_xhi, w_yhi_xlo, w_yhi_xhi
+
+
+def _extract_at_points_packed(
+    z_est,
+    index_bbox_valid,
+    slice_index_uv,
+    y_lo,
+    y_hi,
+    x_lo,
+    x_hi,
+    w_ylo_xlo,
+    w_ylo_xhi,
+    w_yhi_xlo,
+    w_yhi_xhi,
+):
+    """
+    Extract ground truth values z_gt for valid point indices and estimated
+    values z_est using bilinear interpolation over top-left (y_lo, x_lo),
+    top-right (y_lo, x_hi), bottom-left (y_hi, x_lo) and bottom-right
+    (y_hi, x_hi) values in z_est with corresponding weights:
+    w_ylo_xlo, w_ylo_xhi, w_yhi_xlo and w_yhi_xhi.
+    Use slice_index_uv to slice dim=1 in z_est
+    """
+    z_est_sampled = (
+        z_est[index_bbox_valid, slice_index_uv, y_lo, x_lo] * w_ylo_xlo
+        + z_est[index_bbox_valid, slice_index_uv, y_lo, x_hi] * w_ylo_xhi
+        + z_est[index_bbox_valid, slice_index_uv, y_hi, x_lo] * w_yhi_xlo
+        + z_est[index_bbox_valid, slice_index_uv, y_hi, x_hi] * w_yhi_xhi
+    )
+    return z_est_sampled
+
+
+def _resample_data(
+    z, bbox_xywh_src, bbox_xywh_dst, wout, hout, mode="nearest", padding_mode="zeros"
+):
+    """
+    Args:
+        z (:obj: `torch.Tensor`): tensor of size (N,C,H,W) with data to be
+            resampled
+        bbox_xywh_src (:obj: `torch.Tensor`): tensor of size (N,4) containing
+            source bounding boxes in format XYWH
+        bbox_xywh_dst (:obj: `torch.Tensor`): tensor of size (N,4) containing
+            destination bounding boxes in format XYWH
+    Return:
+        zresampled (:obj: `torch.Tensor`): tensor of size (N, C, Hout, Wout)
+            with resampled values of z, where D is the discretization size
+    """
+    n = bbox_xywh_src.size(0)
+    assert n == bbox_xywh_dst.size(0), (
+        "The number of "
+        "source ROIs for resampling ({}) should be equal to the number "
+        "of destination ROIs ({})".format(bbox_xywh_src.size(0), bbox_xywh_dst.size(0))
+    )
+    x0src, y0src, wsrc, hsrc = bbox_xywh_src.unbind(dim=1)
+    x0dst, y0dst, wdst, hdst = bbox_xywh_dst.unbind(dim=1)
+    x0dst_norm = 2 * (x0dst - x0src) / wsrc - 1
+    y0dst_norm = 2 * (y0dst - y0src) / hsrc - 1
+    x1dst_norm = 2 * (x0dst + wdst - x0src) / wsrc - 1
+    y1dst_norm = 2 * (y0dst + hdst - y0src) / hsrc - 1
+    grid_w = torch.arange(wout, device=z.device, dtype=torch.float) / wout
+    grid_h = torch.arange(hout, device=z.device, dtype=torch.float) / hout
+    grid_w_expanded = grid_w[None, None, :].expand(n, hout, wout)
+    grid_h_expanded = grid_h[None, :, None].expand(n, hout, wout)
+    dx_expanded = (x1dst_norm - x0dst_norm)[:, None, None].expand(n, hout, wout)
+    dy_expanded = (y1dst_norm - y0dst_norm)[:, None, None].expand(n, hout, wout)
+    x0_expanded = x0dst_norm[:, None, None].expand(n, hout, wout)
+    y0_expanded = y0dst_norm[:, None, None].expand(n, hout, wout)
+    grid_x = grid_w_expanded * dx_expanded + x0_expanded
+    grid_y = grid_h_expanded * dy_expanded + y0_expanded
+    grid = torch.stack((grid_x, grid_y), dim=3)
+    # resample Z from (N, C, H, W) into (N, C, Hout, Wout)
+    zresampled = F.grid_sample(z, grid, mode=mode, padding_mode=padding_mode, align_corners=True)
+    return zresampled
+
+
+def _extract_single_tensors_from_matches_one_image(
+    proposals_targets, bbox_with_dp_offset, bbox_global_offset
+):
+    i_gt_all = []
+    x_norm_all = []
+    y_norm_all = []
+    u_gt_all = []
+    v_gt_all = []
+    s_gt_all = []
+    bbox_xywh_gt_all = []
+    bbox_xywh_est_all = []
+    # Ibbox_all == k should be true for all data that corresponds
+    # to bbox_xywh_gt[k] and bbox_xywh_est[k]
+    # index k here is global wrt images
+    i_bbox_all = []
+    # at offset k (k is global) contains index of bounding box data
+    # within densepose output tensor
+    i_with_dp = []
+
+    boxes_xywh_est = proposals_targets.proposal_boxes.clone()
+    boxes_xywh_gt = proposals_targets.gt_boxes.clone()
+    n_i = len(boxes_xywh_est)
+    assert n_i == len(boxes_xywh_gt)
+
+    if n_i:
+        boxes_xywh_est.tensor[:, 2] -= boxes_xywh_est.tensor[:, 0]
+        boxes_xywh_est.tensor[:, 3] -= boxes_xywh_est.tensor[:, 1]
+        boxes_xywh_gt.tensor[:, 2] -= boxes_xywh_gt.tensor[:, 0]
+        boxes_xywh_gt.tensor[:, 3] -= boxes_xywh_gt.tensor[:, 1]
+        if hasattr(proposals_targets, "gt_densepose"):
+            densepose_gt = proposals_targets.gt_densepose
+            for k, box_xywh_est, box_xywh_gt, dp_gt in zip(
+                range(n_i), boxes_xywh_est.tensor, boxes_xywh_gt.tensor, densepose_gt
+            ):
+                if (dp_gt is not None) and (len(dp_gt.x) > 0):
+                    i_gt_all.append(dp_gt.i)
+                    x_norm_all.append(dp_gt.x)
+                    y_norm_all.append(dp_gt.y)
+                    u_gt_all.append(dp_gt.u)
+                    v_gt_all.append(dp_gt.v)
+                    s_gt_all.append(dp_gt.segm.unsqueeze(0))
+                    bbox_xywh_gt_all.append(box_xywh_gt.view(-1, 4))
+                    bbox_xywh_est_all.append(box_xywh_est.view(-1, 4))
+                    i_bbox_k = torch.full_like(dp_gt.i, bbox_with_dp_offset + len(i_with_dp))
+                    i_bbox_all.append(i_bbox_k)
+                    i_with_dp.append(bbox_global_offset + k)
+    return (
+        i_gt_all,
+        x_norm_all,
+        y_norm_all,
+        u_gt_all,
+        v_gt_all,
+        s_gt_all,
+        bbox_xywh_gt_all,
+        bbox_xywh_est_all,
+        i_bbox_all,
+        i_with_dp,
+    )
+
+
+def _extract_single_tensors_from_matches(proposals_with_targets):
+    i_img = []
+    i_gt_all = []
+    x_norm_all = []
+    y_norm_all = []
+    u_gt_all = []
+    v_gt_all = []
+    s_gt_all = []
+    bbox_xywh_gt_all = []
+    bbox_xywh_est_all = []
+    i_bbox_all = []
+    i_with_dp_all = []
+    n = 0
+    for i, proposals_targets_per_image in enumerate(proposals_with_targets):
+        n_i = proposals_targets_per_image.proposal_boxes.tensor.size(0)
+        if not n_i:
+            continue
+        (
+            i_gt_img,
+            x_norm_img,
+            y_norm_img,
+            u_gt_img,
+            v_gt_img,
+            s_gt_img,
+            bbox_xywh_gt_img,
+            bbox_xywh_est_img,
+            i_bbox_img,
+            i_with_dp_img,
+        ) = _extract_single_tensors_from_matches_one_image(  # noqa
+            proposals_targets_per_image, len(i_with_dp_all), n
+        )
+        i_gt_all.extend(i_gt_img)
+        x_norm_all.extend(x_norm_img)
+        y_norm_all.extend(y_norm_img)
+        u_gt_all.extend(u_gt_img)
+        v_gt_all.extend(v_gt_img)
+        s_gt_all.extend(s_gt_img)
+        bbox_xywh_gt_all.extend(bbox_xywh_gt_img)
+        bbox_xywh_est_all.extend(bbox_xywh_est_img)
+        i_bbox_all.extend(i_bbox_img)
+        i_with_dp_all.extend(i_with_dp_img)
+        i_img.extend([i] * len(i_with_dp_img))
+        n += n_i
+    # concatenate all data into a single tensor
+    if (n > 0) and (len(i_with_dp_all) > 0):
+        i_gt = torch.cat(i_gt_all, 0).long()
+        x_norm = torch.cat(x_norm_all, 0)
+        y_norm = torch.cat(y_norm_all, 0)
+        u_gt = torch.cat(u_gt_all, 0)
+        v_gt = torch.cat(v_gt_all, 0)
+        s_gt = torch.cat(s_gt_all, 0)
+        bbox_xywh_gt = torch.cat(bbox_xywh_gt_all, 0)
+        bbox_xywh_est = torch.cat(bbox_xywh_est_all, 0)
+        i_bbox = torch.cat(i_bbox_all, 0).long()
+    else:
+        i_gt = None
+        x_norm = None
+        y_norm = None
+        u_gt = None
+        v_gt = None
+        s_gt = None
+        bbox_xywh_gt = None
+        bbox_xywh_est = None
+        i_bbox = None
+    return (
+        i_img,
+        i_with_dp_all,
+        bbox_xywh_est,
+        bbox_xywh_gt,
+        i_gt,
+        x_norm,
+        y_norm,
+        u_gt,
+        v_gt,
+        s_gt,
+        i_bbox,
+    )
+
+
+class IIDIsotropicGaussianUVLoss(nn.Module):
+    """
+    Loss for the case of iid residuals with isotropic covariance:
+    $Sigma_i = sigma_i^2 I$
+    The loss (negative log likelihood) is then:
+    $1/2 sum_{i=1}^n (log(2 pi) + 2 log sigma_i^2 + ||delta_i||^2 / sigma_i^2)$,
+    where $delta_i=(u - u', v - v')$ is a 2D vector containing UV coordinates
+    difference between estimated and ground truth UV values
+    For details, see:
+    N. Neverova, D. Novotny, A. Vedaldi "Correlated Uncertainty for Learning
+    Dense Correspondences from Noisy Labels", p. 918--926, in Proc. NIPS 2019
+    """
+
+    def __init__(self, sigma_lower_bound: float):
+        super(IIDIsotropicGaussianUVLoss, self).__init__()
+        self.sigma_lower_bound = sigma_lower_bound
+        self.log2pi = math.log(2 * math.pi)
+
+    def forward(
+        self,
+        u: torch.Tensor,
+        v: torch.Tensor,
+        sigma_u: torch.Tensor,
+        target_u: torch.Tensor,
+        target_v: torch.Tensor,
+    ):
+        # compute $\sigma_i^2$
+        # use sigma_lower_bound to avoid degenerate solution for variance
+        # (sigma -> 0)
+        sigma2 = F.softplus(sigma_u) + self.sigma_lower_bound
+        # compute \|delta_i\|^2
+        delta_t_delta = (u - target_u) ** 2 + (v - target_v) ** 2
+        # the total loss from the formula above:
+        loss = 0.5 * (self.log2pi + 2 * torch.log(sigma2) + delta_t_delta / sigma2)
+        return loss.sum()
+
+
+class IndepAnisotropicGaussianUVLoss(nn.Module):
+    """
+    Loss for the case of independent residuals with anisotropic covariances:
+    $Sigma_i = sigma_i^2 I + r_i r_i^T$
+    The loss (negative log likelihood) is then:
+    $1/2 sum_{i=1}^n (log(2 pi)
+      + log sigma_i^2 (sigma_i^2 + ||r_i||^2)
+      + ||delta_i||^2 / sigma_i^2
+      - <delta_i, r_i>^2 / (sigma_i^2 * (sigma_i^2 + ||r_i||^2)))$,
+    where $delta_i=(u - u', v - v')$ is a 2D vector containing UV coordinates
+    difference between estimated and ground truth UV values
+    For details, see:
+    N. Neverova, D. Novotny, A. Vedaldi "Correlated Uncertainty for Learning
+    Dense Correspondences from Noisy Labels", p. 918--926, in Proc. NIPS 2019
+    """
+
+    def __init__(self, sigma_lower_bound: float):
+        super(IndepAnisotropicGaussianUVLoss, self).__init__()
+        self.sigma_lower_bound = sigma_lower_bound
+        self.log2pi = math.log(2 * math.pi)
+
+    def forward(
+        self,
+        u: torch.Tensor,
+        v: torch.Tensor,
+        sigma_u: torch.Tensor,
+        kappa_u_est: torch.Tensor,
+        kappa_v_est: torch.Tensor,
+        target_u: torch.Tensor,
+        target_v: torch.Tensor,
+    ):
+        # compute $\sigma_i^2$
+        sigma2 = F.softplus(sigma_u) + self.sigma_lower_bound
+        # compute \|r_i\|^2
+        r_sqnorm2 = kappa_u_est ** 2 + kappa_v_est ** 2
+        delta_u = u - target_u
+        delta_v = v - target_v
+        # compute \|delta_i\|^2
+        delta_sqnorm = delta_u ** 2 + delta_v ** 2
+        delta_u_r_u = delta_u * kappa_u_est
+        delta_v_r_v = delta_v * kappa_v_est
+        # compute the scalar product <delta_i, r_i>
+        delta_r = delta_u_r_u + delta_v_r_v
+        # compute squared scalar product <delta_i, r_i>^2
+        delta_r_sqnorm = delta_r ** 2
+        denom2 = sigma2 * (sigma2 + r_sqnorm2)
+        loss = 0.5 * (
+            self.log2pi + torch.log(denom2) + delta_sqnorm / sigma2 - delta_r_sqnorm / denom2
+        )
+        return loss.sum()
+
+
+class DensePoseLosses(object):
+    def __init__(self, cfg):
+        # fmt: off
+        self.heatmap_size = cfg.MODEL.ROI_DENSEPOSE_HEAD.HEATMAP_SIZE
+        self.w_points     = cfg.MODEL.ROI_DENSEPOSE_HEAD.POINT_REGRESSION_WEIGHTS
+        self.w_part       = cfg.MODEL.ROI_DENSEPOSE_HEAD.PART_WEIGHTS
+        self.w_segm       = cfg.MODEL.ROI_DENSEPOSE_HEAD.INDEX_WEIGHTS
+        self.n_segm_chan  = cfg.MODEL.ROI_DENSEPOSE_HEAD.NUM_COARSE_SEGM_CHANNELS
+        # fmt: on
+        self.confidence_model_cfg = DensePoseConfidenceModelConfig.from_cfg(cfg)
+        if self.confidence_model_cfg.uv_confidence.type == DensePoseUVConfidenceType.IID_ISO:
+            self.uv_loss_with_confidences = IIDIsotropicGaussianUVLoss(
+                self.confidence_model_cfg.uv_confidence.epsilon
+            )
+        elif self.confidence_model_cfg.uv_confidence.type == DensePoseUVConfidenceType.INDEP_ANISO:
+            self.uv_loss_with_confidences = IndepAnisotropicGaussianUVLoss(
+                self.confidence_model_cfg.uv_confidence.epsilon
+            )
+
+    def __call__(self, proposals_with_gt, densepose_outputs, densepose_confidences):
+        losses = {}
+        # densepose outputs are computed for all images and all bounding boxes;
+        # i.e. if a batch has 4 images with (3, 1, 2, 1) proposals respectively,
+        # the outputs will have size(0) == 3+1+2+1 == 7
+        s, index_uv, u, v = densepose_outputs
+        sigma_1, sigma_2, kappa_u, kappa_v = densepose_confidences
+        conf_type = self.confidence_model_cfg.uv_confidence.type
+        assert u.size(2) == v.size(2)
+        assert u.size(3) == v.size(3)
+        assert u.size(2) == index_uv.size(2)
+        assert u.size(3) == index_uv.size(3)
+
+        with torch.no_grad():
+            (
+                index_uv_img,
+                i_with_dp,
+                bbox_xywh_est,
+                bbox_xywh_gt,
+                index_gt_all,
+                x_norm,
+                y_norm,
+                u_gt_all,
+                v_gt_all,
+                s_gt,
+                index_bbox,
+            ) = _extract_single_tensors_from_matches(  # noqa
+                proposals_with_gt
+            )
+        n_batch = len(i_with_dp)
+
+        # NOTE: we need to keep the same computation graph on all the GPUs to
+        # perform reduction properly. Hence even if we have no data on one
+        # of the GPUs, we still need to generate the computation graph.
+        # Add fake (zero) loss in the form Tensor.sum() * 0
+        if not n_batch:
+            losses["loss_densepose_I"] = index_uv.sum() * 0
+            losses["loss_densepose_S"] = s.sum() * 0
+            if self.confidence_model_cfg.uv_confidence.enabled:
+                losses["loss_densepose_UV"] = (u.sum() + v.sum()) * 0
+                if conf_type == DensePoseUVConfidenceType.IID_ISO:
+                    losses["loss_densepose_UV"] += sigma_2.sum() * 0
+                elif conf_type == DensePoseUVConfidenceType.INDEP_ANISO:
+                    losses["loss_densepose_UV"] += (
+                        sigma_2.sum() + kappa_u.sum() + kappa_v.sum()
+                    ) * 0
+            else:
+                losses["loss_densepose_U"] = u.sum() * 0
+                losses["loss_densepose_V"] = v.sum() * 0
+            return losses
+
+        zh = u.size(2)
+        zw = u.size(3)
+
+        (
+            j_valid,
+            y_lo,
+            y_hi,
+            x_lo,
+            x_hi,
+            w_ylo_xlo,
+            w_ylo_xhi,
+            w_yhi_xlo,
+            w_yhi_xhi,
+        ) = _grid_sampling_utilities(  # noqa
+            zh, zw, bbox_xywh_est, bbox_xywh_gt, index_gt_all, x_norm, y_norm, index_bbox
+        )
+
+        j_valid_fg = j_valid * (index_gt_all > 0)
+
+        u_gt = u_gt_all[j_valid_fg]
+        u_est_all = _extract_at_points_packed(
+            u[i_with_dp],
+            index_bbox,
+            index_gt_all,
+            y_lo,
+            y_hi,
+            x_lo,
+            x_hi,
+            w_ylo_xlo,
+            w_ylo_xhi,
+            w_yhi_xlo,
+            w_yhi_xhi,
+        )
+        u_est = u_est_all[j_valid_fg]
+
+        v_gt = v_gt_all[j_valid_fg]
+        v_est_all = _extract_at_points_packed(
+            v[i_with_dp],
+            index_bbox,
+            index_gt_all,
+            y_lo,
+            y_hi,
+            x_lo,
+            x_hi,
+            w_ylo_xlo,
+            w_ylo_xhi,
+            w_yhi_xlo,
+            w_yhi_xhi,
+        )
+        v_est = v_est_all[j_valid_fg]
+
+        index_uv_gt = index_gt_all[j_valid]
+        index_uv_est_all = _extract_at_points_packed(
+            index_uv[i_with_dp],
+            index_bbox,
+            slice(None),
+            y_lo,
+            y_hi,
+            x_lo,
+            x_hi,
+            w_ylo_xlo[:, None],
+            w_ylo_xhi[:, None],
+            w_yhi_xlo[:, None],
+            w_yhi_xhi[:, None],
+        )
+        index_uv_est = index_uv_est_all[j_valid, :]
+
+        if self.confidence_model_cfg.uv_confidence.enabled:
+            sigma_2_est_all = _extract_at_points_packed(
+                sigma_2[i_with_dp],
+                index_bbox,
+                index_gt_all,
+                y_lo,
+                y_hi,
+                x_lo,
+                x_hi,
+                w_ylo_xlo,
+                w_ylo_xhi,
+                w_yhi_xlo,
+                w_yhi_xhi,
+            )
+            sigma_2_est = sigma_2_est_all[j_valid_fg]
+            if conf_type in [DensePoseUVConfidenceType.INDEP_ANISO]:
+                kappa_u_est_all = _extract_at_points_packed(
+                    kappa_u[i_with_dp],
+                    index_bbox,
+                    index_gt_all,
+                    y_lo,
+                    y_hi,
+                    x_lo,
+                    x_hi,
+                    w_ylo_xlo,
+                    w_ylo_xhi,
+                    w_yhi_xlo,
+                    w_yhi_xhi,
+                )
+                kappa_u_est = kappa_u_est_all[j_valid_fg]
+                kappa_v_est_all = _extract_at_points_packed(
+                    kappa_v[i_with_dp],
+                    index_bbox,
+                    index_gt_all,
+                    y_lo,
+                    y_hi,
+                    x_lo,
+                    x_hi,
+                    w_ylo_xlo,
+                    w_ylo_xhi,
+                    w_yhi_xlo,
+                    w_yhi_xhi,
+                )
+                kappa_v_est = kappa_v_est_all[j_valid_fg]
+
+        # Resample everything to the estimated data size, no need to resample
+        # S_est then:
+        s_est = s[i_with_dp]
+        with torch.no_grad():
+            s_gt = _resample_data(
+                s_gt.unsqueeze(1),
+                bbox_xywh_gt,
+                bbox_xywh_est,
+                self.heatmap_size,
+                self.heatmap_size,
+                mode="nearest",
+                padding_mode="zeros",
+            ).squeeze(1)
+
+        # add point-based losses:
+        if self.confidence_model_cfg.uv_confidence.enabled:
+            if conf_type == DensePoseUVConfidenceType.IID_ISO:
+                uv_loss = (
+                    self.uv_loss_with_confidences(u_est, v_est, sigma_2_est, u_gt, v_gt)
+                    * self.w_points
+                )
+                losses["loss_densepose_UV"] = uv_loss
+            elif conf_type == DensePoseUVConfidenceType.INDEP_ANISO:
+                uv_loss = (
+                    self.uv_loss_with_confidences(
+                        u_est, v_est, sigma_2_est, kappa_u_est, kappa_v_est, u_gt, v_gt
+                    )
+                    * self.w_points
+                )
+                losses["loss_densepose_UV"] = uv_loss
+            else:
+                raise ValueError(f"Unknown confidence model type: {conf_type}")
+        else:
+            u_loss = F.smooth_l1_loss(u_est, u_gt, reduction="sum") * self.w_points
+            losses["loss_densepose_U"] = u_loss
+            v_loss = F.smooth_l1_loss(v_est, v_gt, reduction="sum") * self.w_points
+            losses["loss_densepose_V"] = v_loss
+        index_uv_loss = F.cross_entropy(index_uv_est, index_uv_gt.long()) * self.w_part
+        losses["loss_densepose_I"] = index_uv_loss
+
+        if self.n_segm_chan == 2:
+            s_gt = s_gt > 0
+        s_loss = F.cross_entropy(s_est, s_gt.long()) * self.w_segm
+        losses["loss_densepose_S"] = s_loss
+        return losses
+
+
+def build_densepose_losses(cfg):
+    losses = DensePoseLosses(cfg)
+    return losses
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/projects/DensePose/densepose/evaluator.py b/preprocess/humanparsing/mhp_extension/detectron2/projects/DensePose/densepose/evaluator.py
new file mode 100644
index 0000000000000000000000000000000000000000..3bb002b5093365f12edf5f4610ab261491d12bc8
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/projects/DensePose/densepose/evaluator.py
@@ -0,0 +1,158 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+
+import contextlib
+import copy
+import io
+import itertools
+import json
+import logging
+import os
+from collections import OrderedDict
+import torch
+from fvcore.common.file_io import PathManager
+from pycocotools.coco import COCO
+
+from detectron2.data import MetadataCatalog
+from detectron2.evaluation import DatasetEvaluator
+from detectron2.structures import BoxMode
+from detectron2.utils.comm import all_gather, is_main_process, synchronize
+from detectron2.utils.logger import create_small_table
+
+from .densepose_coco_evaluation import DensePoseCocoEval, DensePoseEvalMode
+
+
+class DensePoseCOCOEvaluator(DatasetEvaluator):
+    def __init__(self, dataset_name, distributed, output_dir=None):
+        self._distributed = distributed
+        self._output_dir = output_dir
+
+        self._cpu_device = torch.device("cpu")
+        self._logger = logging.getLogger(__name__)
+
+        self._metadata = MetadataCatalog.get(dataset_name)
+        json_file = PathManager.get_local_path(self._metadata.json_file)
+        with contextlib.redirect_stdout(io.StringIO()):
+            self._coco_api = COCO(json_file)
+
+    def reset(self):
+        self._predictions = []
+
+    def process(self, inputs, outputs):
+        """
+        Args:
+            inputs: the inputs to a COCO model (e.g., GeneralizedRCNN).
+                It is a list of dict. Each dict corresponds to an image and
+                contains keys like "height", "width", "file_name", "image_id".
+            outputs: the outputs of a COCO model. It is a list of dicts with key
+                "instances" that contains :class:`Instances`.
+                The :class:`Instances` object needs to have `densepose` field.
+        """
+        for input, output in zip(inputs, outputs):
+            instances = output["instances"].to(self._cpu_device)
+
+            boxes = instances.pred_boxes.tensor.clone()
+            boxes = BoxMode.convert(boxes, BoxMode.XYXY_ABS, BoxMode.XYWH_ABS)
+            instances.pred_densepose = instances.pred_densepose.to_result(boxes)
+
+            json_results = prediction_to_json(instances, input["image_id"])
+            self._predictions.extend(json_results)
+
+    def evaluate(self):
+        if self._distributed:
+            synchronize()
+            predictions = all_gather(self._predictions)
+            predictions = list(itertools.chain(*predictions))
+            if not is_main_process():
+                return
+        else:
+            predictions = self._predictions
+
+        return copy.deepcopy(self._eval_predictions(predictions))
+
+    def _eval_predictions(self, predictions):
+        """
+        Evaluate predictions on densepose.
+        Return results with the metrics of the tasks.
+        """
+        self._logger.info("Preparing results for COCO format ...")
+
+        if self._output_dir:
+            file_path = os.path.join(self._output_dir, "coco_densepose_results.json")
+            with open(file_path, "w") as f:
+                json.dump(predictions, f)
+                f.flush()
+                os.fsync(f.fileno())
+
+        self._logger.info("Evaluating predictions ...")
+        res = OrderedDict()
+        results_gps, results_gpsm = _evaluate_predictions_on_coco(self._coco_api, predictions)
+        res["densepose_gps"] = results_gps
+        res["densepose_gpsm"] = results_gpsm
+        return res
+
+
+def prediction_to_json(instances, img_id):
+    """
+    Args:
+        instances (Instances): the output of the model
+        img_id (str): the image id in COCO
+
+    Returns:
+        list[dict]: the results in densepose evaluation format
+    """
+    scores = instances.scores.tolist()
+
+    results = []
+    for k in range(len(instances)):
+        densepose = instances.pred_densepose[k]
+        result = {
+            "image_id": img_id,
+            "category_id": 1,  # densepose only has one class
+            "bbox": densepose[1],
+            "score": scores[k],
+            "densepose": densepose,
+        }
+        results.append(result)
+    return results
+
+
+def _evaluate_predictions_on_coco(coco_gt, coco_results):
+    metrics = ["AP", "AP50", "AP75", "APm", "APl"]
+
+    logger = logging.getLogger(__name__)
+
+    if len(coco_results) == 0:  # cocoapi does not handle empty results very well
+        logger.warn("No predictions from the model! Set scores to -1")
+        results_gps = {metric: -1 for metric in metrics}
+        results_gpsm = {metric: -1 for metric in metrics}
+        return results_gps, results_gpsm
+
+    coco_dt = coco_gt.loadRes(coco_results)
+    results_gps = _evaluate_predictions_on_coco_gps(coco_gt, coco_dt, metrics)
+    logger.info(
+        "Evaluation results for densepose, GPS metric: \n" + create_small_table(results_gps)
+    )
+    results_gpsm = _evaluate_predictions_on_coco_gpsm(coco_gt, coco_dt, metrics)
+    logger.info(
+        "Evaluation results for densepose, GPSm metric: \n" + create_small_table(results_gpsm)
+    )
+    return results_gps, results_gpsm
+
+
+def _evaluate_predictions_on_coco_gps(coco_gt, coco_dt, metrics):
+    coco_eval = DensePoseCocoEval(coco_gt, coco_dt, "densepose", dpEvalMode=DensePoseEvalMode.GPS)
+    coco_eval.evaluate()
+    coco_eval.accumulate()
+    coco_eval.summarize()
+    results = {metric: float(coco_eval.stats[idx] * 100) for idx, metric in enumerate(metrics)}
+    return results
+
+
+def _evaluate_predictions_on_coco_gpsm(coco_gt, coco_dt, metrics):
+    coco_eval = DensePoseCocoEval(coco_gt, coco_dt, "densepose", dpEvalMode=DensePoseEvalMode.GPSM)
+    coco_eval.evaluate()
+    coco_eval.accumulate()
+    coco_eval.summarize()
+    results = {metric: float(coco_eval.stats[idx] * 100) for idx, metric in enumerate(metrics)}
+    return results
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/projects/DensePose/densepose/modeling/test_time_augmentation.py b/preprocess/humanparsing/mhp_extension/detectron2/projects/DensePose/densepose/modeling/test_time_augmentation.py
new file mode 100644
index 0000000000000000000000000000000000000000..fcf69db1b6e4c687bc4e284e2795cab61ebf043f
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/projects/DensePose/densepose/modeling/test_time_augmentation.py
@@ -0,0 +1,75 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+from detectron2.modeling.test_time_augmentation import GeneralizedRCNNWithTTA
+
+
+class DensePoseGeneralizedRCNNWithTTA(GeneralizedRCNNWithTTA):
+    def __init__(self, cfg, model, transform_data, tta_mapper=None, batch_size=1):
+        """
+        Args:
+            cfg (CfgNode):
+            model (GeneralizedRCNN): a GeneralizedRCNN to apply TTA on.
+            transform_data (DensePoseTransformData): contains symmetry label
+                transforms used for horizontal flip
+            tta_mapper (callable): takes a dataset dict and returns a list of
+                augmented versions of the dataset dict. Defaults to
+                `DatasetMapperTTA(cfg)`.
+            batch_size (int): batch the augmented images into this batch size for inference.
+        """
+        self._transform_data = transform_data
+        super().__init__(cfg=cfg, model=model, tta_mapper=tta_mapper, batch_size=batch_size)
+
+    # the implementation follows closely the one from detectron2/modeling
+    def _inference_one_image(self, input):
+        """
+        Args:
+            input (dict): one dataset dict
+
+        Returns:
+            dict: one output dict
+        """
+
+        augmented_inputs, aug_vars = self._get_augmented_inputs(input)
+        # Detect boxes from all augmented versions
+        with self._turn_off_roi_heads(["mask_on", "keypoint_on", "densepose_on"]):
+            # temporarily disable roi heads
+            all_boxes, all_scores, all_classes = self._get_augmented_boxes(
+                augmented_inputs, aug_vars
+            )
+        merged_instances = self._merge_detections(
+            all_boxes, all_scores, all_classes, (aug_vars["height"], aug_vars["width"])
+        )
+
+        if self.cfg.MODEL.MASK_ON or self.cfg.MODEL.DENSEPOSE_ON:
+            # Use the detected boxes to obtain new fields
+            augmented_instances = self._rescale_detected_boxes(
+                augmented_inputs, merged_instances, aug_vars
+            )
+            # run forward on the detected boxes
+            outputs = self._batch_inference(
+                augmented_inputs, augmented_instances, do_postprocess=False
+            )
+            # Delete now useless variables to avoid being out of memory
+            del augmented_inputs, augmented_instances, merged_instances
+            # average the predictions
+            if self.cfg.MODEL.MASK_ON:
+                outputs[0].pred_masks = self._reduce_pred_masks(outputs, aug_vars)
+            if self.cfg.MODEL.DENSEPOSE_ON:
+                outputs[0].pred_densepose = self._reduce_pred_densepose(outputs, aug_vars)
+            # postprocess
+            output = self._detector_postprocess(outputs[0], aug_vars)
+            return {"instances": output}
+        else:
+            return {"instances": merged_instances}
+
+    def _reduce_pred_densepose(self, outputs, aug_vars):
+        for idx, output in enumerate(outputs):
+            if aug_vars["do_hflip"][idx]:
+                output.pred_densepose.hflip(self._transform_data)
+        # Less memory-intensive averaging
+        for attr in "SIUV":
+            setattr(
+                outputs[0].pred_densepose,
+                attr,
+                sum(getattr(o.pred_densepose, attr) for o in outputs) / len(outputs),
+            )
+        return outputs[0].pred_densepose
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/projects/DensePose/densepose/roi_head.py b/preprocess/humanparsing/mhp_extension/detectron2/projects/DensePose/densepose/roi_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..023119760b77cf5294ed18292e77e7f495099770
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/projects/DensePose/densepose/roi_head.py
@@ -0,0 +1,213 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+
+import numpy as np
+from typing import Dict
+import fvcore.nn.weight_init as weight_init
+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+
+from detectron2.layers import Conv2d, ShapeSpec, get_norm
+from detectron2.modeling import ROI_HEADS_REGISTRY, StandardROIHeads
+from detectron2.modeling.poolers import ROIPooler
+from detectron2.modeling.roi_heads import select_foreground_proposals
+
+from .densepose_head import (
+    build_densepose_data_filter,
+    build_densepose_head,
+    build_densepose_losses,
+    build_densepose_predictor,
+    densepose_inference,
+)
+
+
+class Decoder(nn.Module):
+    """
+    A semantic segmentation head described in detail in the Panoptic Feature Pyramid Networks paper
+    (https://arxiv.org/abs/1901.02446). It takes FPN features as input and merges information from
+    all levels of the FPN into single output.
+    """
+
+    def __init__(self, cfg, input_shape: Dict[str, ShapeSpec], in_features):
+        super(Decoder, self).__init__()
+
+        # fmt: off
+        self.in_features      = in_features
+        feature_strides       = {k: v.stride for k, v in input_shape.items()}
+        feature_channels      = {k: v.channels for k, v in input_shape.items()}
+        num_classes           = cfg.MODEL.ROI_DENSEPOSE_HEAD.DECODER_NUM_CLASSES
+        conv_dims             = cfg.MODEL.ROI_DENSEPOSE_HEAD.DECODER_CONV_DIMS
+        self.common_stride    = cfg.MODEL.ROI_DENSEPOSE_HEAD.DECODER_COMMON_STRIDE
+        norm                  = cfg.MODEL.ROI_DENSEPOSE_HEAD.DECODER_NORM
+        # fmt: on
+
+        self.scale_heads = []
+        for in_feature in self.in_features:
+            head_ops = []
+            head_length = max(
+                1, int(np.log2(feature_strides[in_feature]) - np.log2(self.common_stride))
+            )
+            for k in range(head_length):
+                conv = Conv2d(
+                    feature_channels[in_feature] if k == 0 else conv_dims,
+                    conv_dims,
+                    kernel_size=3,
+                    stride=1,
+                    padding=1,
+                    bias=not norm,
+                    norm=get_norm(norm, conv_dims),
+                    activation=F.relu,
+                )
+                weight_init.c2_msra_fill(conv)
+                head_ops.append(conv)
+                if feature_strides[in_feature] != self.common_stride:
+                    head_ops.append(
+                        nn.Upsample(scale_factor=2, mode="bilinear", align_corners=False)
+                    )
+            self.scale_heads.append(nn.Sequential(*head_ops))
+            self.add_module(in_feature, self.scale_heads[-1])
+        self.predictor = Conv2d(conv_dims, num_classes, kernel_size=1, stride=1, padding=0)
+        weight_init.c2_msra_fill(self.predictor)
+
+    def forward(self, features):
+        for i, _ in enumerate(self.in_features):
+            if i == 0:
+                x = self.scale_heads[i](features[i])
+            else:
+                x = x + self.scale_heads[i](features[i])
+        x = self.predictor(x)
+        return x
+
+
+@ROI_HEADS_REGISTRY.register()
+class DensePoseROIHeads(StandardROIHeads):
+    """
+    A Standard ROIHeads which contains an addition of DensePose head.
+    """
+
+    def __init__(self, cfg, input_shape):
+        super().__init__(cfg, input_shape)
+        self._init_densepose_head(cfg, input_shape)
+
+    def _init_densepose_head(self, cfg, input_shape):
+        # fmt: off
+        self.densepose_on          = cfg.MODEL.DENSEPOSE_ON
+        if not self.densepose_on:
+            return
+        self.densepose_data_filter = build_densepose_data_filter(cfg)
+        dp_pooler_resolution       = cfg.MODEL.ROI_DENSEPOSE_HEAD.POOLER_RESOLUTION
+        dp_pooler_sampling_ratio   = cfg.MODEL.ROI_DENSEPOSE_HEAD.POOLER_SAMPLING_RATIO
+        dp_pooler_type             = cfg.MODEL.ROI_DENSEPOSE_HEAD.POOLER_TYPE
+        self.use_decoder           = cfg.MODEL.ROI_DENSEPOSE_HEAD.DECODER_ON
+        # fmt: on
+        if self.use_decoder:
+            dp_pooler_scales = (1.0 / input_shape[self.in_features[0]].stride,)
+        else:
+            dp_pooler_scales = tuple(1.0 / input_shape[k].stride for k in self.in_features)
+        in_channels = [input_shape[f].channels for f in self.in_features][0]
+
+        if self.use_decoder:
+            self.decoder = Decoder(cfg, input_shape, self.in_features)
+
+        self.densepose_pooler = ROIPooler(
+            output_size=dp_pooler_resolution,
+            scales=dp_pooler_scales,
+            sampling_ratio=dp_pooler_sampling_ratio,
+            pooler_type=dp_pooler_type,
+        )
+        self.densepose_head = build_densepose_head(cfg, in_channels)
+        self.densepose_predictor = build_densepose_predictor(
+            cfg, self.densepose_head.n_out_channels
+        )
+        self.densepose_losses = build_densepose_losses(cfg)
+
+    def _forward_densepose(self, features, instances):
+        """
+        Forward logic of the densepose prediction branch.
+
+        Args:
+            features (list[Tensor]): #level input features for densepose prediction
+            instances (list[Instances]): the per-image instances to train/predict densepose.
+                In training, they can be the proposals.
+                In inference, they can be the predicted boxes.
+
+        Returns:
+            In training, a dict of losses.
+            In inference, update `instances` with new fields "densepose" and return it.
+        """
+        if not self.densepose_on:
+            return {} if self.training else instances
+
+        features = [features[f] for f in self.in_features]
+        if self.training:
+            proposals, _ = select_foreground_proposals(instances, self.num_classes)
+            proposals_dp = self.densepose_data_filter(proposals)
+            if len(proposals_dp) > 0:
+                # NOTE may deadlock in DDP if certain workers have empty proposals_dp
+                proposal_boxes = [x.proposal_boxes for x in proposals_dp]
+
+                if self.use_decoder:
+                    features = [self.decoder(features)]
+
+                features_dp = self.densepose_pooler(features, proposal_boxes)
+                densepose_head_outputs = self.densepose_head(features_dp)
+                densepose_outputs, _, confidences, _ = self.densepose_predictor(
+                    densepose_head_outputs
+                )
+                densepose_loss_dict = self.densepose_losses(
+                    proposals_dp, densepose_outputs, confidences
+                )
+                return densepose_loss_dict
+        else:
+            pred_boxes = [x.pred_boxes for x in instances]
+
+            if self.use_decoder:
+                features = [self.decoder(features)]
+
+            features_dp = self.densepose_pooler(features, pred_boxes)
+            if len(features_dp) > 0:
+                densepose_head_outputs = self.densepose_head(features_dp)
+                densepose_outputs, _, confidences, _ = self.densepose_predictor(
+                    densepose_head_outputs
+                )
+            else:
+                # If no detection occurred instances
+                # set densepose_outputs to empty tensors
+                empty_tensor = torch.zeros(size=(0, 0, 0, 0), device=features_dp.device)
+                densepose_outputs = tuple([empty_tensor] * 4)
+                confidences = tuple([empty_tensor] * 4)
+
+            densepose_inference(densepose_outputs, confidences, instances)
+            return instances
+
+    def forward(self, images, features, proposals, targets=None):
+        instances, losses = super().forward(images, features, proposals, targets)
+        del targets, images
+
+        if self.training:
+            losses.update(self._forward_densepose(features, instances))
+        return instances, losses
+
+    def forward_with_given_boxes(self, features, instances):
+        """
+        Use the given boxes in `instances` to produce other (non-box) per-ROI outputs.
+
+        This is useful for downstream tasks where a box is known, but need to obtain
+        other attributes (outputs of other heads).
+        Test-time augmentation also uses this.
+
+        Args:
+            features: same as in `forward()`
+            instances (list[Instances]): instances to predict other outputs. Expect the keys
+                "pred_boxes" and "pred_classes" to exist.
+
+        Returns:
+            instances (list[Instances]):
+                the same `Instances` objects, with extra
+                fields such as `pred_masks` or `pred_keypoints`.
+        """
+
+        instances = super().forward_with_given_boxes(features, instances)
+        instances = self._forward_densepose(features, instances)
+        return instances
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/projects/DensePose/densepose/utils/dbhelper.py b/preprocess/humanparsing/mhp_extension/detectron2/projects/DensePose/densepose/utils/dbhelper.py
new file mode 100644
index 0000000000000000000000000000000000000000..b28862cdede26c13200d928118d5bc5c00e3d2aa
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/projects/DensePose/densepose/utils/dbhelper.py
@@ -0,0 +1,145 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+from typing import Any, Dict, Optional, Tuple
+
+
+class EntrySelector(object):
+    """
+    Base class for entry selectors
+    """
+
+    @staticmethod
+    def from_string(spec: str) -> "EntrySelector":
+        if spec == "*":
+            return AllEntrySelector()
+        return FieldEntrySelector(spec)
+
+
+class AllEntrySelector(EntrySelector):
+    """
+    Selector that accepts all entries
+    """
+
+    SPECIFIER = "*"
+
+    def __call__(self, entry):
+        return True
+
+
+class FieldEntrySelector(EntrySelector):
+    """
+    Selector that accepts only entries that match provided field
+    specifier(s). Only a limited set of specifiers is supported for now:
+      <specifiers>::=<specifier>[<comma><specifiers>]
+      <specifier>::=<field_name>[<type_delim><type>]<equal><value_or_range>
+      <field_name> is a valid identifier
+      <type> ::= "int" | "str"
+      <equal> ::= "="
+      <comma> ::= ","
+      <type_delim> ::= ":"
+      <value_or_range> ::= <value> | <range>
+      <range> ::= <value><range_delim><value>
+      <range_delim> ::= "-"
+      <value> is a string without spaces and special symbols
+        (e.g. <comma>, <equal>, <type_delim>, <range_delim>)
+    """
+
+    _SPEC_DELIM = ","
+    _TYPE_DELIM = ":"
+    _RANGE_DELIM = "-"
+    _EQUAL = "="
+    _ERROR_PREFIX = "Invalid field selector specifier"
+
+    class _FieldEntryValuePredicate(object):
+        """
+        Predicate that checks strict equality for the specified entry field
+        """
+
+        def __init__(self, name: str, typespec: str, value: str):
+            import builtins
+
+            self.name = name
+            self.type = getattr(builtins, typespec) if typespec is not None else str
+            self.value = value
+
+        def __call__(self, entry):
+            return entry[self.name] == self.type(self.value)
+
+    class _FieldEntryRangePredicate(object):
+        """
+        Predicate that checks whether an entry field falls into the specified range
+        """
+
+        def __init__(self, name: str, typespec: str, vmin: str, vmax: str):
+            import builtins
+
+            self.name = name
+            self.type = getattr(builtins, typespec) if typespec is not None else str
+            self.vmin = vmin
+            self.vmax = vmax
+
+        def __call__(self, entry):
+            return (entry[self.name] >= self.type(self.vmin)) and (
+                entry[self.name] <= self.type(self.vmax)
+            )
+
+    def __init__(self, spec: str):
+        self._predicates = self._parse_specifier_into_predicates(spec)
+
+    def __call__(self, entry: Dict[str, Any]):
+        for predicate in self._predicates:
+            if not predicate(entry):
+                return False
+        return True
+
+    def _parse_specifier_into_predicates(self, spec: str):
+        predicates = []
+        specs = spec.split(self._SPEC_DELIM)
+        for subspec in specs:
+            eq_idx = subspec.find(self._EQUAL)
+            if eq_idx > 0:
+                field_name_with_type = subspec[:eq_idx]
+                field_name, field_type = self._parse_field_name_type(field_name_with_type)
+                field_value_or_range = subspec[eq_idx + 1 :]
+                if self._is_range_spec(field_value_or_range):
+                    vmin, vmax = self._get_range_spec(field_value_or_range)
+                    predicate = FieldEntrySelector._FieldEntryRangePredicate(
+                        field_name, field_type, vmin, vmax
+                    )
+                else:
+                    predicate = FieldEntrySelector._FieldEntryValuePredicate(
+                        field_name, field_type, field_value_or_range
+                    )
+                predicates.append(predicate)
+            elif eq_idx == 0:
+                self._parse_error(f'"{subspec}", field name is empty!')
+            else:
+                self._parse_error(f'"{subspec}", should have format ' "<field>=<value_or_range>!")
+        return predicates
+
+    def _parse_field_name_type(self, field_name_with_type: str) -> Tuple[str, Optional[str]]:
+        type_delim_idx = field_name_with_type.find(self._TYPE_DELIM)
+        if type_delim_idx > 0:
+            field_name = field_name_with_type[:type_delim_idx]
+            field_type = field_name_with_type[type_delim_idx + 1 :]
+        elif type_delim_idx == 0:
+            self._parse_error(f'"{field_name_with_type}", field name is empty!')
+        else:
+            field_name = field_name_with_type
+            field_type = None
+        return field_name, field_type
+
+    def _is_range_spec(self, field_value_or_range):
+        delim_idx = field_value_or_range.find(self._RANGE_DELIM)
+        return delim_idx > 0
+
+    def _get_range_spec(self, field_value_or_range):
+        if self._is_range_spec(field_value_or_range):
+            delim_idx = field_value_or_range.find(self._RANGE_DELIM)
+            vmin = field_value_or_range[:delim_idx]
+            vmax = field_value_or_range[delim_idx + 1 :]
+            return vmin, vmax
+        else:
+            self._parse_error('"field_value_or_range", range of values expected!')
+
+    def _parse_error(self, msg):
+        raise ValueError(f"{self._ERROR_PREFIX}: {msg}")
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/projects/DensePose/densepose/utils/logger.py b/preprocess/humanparsing/mhp_extension/detectron2/projects/DensePose/densepose/utils/logger.py
new file mode 100644
index 0000000000000000000000000000000000000000..e3fa45e0c0218bdd2e79c08b0d8ff83abc3e4308
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/projects/DensePose/densepose/utils/logger.py
@@ -0,0 +1,13 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+import logging
+
+
+def verbosity_to_level(verbosity):
+    if verbosity is not None:
+        if verbosity == 0:
+            return logging.WARNING
+        elif verbosity == 1:
+            return logging.INFO
+        elif verbosity >= 2:
+            return logging.DEBUG
+    return logging.WARNING
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/projects/DensePose/densepose/utils/transform.py b/preprocess/humanparsing/mhp_extension/detectron2/projects/DensePose/densepose/utils/transform.py
new file mode 100644
index 0000000000000000000000000000000000000000..b7cfe097234dbd3ff19b84ecdfb63fd8bf5fd4b6
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/projects/DensePose/densepose/utils/transform.py
@@ -0,0 +1,16 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+from fvcore.common.file_io import PathManager
+
+from detectron2.data import MetadataCatalog
+
+from densepose import DensePoseTransformData
+
+
+def load_for_dataset(dataset_name):
+    path = MetadataCatalog.get(dataset_name).densepose_transform_src
+    densepose_transform_data_fpath = PathManager.get_local_path(path)
+    return DensePoseTransformData.load(densepose_transform_data_fpath)
+
+
+def load_from_cfg(cfg):
+    return load_for_dataset(cfg.DATASETS.TEST[0])
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/projects/DensePose/densepose/vis/base.py b/preprocess/humanparsing/mhp_extension/detectron2/projects/DensePose/densepose/vis/base.py
new file mode 100644
index 0000000000000000000000000000000000000000..2aa3e6e9f44ae2ce888f6e24dd11c8428734417b
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/projects/DensePose/densepose/vis/base.py
@@ -0,0 +1,191 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+import logging
+import numpy as np
+import cv2
+import torch
+
+Image = np.ndarray
+Boxes = torch.Tensor
+
+
+class MatrixVisualizer(object):
+    """
+    Base visualizer for matrix data
+    """
+
+    def __init__(
+        self,
+        inplace=True,
+        cmap=cv2.COLORMAP_PARULA,
+        val_scale=1.0,
+        alpha=0.7,
+        interp_method_matrix=cv2.INTER_LINEAR,
+        interp_method_mask=cv2.INTER_NEAREST,
+    ):
+        self.inplace = inplace
+        self.cmap = cmap
+        self.val_scale = val_scale
+        self.alpha = alpha
+        self.interp_method_matrix = interp_method_matrix
+        self.interp_method_mask = interp_method_mask
+
+    def visualize(self, image_bgr, mask, matrix, bbox_xywh):
+        self._check_image(image_bgr)
+        self._check_mask_matrix(mask, matrix)
+        if self.inplace:
+            image_target_bgr = image_bgr
+        else:
+            image_target_bgr = image_bgr * 0
+        x, y, w, h = [int(v) for v in bbox_xywh]
+        if w <= 0 or h <= 0:
+            return image_bgr
+        mask, matrix = self._resize(mask, matrix, w, h)
+        mask_bg = np.tile((mask == 0)[:, :, np.newaxis], [1, 1, 3])
+        matrix_scaled = matrix.astype(np.float32) * self.val_scale
+        _EPSILON = 1e-6
+        if np.any(matrix_scaled > 255 + _EPSILON):
+            logger = logging.getLogger(__name__)
+            logger.warning(
+                f"Matrix has values > {255 + _EPSILON} after " f"scaling, clipping to [0..255]"
+            )
+        matrix_scaled_8u = matrix_scaled.clip(0, 255).astype(np.uint8)
+        matrix_vis = cv2.applyColorMap(matrix_scaled_8u, self.cmap)
+        matrix_vis[mask_bg] = image_target_bgr[y : y + h, x : x + w, :][mask_bg]
+        image_target_bgr[y : y + h, x : x + w, :] = (
+            image_target_bgr[y : y + h, x : x + w, :] * (1.0 - self.alpha) + matrix_vis * self.alpha
+        )
+        return image_target_bgr.astype(np.uint8)
+
+    def _resize(self, mask, matrix, w, h):
+        if (w != mask.shape[1]) or (h != mask.shape[0]):
+            mask = cv2.resize(mask, (w, h), self.interp_method_mask)
+        if (w != matrix.shape[1]) or (h != matrix.shape[0]):
+            matrix = cv2.resize(matrix, (w, h), self.interp_method_matrix)
+        return mask, matrix
+
+    def _check_image(self, image_rgb):
+        assert len(image_rgb.shape) == 3
+        assert image_rgb.shape[2] == 3
+        assert image_rgb.dtype == np.uint8
+
+    def _check_mask_matrix(self, mask, matrix):
+        assert len(matrix.shape) == 2
+        assert len(mask.shape) == 2
+        assert mask.dtype == np.uint8
+
+
+class RectangleVisualizer(object):
+
+    _COLOR_GREEN = (18, 127, 15)
+
+    def __init__(self, color=_COLOR_GREEN, thickness=1):
+        self.color = color
+        self.thickness = thickness
+
+    def visualize(self, image_bgr, bbox_xywh, color=None, thickness=None):
+        x, y, w, h = bbox_xywh
+        color = color or self.color
+        thickness = thickness or self.thickness
+        cv2.rectangle(image_bgr, (int(x), int(y)), (int(x + w), int(y + h)), color, thickness)
+        return image_bgr
+
+
+class PointsVisualizer(object):
+
+    _COLOR_GREEN = (18, 127, 15)
+
+    def __init__(self, color_bgr=_COLOR_GREEN, r=5):
+        self.color_bgr = color_bgr
+        self.r = r
+
+    def visualize(self, image_bgr, pts_xy, colors_bgr=None, rs=None):
+        for j, pt_xy in enumerate(pts_xy):
+            x, y = pt_xy
+            color_bgr = colors_bgr[j] if colors_bgr is not None else self.color_bgr
+            r = rs[j] if rs is not None else self.r
+            cv2.circle(image_bgr, (x, y), r, color_bgr, -1)
+        return image_bgr
+
+
+class TextVisualizer(object):
+
+    _COLOR_GRAY = (218, 227, 218)
+    _COLOR_WHITE = (255, 255, 255)
+
+    def __init__(
+        self,
+        font_face=cv2.FONT_HERSHEY_SIMPLEX,
+        font_color_bgr=_COLOR_GRAY,
+        font_scale=0.35,
+        font_line_type=cv2.LINE_AA,
+        font_line_thickness=1,
+        fill_color_bgr=_COLOR_WHITE,
+        fill_color_transparency=1.0,
+        frame_color_bgr=_COLOR_WHITE,
+        frame_color_transparency=1.0,
+        frame_thickness=1,
+    ):
+        self.font_face = font_face
+        self.font_color_bgr = font_color_bgr
+        self.font_scale = font_scale
+        self.font_line_type = font_line_type
+        self.font_line_thickness = font_line_thickness
+        self.fill_color_bgr = fill_color_bgr
+        self.fill_color_transparency = fill_color_transparency
+        self.frame_color_bgr = frame_color_bgr
+        self.frame_color_transparency = frame_color_transparency
+        self.frame_thickness = frame_thickness
+
+    def visualize(self, image_bgr, txt, topleft_xy):
+        txt_w, txt_h = self.get_text_size_wh(txt)
+        topleft_xy = tuple(map(int, topleft_xy))
+        x, y = topleft_xy
+        if self.frame_color_transparency < 1.0:
+            t = self.frame_thickness
+            image_bgr[y - t : y + txt_h + t, x - t : x + txt_w + t, :] = (
+                image_bgr[y - t : y + txt_h + t, x - t : x + txt_w + t, :]
+                * self.frame_color_transparency
+                + np.array(self.frame_color_bgr) * (1.0 - self.frame_color_transparency)
+            ).astype(np.float)
+        if self.fill_color_transparency < 1.0:
+            image_bgr[y : y + txt_h, x : x + txt_w, :] = (
+                image_bgr[y : y + txt_h, x : x + txt_w, :] * self.fill_color_transparency
+                + np.array(self.fill_color_bgr) * (1.0 - self.fill_color_transparency)
+            ).astype(np.float)
+        cv2.putText(
+            image_bgr,
+            txt,
+            topleft_xy,
+            self.font_face,
+            self.font_scale,
+            self.font_color_bgr,
+            self.font_line_thickness,
+            self.font_line_type,
+        )
+        return image_bgr
+
+    def get_text_size_wh(self, txt):
+        ((txt_w, txt_h), _) = cv2.getTextSize(
+            txt, self.font_face, self.font_scale, self.font_line_thickness
+        )
+        return txt_w, txt_h
+
+
+class CompoundVisualizer(object):
+    def __init__(self, visualizers):
+        self.visualizers = visualizers
+
+    def visualize(self, image_bgr, data):
+        assert len(data) == len(
+            self.visualizers
+        ), "The number of datas {} should match the number of visualizers" " {}".format(
+            len(data), len(self.visualizers)
+        )
+        image = image_bgr
+        for i, visualizer in enumerate(self.visualizers):
+            image = visualizer.visualize(image, data[i])
+        return image
+
+    def __str__(self):
+        visualizer_str = ", ".join([str(v) for v in self.visualizers])
+        return "Compound Visualizer [{}]".format(visualizer_str)
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/projects/DensePose/densepose/vis/bounding_box.py b/preprocess/humanparsing/mhp_extension/detectron2/projects/DensePose/densepose/vis/bounding_box.py
new file mode 100644
index 0000000000000000000000000000000000000000..d7951d69e4a92d638debc79458dd2cfe58c650e3
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/projects/DensePose/densepose/vis/bounding_box.py
@@ -0,0 +1,37 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+from .base import RectangleVisualizer, TextVisualizer
+
+
+class BoundingBoxVisualizer(object):
+    def __init__(self):
+        self.rectangle_visualizer = RectangleVisualizer()
+
+    def visualize(self, image_bgr, boxes_xywh):
+        for bbox_xywh in boxes_xywh:
+            image_bgr = self.rectangle_visualizer.visualize(image_bgr, bbox_xywh)
+        return image_bgr
+
+
+class ScoredBoundingBoxVisualizer(object):
+    def __init__(self, bbox_visualizer_params=None, score_visualizer_params=None):
+        if bbox_visualizer_params is None:
+            bbox_visualizer_params = {}
+        if score_visualizer_params is None:
+            score_visualizer_params = {}
+        self.visualizer_bbox = RectangleVisualizer(**bbox_visualizer_params)
+        self.visualizer_score = TextVisualizer(**score_visualizer_params)
+
+    def visualize(self, image_bgr, scored_bboxes):
+        boxes_xywh, box_scores = scored_bboxes
+        assert len(boxes_xywh) == len(
+            box_scores
+        ), "Number of bounding boxes {} should be equal to the number of scores {}".format(
+            len(boxes_xywh), len(box_scores)
+        )
+        for i, box_xywh in enumerate(boxes_xywh):
+            score_i = box_scores[i]
+            image_bgr = self.visualizer_bbox.visualize(image_bgr, box_xywh)
+            score_txt = "{0:6.4f}".format(score_i)
+            topleft_xy = box_xywh[0], box_xywh[1]
+            image_bgr = self.visualizer_score.visualize(image_bgr, score_txt, topleft_xy)
+        return image_bgr
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/projects/DensePose/densepose/vis/densepose.py b/preprocess/humanparsing/mhp_extension/detectron2/projects/DensePose/densepose/vis/densepose.py
new file mode 100644
index 0000000000000000000000000000000000000000..f2e77dc2d8e0f8c041ac1217978c639a826f0857
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/projects/DensePose/densepose/vis/densepose.py
@@ -0,0 +1,593 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+import logging
+import numpy as np
+from typing import Iterable, Optional, Tuple
+import cv2
+
+from ..data.structures import DensePoseDataRelative, DensePoseOutput, DensePoseResult
+from .base import Boxes, Image, MatrixVisualizer, PointsVisualizer
+
+
+class DensePoseResultsVisualizer(object):
+    def visualize(self, image_bgr: Image, densepose_result: Optional[DensePoseResult]) -> Image:
+        if densepose_result is None:
+            return image_bgr
+        context = self.create_visualization_context(image_bgr)
+        for i, result_encoded_w_shape in enumerate(densepose_result.results):
+            iuv_arr = DensePoseResult.decode_png_data(*result_encoded_w_shape)
+            bbox_xywh = densepose_result.boxes_xywh[i]
+            self.visualize_iuv_arr(context, iuv_arr, bbox_xywh)
+        image_bgr = self.context_to_image_bgr(context)
+        return image_bgr
+
+
+class DensePoseMaskedColormapResultsVisualizer(DensePoseResultsVisualizer):
+    def __init__(
+        self,
+        data_extractor,
+        segm_extractor,
+        inplace=True,
+        cmap=cv2.COLORMAP_PARULA,
+        alpha=0.7,
+        val_scale=1.0,
+    ):
+        self.mask_visualizer = MatrixVisualizer(
+            inplace=inplace, cmap=cmap, val_scale=val_scale, alpha=alpha
+        )
+        self.data_extractor = data_extractor
+        self.segm_extractor = segm_extractor
+
+    def create_visualization_context(self, image_bgr: Image):
+        return image_bgr
+
+    def context_to_image_bgr(self, context):
+        return context
+
+    def get_image_bgr_from_context(self, context):
+        return context
+
+    def visualize_iuv_arr(self, context, iuv_arr, bbox_xywh):
+        image_bgr = self.get_image_bgr_from_context(context)
+        matrix = self.data_extractor(iuv_arr)
+        segm = self.segm_extractor(iuv_arr)
+        mask = np.zeros(matrix.shape, dtype=np.uint8)
+        mask[segm > 0] = 1
+        image_bgr = self.mask_visualizer.visualize(image_bgr, mask, matrix, bbox_xywh)
+        return image_bgr
+
+
+def _extract_i_from_iuvarr(iuv_arr):
+    return iuv_arr[0, :, :]
+
+
+def _extract_u_from_iuvarr(iuv_arr):
+    return iuv_arr[1, :, :]
+
+
+def _extract_v_from_iuvarr(iuv_arr):
+    return iuv_arr[2, :, :]
+
+
+class DensePoseResultsMplContourVisualizer(DensePoseResultsVisualizer):
+    def __init__(self, levels=10, **kwargs):
+        self.levels = levels
+        self.plot_args = kwargs
+
+    def create_visualization_context(self, image_bgr: Image):
+        import matplotlib.pyplot as plt
+        from matplotlib.backends.backend_agg import FigureCanvasAgg as FigureCanvas
+
+        context = {}
+        context["image_bgr"] = image_bgr
+        dpi = 100
+        height_inches = float(image_bgr.shape[0]) / dpi
+        width_inches = float(image_bgr.shape[1]) / dpi
+        fig = plt.figure(figsize=(width_inches, height_inches), dpi=dpi)
+        plt.axes([0, 0, 1, 1])
+        plt.axis("off")
+        context["fig"] = fig
+        canvas = FigureCanvas(fig)
+        context["canvas"] = canvas
+        extent = (0, image_bgr.shape[1], image_bgr.shape[0], 0)
+        plt.imshow(image_bgr[:, :, ::-1], extent=extent)
+        return context
+
+    def context_to_image_bgr(self, context):
+        fig = context["fig"]
+        w, h = map(int, fig.get_size_inches() * fig.get_dpi())
+        canvas = context["canvas"]
+        canvas.draw()
+        image_1d = np.fromstring(canvas.tostring_rgb(), dtype="uint8")
+        image_rgb = image_1d.reshape(h, w, 3)
+        image_bgr = image_rgb[:, :, ::-1].copy()
+        return image_bgr
+
+    def visualize_iuv_arr(self, context, iuv_arr: np.ndarray, bbox_xywh: Boxes) -> Image:
+        import matplotlib.pyplot as plt
+
+        u = _extract_u_from_iuvarr(iuv_arr).astype(float) / 255.0
+        v = _extract_v_from_iuvarr(iuv_arr).astype(float) / 255.0
+        extent = (
+            bbox_xywh[0],
+            bbox_xywh[0] + bbox_xywh[2],
+            bbox_xywh[1],
+            bbox_xywh[1] + bbox_xywh[3],
+        )
+        plt.contour(u, self.levels, extent=extent, **self.plot_args)
+        plt.contour(v, self.levels, extent=extent, **self.plot_args)
+
+
+class DensePoseResultsCustomContourVisualizer(DensePoseResultsVisualizer):
+    """
+    Contour visualization using marching squares
+    """
+
+    def __init__(self, levels=10, **kwargs):
+        # TODO: colormap is hardcoded
+        cmap = cv2.COLORMAP_PARULA
+        if isinstance(levels, int):
+            self.levels = np.linspace(0, 1, levels)
+        else:
+            self.levels = levels
+        if "linewidths" in kwargs:
+            self.linewidths = kwargs["linewidths"]
+        else:
+            self.linewidths = [1] * len(self.levels)
+        self.plot_args = kwargs
+        img_colors_bgr = cv2.applyColorMap((self.levels * 255).astype(np.uint8), cmap)
+        self.level_colors_bgr = [
+            [int(v) for v in img_color_bgr.ravel()] for img_color_bgr in img_colors_bgr
+        ]
+
+    def create_visualization_context(self, image_bgr: Image):
+        return image_bgr
+
+    def context_to_image_bgr(self, context):
+        return context
+
+    def get_image_bgr_from_context(self, context):
+        return context
+
+    def visualize_iuv_arr(self, context, iuv_arr: np.ndarray, bbox_xywh: Boxes) -> Image:
+        image_bgr = self.get_image_bgr_from_context(context)
+        segm = _extract_i_from_iuvarr(iuv_arr)
+        u = _extract_u_from_iuvarr(iuv_arr).astype(float) / 255.0
+        v = _extract_v_from_iuvarr(iuv_arr).astype(float) / 255.0
+        self._contours(image_bgr, u, segm, bbox_xywh)
+        self._contours(image_bgr, v, segm, bbox_xywh)
+
+    def _contours(self, image_bgr, arr, segm, bbox_xywh):
+        for part_idx in range(1, DensePoseDataRelative.N_PART_LABELS + 1):
+            mask = segm == part_idx
+            if not np.any(mask):
+                continue
+            arr_min = np.amin(arr[mask])
+            arr_max = np.amax(arr[mask])
+            I, J = np.nonzero(mask)
+            i0 = np.amin(I)
+            i1 = np.amax(I) + 1
+            j0 = np.amin(J)
+            j1 = np.amax(J) + 1
+            if (j1 == j0 + 1) or (i1 == i0 + 1):
+                continue
+            Nw = arr.shape[1] - 1
+            Nh = arr.shape[0] - 1
+            for level_idx, level in enumerate(self.levels):
+                if (level < arr_min) or (level > arr_max):
+                    continue
+                vp = arr[i0:i1, j0:j1] >= level
+                bin_codes = vp[:-1, :-1] + vp[1:, :-1] * 2 + vp[1:, 1:] * 4 + vp[:-1, 1:] * 8
+                mp = mask[i0:i1, j0:j1]
+                bin_mask_codes = mp[:-1, :-1] + mp[1:, :-1] * 2 + mp[1:, 1:] * 4 + mp[:-1, 1:] * 8
+                it = np.nditer(bin_codes, flags=["multi_index"])
+                color_bgr = self.level_colors_bgr[level_idx]
+                linewidth = self.linewidths[level_idx]
+                while not it.finished:
+                    if (it[0] != 0) and (it[0] != 15):
+                        i, j = it.multi_index
+                        if bin_mask_codes[i, j] != 0:
+                            self._draw_line(
+                                image_bgr,
+                                arr,
+                                mask,
+                                level,
+                                color_bgr,
+                                linewidth,
+                                it[0],
+                                it.multi_index,
+                                bbox_xywh,
+                                Nw,
+                                Nh,
+                                (i0, j0),
+                            )
+                    it.iternext()
+
+    def _draw_line(
+        self,
+        image_bgr,
+        arr,
+        mask,
+        v,
+        color_bgr,
+        linewidth,
+        bin_code,
+        multi_idx,
+        bbox_xywh,
+        Nw,
+        Nh,
+        offset,
+    ):
+        lines = self._bin_code_2_lines(arr, v, bin_code, multi_idx, Nw, Nh, offset)
+        x0, y0, w, h = bbox_xywh
+        x1 = x0 + w
+        y1 = y0 + h
+        for line in lines:
+            x0r, y0r = line[0]
+            x1r, y1r = line[1]
+            pt0 = (int(x0 + x0r * (x1 - x0)), int(y0 + y0r * (y1 - y0)))
+            pt1 = (int(x0 + x1r * (x1 - x0)), int(y0 + y1r * (y1 - y0)))
+            cv2.line(image_bgr, pt0, pt1, color_bgr, linewidth)
+
+    def _bin_code_2_lines(self, arr, v, bin_code, multi_idx, Nw, Nh, offset):
+        i0, j0 = offset
+        i, j = multi_idx
+        i += i0
+        j += j0
+        v0, v1, v2, v3 = arr[i, j], arr[i + 1, j], arr[i + 1, j + 1], arr[i, j + 1]
+        x0i = float(j) / Nw
+        y0j = float(i) / Nh
+        He = 1.0 / Nh
+        We = 1.0 / Nw
+        if (bin_code == 1) or (bin_code == 14):
+            a = (v - v0) / (v1 - v0)
+            b = (v - v0) / (v3 - v0)
+            pt1 = (x0i, y0j + a * He)
+            pt2 = (x0i + b * We, y0j)
+            return [(pt1, pt2)]
+        elif (bin_code == 2) or (bin_code == 13):
+            a = (v - v0) / (v1 - v0)
+            b = (v - v1) / (v2 - v1)
+            pt1 = (x0i, y0j + a * He)
+            pt2 = (x0i + b * We, y0j + He)
+            return [(pt1, pt2)]
+        elif (bin_code == 3) or (bin_code == 12):
+            a = (v - v0) / (v3 - v0)
+            b = (v - v1) / (v2 - v1)
+            pt1 = (x0i + a * We, y0j)
+            pt2 = (x0i + b * We, y0j + He)
+            return [(pt1, pt2)]
+        elif (bin_code == 4) or (bin_code == 11):
+            a = (v - v1) / (v2 - v1)
+            b = (v - v3) / (v2 - v3)
+            pt1 = (x0i + a * We, y0j + He)
+            pt2 = (x0i + We, y0j + b * He)
+            return [(pt1, pt2)]
+        elif (bin_code == 6) or (bin_code == 9):
+            a = (v - v0) / (v1 - v0)
+            b = (v - v3) / (v2 - v3)
+            pt1 = (x0i, y0j + a * He)
+            pt2 = (x0i + We, y0j + b * He)
+            return [(pt1, pt2)]
+        elif (bin_code == 7) or (bin_code == 8):
+            a = (v - v0) / (v3 - v0)
+            b = (v - v3) / (v2 - v3)
+            pt1 = (x0i + a * We, y0j)
+            pt2 = (x0i + We, y0j + b * He)
+            return [(pt1, pt2)]
+        elif bin_code == 5:
+            a1 = (v - v0) / (v1 - v0)
+            b1 = (v - v1) / (v2 - v1)
+            pt11 = (x0i, y0j + a1 * He)
+            pt12 = (x0i + b1 * We, y0j + He)
+            a2 = (v - v0) / (v3 - v0)
+            b2 = (v - v3) / (v2 - v3)
+            pt21 = (x0i + a2 * We, y0j)
+            pt22 = (x0i + We, y0j + b2 * He)
+            return [(pt11, pt12), (pt21, pt22)]
+        elif bin_code == 10:
+            a1 = (v - v0) / (v3 - v0)
+            b1 = (v - v0) / (v1 - v0)
+            pt11 = (x0i + a1 * We, y0j)
+            pt12 = (x0i, y0j + b1 * He)
+            a2 = (v - v1) / (v2 - v1)
+            b2 = (v - v3) / (v2 - v3)
+            pt21 = (x0i + a2 * We, y0j + He)
+            pt22 = (x0i + We, y0j + b2 * He)
+            return [(pt11, pt12), (pt21, pt22)]
+        return []
+
+
+try:
+    import matplotlib
+
+    matplotlib.use("Agg")
+    DensePoseResultsContourVisualizer = DensePoseResultsMplContourVisualizer
+except ModuleNotFoundError:
+    logger = logging.getLogger(__name__)
+    logger.warning("Could not import matplotlib, using custom contour visualizer")
+    DensePoseResultsContourVisualizer = DensePoseResultsCustomContourVisualizer
+
+
+class DensePoseResultsFineSegmentationVisualizer(DensePoseMaskedColormapResultsVisualizer):
+    def __init__(self, inplace=True, cmap=cv2.COLORMAP_PARULA, alpha=0.7):
+        super(DensePoseResultsFineSegmentationVisualizer, self).__init__(
+            _extract_i_from_iuvarr,
+            _extract_i_from_iuvarr,
+            inplace,
+            cmap,
+            alpha,
+            val_scale=255.0 / DensePoseDataRelative.N_PART_LABELS,
+        )
+
+
+class DensePoseResultsUVisualizer(DensePoseMaskedColormapResultsVisualizer):
+    def __init__(self, inplace=True, cmap=cv2.COLORMAP_PARULA, alpha=0.7):
+        super(DensePoseResultsUVisualizer, self).__init__(
+            _extract_u_from_iuvarr, _extract_i_from_iuvarr, inplace, cmap, alpha, val_scale=1.0
+        )
+
+
+class DensePoseResultsVVisualizer(DensePoseMaskedColormapResultsVisualizer):
+    def __init__(self, inplace=True, cmap=cv2.COLORMAP_PARULA, alpha=0.7):
+        super(DensePoseResultsVVisualizer, self).__init__(
+            _extract_v_from_iuvarr, _extract_i_from_iuvarr, inplace, cmap, alpha, val_scale=1.0
+        )
+
+
+class DensePoseOutputsFineSegmentationVisualizer(object):
+    def __init__(self, inplace=True, cmap=cv2.COLORMAP_PARULA, alpha=0.7):
+        self.mask_visualizer = MatrixVisualizer(
+            inplace=inplace,
+            cmap=cmap,
+            val_scale=255.0 / DensePoseDataRelative.N_PART_LABELS,
+            alpha=alpha,
+        )
+
+    def visualize(
+        self, image_bgr: Image, dp_output_with_bboxes: Optional[Tuple[DensePoseOutput, Boxes]]
+    ) -> Image:
+        if dp_output_with_bboxes is None:
+            return image_bgr
+        densepose_output, bboxes_xywh = dp_output_with_bboxes
+        S = densepose_output.S
+        I = densepose_output.I  # noqa
+        U = densepose_output.U
+        V = densepose_output.V
+        N = S.size(0)
+        assert N == I.size(
+            0
+        ), "densepose outputs S {} and I {}" " should have equal first dim size".format(
+            S.size(), I.size()
+        )
+        assert N == U.size(
+            0
+        ), "densepose outputs S {} and U {}" " should have equal first dim size".format(
+            S.size(), U.size()
+        )
+        assert N == V.size(
+            0
+        ), "densepose outputs S {} and V {}" " should have equal first dim size".format(
+            S.size(), V.size()
+        )
+        assert N == len(
+            bboxes_xywh
+        ), "number of bounding boxes {}" " should be equal to first dim size of outputs {}".format(
+            len(bboxes_xywh), N
+        )
+        for n in range(N):
+            Sn = S[n].argmax(dim=0)
+            In = I[n].argmax(dim=0) * (Sn > 0).long()
+            matrix = In.cpu().numpy().astype(np.uint8)
+            mask = np.zeros(matrix.shape, dtype=np.uint8)
+            mask[matrix > 0] = 1
+            bbox_xywh = bboxes_xywh[n]
+            image_bgr = self.mask_visualizer.visualize(image_bgr, mask, matrix, bbox_xywh)
+        return image_bgr
+
+
+class DensePoseOutputsUVisualizer(object):
+    def __init__(self, inplace=True, cmap=cv2.COLORMAP_PARULA, alpha=0.7):
+        self.mask_visualizer = MatrixVisualizer(
+            inplace=inplace, cmap=cmap, val_scale=1.0, alpha=alpha
+        )
+
+    def visualize(
+        self, image_bgr: Image, dp_output_with_bboxes: Optional[Tuple[DensePoseOutput, Boxes]]
+    ) -> Image:
+        if dp_output_with_bboxes is None:
+            return image_bgr
+        densepose_output, bboxes_xywh = dp_output_with_bboxes
+        assert isinstance(
+            densepose_output, DensePoseOutput
+        ), "DensePoseOutput expected, {} encountered".format(type(densepose_output))
+        S = densepose_output.S
+        I = densepose_output.I  # noqa
+        U = densepose_output.U
+        V = densepose_output.V
+        N = S.size(0)
+        assert N == I.size(
+            0
+        ), "densepose outputs S {} and I {}" " should have equal first dim size".format(
+            S.size(), I.size()
+        )
+        assert N == U.size(
+            0
+        ), "densepose outputs S {} and U {}" " should have equal first dim size".format(
+            S.size(), U.size()
+        )
+        assert N == V.size(
+            0
+        ), "densepose outputs S {} and V {}" " should have equal first dim size".format(
+            S.size(), V.size()
+        )
+        assert N == len(
+            bboxes_xywh
+        ), "number of bounding boxes {}" " should be equal to first dim size of outputs {}".format(
+            len(bboxes_xywh), N
+        )
+        for n in range(N):
+            Sn = S[n].argmax(dim=0)
+            In = I[n].argmax(dim=0) * (Sn > 0).long()
+            segmentation = In.cpu().numpy().astype(np.uint8)
+            mask = np.zeros(segmentation.shape, dtype=np.uint8)
+            mask[segmentation > 0] = 1
+            Un = U[n].cpu().numpy().astype(np.float32)
+            Uvis = np.zeros(segmentation.shape, dtype=np.float32)
+            for partId in range(Un.shape[0]):
+                Uvis[segmentation == partId] = Un[partId][segmentation == partId].clip(0, 1) * 255
+                bbox_xywh = bboxes_xywh[n]
+            image_bgr = self.mask_visualizer.visualize(image_bgr, mask, Uvis, bbox_xywh)
+        return image_bgr
+
+
+class DensePoseOutputsVVisualizer(object):
+    def __init__(self, inplace=True, cmap=cv2.COLORMAP_PARULA, alpha=0.7):
+        self.mask_visualizer = MatrixVisualizer(
+            inplace=inplace, cmap=cmap, val_scale=1.0, alpha=alpha
+        )
+
+    def visualize(
+        self, image_bgr: Image, dp_output_with_bboxes: Optional[Tuple[DensePoseOutput, Boxes]]
+    ) -> Image:
+        if dp_output_with_bboxes is None:
+            return image_bgr
+        densepose_output, bboxes_xywh = dp_output_with_bboxes
+        assert isinstance(
+            densepose_output, DensePoseOutput
+        ), "DensePoseOutput expected, {} encountered".format(type(densepose_output))
+        S = densepose_output.S
+        I = densepose_output.I  # noqa
+        U = densepose_output.U
+        V = densepose_output.V
+        N = S.size(0)
+        assert N == I.size(
+            0
+        ), "densepose outputs S {} and I {}" " should have equal first dim size".format(
+            S.size(), I.size()
+        )
+        assert N == U.size(
+            0
+        ), "densepose outputs S {} and U {}" " should have equal first dim size".format(
+            S.size(), U.size()
+        )
+        assert N == V.size(
+            0
+        ), "densepose outputs S {} and V {}" " should have equal first dim size".format(
+            S.size(), V.size()
+        )
+        assert N == len(
+            bboxes_xywh
+        ), "number of bounding boxes {}" " should be equal to first dim size of outputs {}".format(
+            len(bboxes_xywh), N
+        )
+        for n in range(N):
+            Sn = S[n].argmax(dim=0)
+            In = I[n].argmax(dim=0) * (Sn > 0).long()
+            segmentation = In.cpu().numpy().astype(np.uint8)
+            mask = np.zeros(segmentation.shape, dtype=np.uint8)
+            mask[segmentation > 0] = 1
+            Vn = V[n].cpu().numpy().astype(np.float32)
+            Vvis = np.zeros(segmentation.shape, dtype=np.float32)
+            for partId in range(Vn.size(0)):
+                Vvis[segmentation == partId] = Vn[partId][segmentation == partId].clip(0, 1) * 255
+            bbox_xywh = bboxes_xywh[n]
+            image_bgr = self.mask_visualizer.visualize(image_bgr, mask, Vvis, bbox_xywh)
+        return image_bgr
+
+
+class DensePoseDataCoarseSegmentationVisualizer(object):
+    """
+    Visualizer for ground truth segmentation
+    """
+
+    def __init__(self, inplace=True, cmap=cv2.COLORMAP_PARULA, alpha=0.7):
+        self.mask_visualizer = MatrixVisualizer(
+            inplace=inplace,
+            cmap=cmap,
+            val_scale=255.0 / DensePoseDataRelative.N_BODY_PARTS,
+            alpha=alpha,
+        )
+
+    def visualize(
+        self,
+        image_bgr: Image,
+        bbox_densepose_datas: Optional[Tuple[Iterable[Boxes], Iterable[DensePoseDataRelative]]],
+    ) -> Image:
+        if bbox_densepose_datas is None:
+            return image_bgr
+        for bbox_xywh, densepose_data in zip(*bbox_densepose_datas):
+            matrix = densepose_data.segm.numpy()
+            mask = np.zeros(matrix.shape, dtype=np.uint8)
+            mask[matrix > 0] = 1
+            image_bgr = self.mask_visualizer.visualize(image_bgr, mask, matrix, bbox_xywh.numpy())
+        return image_bgr
+
+
+class DensePoseDataPointsVisualizer(object):
+    def __init__(self, densepose_data_to_value_fn=None, cmap=cv2.COLORMAP_PARULA):
+        self.points_visualizer = PointsVisualizer()
+        self.densepose_data_to_value_fn = densepose_data_to_value_fn
+        self.cmap = cmap
+
+    def visualize(
+        self,
+        image_bgr: Image,
+        bbox_densepose_datas: Optional[Tuple[Iterable[Boxes], Iterable[DensePoseDataRelative]]],
+    ) -> Image:
+        if bbox_densepose_datas is None:
+            return image_bgr
+        for bbox_xywh, densepose_data in zip(*bbox_densepose_datas):
+            x0, y0, w, h = bbox_xywh.numpy()
+            x = densepose_data.x.numpy() * w / 255.0 + x0
+            y = densepose_data.y.numpy() * h / 255.0 + y0
+            pts_xy = zip(x, y)
+            if self.densepose_data_to_value_fn is None:
+                image_bgr = self.points_visualizer.visualize(image_bgr, pts_xy)
+            else:
+                v = self.densepose_data_to_value_fn(densepose_data)
+                img_colors_bgr = cv2.applyColorMap(v, self.cmap)
+                colors_bgr = [
+                    [int(v) for v in img_color_bgr.ravel()] for img_color_bgr in img_colors_bgr
+                ]
+                image_bgr = self.points_visualizer.visualize(image_bgr, pts_xy, colors_bgr)
+        return image_bgr
+
+
+def _densepose_data_u_for_cmap(densepose_data):
+    u = np.clip(densepose_data.u.numpy(), 0, 1) * 255.0
+    return u.astype(np.uint8)
+
+
+def _densepose_data_v_for_cmap(densepose_data):
+    v = np.clip(densepose_data.v.numpy(), 0, 1) * 255.0
+    return v.astype(np.uint8)
+
+
+def _densepose_data_i_for_cmap(densepose_data):
+    i = (
+        np.clip(densepose_data.i.numpy(), 0.0, DensePoseDataRelative.N_PART_LABELS)
+        * 255.0
+        / DensePoseDataRelative.N_PART_LABELS
+    )
+    return i.astype(np.uint8)
+
+
+class DensePoseDataPointsUVisualizer(DensePoseDataPointsVisualizer):
+    def __init__(self):
+        super(DensePoseDataPointsUVisualizer, self).__init__(
+            densepose_data_to_value_fn=_densepose_data_u_for_cmap
+        )
+
+
+class DensePoseDataPointsVVisualizer(DensePoseDataPointsVisualizer):
+    def __init__(self):
+        super(DensePoseDataPointsVVisualizer, self).__init__(
+            densepose_data_to_value_fn=_densepose_data_v_for_cmap
+        )
+
+
+class DensePoseDataPointsIVisualizer(DensePoseDataPointsVisualizer):
+    def __init__(self):
+        super(DensePoseDataPointsIVisualizer, self).__init__(
+            densepose_data_to_value_fn=_densepose_data_i_for_cmap
+        )
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/projects/DensePose/densepose/vis/extractor.py b/preprocess/humanparsing/mhp_extension/detectron2/projects/DensePose/densepose/vis/extractor.py
new file mode 100644
index 0000000000000000000000000000000000000000..b715a4451e096d6d6c086f9bcf60f92d2ae692f8
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/projects/DensePose/densepose/vis/extractor.py
@@ -0,0 +1,152 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+import logging
+from typing import Sequence
+import torch
+
+from detectron2.layers.nms import batched_nms
+from detectron2.structures.instances import Instances
+
+from densepose.vis.bounding_box import BoundingBoxVisualizer, ScoredBoundingBoxVisualizer
+from densepose.vis.densepose import DensePoseResultsVisualizer
+
+from .base import CompoundVisualizer
+
+Scores = Sequence[float]
+
+
+def extract_scores_from_instances(instances: Instances, select=None):
+    if instances.has("scores"):
+        return instances.scores if select is None else instances.scores[select]
+    return None
+
+
+def extract_boxes_xywh_from_instances(instances: Instances, select=None):
+    if instances.has("pred_boxes"):
+        boxes_xywh = instances.pred_boxes.tensor.clone()
+        boxes_xywh[:, 2] -= boxes_xywh[:, 0]
+        boxes_xywh[:, 3] -= boxes_xywh[:, 1]
+        return boxes_xywh if select is None else boxes_xywh[select]
+    return None
+
+
+def create_extractor(visualizer: object):
+    """
+    Create an extractor for the provided visualizer
+    """
+    if isinstance(visualizer, CompoundVisualizer):
+        extractors = [create_extractor(v) for v in visualizer.visualizers]
+        return CompoundExtractor(extractors)
+    elif isinstance(visualizer, DensePoseResultsVisualizer):
+        return DensePoseResultExtractor()
+    elif isinstance(visualizer, ScoredBoundingBoxVisualizer):
+        return CompoundExtractor([extract_boxes_xywh_from_instances, extract_scores_from_instances])
+    elif isinstance(visualizer, BoundingBoxVisualizer):
+        return extract_boxes_xywh_from_instances
+    else:
+        logger = logging.getLogger(__name__)
+        logger.error(f"Could not create extractor for {visualizer}")
+        return None
+
+
+class BoundingBoxExtractor(object):
+    """
+    Extracts bounding boxes from instances
+    """
+
+    def __call__(self, instances: Instances):
+        boxes_xywh = extract_boxes_xywh_from_instances(instances)
+        return boxes_xywh
+
+
+class ScoredBoundingBoxExtractor(object):
+    """
+    Extracts bounding boxes from instances
+    """
+
+    def __call__(self, instances: Instances, select=None):
+        scores = extract_scores_from_instances(instances)
+        boxes_xywh = extract_boxes_xywh_from_instances(instances)
+        if (scores is None) or (boxes_xywh is None):
+            return (boxes_xywh, scores)
+        if select is not None:
+            scores = scores[select]
+            boxes_xywh = boxes_xywh[select]
+        return (boxes_xywh, scores)
+
+
+class DensePoseResultExtractor(object):
+    """
+    Extracts DensePose result from instances
+    """
+
+    def __call__(self, instances: Instances, select=None):
+        boxes_xywh = extract_boxes_xywh_from_instances(instances)
+        if instances.has("pred_densepose") and (boxes_xywh is not None):
+            dpout = instances.pred_densepose
+            if select is not None:
+                dpout = dpout[select]
+                boxes_xywh = boxes_xywh[select]
+            return dpout.to_result(boxes_xywh)
+        else:
+            return None
+
+
+class CompoundExtractor(object):
+    """
+    Extracts data for CompoundVisualizer
+    """
+
+    def __init__(self, extractors):
+        self.extractors = extractors
+
+    def __call__(self, instances: Instances, select=None):
+        datas = []
+        for extractor in self.extractors:
+            data = extractor(instances, select)
+            datas.append(data)
+        return datas
+
+
+class NmsFilteredExtractor(object):
+    """
+    Extracts data in the format accepted by NmsFilteredVisualizer
+    """
+
+    def __init__(self, extractor, iou_threshold):
+        self.extractor = extractor
+        self.iou_threshold = iou_threshold
+
+    def __call__(self, instances: Instances, select=None):
+        scores = extract_scores_from_instances(instances)
+        boxes_xywh = extract_boxes_xywh_from_instances(instances)
+        if boxes_xywh is None:
+            return None
+        select_local_idx = batched_nms(
+            boxes_xywh,
+            scores,
+            torch.zeros(len(scores), dtype=torch.int32),
+            iou_threshold=self.iou_threshold,
+        ).squeeze()
+        select_local = torch.zeros(len(boxes_xywh), dtype=torch.bool, device=boxes_xywh.device)
+        select_local[select_local_idx] = True
+        select = select_local if select is None else (select & select_local)
+        return self.extractor(instances, select=select)
+
+
+class ScoreThresholdedExtractor(object):
+    """
+    Extracts data in the format accepted by ScoreThresholdedVisualizer
+    """
+
+    def __init__(self, extractor, min_score):
+        self.extractor = extractor
+        self.min_score = min_score
+
+    def __call__(self, instances: Instances, select=None):
+        scores = extract_scores_from_instances(instances)
+        if scores is None:
+            return None
+        select_local = scores > self.min_score
+        select = select_local if select is None else (select & select_local)
+        data = self.extractor(instances, select=select)
+        return data
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/projects/DensePose/dev/README.md b/preprocess/humanparsing/mhp_extension/detectron2/projects/DensePose/dev/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..e3a94b67ed4b4d0c2934f074802cd00f3660f9a9
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/projects/DensePose/dev/README.md
@@ -0,0 +1,7 @@
+
+## Some scripts for developers to use, include:
+
+- `run_instant_tests.sh`: run training for a few iterations.
+- `run_inference_tests.sh`: run inference on a small dataset.
+- `../../dev/linter.sh`: lint the codebase before commit
+- `../../dev/parse_results.sh`: parse results from log file.
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/projects/DensePose/dev/run_inference_tests.sh b/preprocess/humanparsing/mhp_extension/detectron2/projects/DensePose/dev/run_inference_tests.sh
new file mode 100644
index 0000000000000000000000000000000000000000..34f47d5a07a90c411e830c98a346845fa618f836
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/projects/DensePose/dev/run_inference_tests.sh
@@ -0,0 +1,33 @@
+#!/bin/bash -e
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+
+BIN="python train_net.py"
+OUTPUT="inference_test_output"
+NUM_GPUS=2
+IMS_PER_GPU=2
+IMS_PER_BATCH=$(( NUM_GPUS * IMS_PER_GPU ))
+
+CFG_LIST=( "${@:1}" )
+
+if [ ${#CFG_LIST[@]} -eq 0 ]; then
+  CFG_LIST=( ./configs/quick_schedules/*inference_acc_test.yaml )
+fi
+
+echo "========================================================================"
+echo "Configs to run:"
+echo "${CFG_LIST[@]}"
+echo "========================================================================"
+
+for cfg in "${CFG_LIST[@]}"; do
+    echo "========================================================================"
+    echo "Running $cfg ..."
+    echo "========================================================================"
+    $BIN \
+      --eval-only \
+      --num-gpus $NUM_GPUS \
+      --config-file "$cfg" \
+      OUTPUT_DIR "$OUTPUT" \
+      SOLVER.IMS_PER_BATCH $IMS_PER_BATCH
+    rm -rf $OUTPUT
+done
+
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/projects/DensePose/dev/run_instant_tests.sh b/preprocess/humanparsing/mhp_extension/detectron2/projects/DensePose/dev/run_instant_tests.sh
new file mode 100644
index 0000000000000000000000000000000000000000..a53785180974a70bce7fdb0c9da4024166efd596
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/projects/DensePose/dev/run_instant_tests.sh
@@ -0,0 +1,28 @@
+#!/bin/bash -e
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+
+BIN="python train_net.py"
+OUTPUT="instant_test_output"
+NUM_GPUS=2
+SOLVER_IMS_PER_BATCH=$((NUM_GPUS * 2))
+
+CFG_LIST=( "${@:1}" )
+if [ ${#CFG_LIST[@]} -eq 0 ]; then
+  CFG_LIST=( ./configs/quick_schedules/*instant_test.yaml )
+fi
+
+echo "========================================================================"
+echo "Configs to run:"
+echo "${CFG_LIST[@]}"
+echo "========================================================================"
+
+for cfg in "${CFG_LIST[@]}"; do
+    echo "========================================================================"
+    echo "Running $cfg ..."
+    echo "========================================================================"
+    $BIN --num-gpus $NUM_GPUS --config-file "$cfg" \
+      SOLVER.IMS_PER_BATCH $SOLVER_IMS_PER_BATCH \
+      OUTPUT_DIR "$OUTPUT"
+    rm -rf "$OUTPUT"
+done
+
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/projects/DensePose/doc/GETTING_STARTED.md b/preprocess/humanparsing/mhp_extension/detectron2/projects/DensePose/doc/GETTING_STARTED.md
new file mode 100644
index 0000000000000000000000000000000000000000..a6bcbedee42835c99fa5aa1110309329dfbff6f0
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/projects/DensePose/doc/GETTING_STARTED.md
@@ -0,0 +1,58 @@
+# Getting Started with DensePose
+
+## Inference with Pre-trained Models
+
+1. Pick a model and its config file from [Model Zoo](MODEL_ZOO.md), for example [densepose_rcnn_R_50_FPN_s1x.yaml](../configs/densepose_rcnn_R_50_FPN_s1x.yaml)
+2. Run the [Apply Net](TOOL_APPLY_NET.md) tool to visualize the results or save the to disk. For example, to use contour visualization for DensePose, one can run:
+```bash
+python apply_net.py show configs/densepose_rcnn_R_50_FPN_s1x.yaml densepose_rcnn_R_50_FPN_s1x.pkl image.jpg dp_contour,bbox --output image_densepose_contour.png
+```
+Please see [Apply Net](TOOL_APPLY_NET.md) for more details on the tool.
+
+## Training
+
+First, prepare the [dataset](http://densepose.org/#dataset) into the following structure under the directory you'll run training scripts:
+<pre>
+datasets/coco/
+  annotations/
+    densepose_{train,minival,valminusminival}2014.json
+    <a href="https://dl.fbaipublicfiles.com/detectron2/densepose/densepose_minival2014_100.json">densepose_minival2014_100.json </a>  (optional, for testing only)
+  {train,val}2014/
+    # image files that are mentioned in the corresponding json
+</pre>
+
+To train a model one can use the [train_net.py](../train_net.py) script.
+This script was used to train all DensePose models in [Model Zoo](MODEL_ZOO.md).
+For example, to launch end-to-end DensePose-RCNN training with ResNet-50 FPN backbone
+on 8 GPUs following the s1x schedule, one can run
+```bash
+python train_net.py --config-file configs/densepose_rcnn_R_50_FPN_s1x.yaml --num-gpus 8
+```
+The configs are made for 8-GPU training. To train on 1 GPU, one can apply the
+[linear learning rate scaling rule](https://arxiv.org/abs/1706.02677):
+```bash
+python train_net.py --config-file configs/densepose_rcnn_R_50_FPN_s1x.yaml \
+    SOLVER.IMS_PER_BATCH 2 SOLVER.BASE_LR 0.0025
+```
+
+## Evaluation
+
+Model testing can be done in the same way as training, except for an additional flag `--eval-only` and
+model location specification through `MODEL.WEIGHTS model.pth` in the command line
+```bash
+python train_net.py --config-file configs/densepose_rcnn_R_50_FPN_s1x.yaml \
+    --eval-only MODEL.WEIGHTS model.pth
+```
+
+## Tools
+
+We provide tools which allow one to:
+ - easily view DensePose annotated data in a dataset;
+ - perform DensePose inference on a set of images;
+ - visualize DensePose model results;
+
+`query_db` is a tool to print or visualize DensePose data in a dataset.
+Please refer to [Query DB](TOOL_QUERY_DB.md) for more details on this tool
+
+`apply_net` is a tool to print or visualize DensePose results.
+Please refer to [Apply Net](TOOL_APPLY_NET.md) for more details on this tool
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/projects/DensePose/doc/MODEL_ZOO.md b/preprocess/humanparsing/mhp_extension/detectron2/projects/DensePose/doc/MODEL_ZOO.md
new file mode 100644
index 0000000000000000000000000000000000000000..c26308417de03efea3872b44fec43c74ead529e9
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/projects/DensePose/doc/MODEL_ZOO.md
@@ -0,0 +1,277 @@
+# Model Zoo and Baselines
+
+# Introduction
+
+We provide baselines trained with Detectron2 DensePose. The corresponding
+configuration files can be found in the [configs](../configs) directory.
+All models were trained on COCO `train2014` + `valminusminival2014` and
+evaluated on COCO `minival2014`. For the details on common settings in which
+baselines were trained, please check [Detectron 2 Model Zoo](../../../MODEL_ZOO.md).
+
+## License
+
+All models available for download through this document are licensed under the
+[Creative Commons Attribution-ShareAlike 3.0 license](https://creativecommons.org/licenses/by-sa/3.0/)
+
+## COCO DensePose Baselines with DensePose-RCNN
+
+### Legacy Models
+
+Baselines trained using schedules from [Güler et al, 2018](https://arxiv.org/pdf/1802.00434.pdf)
+
+<table><tbody>
+<!-- START TABLE -->
+<!-- TABLE HEADER -->
+<th valign="bottom">Name</th>
+<th valign="bottom">lr<br/>sched</th>
+<th valign="bottom">train<br/>time<br/>(s/iter)</th>
+<th valign="bottom">inference<br/>time<br/>(s/im)</th>
+<th valign="bottom">train<br/>mem<br/>(GB)</th>
+<th valign="bottom">box<br/>AP</th>
+<th valign="bottom">dp. AP<br/>GPS</th>
+<th valign="bottom">dp. AP<br/>GPSm</th>
+<th valign="bottom">model id</th>
+<th valign="bottom">download</th>
+<!-- TABLE BODY -->
+<!-- ROW: densepose_rcnn_R_50_FPN_s1x_legacy -->
+ <tr><td align="left"><a href="../configs/densepose_rcnn_R_50_FPN_s1x_legacy.yaml">R_50_FPN_s1x_legacy</a></td>
+ <td align="center">s1x</td>
+ <td align="center">0.307</td>
+ <td align="center">0.051</td>
+ <td align="center">3.2</td>
+ <td align="center">58.1</td>
+ <td align="center">52.1</td>
+ <td align="center">54.9</td>
+ <td align="center">164832157</td>
+ <td align="center"><a href="https://dl.fbaipublicfiles.com/densepose/densepose_rcnn_R_50_FPN_s1x_legacy/164832157/model_final_d366fa.pkl">model</a>&nbsp;|&nbsp;<a href="https://dl.fbaipublicfiles.com/densepose/densepose_rcnn_R_50_FPN_s1x_legacy/164832157/metrics.json">metrics</a></td>
+ </tr>
+ <!-- ROW: densepose_rcnn_R_101_FPN_s1x_legacy -->
+  <tr><td align="left"><a href="../configs/densepose_rcnn_R_101_FPN_s1x_legacy.yaml">R_101_FPN_s1x_legacy</a></td>
+  <td align="center">s1x</td>
+  <td align="center">0.390</td>
+  <td align="center">0.063</td>
+  <td align="center">4.3</td>
+  <td align="center">59.5</td>
+  <td align="center">53.2</td>
+  <td align="center">56.1</td>
+  <td align="center">164832182</td>
+  <td align="center"><a href="https://dl.fbaipublicfiles.com/densepose/densepose_rcnn_R_101_FPN_s1x_legacy/164832182/model_final_10af0e.pkl">model</a>&nbsp;|&nbsp;<a href="https://dl.fbaipublicfiles.com/densepose/densepose_rcnn_R_101_FPN_s1x_legacy/164832182/metrics.json">metrics</a></td>
+  </tr>
+</tbody></table>
+
+### Improved Baselines, Original Fully Convolutional Haad
+
+These models use an improved training schedule and Panoptic FPN head from [Kirillov et al, 2019](https://arxiv.org/abs/1901.02446).
+
+<table><tbody>
+  <!-- START TABLE -->
+  <!-- TABLE HEADER -->
+  <th valign="bottom">Name</th>
+  <th valign="bottom">lr<br/>sched</th>
+  <th valign="bottom">train<br/>time<br/>(s/iter)</th>
+  <th valign="bottom">inference<br/>time<br/>(s/im)</th>
+  <th valign="bottom">train<br/>mem<br/>(GB)</th>
+  <th valign="bottom">box<br/>AP</th>
+  <th valign="bottom">dp. AP<br/>GPS</th>
+  <th valign="bottom">dp. AP<br/>GPSm</th>
+  <th valign="bottom">model id</th>
+  <th valign="bottom">download</th>
+  <!-- TABLE BODY -->
+  <!-- ROW: densepose_rcnn_R_50_FPN_s1x -->
+   <tr><td align="left"><a href="../configs/densepose_rcnn_R_50_FPN_s1x.yaml">R_50_FPN_s1x</a></td>
+   <td align="center">s1x</td>
+   <td align="center">0.359</td>
+   <td align="center">0.066</td>
+   <td align="center">4.5</td>
+   <td align="center">61.2</td>
+   <td align="center">63.7</td>
+   <td align="center">65.3</td>
+   <td align="center">165712039</td>
+   <td align="center"><a href="https://dl.fbaipublicfiles.com/densepose/densepose_rcnn_R_50_FPN_s1x/165712039/model_final_162be9.pkl">model</a>&nbsp;|&nbsp;<a href="https://dl.fbaipublicfiles.com/densepose/densepose_rcnn_R_50_FPN_s1x/165712039/metrics.json">metrics</a></td>
+   </tr>
+   <!-- ROW: densepose_rcnn_R_101_FPN_s1x -->
+    <tr><td align="left"><a href="../configs/densepose_rcnn_R_101_FPN_s1x.yaml">R_101_FPN_s1x</a></td>
+    <td align="center">s1x</td>
+    <td align="center">0.428</td>
+    <td align="center">0.079</td>
+    <td align="center">5.8</td>
+    <td align="center">62.3</td>
+    <td align="center">64.5</td>
+    <td align="center">66.4</td>
+    <td align="center">165712084</td>
+    <td align="center"><a href="https://dl.fbaipublicfiles.com/densepose/densepose_rcnn_R_101_FPN_s1x/165712084/model_final_c6ab63.pkl">model</a>&nbsp;|&nbsp;<a href="https://dl.fbaipublicfiles.com/densepose/densepose_rcnn_R_101_FPN_s1x/165712084/metrics.json">metrics</a></td>
+    </tr>
+    </tbody></table>
+
+### Improved Baselines, DeepLabV3 Head
+
+These models use an improved training schedule, Panoptic FPN head from [Kirillov et al, 2019](https://arxiv.org/abs/1901.02446) and DeepLabV3 head from [Chen et al, 2017](https://arxiv.org/abs/1706.05587).
+
+<table><tbody>
+    <!-- START TABLE -->
+    <!-- TABLE HEADER -->
+    <th valign="bottom">Name</th>
+    <th valign="bottom">lr<br/>sched</th>
+    <th valign="bottom">train<br/>time<br/>(s/iter)</th>
+    <th valign="bottom">inference<br/>time<br/>(s/im)</th>
+    <th valign="bottom">train<br/>mem<br/>(GB)</th>
+    <th valign="bottom">box<br/>AP</th>
+    <th valign="bottom">dp. AP<br/>GPS</th>
+    <th valign="bottom">dp. AP<br/>GPSm</th>
+    <th valign="bottom">model id</th>
+    <th valign="bottom">download</th>
+    <!-- TABLE BODY -->
+    <!-- ROW: densepose_rcnn_R_50_FPN_DL_s1x -->
+     <tr><td align="left"><a href="../configs/densepose_rcnn_R_50_FPN_DL_s1x.yaml">R_50_FPN_DL_s1x</a></td>
+     <td align="center">s1x</td>
+     <td align="center">0.392</td>
+     <td align="center">0.070</td>
+     <td align="center">6.7</td>
+     <td align="center">61.1</td>
+     <td align="center">65.6</td>
+     <td align="center">66.8</td>
+     <td align="center">165712097</td>
+     <td align="center"><a href="https://dl.fbaipublicfiles.com/densepose/densepose_rcnn_R_50_FPN_DL_s1x/165712097/model_final_0ed407.pkl">model</a>&nbsp;|&nbsp;<a href="https://dl.fbaipublicfiles.com/densepose/densepose_rcnn_R_50_FPN_DL_s1x/165712097/metrics.json">metrics</a></td>
+     </tr>
+     <!-- ROW: densepose_rcnn_R_101_FPN_DL_s1x -->
+      <tr><td align="left"><a href="../configs/densepose_rcnn_R_101_FPN_DL_s1x.yaml">R_101_FPN_DL_s1x</a></td>
+      <td align="center">s1x</td>
+      <td align="center">0.478</td>
+      <td align="center">0.083</td>
+      <td align="center">7.0</td>
+      <td align="center">62.3</td>
+      <td align="center">66.3</td>
+      <td align="center">67.7</td>
+      <td align="center">165712116</td>
+      <td align="center"><a href="https://dl.fbaipublicfiles.com/densepose/densepose_rcnn_R_101_FPN_DL_s1x/165712116/model_final_844d15.pkl">model</a>&nbsp;|&nbsp;<a href="https://dl.fbaipublicfiles.com/densepose/densepose_rcnn_R_101_FPN_DL_s1x/165712116/metrics.json">metrics</a></td>
+      </tr>
+</tbody></table>
+
+### Baselines with Confidence Estimation
+
+These models perform additional estimation of confidence in regressed UV coodrinates, along the lines of [Neverova et al., 2019](https://papers.nips.cc/paper/8378-correlated-uncertainty-for-learning-dense-correspondences-from-noisy-labels).
+
+<table><tbody>
+<!-- START TABLE -->
+<!-- TABLE HEADER -->
+<th valign="bottom">Name</th>
+<th valign="bottom">lr<br/>sched</th>
+<th valign="bottom">train<br/>time<br/>(s/iter)</th>
+<th valign="bottom">inference<br/>time<br/>(s/im)</th>
+<th valign="bottom">train<br/>mem<br/>(GB)</th>
+<th valign="bottom">box<br/>AP</th>
+<th valign="bottom">dp. AP<br/>GPS</th>
+<th valign="bottom">dp. AP<br/>GPSm</th>
+<th valign="bottom">model id</th>
+<th valign="bottom">download</th>
+<!-- TABLE BODY --> 
+<!-- ROW: densepose_rcnn_R_50_FPN_WC1_s1x --> 
+ <tr><td align="left"><a href="../configs/densepose_rcnn_R_50_FPN_WC1_s1x.yaml">R_50_FPN_WC1_s1x</a></td>
+<td align="center">s1x</td>
+<td align="center">0.353</td>
+<td align="center">0.064</td>
+<td align="center">4.6</td>
+<td align="center">60.5</td>
+<td align="center">64.2</td>
+<td align="center">65.6</td>
+<td align="center">173862049</td>
+<td align="center"><a href="https://dl.fbaipublicfiles.com/densepose/densepose_rcnn_R_50_FPN_WC1_s1x/173862049/model_final_289019.pkl">model</a>&nbsp;|&nbsp;<a href="https://dl.fbaipublicfiles.com/densepose/densepose_rcnn_R_50_FPN_WC1_s1x/173862049/metrics.json">metrics</a></td>
+</tr>
+<!-- ROW: densepose_rcnn_R_50_FPN_WC2_s1x --> 
+ <tr><td align="left"><a href="../configs/densepose_rcnn_R_50_FPN_WC2_s1x.yaml">R_50_FPN_WC2_s1x</a></td>
+<td align="center">s1x</td>
+<td align="center">0.364</td>
+<td align="center">0.066</td>
+<td align="center">4.8</td>
+<td align="center">60.7</td>
+<td align="center">64.2</td>
+<td align="center">65.7</td>
+<td align="center">173861455</td>
+<td align="center"><a href="https://dl.fbaipublicfiles.com/densepose/densepose_rcnn_R_50_FPN_WC2_s1x/173861455/model_final_3abe14.pkl">model</a>&nbsp;|&nbsp;<a href="https://dl.fbaipublicfiles.com/densepose/densepose_rcnn_R_50_FPN_WC2_s1x/173861455/metrics.json">metrics</a></td>
+</tr>
+<!-- ROW: densepose_rcnn_R_50_FPN_DL_WC1_s1x --> 
+ <tr><td align="left"><a href="../configs/densepose_rcnn_R_50_FPN_DL_WC1_s1x.yaml">R_50_FPN_DL_WC1_s1x</a></td>
+<td align="center">s1x</td>
+<td align="center">0.397</td>
+<td align="center">0.068</td>
+<td align="center">6.7</td>
+<td align="center">61.1</td>
+<td align="center">65.8</td>
+<td align="center">67.1</td>
+<td align="center">173067973</td>
+<td align="center"><a href="https://dl.fbaipublicfiles.com/densepose/densepose_rcnn_R_50_FPN_DL_WC1_s1x/173067973/model_final_b1e525.pkl">model</a>&nbsp;|&nbsp;<a href="https://dl.fbaipublicfiles.com/densepose/densepose_rcnn_R_50_FPN_DL_WC1_s1x/173067973/metrics.json">metrics</a></td>
+</tr>
+<!-- ROW: densepose_rcnn_R_50_FPN_DL_WC2_s1x --> 
+ <tr><td align="left"><a href="../configs/densepose_rcnn_R_50_FPN_DL_WC2_s1x.yaml">R_50_FPN_DL_WC2_s1x</a></td>
+<td align="center">s1x</td>
+<td align="center">0.410</td>
+<td align="center">0.070</td>
+<td align="center">6.8</td>
+<td align="center">60.8</td>
+<td align="center">65.6</td>
+<td align="center">66.7</td>
+<td align="center">173859335</td>
+<td align="center"><a href="https://dl.fbaipublicfiles.com/densepose/densepose_rcnn_R_50_FPN_DL_WC2_s1x/173859335/model_final_60fed4.pkl">model</a>&nbsp;|&nbsp;<a href="https://dl.fbaipublicfiles.com/densepose/densepose_rcnn_R_50_FPN_DL_WC2_s1x/173859335/metrics.json">metrics</a></td>
+</tr>
+<!-- ROW: densepose_rcnn_R_101_FPN_WC1_s1x --> 
+ <tr><td align="left"><a href="../configs/densepose_rcnn_R_101_FPN_WC1_s1x.yaml">R_101_FPN_WC1_s1x</a></td>
+<td align="center">s1x</td>
+<td align="center">0.435</td>
+<td align="center">0.076</td>
+<td align="center">5.7</td>
+<td align="center">62.5</td>
+<td align="center">64.9</td>
+<td align="center">66.5</td>
+<td align="center">171402969</td>
+<td align="center"><a href="https://dl.fbaipublicfiles.com/densepose/densepose_rcnn_R_101_FPN_WC1_s1x/171402969/model_final_9e47f0.pkl">model</a>&nbsp;|&nbsp;<a href="https://dl.fbaipublicfiles.com/densepose/densepose_rcnn_R_101_FPN_WC1_s1x/171402969/metrics.json">metrics</a></td>
+</tr>
+<!-- ROW: densepose_rcnn_R_101_FPN_WC2_s1x --> 
+ <tr><td align="left"><a href="../configs/densepose_rcnn_R_101_FPN_WC2_s1x.yaml">R_101_FPN_WC2_s1x</a></td>
+<td align="center">s1x</td>
+<td align="center">0.450</td>
+<td align="center">0.078</td>
+<td align="center">5.7</td>
+<td align="center">62.3</td>
+<td align="center">64.8</td>
+<td align="center">66.6</td>
+<td align="center">173860702</td>
+<td align="center"><a href="https://dl.fbaipublicfiles.com/densepose/densepose_rcnn_R_101_FPN_WC2_s1x/173860702/model_final_5ea023.pkl">model</a>&nbsp;|&nbsp;<a href="https://dl.fbaipublicfiles.com/densepose/densepose_rcnn_R_101_FPN_WC2_s1x/173860702/metrics.json">metrics</a></td>
+</tr>
+<!-- ROW: densepose_rcnn_R_101_FPN_DL_WC1_s1x --> 
+ <tr><td align="left"><a href="../configs/densepose_rcnn_R_101_FPN_DL_WC1_s1x.yaml">R_101_FPN_DL_WC1_s1x</a></td>
+<td align="center">s1x</td>
+<td align="center">0.479</td>
+<td align="center">0.081</td>
+<td align="center">7.9</td>
+<td align="center">62.0</td>
+<td align="center">66.2</td>
+<td align="center">67.4</td>
+<td align="center">173858525</td>
+<td align="center"><a href="https://dl.fbaipublicfiles.com/densepose/densepose_rcnn_R_101_FPN_DL_WC1_s1x/173858525/model_final_f359f3.pkl">model</a>&nbsp;|&nbsp;<a href="https://dl.fbaipublicfiles.com/densepose/densepose_rcnn_R_101_FPN_DL_WC1_s1x/173858525/metrics.json">metrics</a></td>
+</tr>
+<!-- ROW: densepose_rcnn_R_101_FPN_DL_WC2_s1x --> 
+ <tr><td align="left"><a href="../configs/densepose_rcnn_R_101_FPN_DL_WC2_s1x.yaml">R_101_FPN_DL_WC2_s1x</a></td>
+<td align="center">s1x</td>
+<td align="center">0.491</td>
+<td align="center">0.082</td>
+<td align="center">7.6</td>
+<td align="center">61.7</td>
+<td align="center">65.9</td>
+<td align="center">67.3</td>
+<td align="center">173294801</td>
+<td align="center"><a href="https://dl.fbaipublicfiles.com/densepose/densepose_rcnn_R_101_FPN_DL_WC2_s1x/173294801/model_final_6e1ed1.pkl">model</a>&nbsp;|&nbsp;<a href="https://dl.fbaipublicfiles.com/densepose/densepose_rcnn_R_101_FPN_DL_WC2_s1x/173294801/metrics.json">metrics</a></td>
+</tr>
+</tbody></table>
+
+## Old Baselines
+
+It is still possible to use some baselines from [DensePose 1](https://github.com/facebookresearch/DensePose).
+Below are evaluation metrics for the baselines recomputed in the current framework:
+
+| Model | bbox AP | AP  |  AP50 | AP75  | APm  |APl |
+|-----|-----|-----|---    |---    |---   |--- |
+| [`ResNet50_FPN_s1x-e2e`](https://dl.fbaipublicfiles.com/densepose/DensePose_ResNet50_FPN_s1x-e2e.pkl) | 54.673 | 48.894 | 84.963 | 50.717 | 43.132 | 50.433 |
+| [`ResNet101_FPN_s1x-e2e`](https://dl.fbaipublicfiles.com/densepose/DensePose_ResNet101_FPN_s1x-e2e.pkl) | 56.032 | 51.088 | 86.250 | 55.057 | 46.542 | 52.563 |
+
+Note: these scores are close, but not strictly equal to the ones reported in the [DensePose 1 Model Zoo](https://github.com/facebookresearch/DensePose/blob/master/MODEL_ZOO.md),
+which is due to small incompatibilities between the frameworks.
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/projects/DensePose/doc/TOOL_APPLY_NET.md b/preprocess/humanparsing/mhp_extension/detectron2/projects/DensePose/doc/TOOL_APPLY_NET.md
new file mode 100644
index 0000000000000000000000000000000000000000..f5cf2579a83811e4b192b3688f241b570f62bcb5
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/projects/DensePose/doc/TOOL_APPLY_NET.md
@@ -0,0 +1,130 @@
+# Apply Net
+
+`apply_net` is a tool to print or visualize DensePose results on a set of images.
+It has two modes: `dump` to save DensePose model results to a pickle file
+and `show` to visualize them on images.
+
+## Dump Mode
+
+The general command form is:
+```bash
+python apply_net.py dump [-h] [-v] [--output <dump_file>] <config> <model> <input>
+```
+
+There are three mandatory arguments:
+ - `<config>`, configuration file for a given model;
+ - `<model>`, model file with trained parameters
+ - `<input>`, input image file name, pattern or folder
+
+One can additionally provide `--output` argument to define the output file name,
+which defaults to `output.pkl`.
+
+
+Examples:
+
+1. Dump results of a DensePose model with ResNet-50 FPN backbone for images
+   in a folder `images` to file `dump.pkl`:
+```bash
+python apply_net.py dump configs/densepose_rcnn_R_50_FPN_s1x.yaml DensePose_ResNet50_FPN_s1x-e2e.pkl images --output dump.pkl -v
+```
+
+2. Dump results of a DensePose model with ResNet-50 FPN backbone for images
+   with file name matching a pattern `image*.jpg` to file `results.pkl`:
+```bash
+python apply_net.py dump configs/densepose_rcnn_R_50_FPN_s1x.yaml DensePose_ResNet50_FPN_s1x-e2e.pkl "image*.jpg" --output results.pkl -v
+```
+
+If you want to load the pickle file generated by the above command:
+```
+# make sure DensePose is in your PYTHONPATH, or use the following line to add it:
+sys.path.append("/your_detectron2_path/detectron2_repo/projects/DensePose/")
+
+f = open('/your_result_path/results.pkl', 'rb')
+data = pickle.load(f)
+```
+
+The file `results.pkl` contains the list of results per image, for each image the result is a dictionary:
+```
+data: [{'file_name': '/your_path/image1.jpg',
+        'scores': tensor([0.9884]),
+        'pred_boxes_XYXY': tensor([[ 69.6114,   0.0000, 706.9797, 706.0000]]),
+        'pred_densepose': <densepose.structures.DensePoseResult object at 0x7f791b312470>},
+       {'file_name': '/your_path/image2.jpg',
+        'scores': tensor([0.9999, 0.5373, 0.3991]),
+        'pred_boxes_XYXY': tensor([[ 59.5734,   7.7535, 579.9311, 932.3619],
+                                   [612.9418, 686.1254, 612.9999, 704.6053],
+                                   [164.5081, 407.4034, 598.3944, 920.4266]]),
+        'pred_densepose': <densepose.structures.DensePoseResult object at 0x7f7071229be0>}]
+```
+
+We can use the following code, to parse the outputs of the first
+detected instance on the first image.
+```
+img_id, instance_id = 0, 0  # Look at the first image and the first detected instance
+bbox_xyxy = data[img_id]['pred_boxes_XYXY'][instance_id]
+result_encoded = data[img_id]['pred_densepose'].results[instance_id]
+iuv_arr = DensePoseResult.decode_png_data(*result_encoded)
+```
+The array `bbox_xyxy` contains (x0, y0, x1, y1) of the bounding box.
+
+The shape of `iuv_arr` is `[3, H, W]`, where (H, W) is the shape of the bounding box.
+- `iuv_arr[0,:,:]`: The patch index of image points, indicating which of the 24 surface patches the point is on.
+- `iuv_arr[1,:,:]`: The U-coordinate value of image points.
+- `iuv_arr[2,:,:]`: The V-coordinate value of image points.
+
+
+## Visualization Mode
+
+The general command form is:
+```bash
+python apply_net.py show [-h] [-v] [--min_score <score>] [--nms_thresh <threshold>] [--output <image_file>] <config> <model> <input> <visualizations>
+```
+
+There are four mandatory arguments:
+ - `<config>`, configuration file for a given model;
+ - `<model>`, model file with trained parameters
+ - `<input>`, input image file name, pattern or folder
+ - `<visualizations>`, visualizations specifier; currently available visualizations are:
+   * `bbox` - bounding boxes of detected persons;
+   * `dp_segm` - segmentation masks for detected persons;
+   * `dp_u` - each body part is colored according to the estimated values of the
+     U coordinate in part parameterization;
+   * `dp_v` - each body part is colored according to the estimated values of the
+     V coordinate in part parameterization;
+   * `dp_contour` - plots contours with color-coded U and V coordinates
+
+
+One can additionally provide the following optional arguments:
+ - `--min_score` to only show detections with sufficient scores that are not lower than provided value
+ - `--nms_thresh` to additionally apply non-maximum suppression to detections at a given threshold
+ - `--output` to define visualization file name template, which defaults to `output.png`.
+   To distinguish output file names for different images, the tool appends 1-based entry index,
+   e.g. output.0001.png, output.0002.png, etc...
+
+
+The following examples show how to output results of a DensePose model
+with ResNet-50 FPN backbone using different visualizations for image `image.jpg`:
+
+1. Show bounding box and segmentation:
+```bash
+python apply_net.py show configs/densepose_rcnn_R_50_FPN_s1x.yaml DensePose_ResNet50_FPN_s1x-e2e.pkl image.jpg bbox,dp_segm -v
+```
+![Bounding Box + Segmentation Visualization](images/res_bbox_dp_segm.jpg)
+
+2. Show bounding box and estimated U coordinates for body parts:
+```bash
+python apply_net.py show configs/densepose_rcnn_R_50_FPN_s1x.yaml DensePose_ResNet50_FPN_s1x-e2e.pkl image.jpg bbox,dp_u -v
+```
+![Bounding Box + U Coordinate Visualization](images/res_bbox_dp_u.jpg)
+
+3. Show bounding box and estimated V coordinates for body parts:
+```bash
+python apply_net.py show configs/densepose_rcnn_R_50_FPN_s1x.yaml DensePose_ResNet50_FPN_s1x-e2e.pkl image.jpg bbox,dp_v -v
+```
+![Bounding Box + V Coordinate Visualization](images/res_bbox_dp_v.jpg)
+
+4. Show bounding box and estimated U and V coordinates via contour plots:
+```bash
+python apply_net.py show configs/densepose_rcnn_R_50_FPN_s1x.yaml DensePose_ResNet50_FPN_s1x-e2e.pkl image.jpg dp_contour,bbox -v
+```
+![Bounding Box + Contour Visualization](images/res_bbox_dp_contour.jpg)
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/projects/DensePose/doc/TOOL_QUERY_DB.md b/preprocess/humanparsing/mhp_extension/detectron2/projects/DensePose/doc/TOOL_QUERY_DB.md
new file mode 100644
index 0000000000000000000000000000000000000000..b0a764b8740597c6af634127b80b53d28913726f
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/projects/DensePose/doc/TOOL_QUERY_DB.md
@@ -0,0 +1,105 @@
+
+# Query Dataset
+
+`query_db` is a tool to print or visualize DensePose data from a dataset.
+It has two modes: `print` and `show` to output dataset entries to standard
+output or to visualize them on images.
+
+## Print Mode
+
+The general command form is:
+```bash
+python query_db.py print [-h] [-v] [--max-entries N] <dataset> <selector>
+```
+
+There are two mandatory arguments:
+ - `<dataset>`, DensePose dataset specification, from which to select
+   the entries (e.g. `densepose_coco_2014_train`).
+ - `<selector>`, dataset entry selector which can be a single specification,
+   or a comma-separated list of specifications of the form
+   `field[:type]=value` for exact match with the value
+   or `field[:type]=min-max` for a range of values
+
+One can additionally limit the maximum number of entries to output
+by providing `--max-entries` argument.
+
+Examples:
+
+1. Output at most 10 first entries from the `densepose_coco_2014_train` dataset:
+```bash
+python query_db.py print densepose_coco_2014_train \* --max-entries 10 -v
+```
+
+2. Output all entries with `file_name` equal to `COCO_train2014_000000000036.jpg`: 
+```bash
+python query_db.py print densepose_coco_2014_train file_name=COCO_train2014_000000000036.jpg -v
+```
+
+3. Output all entries with `image_id` between 36 and 156:
+```bash
+python query_db.py print densepose_coco_2014_train image_id:int=36-156 -v
+```
+
+## Visualization Mode
+
+The general command form is:
+```bash
+python query_db.py show [-h] [-v] [--max-entries N] [--output <image_file>] <dataset> <selector> <visualizations>
+```
+
+There are three mandatory arguments:
+ - `<dataset>`, DensePose dataset specification, from which to select
+   the entries (e.g. `densepose_coco_2014_train`).
+ - `<selector>`, dataset entry selector which can be a single specification,
+   or a comma-separated list of specifications of the form
+   `field[:type]=value` for exact match with the value
+   or `field[:type]=min-max` for a range of values
+ - `<visualizations>`, visualizations specifier; currently available visualizations are:
+   * `bbox` - bounding boxes of annotated persons;
+   * `dp_i` - annotated points colored according to the containing part;
+   * `dp_pts` - annotated points in green color;
+   * `dp_segm` - segmentation masks for annotated persons;
+   * `dp_u` - annotated points colored according to their U coordinate in part parameterization;
+   * `dp_v` - annotated points colored according to their V coordinate in part parameterization;
+
+One can additionally provide one of the two optional arguments:
+ - `--max_entries` to limit the maximum number of entries to visualize
+ - `--output` to provide visualization file name template, which defaults
+   to `output.png`. To distinguish file names for different dataset
+   entries, the tool appends 1-based entry index to the output file name,
+   e.g. output.0001.png, output.0002.png, etc.
+
+The following examples show how to output different visualizations for image with `id = 322`
+from `densepose_coco_2014_train` dataset:
+
+1. Show bounding box and segmentation:
+```bash
+python query_db.py show densepose_coco_2014_train image_id:int=322 bbox,dp_segm -v
+```
+![Bounding Box + Segmentation Visualization](images/vis_bbox_dp_segm.jpg)
+
+2. Show bounding box and points colored according to the containing part:
+```bash
+python query_db.py show densepose_coco_2014_train image_id:int=322 bbox,dp_i -v
+```
+![Bounding Box + Point Label Visualization](images/vis_bbox_dp_i.jpg)
+
+3. Show bounding box and annotated points in green color:
+```bash
+python query_db.py show densepose_coco_2014_train image_id:int=322 bbox,dp_segm -v
+```
+![Bounding Box + Point Visualization](images/vis_bbox_dp_pts.jpg)
+
+4. Show bounding box and annotated points colored according to their U coordinate in part parameterization:
+```bash
+python query_db.py show densepose_coco_2014_train image_id:int=322 bbox,dp_u -v
+```
+![Bounding Box + Point U Visualization](images/vis_bbox_dp_u.jpg)
+
+5. Show bounding box and annotated points colored according to their V coordinate in part parameterization:
+```bash
+python query_db.py show densepose_coco_2014_train image_id:int=322 bbox,dp_v -v
+```
+![Bounding Box + Point V Visualization](images/vis_bbox_dp_v.jpg)
+
+
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/projects/DensePose/query_db.py b/preprocess/humanparsing/mhp_extension/detectron2/projects/DensePose/query_db.py
new file mode 100644
index 0000000000000000000000000000000000000000..6d3ea2ffdff7559a8cd78df95a5fb7f308f33e1e
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/projects/DensePose/query_db.py
@@ -0,0 +1,250 @@
+#!/usr/bin/env python3
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+
+import argparse
+import logging
+import os
+import sys
+from timeit import default_timer as timer
+from typing import Any, ClassVar, Dict, List
+import torch
+from fvcore.common.file_io import PathManager
+
+from detectron2.data.catalog import DatasetCatalog
+from detectron2.utils.logger import setup_logger
+
+from densepose.data.structures import DensePoseDataRelative
+from densepose.utils.dbhelper import EntrySelector
+from densepose.utils.logger import verbosity_to_level
+from densepose.vis.base import CompoundVisualizer
+from densepose.vis.bounding_box import BoundingBoxVisualizer
+from densepose.vis.densepose import (
+    DensePoseDataCoarseSegmentationVisualizer,
+    DensePoseDataPointsIVisualizer,
+    DensePoseDataPointsUVisualizer,
+    DensePoseDataPointsVisualizer,
+    DensePoseDataPointsVVisualizer,
+)
+
+DOC = """Query DB - a tool to print / visualize data from a database
+"""
+
+LOGGER_NAME = "query_db"
+
+logger = logging.getLogger(LOGGER_NAME)
+
+_ACTION_REGISTRY: Dict[str, "Action"] = {}
+
+
+class Action(object):
+    @classmethod
+    def add_arguments(cls: type, parser: argparse.ArgumentParser):
+        parser.add_argument(
+            "-v",
+            "--verbosity",
+            action="count",
+            help="Verbose mode. Multiple -v options increase the verbosity.",
+        )
+
+
+def register_action(cls: type):
+    """
+    Decorator for action classes to automate action registration
+    """
+    global _ACTION_REGISTRY
+    _ACTION_REGISTRY[cls.COMMAND] = cls
+    return cls
+
+
+class EntrywiseAction(Action):
+    @classmethod
+    def add_arguments(cls: type, parser: argparse.ArgumentParser):
+        super(EntrywiseAction, cls).add_arguments(parser)
+        parser.add_argument(
+            "dataset", metavar="<dataset>", help="Dataset name (e.g. densepose_coco_2014_train)"
+        )
+        parser.add_argument(
+            "selector",
+            metavar="<selector>",
+            help="Dataset entry selector in the form field1[:type]=value1[,"
+            "field2[:type]=value_min-value_max...] which selects all "
+            "entries from the dataset that satisfy the constraints",
+        )
+        parser.add_argument(
+            "--max-entries", metavar="N", help="Maximum number of entries to process", type=int
+        )
+
+    @classmethod
+    def execute(cls: type, args: argparse.Namespace):
+        dataset = setup_dataset(args.dataset)
+        entry_selector = EntrySelector.from_string(args.selector)
+        context = cls.create_context(args)
+        if args.max_entries is not None:
+            for _, entry in zip(range(args.max_entries), dataset):
+                if entry_selector(entry):
+                    cls.execute_on_entry(entry, context)
+        else:
+            for entry in dataset:
+                if entry_selector(entry):
+                    cls.execute_on_entry(entry, context)
+
+    @classmethod
+    def create_context(cls: type, args: argparse.Namespace) -> Dict[str, Any]:
+        context = {}
+        return context
+
+
+@register_action
+class PrintAction(EntrywiseAction):
+    """
+    Print action that outputs selected entries to stdout
+    """
+
+    COMMAND: ClassVar[str] = "print"
+
+    @classmethod
+    def add_parser(cls: type, subparsers: argparse._SubParsersAction):
+        parser = subparsers.add_parser(cls.COMMAND, help="Output selected entries to stdout. ")
+        cls.add_arguments(parser)
+        parser.set_defaults(func=cls.execute)
+
+    @classmethod
+    def add_arguments(cls: type, parser: argparse.ArgumentParser):
+        super(PrintAction, cls).add_arguments(parser)
+
+    @classmethod
+    def execute_on_entry(cls: type, entry: Dict[str, Any], context: Dict[str, Any]):
+        import pprint
+
+        printer = pprint.PrettyPrinter(indent=2, width=200, compact=True)
+        printer.pprint(entry)
+
+
+@register_action
+class ShowAction(EntrywiseAction):
+    """
+    Show action that visualizes selected entries on an image
+    """
+
+    COMMAND: ClassVar[str] = "show"
+    VISUALIZERS: ClassVar[Dict[str, object]] = {
+        "dp_segm": DensePoseDataCoarseSegmentationVisualizer(),
+        "dp_i": DensePoseDataPointsIVisualizer(),
+        "dp_u": DensePoseDataPointsUVisualizer(),
+        "dp_v": DensePoseDataPointsVVisualizer(),
+        "dp_pts": DensePoseDataPointsVisualizer(),
+        "bbox": BoundingBoxVisualizer(),
+    }
+
+    @classmethod
+    def add_parser(cls: type, subparsers: argparse._SubParsersAction):
+        parser = subparsers.add_parser(cls.COMMAND, help="Visualize selected entries")
+        cls.add_arguments(parser)
+        parser.set_defaults(func=cls.execute)
+
+    @classmethod
+    def add_arguments(cls: type, parser: argparse.ArgumentParser):
+        super(ShowAction, cls).add_arguments(parser)
+        parser.add_argument(
+            "visualizations",
+            metavar="<visualizations>",
+            help="Comma separated list of visualizations, possible values: "
+            "[{}]".format(",".join(sorted(cls.VISUALIZERS.keys()))),
+        )
+        parser.add_argument(
+            "--output",
+            metavar="<image_file>",
+            default="output.png",
+            help="File name to save output to",
+        )
+
+    @classmethod
+    def execute_on_entry(cls: type, entry: Dict[str, Any], context: Dict[str, Any]):
+        import cv2
+        import numpy as np
+
+        image_fpath = PathManager.get_local_path(entry["file_name"])
+        image = cv2.imread(image_fpath, cv2.IMREAD_GRAYSCALE)
+        image = np.tile(image[:, :, np.newaxis], [1, 1, 3])
+        datas = cls._extract_data_for_visualizers_from_entry(context["vis_specs"], entry)
+        visualizer = context["visualizer"]
+        image_vis = visualizer.visualize(image, datas)
+        entry_idx = context["entry_idx"] + 1
+        out_fname = cls._get_out_fname(entry_idx, context["out_fname"])
+        cv2.imwrite(out_fname, image_vis)
+        logger.info(f"Output saved to {out_fname}")
+        context["entry_idx"] += 1
+
+    @classmethod
+    def _get_out_fname(cls: type, entry_idx: int, fname_base: str):
+        base, ext = os.path.splitext(fname_base)
+        return base + ".{0:04d}".format(entry_idx) + ext
+
+    @classmethod
+    def create_context(cls: type, args: argparse.Namespace) -> Dict[str, Any]:
+        vis_specs = args.visualizations.split(",")
+        visualizers = []
+        for vis_spec in vis_specs:
+            vis = cls.VISUALIZERS[vis_spec]
+            visualizers.append(vis)
+        context = {
+            "vis_specs": vis_specs,
+            "visualizer": CompoundVisualizer(visualizers),
+            "out_fname": args.output,
+            "entry_idx": 0,
+        }
+        return context
+
+    @classmethod
+    def _extract_data_for_visualizers_from_entry(
+        cls: type, vis_specs: List[str], entry: Dict[str, Any]
+    ):
+        dp_list = []
+        bbox_list = []
+        for annotation in entry["annotations"]:
+            is_valid, _ = DensePoseDataRelative.validate_annotation(annotation)
+            if not is_valid:
+                continue
+            bbox = torch.as_tensor(annotation["bbox"])
+            bbox_list.append(bbox)
+            dp_data = DensePoseDataRelative(annotation)
+            dp_list.append(dp_data)
+        datas = []
+        for vis_spec in vis_specs:
+            datas.append(bbox_list if "bbox" == vis_spec else (bbox_list, dp_list))
+        return datas
+
+
+def setup_dataset(dataset_name):
+    logger.info("Loading dataset {}".format(dataset_name))
+    start = timer()
+    dataset = DatasetCatalog.get(dataset_name)
+    stop = timer()
+    logger.info("Loaded dataset {} in {:.3f}s".format(dataset_name, stop - start))
+    return dataset
+
+
+def create_argument_parser() -> argparse.ArgumentParser:
+    parser = argparse.ArgumentParser(
+        description=DOC,
+        formatter_class=lambda prog: argparse.HelpFormatter(prog, max_help_position=120),
+    )
+    parser.set_defaults(func=lambda _: parser.print_help(sys.stdout))
+    subparsers = parser.add_subparsers(title="Actions")
+    for _, action in _ACTION_REGISTRY.items():
+        action.add_parser(subparsers)
+    return parser
+
+
+def main():
+    parser = create_argument_parser()
+    args = parser.parse_args()
+    verbosity = args.verbosity if hasattr(args, "verbosity") else None
+    global logger
+    logger = setup_logger(name=LOGGER_NAME)
+    logger.setLevel(verbosity_to_level(verbosity))
+    args.func(args)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/projects/DensePose/tests/common.py b/preprocess/humanparsing/mhp_extension/detectron2/projects/DensePose/tests/common.py
new file mode 100644
index 0000000000000000000000000000000000000000..13bf0dd3ca113e0756d3023e36272675c6b972f9
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/projects/DensePose/tests/common.py
@@ -0,0 +1,110 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+
+import os
+import torch
+
+from detectron2.config import get_cfg
+from detectron2.engine import default_setup
+from detectron2.modeling import build_model
+
+from densepose import add_dataset_category_config, add_densepose_config
+
+_BASE_CONFIG_DIR = "configs"
+_EVOLUTION_CONFIG_SUB_DIR = "evolution"
+_QUICK_SCHEDULES_CONFIG_SUB_DIR = "quick_schedules"
+_BASE_CONFIG_FILE_PREFIX = "Base-"
+_CONFIG_FILE_EXT = ".yaml"
+
+
+def _get_base_config_dir():
+    """
+    Return the base directory for configurations
+    """
+    return os.path.join(os.path.dirname(os.path.realpath(__file__)), "..", _BASE_CONFIG_DIR)
+
+
+def _get_evolution_config_dir():
+    """
+    Return the base directory for evolution configurations
+    """
+    return os.path.join(_get_base_config_dir(), _EVOLUTION_CONFIG_SUB_DIR)
+
+
+def _get_quick_schedules_config_dir():
+    """
+    Return the base directory for quick schedules configurations
+    """
+    return os.path.join(_get_base_config_dir(), _QUICK_SCHEDULES_CONFIG_SUB_DIR)
+
+
+def _collect_config_files(config_dir):
+    """
+    Collect all configuration files (i.e. densepose_*.yaml) directly in the specified directory
+    """
+    start = _get_base_config_dir()
+    results = []
+    for entry in os.listdir(config_dir):
+        path = os.path.join(config_dir, entry)
+        if not os.path.isfile(path):
+            continue
+        _, ext = os.path.splitext(entry)
+        if ext != _CONFIG_FILE_EXT:
+            continue
+        if entry.startswith(_BASE_CONFIG_FILE_PREFIX):
+            continue
+        config_file = os.path.relpath(path, start)
+        results.append(config_file)
+    return results
+
+
+def get_config_files():
+    """
+    Get all the configuration files (relative to the base configuration directory)
+    """
+    return _collect_config_files(_get_base_config_dir())
+
+
+def get_evolution_config_files():
+    """
+    Get all the evolution configuration files (relative to the base configuration directory)
+    """
+    return _collect_config_files(_get_evolution_config_dir())
+
+
+def get_quick_schedules_config_files():
+    """
+    Get all the quick schedules configuration files (relative to the base configuration directory)
+    """
+    return _collect_config_files(_get_quick_schedules_config_dir())
+
+
+def _get_model_config(config_file):
+    """
+    Load and return the configuration from the specified file (relative to the base configuration
+    directory)
+    """
+    cfg = get_cfg()
+    add_dataset_category_config(cfg)
+    add_densepose_config(cfg)
+    path = os.path.join(_get_base_config_dir(), config_file)
+    cfg.merge_from_file(path)
+    if not torch.cuda.is_available():
+        cfg.MODEL_DEVICE = "cpu"
+    return cfg
+
+
+def get_model(config_file):
+    """
+    Get the model from the specified file (relative to the base configuration directory)
+    """
+    cfg = _get_model_config(config_file)
+    return build_model(cfg)
+
+
+def setup(config_file):
+    """
+    Setup the configuration from the specified file (relative to the base configuration directory)
+    """
+    cfg = _get_model_config(config_file)
+    cfg.freeze()
+    default_setup(cfg, {})
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/projects/DensePose/tests/test_model_e2e.py b/preprocess/humanparsing/mhp_extension/detectron2/projects/DensePose/tests/test_model_e2e.py
new file mode 100644
index 0000000000000000000000000000000000000000..eed131080547d84185c1d33913014a2c977b119f
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/projects/DensePose/tests/test_model_e2e.py
@@ -0,0 +1,43 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+
+import unittest
+import torch
+
+from detectron2.structures import BitMasks, Boxes, Instances
+
+from .common import get_model
+
+
+# TODO(plabatut): Modularize detectron2 tests and re-use
+def make_model_inputs(image, instances=None):
+    if instances is None:
+        return {"image": image}
+
+    return {"image": image, "instances": instances}
+
+
+def make_empty_instances(h, w):
+    instances = Instances((h, w))
+    instances.gt_boxes = Boxes(torch.rand(0, 4))
+    instances.gt_classes = torch.tensor([]).to(dtype=torch.int64)
+    instances.gt_masks = BitMasks(torch.rand(0, h, w))
+    return instances
+
+
+class ModelE2ETest(unittest.TestCase):
+    CONFIG_PATH = ""
+
+    def setUp(self):
+        self.model = get_model(self.CONFIG_PATH)
+
+    def _test_eval(self, sizes):
+        inputs = [make_model_inputs(torch.rand(3, size[0], size[1])) for size in sizes]
+        self.model.eval()
+        self.model(inputs)
+
+
+class DensePoseRCNNE2ETest(ModelE2ETest):
+    CONFIG_PATH = "densepose_rcnn_R_101_FPN_s1x.yaml"
+
+    def test_empty_data(self):
+        self._test_eval([(200, 250), (200, 249)])
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/projects/DensePose/tests/test_setup.py b/preprocess/humanparsing/mhp_extension/detectron2/projects/DensePose/tests/test_setup.py
new file mode 100644
index 0000000000000000000000000000000000000000..96827f14b3a71d571c2109791233b5bcf7ef35f8
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/projects/DensePose/tests/test_setup.py
@@ -0,0 +1,30 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+
+import unittest
+
+from .common import (
+    get_config_files,
+    get_evolution_config_files,
+    get_quick_schedules_config_files,
+    setup,
+)
+
+
+class TestSetup(unittest.TestCase):
+    def _test_setup(self, config_file):
+        setup(config_file)
+
+    def test_setup_configs(self):
+        config_files = get_config_files()
+        for config_file in config_files:
+            self._test_setup(config_file)
+
+    def test_setup_evolution_configs(self):
+        config_files = get_evolution_config_files()
+        for config_file in config_files:
+            self._test_setup(config_file)
+
+    def test_setup_quick_schedules_configs(self):
+        config_files = get_quick_schedules_config_files()
+        for config_file in config_files:
+            self._test_setup(config_file)
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/projects/DensePose/tests/test_structures.py b/preprocess/humanparsing/mhp_extension/detectron2/projects/DensePose/tests/test_structures.py
new file mode 100644
index 0000000000000000000000000000000000000000..ad97c23a43a9a72db566ec272b10f5bbda874695
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/projects/DensePose/tests/test_structures.py
@@ -0,0 +1,25 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+
+import unittest
+
+from densepose.data.structures import normalized_coords_transform
+
+
+class TestStructures(unittest.TestCase):
+    def test_normalized_coords_transform(self):
+        bbox = (32, 24, 288, 216)
+        x0, y0, w, h = bbox
+        xmin, ymin, xmax, ymax = x0, y0, x0 + w, y0 + h
+        f = normalized_coords_transform(*bbox)
+        # Top-left
+        expected_p, actual_p = (-1, -1), f((xmin, ymin))
+        self.assertEqual(expected_p, actual_p)
+        # Top-right
+        expected_p, actual_p = (1, -1), f((xmax, ymin))
+        self.assertEqual(expected_p, actual_p)
+        # Bottom-left
+        expected_p, actual_p = (-1, 1), f((xmin, ymax))
+        self.assertEqual(expected_p, actual_p)
+        # Bottom-right
+        expected_p, actual_p = (1, 1), f((xmax, ymax))
+        self.assertEqual(expected_p, actual_p)
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/projects/DensePose/train_net.py b/preprocess/humanparsing/mhp_extension/detectron2/projects/DensePose/train_net.py
new file mode 100644
index 0000000000000000000000000000000000000000..9d2e7bd8b92964f752620d92e7acb662c0b86fa7
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/projects/DensePose/train_net.py
@@ -0,0 +1,122 @@
+#!/usr/bin/env python3
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+
+"""
+DensePose Training Script.
+
+This script is similar to the training script in detectron2/tools.
+
+It is an example of how a user might use detectron2 for a new project.
+"""
+
+import logging
+import os
+from collections import OrderedDict
+from fvcore.common.file_io import PathManager
+
+import detectron2.utils.comm as comm
+from detectron2.checkpoint import DetectionCheckpointer
+from detectron2.config import CfgNode, get_cfg
+from detectron2.engine import DefaultTrainer, default_argument_parser, default_setup, hooks, launch
+from detectron2.evaluation import COCOEvaluator, DatasetEvaluators, verify_results
+from detectron2.modeling import DatasetMapperTTA
+from detectron2.utils.logger import setup_logger
+
+from densepose import (
+    DensePoseCOCOEvaluator,
+    DensePoseGeneralizedRCNNWithTTA,
+    add_dataset_category_config,
+    add_densepose_config,
+    load_from_cfg,
+)
+from densepose.data import DatasetMapper, build_detection_test_loader, build_detection_train_loader
+
+
+class Trainer(DefaultTrainer):
+    @classmethod
+    def build_evaluator(cls, cfg: CfgNode, dataset_name, output_folder=None):
+        if output_folder is None:
+            output_folder = os.path.join(cfg.OUTPUT_DIR, "inference")
+        evaluators = [COCOEvaluator(dataset_name, cfg, True, output_folder)]
+        if cfg.MODEL.DENSEPOSE_ON:
+            evaluators.append(DensePoseCOCOEvaluator(dataset_name, True, output_folder))
+        return DatasetEvaluators(evaluators)
+
+    @classmethod
+    def build_test_loader(cls, cfg: CfgNode, dataset_name):
+        return build_detection_test_loader(cfg, dataset_name, mapper=DatasetMapper(cfg, False))
+
+    @classmethod
+    def build_train_loader(cls, cfg: CfgNode):
+        return build_detection_train_loader(cfg, mapper=DatasetMapper(cfg, True))
+
+    @classmethod
+    def test_with_TTA(cls, cfg: CfgNode, model):
+        logger = logging.getLogger("detectron2.trainer")
+        # In the end of training, run an evaluation with TTA
+        # Only support some R-CNN models.
+        logger.info("Running inference with test-time augmentation ...")
+        transform_data = load_from_cfg(cfg)
+        model = DensePoseGeneralizedRCNNWithTTA(cfg, model, transform_data, DatasetMapperTTA(cfg))
+        evaluators = [
+            cls.build_evaluator(
+                cfg, name, output_folder=os.path.join(cfg.OUTPUT_DIR, "inference_TTA")
+            )
+            for name in cfg.DATASETS.TEST
+        ]
+        res = cls.test(cfg, model, evaluators)
+        res = OrderedDict({k + "_TTA": v for k, v in res.items()})
+        return res
+
+
+def setup(args):
+    cfg = get_cfg()
+    add_dataset_category_config(cfg)
+    add_densepose_config(cfg)
+    cfg.merge_from_file(args.config_file)
+    cfg.merge_from_list(args.opts)
+    cfg.freeze()
+    default_setup(cfg, args)
+    # Setup logger for "densepose" module
+    setup_logger(output=cfg.OUTPUT_DIR, distributed_rank=comm.get_rank(), name="densepose")
+    return cfg
+
+
+def main(args):
+    cfg = setup(args)
+    # disable strict kwargs checking: allow one to specify path handle
+    # hints through kwargs, like timeout in DP evaluation
+    PathManager.set_strict_kwargs_checking(False)
+
+    if args.eval_only:
+        model = Trainer.build_model(cfg)
+        DetectionCheckpointer(model, save_dir=cfg.OUTPUT_DIR).resume_or_load(
+            cfg.MODEL.WEIGHTS, resume=args.resume
+        )
+        res = Trainer.test(cfg, model)
+        if cfg.TEST.AUG.ENABLED:
+            res.update(Trainer.test_with_TTA(cfg, model))
+        if comm.is_main_process():
+            verify_results(cfg, res)
+        return res
+
+    trainer = Trainer(cfg)
+    trainer.resume_or_load(resume=args.resume)
+    if cfg.TEST.AUG.ENABLED:
+        trainer.register_hooks(
+            [hooks.EvalHook(0, lambda: trainer.test_with_TTA(cfg, trainer.model))]
+        )
+    return trainer.train()
+
+
+if __name__ == "__main__":
+    args = default_argument_parser().parse_args()
+    print("Command Line Args:", args)
+    launch(
+        main,
+        args.num_gpus,
+        num_machines=args.num_machines,
+        machine_rank=args.machine_rank,
+        dist_url=args.dist_url,
+        args=(args,),
+    )
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/projects/PointRend/README.md b/preprocess/humanparsing/mhp_extension/detectron2/projects/PointRend/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..443736fff35cc49e02807a7b941da19c0bdfa666
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/projects/PointRend/README.md
@@ -0,0 +1,135 @@
+# PointRend: Image Segmentation as Rendering
+
+Alexander Kirillov, Yuxin Wu, Kaiming He, Ross Girshick
+
+[[`arXiv`](https://arxiv.org/abs/1912.08193)] [[`BibTeX`](#CitingPointRend)]
+
+<div align="center">
+  <img src="https://alexander-kirillov.github.io/images/kirillov2019pointrend.jpg"/>
+</div><br/>
+
+In this repository, we release code for PointRend in Detectron2. PointRend can be flexibly applied to both instance and semantic segmentation tasks by building on top of existing state-of-the-art models.
+
+## Installation
+Install Detectron 2 following [INSTALL.md](https://github.com/facebookresearch/detectron2/blob/master/INSTALL.md). You are ready to go!
+
+## Quick start and visualization
+
+This [Colab Notebook](https://colab.research.google.com/drive/1isGPL5h5_cKoPPhVL9XhMokRtHDvmMVL) tutorial contains examples of PointRend usage and visualizations of its point sampling stages.
+
+## Training
+
+To train a model with 8 GPUs run:
+```bash
+cd /path/to/detectron2/projects/PointRend
+python train_net.py --config-file configs/InstanceSegmentation/pointrend_rcnn_R_50_FPN_1x_coco.yaml --num-gpus 8
+```
+
+## Evaluation
+
+Model evaluation can be done similarly:
+```bash
+cd /path/to/detectron2/projects/PointRend
+python train_net.py --config-file configs/InstanceSegmentation/pointrend_rcnn_R_50_FPN_1x_coco.yaml --eval-only MODEL.WEIGHTS /path/to/model_checkpoint
+```
+
+# Pretrained Models
+
+## Instance Segmentation
+#### COCO
+
+<table><tbody>
+<!-- START TABLE -->
+<!-- TABLE HEADER -->
+<th valign="bottom">Mask<br/>head</th>
+<th valign="bottom">Backbone</th>
+<th valign="bottom">lr<br/>sched</th>
+<th valign="bottom">Output<br/>resolution</th>
+<th valign="bottom">mask<br/>AP</th>
+<th valign="bottom">mask<br/>AP&ast;</th>
+<th valign="bottom">model id</th>
+<th valign="bottom">download</th>
+<!-- TABLE BODY -->
+ <tr><td align="left"><a href="configs/InstanceSegmentation/pointrend_rcnn_R_50_FPN_1x_coco.yaml">PointRend</a></td>
+<td align="center">R50-FPN</td>
+<td align="center">1&times;</td>
+<td align="center">224&times;224</td>
+<td align="center">36.2</td>
+<td align="center">39.7</td>
+<td align="center">164254221</td>
+<td align="center"><a href="https://dl.fbaipublicfiles.com/detectron2/PointRend/InstanceSegmentation/pointrend_rcnn_R_50_FPN_1x_coco/164254221/model_final_88c6f8.pkl">model</a>&nbsp;|&nbsp;<a href="https://dl.fbaipublicfiles.com/detectron2/PointRend/InstanceSegmentation/pointrend_rcnn_R_50_FPN_1x_coco/164254221/metrics.json">metrics</a></td>
+</tr>
+ <tr><td align="left"><a href="configs/InstanceSegmentation/pointrend_rcnn_R_50_FPN_3x_coco.yaml">PointRend</a></td>
+<td align="center">R50-FPN</td>
+<td align="center">3&times;</td>
+<td align="center">224&times;224</td>
+<td align="center">38.3</td>
+<td align="center">41.6</td>
+<td align="center">164955410</td>
+<td align="center"><a href="https://dl.fbaipublicfiles.com/detectron2/PointRend/InstanceSegmentation/pointrend_rcnn_R_50_FPN_3x_coco/164955410/model_final_3c3198.pkl">model</a>&nbsp;|&nbsp;<a href="https://dl.fbaipublicfiles.com/detectron2/PointRend/InstanceSegmentation/pointrend_rcnn_R_50_FPN_3x_coco/164955410/metrics.json">metrics</a></td>
+</tr>
+</tbody></table>
+
+AP&ast; is COCO mask AP evaluated against the higher-quality LVIS annotations; see the paper for details. Run `python detectron2/datasets/prepare_cocofied_lvis.py` to prepare GT files for AP&ast; evaluation. Since LVIS annotations are not exhaustive `lvis-api` and not `cocoapi` should be used to evaluate AP&ast;.
+
+#### Cityscapes
+Cityscapes model is trained with ImageNet pretraining.
+
+<table><tbody>
+<!-- START TABLE -->
+<!-- TABLE HEADER -->
+<th valign="bottom">Mask<br/>head</th>
+<th valign="bottom">Backbone</th>
+<th valign="bottom">lr<br/>sched</th>
+<th valign="bottom">Output<br/>resolution</th>
+<th valign="bottom">mask<br/>AP</th>
+<th valign="bottom">model id</th>
+<th valign="bottom">download</th>
+<!-- TABLE BODY -->
+ <tr><td align="left"><a href="configs/InstanceSegmentation/pointrend_rcnn_R_50_FPN_1x_cityscapes.yaml">PointRend</a></td>
+<td align="center">R50-FPN</td>
+<td align="center">1&times;</td>
+<td align="center">224&times;224</td>
+<td align="center">35.9</td>
+<td align="center">164255101</td>
+<td align="center"><a href="https://dl.fbaipublicfiles.com/detectron2/PointRend/InstanceSegmentation/pointrend_rcnn_R_50_FPN_1x_cityscapes/164255101/model_final_318a02.pkl">model</a>&nbsp;|&nbsp;<a href="https://dl.fbaipublicfiles.com/detectron2/PointRend/InstanceSegmentation/pointrend_rcnn_R_50_FPN_1x_cityscapes/164255101/metrics.json">metrics</a></td>
+</tr>
+</tbody></table>
+
+
+## Semantic Segmentation
+
+#### Cityscapes
+Cityscapes model is trained with ImageNet pretraining.
+
+<table><tbody>
+<!-- START TABLE -->
+<!-- TABLE HEADER -->
+<th valign="bottom">Method</th>
+<th valign="bottom">Backbone</th>
+<th valign="bottom">Output<br/>resolution</th>
+<th valign="bottom">mIoU</th>
+<th valign="bottom">model id</th>
+<th valign="bottom">download</th>
+<!-- TABLE BODY -->
+ <tr><td align="left"><a href="configs/SemanticSegmentation/pointrend_semantic_R_101_FPN_1x_cityscapes.yaml">SemanticFPN + PointRend</a></td>
+<td align="center">R101-FPN</td>
+<td align="center">1024&times;2048</td>
+<td align="center">78.6</td>
+<td align="center">186480235</td>
+<td align="center"><a href="https://dl.fbaipublicfiles.com/detectron2/PointRend/SemanticSegmentation/pointrend_semantic_R_101_FPN_1x_cityscapes/186480235/model_final_5f3665.pkl">model</a>&nbsp;|&nbsp;<a href="https://dl.fbaipublicfiles.com/detectron2/PointRend/SemanticSegmentation/pointrend_semantic_R_101_FPN_1x_cityscapes/186480235/metrics.json">metrics</a></td>
+</tr>
+</tbody></table>
+
+## <a name="CitingPointRend"></a>Citing PointRend
+
+If you use PointRend, please use the following BibTeX entry.
+
+```BibTeX
+@InProceedings{kirillov2019pointrend,
+  title={{PointRend}: Image Segmentation as Rendering},
+  author={Alexander Kirillov and Yuxin Wu and Kaiming He and Ross Girshick},
+  journal={ArXiv:1912.08193},
+  year={2019}
+}
+```
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/projects/PointRend/configs/InstanceSegmentation/Base-PointRend-RCNN-FPN.yaml b/preprocess/humanparsing/mhp_extension/detectron2/projects/PointRend/configs/InstanceSegmentation/Base-PointRend-RCNN-FPN.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..d3917188afe04c7626e539f7c0bc28df4118a290
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/projects/PointRend/configs/InstanceSegmentation/Base-PointRend-RCNN-FPN.yaml
@@ -0,0 +1,21 @@
+_BASE_: "../../../../configs/Base-RCNN-FPN.yaml"
+MODEL:
+  ROI_HEADS:
+    NAME: "PointRendROIHeads"
+    IN_FEATURES: ["p2", "p3", "p4", "p5"]
+  ROI_BOX_HEAD:
+    TRAIN_ON_PRED_BOXES: True
+  ROI_MASK_HEAD:
+    NAME: "CoarseMaskHead"
+    FC_DIM: 1024
+    NUM_FC: 2
+    OUTPUT_SIDE_RESOLUTION: 7
+    IN_FEATURES: ["p2"]
+    POINT_HEAD_ON: True
+  POINT_HEAD:
+    FC_DIM: 256
+    NUM_FC: 3
+    IN_FEATURES: ["p2"]
+INPUT:
+  # PointRend for instance segmenation does not work with "polygon" mask_format.
+  MASK_FORMAT: "bitmask"
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/projects/PointRend/configs/InstanceSegmentation/pointrend_rcnn_R_50_FPN_1x_cityscapes.yaml b/preprocess/humanparsing/mhp_extension/detectron2/projects/PointRend/configs/InstanceSegmentation/pointrend_rcnn_R_50_FPN_1x_cityscapes.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..c23dbe1c8463d16f6be110ef49acd8c6142c3aa8
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/projects/PointRend/configs/InstanceSegmentation/pointrend_rcnn_R_50_FPN_1x_cityscapes.yaml
@@ -0,0 +1,23 @@
+_BASE_: Base-PointRend-RCNN-FPN.yaml
+MODEL:
+  WEIGHTS: detectron2://ImageNetPretrained/MSRA/R-50.pkl
+  MASK_ON: true
+  RESNETS:
+    DEPTH: 50
+  ROI_HEADS:
+    NUM_CLASSES: 8
+  POINT_HEAD:
+    NUM_CLASSES: 8
+DATASETS:
+  TEST: ("cityscapes_fine_instance_seg_val",)
+  TRAIN: ("cityscapes_fine_instance_seg_train",)
+SOLVER:
+  BASE_LR: 0.01
+  IMS_PER_BATCH: 8
+  MAX_ITER: 24000
+  STEPS: (18000,)
+INPUT:
+  MAX_SIZE_TEST: 2048
+  MAX_SIZE_TRAIN: 2048
+  MIN_SIZE_TEST: 1024
+  MIN_SIZE_TRAIN: (800, 832, 864, 896, 928, 960, 992, 1024)
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/projects/PointRend/configs/InstanceSegmentation/pointrend_rcnn_R_50_FPN_1x_coco.yaml b/preprocess/humanparsing/mhp_extension/detectron2/projects/PointRend/configs/InstanceSegmentation/pointrend_rcnn_R_50_FPN_1x_coco.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..e9fc573bf544de8610a65a7cda2a0df57aec0abf
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/projects/PointRend/configs/InstanceSegmentation/pointrend_rcnn_R_50_FPN_1x_coco.yaml
@@ -0,0 +1,9 @@
+_BASE_: Base-PointRend-RCNN-FPN.yaml
+MODEL:
+  WEIGHTS: detectron2://ImageNetPretrained/MSRA/R-50.pkl
+  MASK_ON: true
+  RESNETS:
+    DEPTH: 50
+# To add COCO AP evaluation against the higher-quality LVIS annotations.
+# DATASETS:
+#   TEST: ("coco_2017_val", "lvis_v0.5_val_cocofied")
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/projects/PointRend/configs/InstanceSegmentation/pointrend_rcnn_R_50_FPN_3x_coco.yaml b/preprocess/humanparsing/mhp_extension/detectron2/projects/PointRend/configs/InstanceSegmentation/pointrend_rcnn_R_50_FPN_3x_coco.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..2f013f32aeb4122f50c5c4030e9738d9d474ba34
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/projects/PointRend/configs/InstanceSegmentation/pointrend_rcnn_R_50_FPN_3x_coco.yaml
@@ -0,0 +1,13 @@
+_BASE_: Base-PointRend-RCNN-FPN.yaml
+MODEL:
+  WEIGHTS: detectron2://ImageNetPretrained/MSRA/R-50.pkl
+  MASK_ON: true
+  RESNETS:
+    DEPTH: 50
+SOLVER:
+  STEPS: (210000, 250000)
+  MAX_ITER: 270000
+# To add COCO AP evaluation against the higher-quality LVIS annotations.
+# DATASETS:
+#   TEST: ("coco_2017_val", "lvis_v0.5_val_cocofied")
+
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/projects/PointRend/configs/InstanceSegmentation/pointrend_rcnn_R_50_FPN_3x_parsing.yaml b/preprocess/humanparsing/mhp_extension/detectron2/projects/PointRend/configs/InstanceSegmentation/pointrend_rcnn_R_50_FPN_3x_parsing.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..a4af81dab7b47371454a273ecf962ea47ac21d49
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/projects/PointRend/configs/InstanceSegmentation/pointrend_rcnn_R_50_FPN_3x_parsing.yaml
@@ -0,0 +1,20 @@
+_BASE_: Base-PointRend-RCNN-FPN.yaml
+MODEL:
+  WEIGHTS: detectron2://ImageNetPretrained/MSRA/R-50.pkl
+  MASK_ON: true
+  RESNETS:
+    DEPTH: 50
+  ROI_HEADS:
+    NUM_CLASSES: 1
+  POINT_HEAD:
+    NUM_CLASSES: 1
+SOLVER:
+  STEPS: (210000, 250000)
+  MAX_ITER: 270000
+  IMS_PER_BATCH: 1
+# To add COCO AP evaluation against the higher-quality LVIS annotations.
+# DATASETS:
+#   TEST: ("coco_2017_val", "lvis_v0.5_val_cocofied")
+DATASETS:
+  TRAIN: ("CIHP_train",)
+  TEST: ("CIHP_val",)
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/projects/PointRend/configs/InstanceSegmentation/pointrend_rcnn_X_101_32x8d_FPN_3x_parsing.yaml b/preprocess/humanparsing/mhp_extension/detectron2/projects/PointRend/configs/InstanceSegmentation/pointrend_rcnn_X_101_32x8d_FPN_3x_parsing.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..8e52d82e39400f08f86a6e1a92e3e1c471403624
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/projects/PointRend/configs/InstanceSegmentation/pointrend_rcnn_X_101_32x8d_FPN_3x_parsing.yaml
@@ -0,0 +1,28 @@
+_BASE_: Base-PointRend-RCNN-FPN.yaml
+MODEL:
+  WEIGHTS: "./X-101-32x8d.pkl"
+  PIXEL_STD: [57.375, 57.120, 58.395]
+  MASK_ON: true
+  RESNETS:
+    STRIDE_IN_1X1: False  # this is a C2 model
+    NUM_GROUPS: 32
+    WIDTH_PER_GROUP: 8
+    DEPTH: 101
+  ROI_HEADS:
+    NUM_CLASSES: 1
+  POINT_HEAD:
+    NUM_CLASSES: 1
+SOLVER:
+  STEPS: (210000, 250000)
+  MAX_ITER: 270000
+  IMS_PER_BATCH: 1
+# To add COCO AP evaluation against the higher-quality LVIS annotations.
+# DATASETS:
+#   TEST: ("coco_2017_val", "lvis_v0.5_val_cocofied")
+INPUT:
+  MIN_SIZE_TRAIN: (640, 864)
+  MIN_SIZE_TRAIN_SAMPLING: "range"
+  MAX_SIZE_TRAIN: 1440
+DATASETS:
+  TRAIN: ("CIHP_train",)
+  TEST: ("CIHP_val",)
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/projects/PointRend/configs/SemanticSegmentation/Base-PointRend-Semantic-FPN.yaml b/preprocess/humanparsing/mhp_extension/detectron2/projects/PointRend/configs/SemanticSegmentation/Base-PointRend-Semantic-FPN.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..00562a92363dc47c6ebe9ef8bebb89cd5e5b8502
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/projects/PointRend/configs/SemanticSegmentation/Base-PointRend-Semantic-FPN.yaml
@@ -0,0 +1,19 @@
+_BASE_: "../../../../configs/Base-RCNN-FPN.yaml"
+MODEL:
+  META_ARCHITECTURE: "SemanticSegmentor"
+  BACKBONE:
+    FREEZE_AT: 0
+  SEM_SEG_HEAD:
+    NAME: "PointRendSemSegHead"
+  POINT_HEAD:
+    NUM_CLASSES: 54
+    FC_DIM: 256
+    NUM_FC: 3
+    IN_FEATURES: ["p2"]
+    TRAIN_NUM_POINTS: 1024
+    SUBDIVISION_STEPS: 2
+    SUBDIVISION_NUM_POINTS: 8192
+    COARSE_SEM_SEG_HEAD_NAME: "SemSegFPNHead"
+DATASETS:
+  TRAIN: ("coco_2017_train_panoptic_stuffonly",)
+  TEST: ("coco_2017_val_panoptic_stuffonly",)
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/projects/PointRend/configs/SemanticSegmentation/pointrend_semantic_R_101_FPN_1x_cityscapes.yaml b/preprocess/humanparsing/mhp_extension/detectron2/projects/PointRend/configs/SemanticSegmentation/pointrend_semantic_R_101_FPN_1x_cityscapes.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..4965b068c11bc568317ea3cc8c83d8c44234b936
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/projects/PointRend/configs/SemanticSegmentation/pointrend_semantic_R_101_FPN_1x_cityscapes.yaml
@@ -0,0 +1,33 @@
+_BASE_: Base-PointRend-Semantic-FPN.yaml
+MODEL:
+  WEIGHTS: detectron2://ImageNetPretrained/MSRA/R-101.pkl
+  RESNETS:
+    DEPTH: 101
+  SEM_SEG_HEAD:
+    NUM_CLASSES: 19
+  POINT_HEAD:
+    NUM_CLASSES: 19
+    TRAIN_NUM_POINTS: 2048
+    SUBDIVISION_NUM_POINTS: 8192
+DATASETS:
+  TRAIN: ("cityscapes_fine_sem_seg_train",)
+  TEST: ("cityscapes_fine_sem_seg_val",)
+SOLVER:
+  BASE_LR: 0.01
+  STEPS: (40000, 55000)
+  MAX_ITER: 65000
+  IMS_PER_BATCH: 32
+INPUT:
+  MIN_SIZE_TRAIN: (512, 768, 1024, 1280, 1536, 1792, 2048)
+  MIN_SIZE_TRAIN_SAMPLING: "choice"
+  MIN_SIZE_TEST: 1024
+  MAX_SIZE_TRAIN: 4096
+  MAX_SIZE_TEST: 2048
+  CROP:
+    ENABLED: True
+    TYPE: "absolute"
+    SIZE: (512, 1024)
+    SINGLE_CATEGORY_MAX_AREA: 0.75
+  COLOR_AUG_SSD: True
+DATALOADER:
+  NUM_WORKERS: 16
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/projects/PointRend/configs/SemanticSegmentation/pointrend_semantic_R_50_FPN_1x_coco.yaml b/preprocess/humanparsing/mhp_extension/detectron2/projects/PointRend/configs/SemanticSegmentation/pointrend_semantic_R_50_FPN_1x_coco.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..7948bd808ea9888b20d1e118abf6bb630c485f39
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/projects/PointRend/configs/SemanticSegmentation/pointrend_semantic_R_50_FPN_1x_coco.yaml
@@ -0,0 +1,5 @@
+_BASE_: Base-PointRend-Semantic-FPN.yaml
+MODEL:
+  WEIGHTS: detectron2://ImageNetPretrained/MSRA/R-50.pkl
+  RESNETS:
+    DEPTH: 50
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/projects/PointRend/finetune_net.py b/preprocess/humanparsing/mhp_extension/detectron2/projects/PointRend/finetune_net.py
new file mode 100644
index 0000000000000000000000000000000000000000..b99baf939b3788a2ee9e339beaa503cfa4d6a14f
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/projects/PointRend/finetune_net.py
@@ -0,0 +1,139 @@
+#!/usr/bin/env python3
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+
+"""
+PointRend Training Script.
+
+This script is a simplified version of the training script in detectron2/tools.
+"""
+
+import os
+import torch
+
+import detectron2.utils.comm as comm
+from detectron2.checkpoint import DetectionCheckpointer
+from detectron2.config import get_cfg
+from detectron2.data import MetadataCatalog, build_detection_train_loader
+from detectron2.engine import DefaultTrainer, default_argument_parser, default_setup, launch
+from detectron2.evaluation import (
+    CityscapesInstanceEvaluator,
+    CityscapesSemSegEvaluator,
+    COCOEvaluator,
+    DatasetEvaluators,
+    LVISEvaluator,
+    SemSegEvaluator,
+    verify_results,
+)
+
+from point_rend import SemSegDatasetMapper, add_pointrend_config
+
+os.environ['CUDA_VISIBLE_DEVICES'] = '4'
+# Register Custom Dataset
+from detectron2.data.datasets import register_coco_instances
+register_coco_instances("CIHP_train", {}, "/data03/v_xuyunqiu/multi_parsing/data/msrcnn_finetune_annotations/CIHP_train.json", "/data03/v_xuyunqiu/data/instance-level_human_parsing/Training/Images")
+register_coco_instances("CIHP_val", {}, "/data03/v_xuyunqiu/multi_parsing/data/msrcnn_finetune_annotations/CIHP_val.json", "/data03/v_xuyunqiu/data/instance-level_human_parsing/Validation/Images")
+
+
+class Trainer(DefaultTrainer):
+    """
+    We use the "DefaultTrainer" which contains a number pre-defined logic for
+    standard training workflow. They may not work for you, especially if you
+    are working on a new research project. In that case you can use the cleaner
+    "SimpleTrainer", or write your own training loop.
+    """
+
+    @classmethod
+    def build_evaluator(cls, cfg, dataset_name, output_folder=None):
+        """
+        Create evaluator(s) for a given dataset.
+        This uses the special metadata "evaluator_type" associated with each builtin dataset.
+        For your own dataset, you can simply create an evaluator manually in your
+        script and do not have to worry about the hacky if-else logic here.
+        """
+        if output_folder is None:
+            output_folder = os.path.join(cfg.OUTPUT_DIR, "inference")
+        evaluator_list = []
+        evaluator_type = MetadataCatalog.get(dataset_name).evaluator_type
+        if evaluator_type == "lvis":
+            return LVISEvaluator(dataset_name, cfg, True, output_folder)
+        if evaluator_type == "coco":
+            return COCOEvaluator(dataset_name, cfg, True, output_folder)
+        if evaluator_type == "sem_seg":
+            return SemSegEvaluator(
+                dataset_name,
+                distributed=True,
+                num_classes=cfg.MODEL.SEM_SEG_HEAD.NUM_CLASSES,
+                ignore_label=cfg.MODEL.SEM_SEG_HEAD.IGNORE_VALUE,
+                output_dir=output_folder,
+            )
+        if evaluator_type == "cityscapes_instance":
+            assert (
+                torch.cuda.device_count() >= comm.get_rank()
+            ), "CityscapesEvaluator currently do not work with multiple machines."
+            return CityscapesInstanceEvaluator(dataset_name)
+        if evaluator_type == "cityscapes_sem_seg":
+            assert (
+                torch.cuda.device_count() >= comm.get_rank()
+            ), "CityscapesEvaluator currently do not work with multiple machines."
+            return CityscapesSemSegEvaluator(dataset_name)
+        if len(evaluator_list) == 0:
+            raise NotImplementedError(
+                "no Evaluator for the dataset {} with the type {}".format(
+                    dataset_name, evaluator_type
+                )
+            )
+        if len(evaluator_list) == 1:
+            return evaluator_list[0]
+        return DatasetEvaluators(evaluator_list)
+
+    @classmethod
+    def build_train_loader(cls, cfg):
+        if "SemanticSegmentor" in cfg.MODEL.META_ARCHITECTURE:
+            mapper = SemSegDatasetMapper(cfg, True)
+        else:
+            mapper = None
+        return build_detection_train_loader(cfg, mapper=mapper)
+
+
+def setup(args):
+    """
+    Create configs and perform basic setups.
+    """
+    cfg = get_cfg()
+    add_pointrend_config(cfg)
+    cfg.merge_from_file(args.config_file)
+    cfg.merge_from_list(args.opts)
+    cfg.freeze()
+    default_setup(cfg, args)
+    return cfg
+
+
+def main(args):
+    cfg = setup(args)
+
+    if args.eval_only:
+        model = Trainer.build_model(cfg)
+        DetectionCheckpointer(model, save_dir=cfg.OUTPUT_DIR).resume_or_load(
+            cfg.MODEL.WEIGHTS, resume=args.resume
+        )
+        res = Trainer.test(cfg, model)
+        if comm.is_main_process():
+            verify_results(cfg, res)
+        return res
+
+    trainer = Trainer(cfg)
+    trainer.resume_or_load(resume=args.resume)
+    return trainer.train()
+
+
+if __name__ == "__main__":
+    args = default_argument_parser().parse_args()
+    print("Command Line Args:", args)
+    launch(
+        main,
+        args.num_gpus,
+        num_machines=args.num_machines,
+        machine_rank=args.machine_rank,
+        dist_url=args.dist_url,
+        args=(args,),
+    )
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/projects/PointRend/logs/hadoop.kylin.libdfs.log b/preprocess/humanparsing/mhp_extension/detectron2/projects/PointRend/logs/hadoop.kylin.libdfs.log
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/projects/PointRend/point_rend/__init__.py b/preprocess/humanparsing/mhp_extension/detectron2/projects/PointRend/point_rend/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..4020fe0a287f87cb3bd2487b5b40b7e1e2647aa8
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/projects/PointRend/point_rend/__init__.py
@@ -0,0 +1,6 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+from .config import add_pointrend_config
+from .coarse_mask_head import CoarseMaskHead
+from .roi_heads import PointRendROIHeads
+from .dataset_mapper import SemSegDatasetMapper
+from .semantic_seg import PointRendSemSegHead
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/projects/PointRend/point_rend/coarse_mask_head.py b/preprocess/humanparsing/mhp_extension/detectron2/projects/PointRend/point_rend/coarse_mask_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..3f1cffb4c985dc3121a863eb7b378965b718a19d
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/projects/PointRend/point_rend/coarse_mask_head.py
@@ -0,0 +1,92 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+import fvcore.nn.weight_init as weight_init
+import torch
+from torch import nn
+from torch.nn import functional as F
+
+from detectron2.layers import Conv2d, ShapeSpec
+from detectron2.modeling import ROI_MASK_HEAD_REGISTRY
+
+
+@ROI_MASK_HEAD_REGISTRY.register()
+class CoarseMaskHead(nn.Module):
+    """
+    A mask head with fully connected layers. Given pooled features it first reduces channels and
+    spatial dimensions with conv layers and then uses FC layers to predict coarse masks analogously
+    to the standard box head.
+    """
+
+    def __init__(self, cfg, input_shape: ShapeSpec):
+        """
+        The following attributes are parsed from config:
+            conv_dim: the output dimension of the conv layers
+            fc_dim: the feature dimenstion of the FC layers
+            num_fc: the number of FC layers
+            output_side_resolution: side resolution of the output square mask prediction
+        """
+        super(CoarseMaskHead, self).__init__()
+
+        # fmt: off
+        self.num_classes            = cfg.MODEL.ROI_HEADS.NUM_CLASSES
+        conv_dim                    = cfg.MODEL.ROI_MASK_HEAD.CONV_DIM
+        self.fc_dim                 = cfg.MODEL.ROI_MASK_HEAD.FC_DIM
+        num_fc                      = cfg.MODEL.ROI_MASK_HEAD.NUM_FC
+        self.output_side_resolution = cfg.MODEL.ROI_MASK_HEAD.OUTPUT_SIDE_RESOLUTION
+        self.input_channels         = input_shape.channels
+        self.input_h                = input_shape.height
+        self.input_w                = input_shape.width
+        # fmt: on
+
+        self.conv_layers = []
+        if self.input_channels > conv_dim:
+            self.reduce_channel_dim_conv = Conv2d(
+                self.input_channels,
+                conv_dim,
+                kernel_size=1,
+                stride=1,
+                padding=0,
+                bias=True,
+                activation=F.relu,
+            )
+            self.conv_layers.append(self.reduce_channel_dim_conv)
+
+        self.reduce_spatial_dim_conv = Conv2d(
+            conv_dim, conv_dim, kernel_size=2, stride=2, padding=0, bias=True, activation=F.relu
+        )
+        self.conv_layers.append(self.reduce_spatial_dim_conv)
+
+        input_dim = conv_dim * self.input_h * self.input_w
+        input_dim //= 4
+
+        self.fcs = []
+        for k in range(num_fc):
+            fc = nn.Linear(input_dim, self.fc_dim)
+            self.add_module("coarse_mask_fc{}".format(k + 1), fc)
+            self.fcs.append(fc)
+            input_dim = self.fc_dim
+
+        output_dim = self.num_classes * self.output_side_resolution * self.output_side_resolution
+
+        self.prediction = nn.Linear(self.fc_dim, output_dim)
+        # use normal distribution initialization for mask prediction layer
+        nn.init.normal_(self.prediction.weight, std=0.001)
+        nn.init.constant_(self.prediction.bias, 0)
+
+        for layer in self.conv_layers:
+            weight_init.c2_msra_fill(layer)
+        for layer in self.fcs:
+            weight_init.c2_xavier_fill(layer)
+
+    def forward(self, x):
+        # unlike BaseMaskRCNNHead, this head only outputs intermediate
+        # features, because the features will be used later by PointHead.
+        N = x.shape[0]
+        x = x.view(N, self.input_channels, self.input_h, self.input_w)
+        for layer in self.conv_layers:
+            x = layer(x)
+        x = torch.flatten(x, start_dim=1)
+        for layer in self.fcs:
+            x = F.relu(layer(x))
+        return self.prediction(x).view(
+            N, self.num_classes, self.output_side_resolution, self.output_side_resolution
+        )
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/projects/PointRend/point_rend/color_augmentation.py b/preprocess/humanparsing/mhp_extension/detectron2/projects/PointRend/point_rend/color_augmentation.py
new file mode 100644
index 0000000000000000000000000000000000000000..27344c470adac143186e61c8a5b0f39900937634
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/projects/PointRend/point_rend/color_augmentation.py
@@ -0,0 +1,98 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+import numpy as np
+import random
+import cv2
+from fvcore.transforms.transform import Transform
+
+
+class ColorAugSSDTransform(Transform):
+    """
+    A color related data augmentation used in Single Shot Multibox Detector (SSD).
+
+    Wei Liu, Dragomir Anguelov, Dumitru Erhan, Christian Szegedy,
+       Scott Reed, Cheng-Yang Fu, Alexander C. Berg.
+       SSD: Single Shot MultiBox Detector. ECCV 2016.
+
+    Implementation based on:
+
+     https://github.com/weiliu89/caffe/blob
+       /4817bf8b4200b35ada8ed0dc378dceaf38c539e4
+       /src/caffe/util/im_transforms.cpp
+
+     https://github.com/chainer/chainercv/blob
+       /7159616642e0be7c5b3ef380b848e16b7e99355b/chainercv
+       /links/model/ssd/transforms.py
+    """
+
+    def __init__(
+        self,
+        img_format,
+        brightness_delta=32,
+        contrast_low=0.5,
+        contrast_high=1.5,
+        saturation_low=0.5,
+        saturation_high=1.5,
+        hue_delta=18,
+    ):
+        super().__init__()
+        assert img_format in ["BGR", "RGB"]
+        self.is_rgb = img_format == "RGB"
+        del img_format
+        self._set_attributes(locals())
+
+    def apply_coords(self, coords):
+        return coords
+
+    def apply_segmentation(self, segmentation):
+        return segmentation
+
+    def apply_image(self, img, interp=None):
+        if self.is_rgb:
+            img = img[:, :, [2, 1, 0]]
+        img = self.brightness(img)
+        if random.randrange(2):
+            img = self.contrast(img)
+            img = self.saturation(img)
+            img = self.hue(img)
+        else:
+            img = self.saturation(img)
+            img = self.hue(img)
+            img = self.contrast(img)
+        if self.is_rgb:
+            img = img[:, :, [2, 1, 0]]
+        return img
+
+    def convert(self, img, alpha=1, beta=0):
+        img = img.astype(np.float32) * alpha + beta
+        img = np.clip(img, 0, 255)
+        return img.astype(np.uint8)
+
+    def brightness(self, img):
+        if random.randrange(2):
+            return self.convert(
+                img, beta=random.uniform(-self.brightness_delta, self.brightness_delta)
+            )
+        return img
+
+    def contrast(self, img):
+        if random.randrange(2):
+            return self.convert(img, alpha=random.uniform(self.contrast_low, self.contrast_high))
+        return img
+
+    def saturation(self, img):
+        if random.randrange(2):
+            img = cv2.cvtColor(img, cv2.COLOR_BGR2HSV)
+            img[:, :, 1] = self.convert(
+                img[:, :, 1], alpha=random.uniform(self.saturation_low, self.saturation_high)
+            )
+            return cv2.cvtColor(img, cv2.COLOR_HSV2BGR)
+        return img
+
+    def hue(self, img):
+        if random.randrange(2):
+            img = cv2.cvtColor(img, cv2.COLOR_BGR2HSV)
+            img[:, :, 0] = (
+                img[:, :, 0].astype(int) + random.randint(-self.hue_delta, self.hue_delta)
+            ) % 180
+            return cv2.cvtColor(img, cv2.COLOR_HSV2BGR)
+        return img
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/projects/PointRend/point_rend/config.py b/preprocess/humanparsing/mhp_extension/detectron2/projects/PointRend/point_rend/config.py
new file mode 100644
index 0000000000000000000000000000000000000000..74f63672bba7cd25679054b19ff87254a0e24974
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/projects/PointRend/point_rend/config.py
@@ -0,0 +1,48 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+
+from detectron2.config import CfgNode as CN
+
+
+def add_pointrend_config(cfg):
+    """
+    Add config for PointRend.
+    """
+    # We retry random cropping until no single category in semantic segmentation GT occupies more
+    # than `SINGLE_CATEGORY_MAX_AREA` part of the crop.
+    cfg.INPUT.CROP.SINGLE_CATEGORY_MAX_AREA = 1.0
+    # Color augmentatition from SSD paper for semantic segmentation model during training.
+    cfg.INPUT.COLOR_AUG_SSD = False
+
+    # Names of the input feature maps to be used by a coarse mask head.
+    cfg.MODEL.ROI_MASK_HEAD.IN_FEATURES = ("p2",)
+    cfg.MODEL.ROI_MASK_HEAD.FC_DIM = 1024
+    cfg.MODEL.ROI_MASK_HEAD.NUM_FC = 2
+    # The side size of a coarse mask head prediction.
+    cfg.MODEL.ROI_MASK_HEAD.OUTPUT_SIDE_RESOLUTION = 7
+    # True if point head is used.
+    cfg.MODEL.ROI_MASK_HEAD.POINT_HEAD_ON = False
+
+    cfg.MODEL.POINT_HEAD = CN()
+    cfg.MODEL.POINT_HEAD.NAME = "StandardPointHead"
+    cfg.MODEL.POINT_HEAD.NUM_CLASSES = 80
+    # Names of the input feature maps to be used by a mask point head.
+    cfg.MODEL.POINT_HEAD.IN_FEATURES = ("p2",)
+    # Number of points sampled during training for a mask point head.
+    cfg.MODEL.POINT_HEAD.TRAIN_NUM_POINTS = 14 * 14
+    # Oversampling parameter for PointRend point sampling during training. Parameter `k` in the
+    # original paper.
+    cfg.MODEL.POINT_HEAD.OVERSAMPLE_RATIO = 3
+    # Importance sampling parameter for PointRend point sampling during training. Parametr `beta` in
+    # the original paper.
+    cfg.MODEL.POINT_HEAD.IMPORTANCE_SAMPLE_RATIO = 0.75
+    # Number of subdivision steps during inference.
+    cfg.MODEL.POINT_HEAD.SUBDIVISION_STEPS = 5
+    # Maximum number of points selected at each subdivision step (N).
+    cfg.MODEL.POINT_HEAD.SUBDIVISION_NUM_POINTS = 28 * 28
+    cfg.MODEL.POINT_HEAD.FC_DIM = 256
+    cfg.MODEL.POINT_HEAD.NUM_FC = 3
+    cfg.MODEL.POINT_HEAD.CLS_AGNOSTIC_MASK = False
+    # If True, then coarse prediction features are used as inout for each layer in PointRend's MLP.
+    cfg.MODEL.POINT_HEAD.COARSE_PRED_EACH_LAYER = True
+    cfg.MODEL.POINT_HEAD.COARSE_SEM_SEG_HEAD_NAME = "SemSegFPNHead"
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/projects/PointRend/point_rend/dataset_mapper.py b/preprocess/humanparsing/mhp_extension/detectron2/projects/PointRend/point_rend/dataset_mapper.py
new file mode 100644
index 0000000000000000000000000000000000000000..76b64ee79b679741d547c5d1ffca55ac756051ae
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/projects/PointRend/point_rend/dataset_mapper.py
@@ -0,0 +1,121 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+import copy
+import logging
+import numpy as np
+import torch
+from fvcore.common.file_io import PathManager
+from fvcore.transforms.transform import CropTransform
+from PIL import Image
+
+from detectron2.data import detection_utils as utils
+from detectron2.data import transforms as T
+
+from .color_augmentation import ColorAugSSDTransform
+
+"""
+This file contains the mapping that's applied to "dataset dicts" for semantic segmentation models.
+Unlike the default DatasetMapper this mapper uses cropping as the last transformation.
+"""
+
+__all__ = ["SemSegDatasetMapper"]
+
+
+class SemSegDatasetMapper:
+    """
+    A callable which takes a dataset dict in Detectron2 Dataset format,
+    and map it into a format used by semantic segmentation models.
+
+    The callable currently does the following:
+
+    1. Read the image from "file_name"
+    2. Applies geometric transforms to the image and annotation
+    3. Find and applies suitable cropping to the image and annotation
+    4. Prepare image and annotation to Tensors
+    """
+
+    def __init__(self, cfg, is_train=True):
+        if cfg.INPUT.CROP.ENABLED and is_train:
+            self.crop_gen = T.RandomCrop(cfg.INPUT.CROP.TYPE, cfg.INPUT.CROP.SIZE)
+            logging.getLogger(__name__).info("CropGen used in training: " + str(self.crop_gen))
+        else:
+            self.crop_gen = None
+
+        self.tfm_gens = utils.build_transform_gen(cfg, is_train)
+
+        if cfg.INPUT.COLOR_AUG_SSD:
+            self.tfm_gens.append(ColorAugSSDTransform(img_format=cfg.INPUT.FORMAT))
+            logging.getLogger(__name__).info(
+                "Color augmnetation used in training: " + str(self.tfm_gens[-1])
+            )
+
+        # fmt: off
+        self.img_format               = cfg.INPUT.FORMAT
+        self.single_category_max_area = cfg.INPUT.CROP.SINGLE_CATEGORY_MAX_AREA
+        self.ignore_value             = cfg.MODEL.SEM_SEG_HEAD.IGNORE_VALUE
+        # fmt: on
+
+        self.is_train = is_train
+
+    def __call__(self, dataset_dict):
+        """
+        Args:
+            dataset_dict (dict): Metadata of one image, in Detectron2 Dataset format.
+
+        Returns:
+            dict: a format that builtin models in detectron2 accept
+        """
+        dataset_dict = copy.deepcopy(dataset_dict)  # it will be modified by code below
+        image = utils.read_image(dataset_dict["file_name"], format=self.img_format)
+        utils.check_image_size(dataset_dict, image)
+        assert "sem_seg_file_name" in dataset_dict
+
+        image, transforms = T.apply_transform_gens(self.tfm_gens, image)
+        if self.is_train:
+            with PathManager.open(dataset_dict.pop("sem_seg_file_name"), "rb") as f:
+                sem_seg_gt = Image.open(f)
+                sem_seg_gt = np.asarray(sem_seg_gt, dtype="uint8")
+            sem_seg_gt = transforms.apply_segmentation(sem_seg_gt)
+            if self.crop_gen:
+                image, sem_seg_gt = crop_transform(
+                    image,
+                    sem_seg_gt,
+                    self.crop_gen,
+                    self.single_category_max_area,
+                    self.ignore_value,
+                )
+            dataset_dict["sem_seg"] = torch.as_tensor(sem_seg_gt.astype("long"))
+
+        # Pytorch's dataloader is efficient on torch.Tensor due to shared-memory,
+        # but not efficient on large generic data structures due to the use of pickle & mp.Queue.
+        # Therefore it's important to use torch.Tensor.
+        dataset_dict["image"] = torch.as_tensor(np.ascontiguousarray(image.transpose(2, 0, 1)))
+
+        if not self.is_train:
+            dataset_dict.pop("sem_seg_file_name", None)
+            return dataset_dict
+
+        return dataset_dict
+
+
+def crop_transform(image, sem_seg, crop_gen, single_category_max_area, ignore_value):
+    """
+    Find a cropping window such that no single category occupies more than
+        `single_category_max_area` in `sem_seg`. The function retries random cropping 10 times max.
+    """
+    if single_category_max_area >= 1.0:
+        crop_tfm = crop_gen.get_transform(image)
+        sem_seg_temp = crop_tfm.apply_segmentation(sem_seg)
+    else:
+        h, w = sem_seg.shape
+        crop_size = crop_gen.get_crop_size((h, w))
+        for _ in range(10):
+            y0 = np.random.randint(h - crop_size[0] + 1)
+            x0 = np.random.randint(w - crop_size[1] + 1)
+            sem_seg_temp = sem_seg[y0 : y0 + crop_size[0], x0 : x0 + crop_size[1]]
+            labels, cnt = np.unique(sem_seg_temp, return_counts=True)
+            cnt = cnt[labels != ignore_value]
+            if len(cnt) > 1 and np.max(cnt) / np.sum(cnt) < single_category_max_area:
+                break
+        crop_tfm = CropTransform(x0, y0, crop_size[1], crop_size[0])
+    image = crop_tfm.apply_image(image)
+    return image, sem_seg_temp
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/projects/PointRend/point_rend/point_features.py b/preprocess/humanparsing/mhp_extension/detectron2/projects/PointRend/point_rend/point_features.py
new file mode 100644
index 0000000000000000000000000000000000000000..320a33de8505572eedcfa94d355bf2772ab75528
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/projects/PointRend/point_rend/point_features.py
@@ -0,0 +1,216 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+import torch
+from torch.nn import functional as F
+
+from detectron2.layers import cat
+from detectron2.structures import Boxes
+
+
+"""
+Shape shorthand in this module:
+
+    N: minibatch dimension size, i.e. the number of RoIs for instance segmenation or the
+        number of images for semantic segmenation.
+    R: number of ROIs, combined over all images, in the minibatch
+    P: number of points
+"""
+
+
+def point_sample(input, point_coords, **kwargs):
+    """
+    A wrapper around :function:`torch.nn.functional.grid_sample` to support 3D point_coords tensors.
+    Unlike :function:`torch.nn.functional.grid_sample` it assumes `point_coords` to lie inside
+    [0, 1] x [0, 1] square.
+
+    Args:
+        input (Tensor): A tensor of shape (N, C, H, W) that contains features map on a H x W grid.
+        point_coords (Tensor): A tensor of shape (N, P, 2) or (N, Hgrid, Wgrid, 2) that contains
+        [0, 1] x [0, 1] normalized point coordinates.
+
+    Returns:
+        output (Tensor): A tensor of shape (N, C, P) or (N, C, Hgrid, Wgrid) that contains
+            features for points in `point_coords`. The features are obtained via bilinear
+            interplation from `input` the same way as :function:`torch.nn.functional.grid_sample`.
+    """
+    add_dim = False
+    if point_coords.dim() == 3:
+        add_dim = True
+        point_coords = point_coords.unsqueeze(2)
+    output = F.grid_sample(input, 2.0 * point_coords - 1.0, **kwargs)
+    if add_dim:
+        output = output.squeeze(3)
+    return output
+
+
+def generate_regular_grid_point_coords(R, side_size, device):
+    """
+    Generate regular square grid of points in [0, 1] x [0, 1] coordinate space.
+
+    Args:
+        R (int): The number of grids to sample, one for each region.
+        side_size (int): The side size of the regular grid.
+        device (torch.device): Desired device of returned tensor.
+
+    Returns:
+        (Tensor): A tensor of shape (R, side_size^2, 2) that contains coordinates
+            for the regular grids.
+    """
+    aff = torch.tensor([[[0.5, 0, 0.5], [0, 0.5, 0.5]]], device=device)
+    r = F.affine_grid(aff, torch.Size((1, 1, side_size, side_size)), align_corners=False)
+    return r.view(1, -1, 2).expand(R, -1, -1)
+
+
+def get_uncertain_point_coords_with_randomness(
+    coarse_logits, uncertainty_func, num_points, oversample_ratio, importance_sample_ratio
+):
+    """
+    Sample points in [0, 1] x [0, 1] coordinate space based on their uncertainty. The unceratinties
+        are calculated for each point using 'uncertainty_func' function that takes point's logit
+        prediction as input.
+    See PointRend paper for details.
+
+    Args:
+        coarse_logits (Tensor): A tensor of shape (N, C, Hmask, Wmask) or (N, 1, Hmask, Wmask) for
+            class-specific or class-agnostic prediction.
+        uncertainty_func: A function that takes a Tensor of shape (N, C, P) or (N, 1, P) that
+            contains logit predictions for P points and returns their uncertainties as a Tensor of
+            shape (N, 1, P).
+        num_points (int): The number of points P to sample.
+        oversample_ratio (int): Oversampling parameter.
+        importance_sample_ratio (float): Ratio of points that are sampled via importnace sampling.
+
+    Returns:
+        point_coords (Tensor): A tensor of shape (N, P, 2) that contains the coordinates of P
+            sampled points.
+    """
+    assert oversample_ratio >= 1
+    assert importance_sample_ratio <= 1 and importance_sample_ratio >= 0
+    num_boxes = coarse_logits.shape[0]
+    num_sampled = int(num_points * oversample_ratio)
+    point_coords = torch.rand(num_boxes, num_sampled, 2, device=coarse_logits.device)
+    point_logits = point_sample(coarse_logits, point_coords, align_corners=False)
+    # It is crucial to calculate uncertainty based on the sampled prediction value for the points.
+    # Calculating uncertainties of the coarse predictions first and sampling them for points leads
+    # to incorrect results.
+    # To illustrate this: assume uncertainty_func(logits)=-abs(logits), a sampled point between
+    # two coarse predictions with -1 and 1 logits has 0 logits, and therefore 0 uncertainty value.
+    # However, if we calculate uncertainties for the coarse predictions first,
+    # both will have -1 uncertainty, and the sampled point will get -1 uncertainty.
+    point_uncertainties = uncertainty_func(point_logits)
+    num_uncertain_points = int(importance_sample_ratio * num_points)
+    num_random_points = num_points - num_uncertain_points
+    idx = torch.topk(point_uncertainties[:, 0, :], k=num_uncertain_points, dim=1)[1]
+    shift = num_sampled * torch.arange(num_boxes, dtype=torch.long, device=coarse_logits.device)
+    idx += shift[:, None]
+    point_coords = point_coords.view(-1, 2)[idx.view(-1), :].view(
+        num_boxes, num_uncertain_points, 2
+    )
+    if num_random_points > 0:
+        point_coords = cat(
+            [
+                point_coords,
+                torch.rand(num_boxes, num_random_points, 2, device=coarse_logits.device),
+            ],
+            dim=1,
+        )
+    return point_coords
+
+
+def get_uncertain_point_coords_on_grid(uncertainty_map, num_points):
+    """
+    Find `num_points` most uncertain points from `uncertainty_map` grid.
+
+    Args:
+        uncertainty_map (Tensor): A tensor of shape (N, 1, H, W) that contains uncertainty
+            values for a set of points on a regular H x W grid.
+        num_points (int): The number of points P to select.
+
+    Returns:
+        point_indices (Tensor): A tensor of shape (N, P) that contains indices from
+            [0, H x W) of the most uncertain points.
+        point_coords (Tensor): A tensor of shape (N, P, 2) that contains [0, 1] x [0, 1] normalized
+            coordinates of the most uncertain points from the H x W grid.
+    """
+    R, _, H, W = uncertainty_map.shape
+    h_step = 1.0 / float(H)
+    w_step = 1.0 / float(W)
+
+    num_points = min(H * W, num_points)
+    point_indices = torch.topk(uncertainty_map.view(R, H * W), k=num_points, dim=1)[1]
+    point_coords = torch.zeros(R, num_points, 2, dtype=torch.float, device=uncertainty_map.device)
+    point_coords[:, :, 0] = w_step / 2.0 + (point_indices % W).to(torch.float) * w_step
+    point_coords[:, :, 1] = h_step / 2.0 + (point_indices // W).to(torch.float) * h_step
+    return point_indices, point_coords
+
+
+def point_sample_fine_grained_features(features_list, feature_scales, boxes, point_coords):
+    """
+    Get features from feature maps in `features_list` that correspond to specific point coordinates
+        inside each bounding box from `boxes`.
+
+    Args:
+        features_list (list[Tensor]): A list of feature map tensors to get features from.
+        feature_scales (list[float]): A list of scales for tensors in `features_list`.
+        boxes (list[Boxes]): A list of I Boxes  objects that contain R_1 + ... + R_I = R boxes all
+            together.
+        point_coords (Tensor): A tensor of shape (R, P, 2) that contains
+            [0, 1] x [0, 1] box-normalized coordinates of the P sampled points.
+
+    Returns:
+        point_features (Tensor): A tensor of shape (R, C, P) that contains features sampled
+            from all features maps in feature_list for P sampled points for all R boxes in `boxes`.
+        point_coords_wrt_image (Tensor): A tensor of shape (R, P, 2) that contains image-level
+            coordinates of P points.
+    """
+    cat_boxes = Boxes.cat(boxes)
+    num_boxes = [len(b) for b in boxes]
+
+    point_coords_wrt_image = get_point_coords_wrt_image(cat_boxes.tensor, point_coords)
+    split_point_coords_wrt_image = torch.split(point_coords_wrt_image, num_boxes)
+
+    point_features = []
+    for idx_img, point_coords_wrt_image_per_image in enumerate(split_point_coords_wrt_image):
+        point_features_per_image = []
+        for idx_feature, feature_map in enumerate(features_list):
+            h, w = feature_map.shape[-2:]
+            scale = torch.tensor([w, h], device=feature_map.device) / feature_scales[idx_feature]
+            point_coords_scaled = point_coords_wrt_image_per_image / scale
+            point_features_per_image.append(
+                point_sample(
+                    feature_map[idx_img].unsqueeze(0),
+                    point_coords_scaled.unsqueeze(0),
+                    align_corners=False,
+                )
+                .squeeze(0)
+                .transpose(1, 0)
+            )
+        point_features.append(cat(point_features_per_image, dim=1))
+
+    return cat(point_features, dim=0), point_coords_wrt_image
+
+
+def get_point_coords_wrt_image(boxes_coords, point_coords):
+    """
+    Convert box-normalized [0, 1] x [0, 1] point cooordinates to image-level coordinates.
+
+    Args:
+        boxes_coords (Tensor): A tensor of shape (R, 4) that contains bounding boxes.
+            coordinates.
+        point_coords (Tensor): A tensor of shape (R, P, 2) that contains
+            [0, 1] x [0, 1] box-normalized coordinates of the P sampled points.
+
+    Returns:
+        point_coords_wrt_image (Tensor): A tensor of shape (R, P, 2) that contains
+            image-normalized coordinates of P sampled points.
+    """
+    with torch.no_grad():
+        point_coords_wrt_image = point_coords.clone()
+        point_coords_wrt_image[:, :, 0] = point_coords_wrt_image[:, :, 0] * (
+            boxes_coords[:, None, 2] - boxes_coords[:, None, 0]
+        )
+        point_coords_wrt_image[:, :, 1] = point_coords_wrt_image[:, :, 1] * (
+            boxes_coords[:, None, 3] - boxes_coords[:, None, 1]
+        )
+        point_coords_wrt_image[:, :, 0] += boxes_coords[:, None, 0]
+        point_coords_wrt_image[:, :, 1] += boxes_coords[:, None, 1]
+    return point_coords_wrt_image
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/projects/PointRend/point_rend/point_head.py b/preprocess/humanparsing/mhp_extension/detectron2/projects/PointRend/point_rend/point_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..6f35baea064fbee14d9bcd0b57e354f82bf54a8c
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/projects/PointRend/point_rend/point_head.py
@@ -0,0 +1,154 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+import fvcore.nn.weight_init as weight_init
+import torch
+from torch import nn
+from torch.nn import functional as F
+
+from detectron2.layers import ShapeSpec, cat
+from detectron2.structures import BitMasks
+from detectron2.utils.events import get_event_storage
+from detectron2.utils.registry import Registry
+
+from .point_features import point_sample
+
+POINT_HEAD_REGISTRY = Registry("POINT_HEAD")
+POINT_HEAD_REGISTRY.__doc__ = """
+Registry for point heads, which makes prediction for a given set of per-point features.
+
+The registered object will be called with `obj(cfg, input_shape)`.
+"""
+
+
+def roi_mask_point_loss(mask_logits, instances, points_coord):
+    """
+    Compute the point-based loss for instance segmentation mask predictions.
+
+    Args:
+        mask_logits (Tensor): A tensor of shape (R, C, P) or (R, 1, P) for class-specific or
+            class-agnostic, where R is the total number of predicted masks in all images, C is the
+            number of foreground classes, and P is the number of points sampled for each mask.
+            The values are logits.
+        instances (list[Instances]): A list of N Instances, where N is the number of images
+            in the batch. These instances are in 1:1 correspondence with the `mask_logits`. So, i_th
+            elememt of the list contains R_i objects and R_1 + ... + R_N is equal to R.
+            The ground-truth labels (class, box, mask, ...) associated with each instance are stored
+            in fields.
+        points_coords (Tensor): A tensor of shape (R, P, 2), where R is the total number of
+            predicted masks and P is the number of points for each mask. The coordinates are in
+            the image pixel coordinate space, i.e. [0, H] x [0, W].
+    Returns:
+        point_loss (Tensor): A scalar tensor containing the loss.
+    """
+    assert len(instances) == 0 or isinstance(
+        instances[0].gt_masks, BitMasks
+    ), "Point head works with GT in 'bitmask' format only. Set INPUT.MASK_FORMAT to 'bitmask'."
+    with torch.no_grad():
+        cls_agnostic_mask = mask_logits.size(1) == 1
+        total_num_masks = mask_logits.size(0)
+
+        gt_classes = []
+        gt_mask_logits = []
+        idx = 0
+        for instances_per_image in instances:
+            if not cls_agnostic_mask:
+                gt_classes_per_image = instances_per_image.gt_classes.to(dtype=torch.int64)
+                gt_classes.append(gt_classes_per_image)
+
+            gt_bit_masks = instances_per_image.gt_masks.tensor
+            h, w = instances_per_image.gt_masks.image_size
+            scale = torch.tensor([w, h], dtype=torch.float, device=gt_bit_masks.device)
+            points_coord_grid_sample_format = (
+                points_coord[idx : idx + len(instances_per_image)] / scale
+            )
+            idx += len(instances_per_image)
+            gt_mask_logits.append(
+                point_sample(
+                    gt_bit_masks.to(torch.float32).unsqueeze(1),
+                    points_coord_grid_sample_format,
+                    align_corners=False,
+                ).squeeze(1)
+            )
+        gt_mask_logits = cat(gt_mask_logits)
+
+    # torch.mean (in binary_cross_entropy_with_logits) doesn't
+    # accept empty tensors, so handle it separately
+    if gt_mask_logits.numel() == 0:
+        return mask_logits.sum() * 0
+
+    if cls_agnostic_mask:
+        mask_logits = mask_logits[:, 0]
+    else:
+        indices = torch.arange(total_num_masks)
+        gt_classes = cat(gt_classes, dim=0)
+        mask_logits = mask_logits[indices, gt_classes]
+
+    # Log the training accuracy (using gt classes and 0.0 threshold for the logits)
+    mask_accurate = (mask_logits > 0.0) == gt_mask_logits.to(dtype=torch.uint8)
+    mask_accuracy = mask_accurate.nonzero().size(0) / mask_accurate.numel()
+    get_event_storage().put_scalar("point_rend/accuracy", mask_accuracy)
+
+    point_loss = F.binary_cross_entropy_with_logits(
+        mask_logits, gt_mask_logits.to(dtype=torch.float32), reduction="mean"
+    )
+    return point_loss
+
+
+@POINT_HEAD_REGISTRY.register()
+class StandardPointHead(nn.Module):
+    """
+    A point head multi-layer perceptron which we model with conv1d layers with kernel 1. The head
+    takes both fine-grained and coarse prediction features as its input.
+    """
+
+    def __init__(self, cfg, input_shape: ShapeSpec):
+        """
+        The following attributes are parsed from config:
+            fc_dim: the output dimension of each FC layers
+            num_fc: the number of FC layers
+            coarse_pred_each_layer: if True, coarse prediction features are concatenated to each
+                layer's input
+        """
+        super(StandardPointHead, self).__init__()
+        # fmt: off
+        num_classes                 = cfg.MODEL.POINT_HEAD.NUM_CLASSES
+        fc_dim                      = cfg.MODEL.POINT_HEAD.FC_DIM
+        num_fc                      = cfg.MODEL.POINT_HEAD.NUM_FC
+        cls_agnostic_mask           = cfg.MODEL.POINT_HEAD.CLS_AGNOSTIC_MASK
+        self.coarse_pred_each_layer = cfg.MODEL.POINT_HEAD.COARSE_PRED_EACH_LAYER
+        input_channels              = input_shape.channels
+        # fmt: on
+
+        fc_dim_in = input_channels + num_classes
+        self.fc_layers = []
+        for k in range(num_fc):
+            fc = nn.Conv1d(fc_dim_in, fc_dim, kernel_size=1, stride=1, padding=0, bias=True)
+            self.add_module("fc{}".format(k + 1), fc)
+            self.fc_layers.append(fc)
+            fc_dim_in = fc_dim
+            fc_dim_in += num_classes if self.coarse_pred_each_layer else 0
+
+        num_mask_classes = 1 if cls_agnostic_mask else num_classes
+        self.predictor = nn.Conv1d(fc_dim_in, num_mask_classes, kernel_size=1, stride=1, padding=0)
+
+        for layer in self.fc_layers:
+            weight_init.c2_msra_fill(layer)
+        # use normal distribution initialization for mask prediction layer
+        nn.init.normal_(self.predictor.weight, std=0.001)
+        if self.predictor.bias is not None:
+            nn.init.constant_(self.predictor.bias, 0)
+
+    def forward(self, fine_grained_features, coarse_features):
+        x = torch.cat((fine_grained_features, coarse_features), dim=1)
+        for layer in self.fc_layers:
+            x = F.relu(layer(x))
+            if self.coarse_pred_each_layer:
+                x = cat((x, coarse_features), dim=1)
+        return self.predictor(x)
+
+
+def build_point_head(cfg, input_channels):
+    """
+    Build a point head defined by `cfg.MODEL.POINT_HEAD.NAME`.
+    """
+    head_name = cfg.MODEL.POINT_HEAD.NAME
+    return POINT_HEAD_REGISTRY.get(head_name)(cfg, input_channels)
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/projects/PointRend/point_rend/roi_heads.py b/preprocess/humanparsing/mhp_extension/detectron2/projects/PointRend/point_rend/roi_heads.py
new file mode 100644
index 0000000000000000000000000000000000000000..4f7225bf10544461bbe1e3c777863557f2ad5808
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/projects/PointRend/point_rend/roi_heads.py
@@ -0,0 +1,227 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+import numpy as np
+import torch
+
+from detectron2.layers import ShapeSpec, cat, interpolate
+from detectron2.modeling import ROI_HEADS_REGISTRY, StandardROIHeads
+from detectron2.modeling.roi_heads.mask_head import (
+    build_mask_head,
+    mask_rcnn_inference,
+    mask_rcnn_loss,
+)
+from detectron2.modeling.roi_heads.roi_heads import select_foreground_proposals
+
+from .point_features import (
+    generate_regular_grid_point_coords,
+    get_uncertain_point_coords_on_grid,
+    get_uncertain_point_coords_with_randomness,
+    point_sample,
+    point_sample_fine_grained_features,
+)
+from .point_head import build_point_head, roi_mask_point_loss
+
+
+def calculate_uncertainty(logits, classes):
+    """
+    We estimate uncerainty as L1 distance between 0.0 and the logit prediction in 'logits' for the
+        foreground class in `classes`.
+
+    Args:
+        logits (Tensor): A tensor of shape (R, C, ...) or (R, 1, ...) for class-specific or
+            class-agnostic, where R is the total number of predicted masks in all images and C is
+            the number of foreground classes. The values are logits.
+        classes (list): A list of length R that contains either predicted of ground truth class
+            for eash predicted mask.
+
+    Returns:
+        scores (Tensor): A tensor of shape (R, 1, ...) that contains uncertainty scores with
+            the most uncertain locations having the highest uncertainty score.
+    """
+    if logits.shape[1] == 1:
+        gt_class_logits = logits.clone()
+    else:
+        gt_class_logits = logits[
+            torch.arange(logits.shape[0], device=logits.device), classes
+        ].unsqueeze(1)
+    return -(torch.abs(gt_class_logits))
+
+
+@ROI_HEADS_REGISTRY.register()
+class PointRendROIHeads(StandardROIHeads):
+    """
+    The RoI heads class for PointRend instance segmentation models.
+
+    In this class we redefine the mask head of `StandardROIHeads` leaving all other heads intact.
+    To avoid namespace conflict with other heads we use names starting from `mask_` for all
+    variables that correspond to the mask head in the class's namespace.
+    """
+
+    def __init__(self, cfg, input_shape):
+        # TODO use explicit args style
+        super().__init__(cfg, input_shape)
+        self._init_mask_head(cfg, input_shape)
+
+    def _init_mask_head(self, cfg, input_shape):
+        # fmt: off
+        self.mask_on                 = cfg.MODEL.MASK_ON
+        if not self.mask_on:
+            return
+        self.mask_coarse_in_features = cfg.MODEL.ROI_MASK_HEAD.IN_FEATURES
+        self.mask_coarse_side_size   = cfg.MODEL.ROI_MASK_HEAD.POOLER_RESOLUTION
+        self._feature_scales         = {k: 1.0 / v.stride for k, v in input_shape.items()}
+        # fmt: on
+
+        in_channels = np.sum([input_shape[f].channels for f in self.mask_coarse_in_features])
+        self.mask_coarse_head = build_mask_head(
+            cfg,
+            ShapeSpec(
+                channels=in_channels,
+                width=self.mask_coarse_side_size,
+                height=self.mask_coarse_side_size,
+            ),
+        )
+        self._init_point_head(cfg, input_shape)
+
+    def _init_point_head(self, cfg, input_shape):
+        # fmt: off
+        self.mask_point_on                      = cfg.MODEL.ROI_MASK_HEAD.POINT_HEAD_ON
+        if not self.mask_point_on:
+            return
+        assert cfg.MODEL.ROI_HEADS.NUM_CLASSES == cfg.MODEL.POINT_HEAD.NUM_CLASSES
+        self.mask_point_in_features             = cfg.MODEL.POINT_HEAD.IN_FEATURES
+        self.mask_point_train_num_points        = cfg.MODEL.POINT_HEAD.TRAIN_NUM_POINTS
+        self.mask_point_oversample_ratio        = cfg.MODEL.POINT_HEAD.OVERSAMPLE_RATIO
+        self.mask_point_importance_sample_ratio = cfg.MODEL.POINT_HEAD.IMPORTANCE_SAMPLE_RATIO
+        # next two parameters are use in the adaptive subdivions inference procedure
+        self.mask_point_subdivision_steps       = cfg.MODEL.POINT_HEAD.SUBDIVISION_STEPS
+        self.mask_point_subdivision_num_points  = cfg.MODEL.POINT_HEAD.SUBDIVISION_NUM_POINTS
+        # fmt: on
+
+        in_channels = np.sum([input_shape[f].channels for f in self.mask_point_in_features])
+        self.mask_point_head = build_point_head(
+            cfg, ShapeSpec(channels=in_channels, width=1, height=1)
+        )
+
+    def _forward_mask(self, features, instances):
+        """
+        Forward logic of the mask prediction branch.
+
+        Args:
+            features (dict[str, Tensor]): #level input features for mask prediction
+            instances (list[Instances]): the per-image instances to train/predict masks.
+                In training, they can be the proposals.
+                In inference, they can be the predicted boxes.
+
+        Returns:
+            In training, a dict of losses.
+            In inference, update `instances` with new fields "pred_masks" and return it.
+        """
+        if not self.mask_on:
+            return {} if self.training else instances
+
+        if self.training:
+            proposals, _ = select_foreground_proposals(instances, self.num_classes)
+            proposal_boxes = [x.proposal_boxes for x in proposals]
+            mask_coarse_logits = self._forward_mask_coarse(features, proposal_boxes)
+
+            losses = {"loss_mask": mask_rcnn_loss(mask_coarse_logits, proposals)}
+            losses.update(self._forward_mask_point(features, mask_coarse_logits, proposals))
+            return losses
+        else:
+            pred_boxes = [x.pred_boxes for x in instances]
+            mask_coarse_logits = self._forward_mask_coarse(features, pred_boxes)
+
+            mask_logits = self._forward_mask_point(features, mask_coarse_logits, instances)
+            mask_rcnn_inference(mask_logits, instances)
+            return instances
+
+    def _forward_mask_coarse(self, features, boxes):
+        """
+        Forward logic of the coarse mask head.
+        """
+        point_coords = generate_regular_grid_point_coords(
+            np.sum(len(x) for x in boxes), self.mask_coarse_side_size, boxes[0].device
+        )
+        mask_coarse_features_list = [features[k] for k in self.mask_coarse_in_features]
+        features_scales = [self._feature_scales[k] for k in self.mask_coarse_in_features]
+        # For regular grids of points, this function is equivalent to `len(features_list)' calls
+        # of `ROIAlign` (with `SAMPLING_RATIO=2`), and concat the results.
+        mask_features, _ = point_sample_fine_grained_features(
+            mask_coarse_features_list, features_scales, boxes, point_coords
+        )
+        return self.mask_coarse_head(mask_features)
+
+    def _forward_mask_point(self, features, mask_coarse_logits, instances):
+        """
+        Forward logic of the mask point head.
+        """
+        if not self.mask_point_on:
+            return {} if self.training else mask_coarse_logits
+
+        mask_features_list = [features[k] for k in self.mask_point_in_features]
+        features_scales = [self._feature_scales[k] for k in self.mask_point_in_features]
+
+        if self.training:
+            proposal_boxes = [x.proposal_boxes for x in instances]
+            gt_classes = cat([x.gt_classes for x in instances])
+            with torch.no_grad():
+                point_coords = get_uncertain_point_coords_with_randomness(
+                    mask_coarse_logits,
+                    lambda logits: calculate_uncertainty(logits, gt_classes),
+                    self.mask_point_train_num_points,
+                    self.mask_point_oversample_ratio,
+                    self.mask_point_importance_sample_ratio,
+                )
+
+            fine_grained_features, point_coords_wrt_image = point_sample_fine_grained_features(
+                mask_features_list, features_scales, proposal_boxes, point_coords
+            )
+            coarse_features = point_sample(mask_coarse_logits, point_coords, align_corners=False)
+            point_logits = self.mask_point_head(fine_grained_features, coarse_features)
+            return {
+                "loss_mask_point": roi_mask_point_loss(
+                    point_logits, instances, point_coords_wrt_image
+                )
+            }
+        else:
+            pred_boxes = [x.pred_boxes for x in instances]
+            pred_classes = cat([x.pred_classes for x in instances])
+            # The subdivision code will fail with the empty list of boxes
+            if len(pred_classes) == 0:
+                return mask_coarse_logits
+
+            mask_logits = mask_coarse_logits.clone()
+            for subdivions_step in range(self.mask_point_subdivision_steps):
+                mask_logits = interpolate(
+                    mask_logits, scale_factor=2, mode="bilinear", align_corners=False
+                )
+                # If `mask_point_subdivision_num_points` is larger or equal to the
+                # resolution of the next step, then we can skip this step
+                H, W = mask_logits.shape[-2:]
+                if (
+                    self.mask_point_subdivision_num_points >= 4 * H * W
+                    and subdivions_step < self.mask_point_subdivision_steps - 1
+                ):
+                    continue
+                uncertainty_map = calculate_uncertainty(mask_logits, pred_classes)
+                point_indices, point_coords = get_uncertain_point_coords_on_grid(
+                    uncertainty_map, self.mask_point_subdivision_num_points
+                )
+                fine_grained_features, _ = point_sample_fine_grained_features(
+                    mask_features_list, features_scales, pred_boxes, point_coords
+                )
+                coarse_features = point_sample(
+                    mask_coarse_logits, point_coords, align_corners=False
+                )
+                point_logits = self.mask_point_head(fine_grained_features, coarse_features)
+
+                # put mask point predictions to the right places on the upsampled grid.
+                R, C, H, W = mask_logits.shape
+                point_indices = point_indices.unsqueeze(1).expand(-1, C, -1)
+                mask_logits = (
+                    mask_logits.reshape(R, C, H * W)
+                    .scatter_(2, point_indices, point_logits)
+                    .view(R, C, H, W)
+                )
+            return mask_logits
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/projects/PointRend/point_rend/semantic_seg.py b/preprocess/humanparsing/mhp_extension/detectron2/projects/PointRend/point_rend/semantic_seg.py
new file mode 100644
index 0000000000000000000000000000000000000000..670a0ea201a6de82f3126171e6320d56f65e1ba7
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/projects/PointRend/point_rend/semantic_seg.py
@@ -0,0 +1,134 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+import numpy as np
+from typing import Dict
+import torch
+from torch import nn
+from torch.nn import functional as F
+
+from detectron2.layers import ShapeSpec, cat
+from detectron2.modeling import SEM_SEG_HEADS_REGISTRY
+
+from .point_features import (
+    get_uncertain_point_coords_on_grid,
+    get_uncertain_point_coords_with_randomness,
+    point_sample,
+)
+from .point_head import build_point_head
+
+
+def calculate_uncertainty(sem_seg_logits):
+    """
+    For each location of the prediction `sem_seg_logits` we estimate uncerainty as the
+        difference between top first and top second predicted logits.
+
+    Args:
+        mask_logits (Tensor): A tensor of shape (N, C, ...), where N is the minibatch size and
+            C is the number of foreground classes. The values are logits.
+
+    Returns:
+        scores (Tensor): A tensor of shape (N, 1, ...) that contains uncertainty scores with
+            the most uncertain locations having the highest uncertainty score.
+    """
+    top2_scores = torch.topk(sem_seg_logits, k=2, dim=1)[0]
+    return (top2_scores[:, 1] - top2_scores[:, 0]).unsqueeze(1)
+
+
+@SEM_SEG_HEADS_REGISTRY.register()
+class PointRendSemSegHead(nn.Module):
+    """
+    A semantic segmentation head that combines a head set in `POINT_HEAD.COARSE_SEM_SEG_HEAD_NAME`
+        and a point head set in `MODEL.POINT_HEAD.NAME`.
+    """
+
+    def __init__(self, cfg, input_shape: Dict[str, ShapeSpec]):
+        super().__init__()
+
+        self.ignore_value = cfg.MODEL.SEM_SEG_HEAD.IGNORE_VALUE
+
+        self.coarse_sem_seg_head = SEM_SEG_HEADS_REGISTRY.get(
+            cfg.MODEL.POINT_HEAD.COARSE_SEM_SEG_HEAD_NAME
+        )(cfg, input_shape)
+        self._init_point_head(cfg, input_shape)
+
+    def _init_point_head(self, cfg, input_shape: Dict[str, ShapeSpec]):
+        # fmt: off
+        assert cfg.MODEL.SEM_SEG_HEAD.NUM_CLASSES == cfg.MODEL.POINT_HEAD.NUM_CLASSES
+        feature_channels             = {k: v.channels for k, v in input_shape.items()}
+        self.in_features             = cfg.MODEL.POINT_HEAD.IN_FEATURES
+        self.train_num_points        = cfg.MODEL.POINT_HEAD.TRAIN_NUM_POINTS
+        self.oversample_ratio        = cfg.MODEL.POINT_HEAD.OVERSAMPLE_RATIO
+        self.importance_sample_ratio = cfg.MODEL.POINT_HEAD.IMPORTANCE_SAMPLE_RATIO
+        self.subdivision_steps       = cfg.MODEL.POINT_HEAD.SUBDIVISION_STEPS
+        self.subdivision_num_points  = cfg.MODEL.POINT_HEAD.SUBDIVISION_NUM_POINTS
+        # fmt: on
+
+        in_channels = np.sum([feature_channels[f] for f in self.in_features])
+        self.point_head = build_point_head(cfg, ShapeSpec(channels=in_channels, width=1, height=1))
+
+    def forward(self, features, targets=None):
+        coarse_sem_seg_logits = self.coarse_sem_seg_head.layers(features)
+
+        if self.training:
+            losses = self.coarse_sem_seg_head.losses(coarse_sem_seg_logits, targets)
+
+            with torch.no_grad():
+                point_coords = get_uncertain_point_coords_with_randomness(
+                    coarse_sem_seg_logits,
+                    calculate_uncertainty,
+                    self.train_num_points,
+                    self.oversample_ratio,
+                    self.importance_sample_ratio,
+                )
+            coarse_features = point_sample(coarse_sem_seg_logits, point_coords, align_corners=False)
+
+            fine_grained_features = cat(
+                [
+                    point_sample(features[in_feature], point_coords, align_corners=False)
+                    for in_feature in self.in_features
+                ]
+            )
+            point_logits = self.point_head(fine_grained_features, coarse_features)
+            point_targets = (
+                point_sample(
+                    targets.unsqueeze(1).to(torch.float),
+                    point_coords,
+                    mode="nearest",
+                    align_corners=False,
+                )
+                .squeeze(1)
+                .to(torch.long)
+            )
+            losses["loss_sem_seg_point"] = F.cross_entropy(
+                point_logits, point_targets, reduction="mean", ignore_index=self.ignore_value
+            )
+            return None, losses
+        else:
+            sem_seg_logits = coarse_sem_seg_logits.clone()
+            for _ in range(self.subdivision_steps):
+                sem_seg_logits = F.interpolate(
+                    sem_seg_logits, scale_factor=2, mode="bilinear", align_corners=False
+                )
+                uncertainty_map = calculate_uncertainty(sem_seg_logits)
+                point_indices, point_coords = get_uncertain_point_coords_on_grid(
+                    uncertainty_map, self.subdivision_num_points
+                )
+                fine_grained_features = cat(
+                    [
+                        point_sample(features[in_feature], point_coords, align_corners=False)
+                        for in_feature in self.in_features
+                    ]
+                )
+                coarse_features = point_sample(
+                    coarse_sem_seg_logits, point_coords, align_corners=False
+                )
+                point_logits = self.point_head(fine_grained_features, coarse_features)
+
+                # put sem seg point predictions to the right places on the upsampled grid.
+                N, C, H, W = sem_seg_logits.shape
+                point_indices = point_indices.unsqueeze(1).expand(-1, C, -1)
+                sem_seg_logits = (
+                    sem_seg_logits.reshape(N, C, H * W)
+                    .scatter_(2, point_indices, point_logits)
+                    .view(N, C, H, W)
+                )
+            return sem_seg_logits, {}
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/projects/PointRend/run.sh b/preprocess/humanparsing/mhp_extension/detectron2/projects/PointRend/run.sh
new file mode 100644
index 0000000000000000000000000000000000000000..4ee1614b02f784cb46fa65243174ea3588eb1adc
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/projects/PointRend/run.sh
@@ -0,0 +1,2 @@
+python finetune_net.py --config-file configs/InstanceSegmentation/pointrend_rcnn_X_101_32x8d_FPN_3x_parsing.yaml --num-gpus 1
+#python finetune_net.py --config-file configs/InstanceSegmentation/pointrend_rcnn_R_50_FPN_3x_parsing.yaml --num-gpus 1
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/projects/PointRend/train_net.py b/preprocess/humanparsing/mhp_extension/detectron2/projects/PointRend/train_net.py
new file mode 100644
index 0000000000000000000000000000000000000000..7832867ec668c5715c4124c02b72909a318836e8
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/projects/PointRend/train_net.py
@@ -0,0 +1,133 @@
+#!/usr/bin/env python3
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+
+"""
+PointRend Training Script.
+
+This script is a simplified version of the training script in detectron2/tools.
+"""
+
+import os
+import torch
+
+import detectron2.utils.comm as comm
+from detectron2.checkpoint import DetectionCheckpointer
+from detectron2.config import get_cfg
+from detectron2.data import MetadataCatalog, build_detection_train_loader
+from detectron2.engine import DefaultTrainer, default_argument_parser, default_setup, launch
+from detectron2.evaluation import (
+    CityscapesInstanceEvaluator,
+    CityscapesSemSegEvaluator,
+    COCOEvaluator,
+    DatasetEvaluators,
+    LVISEvaluator,
+    SemSegEvaluator,
+    verify_results,
+)
+
+from point_rend import SemSegDatasetMapper, add_pointrend_config
+
+
+class Trainer(DefaultTrainer):
+    """
+    We use the "DefaultTrainer" which contains a number pre-defined logic for
+    standard training workflow. They may not work for you, especially if you
+    are working on a new research project. In that case you can use the cleaner
+    "SimpleTrainer", or write your own training loop.
+    """
+
+    @classmethod
+    def build_evaluator(cls, cfg, dataset_name, output_folder=None):
+        """
+        Create evaluator(s) for a given dataset.
+        This uses the special metadata "evaluator_type" associated with each builtin dataset.
+        For your own dataset, you can simply create an evaluator manually in your
+        script and do not have to worry about the hacky if-else logic here.
+        """
+        if output_folder is None:
+            output_folder = os.path.join(cfg.OUTPUT_DIR, "inference")
+        evaluator_list = []
+        evaluator_type = MetadataCatalog.get(dataset_name).evaluator_type
+        if evaluator_type == "lvis":
+            return LVISEvaluator(dataset_name, cfg, True, output_folder)
+        if evaluator_type == "coco":
+            return COCOEvaluator(dataset_name, cfg, True, output_folder)
+        if evaluator_type == "sem_seg":
+            return SemSegEvaluator(
+                dataset_name,
+                distributed=True,
+                num_classes=cfg.MODEL.SEM_SEG_HEAD.NUM_CLASSES,
+                ignore_label=cfg.MODEL.SEM_SEG_HEAD.IGNORE_VALUE,
+                output_dir=output_folder,
+            )
+        if evaluator_type == "cityscapes_instance":
+            assert (
+                torch.cuda.device_count() >= comm.get_rank()
+            ), "CityscapesEvaluator currently do not work with multiple machines."
+            return CityscapesInstanceEvaluator(dataset_name)
+        if evaluator_type == "cityscapes_sem_seg":
+            assert (
+                torch.cuda.device_count() >= comm.get_rank()
+            ), "CityscapesEvaluator currently do not work with multiple machines."
+            return CityscapesSemSegEvaluator(dataset_name)
+        if len(evaluator_list) == 0:
+            raise NotImplementedError(
+                "no Evaluator for the dataset {} with the type {}".format(
+                    dataset_name, evaluator_type
+                )
+            )
+        if len(evaluator_list) == 1:
+            return evaluator_list[0]
+        return DatasetEvaluators(evaluator_list)
+
+    @classmethod
+    def build_train_loader(cls, cfg):
+        if "SemanticSegmentor" in cfg.MODEL.META_ARCHITECTURE:
+            mapper = SemSegDatasetMapper(cfg, True)
+        else:
+            mapper = None
+        return build_detection_train_loader(cfg, mapper=mapper)
+
+
+def setup(args):
+    """
+    Create configs and perform basic setups.
+    """
+    cfg = get_cfg()
+    add_pointrend_config(cfg)
+    cfg.merge_from_file(args.config_file)
+    cfg.merge_from_list(args.opts)
+    cfg.freeze()
+    default_setup(cfg, args)
+    return cfg
+
+
+def main(args):
+    cfg = setup(args)
+
+    if args.eval_only:
+        model = Trainer.build_model(cfg)
+        DetectionCheckpointer(model, save_dir=cfg.OUTPUT_DIR).resume_or_load(
+            cfg.MODEL.WEIGHTS, resume=args.resume
+        )
+        res = Trainer.test(cfg, model)
+        if comm.is_main_process():
+            verify_results(cfg, res)
+        return res
+
+    trainer = Trainer(cfg)
+    trainer.resume_or_load(resume=args.resume)
+    return trainer.train()
+
+
+if __name__ == "__main__":
+    args = default_argument_parser().parse_args()
+    print("Command Line Args:", args)
+    launch(
+        main,
+        args.num_gpus,
+        num_machines=args.num_machines,
+        machine_rank=args.machine_rank,
+        dist_url=args.dist_url,
+        args=(args,),
+    )
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/projects/README.md b/preprocess/humanparsing/mhp_extension/detectron2/projects/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..36263bd87401a98f273831f4ec98fcb5c65d3412
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/projects/README.md
@@ -0,0 +1,31 @@
+
+Here are a few projects that are built on detectron2.
+They are examples of how to use detectron2 as a library, to make your projects more
+maintainable.
+
+## Projects by Facebook
+
+Note that these are research projects, and therefore may not have the same level
+of support or stability of detectron2.
+
++ [DensePose: Dense Human Pose Estimation In The Wild](DensePose)
++ [Scale-Aware Trident Networks for Object Detection](TridentNet)
++ [TensorMask: A Foundation for Dense Object Segmentation](TensorMask)
++ [Mesh R-CNN](https://github.com/facebookresearch/meshrcnn)
++ [PointRend: Image Segmentation as Rendering](PointRend)
++ [Momentum Contrast for Unsupervised Visual Representation Learning](https://github.com/facebookresearch/moco/tree/master/detection)
+
+
+## External Projects
+
+External projects in the community that use detectron2:
+
+<!--
+ - If you want to contribute, note that:
+ -  1. please add your project to the end of the list and try to use only one line
+ -  2. the project must provide models trained on standard data
+ -->
+
++ [VoVNet backbones](https://github.com/youngwanLEE/vovnet-detectron2).
++ [AdelaiDet](https://github.com/aim-uofa/adet), a detection toolbox from the Universtiy of Adelaide.
++ [CenterMask : Real-Time Anchor-Free Instance Segmentation](https://github.com/youngwanLEE/centermask2)
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/projects/TensorMask/README.md b/preprocess/humanparsing/mhp_extension/detectron2/projects/TensorMask/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..6831508b9aea37f0e88bec62c98f2bf2b64240ab
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/projects/TensorMask/README.md
@@ -0,0 +1,64 @@
+
+# TensorMask in Detectron2
+**A Foundation for Dense Object Segmentation**
+
+Xinlei Chen, Ross Girshick, Kaiming He, Piotr Dollár
+
+[[`arXiv`](https://arxiv.org/abs/1903.12174)] [[`BibTeX`](#CitingTensorMask)]
+
+<div align="center">
+  <img src="http://xinleic.xyz/images/tmask.png" width="700px" />
+</div>
+
+In this repository, we release code for TensorMask in Detectron2.
+TensorMask is a dense sliding-window instance segmentation framework that, for the first time, achieves results close to the well-developed Mask R-CNN framework -- both qualitatively and quantitatively. It establishes a conceptually complementary direction for object instance segmentation research.
+
+## Installation
+First install Detectron2 following the [documentation](https://detectron2.readthedocs.io/tutorials/install.html) and
+[setup the dataset](../../datasets). Then compile the TensorMask-specific op (`swap_align2nat`):
+```bash
+cd /path/to/detectron2/projects/TensorMask
+python setup.py build develop
+```
+
+## Training
+
+To train a model, run:
+```bash
+python /path/to/detectron2/projects/TensorMask/train_net.py --config-file <config.yaml>
+```
+
+For example, to launch TensorMask BiPyramid training (1x schedule) with ResNet-50 backbone on 8 GPUs,
+one should execute:
+```bash
+python /path/to/detectron2/projects/TensorMask/train_net.py --config-file configs/tensormask_R_50_FPN_1x.yaml --num-gpus 8
+```
+
+## Evaluation
+
+Model evaluation can be done similarly (6x schedule with scale augmentation):
+```bash
+python /path/to/detectron2/projects/TensorMask/train_net.py --config-file configs/tensormask_R_50_FPN_6x.yaml --eval-only MODEL.WEIGHTS /path/to/model_checkpoint
+```
+
+# Pretrained Models
+
+| Backbone | lr sched | AP box | AP mask | download                                                                                                                                    |
+| -------- | -------- | --     | ---  | --------                                                                                                                                    |
+| R50      | 1x       | 37.6   | 32.4 | <a href="https://dl.fbaipublicfiles.com/detectron2/TensorMask/tensormask_R_50_FPN_1x/152549419/model_final_8f325c.pkl">model</a>&nbsp;\| &nbsp;<a href="https://dl.fbaipublicfiles.com/detectron2/TensorMask/tensormask_R_50_FPN_1x/152549419/metrics.json">metrics</a> |
+| R50      | 6x       | 41.4   | 35.8 | <a href="https://dl.fbaipublicfiles.com/detectron2/TensorMask/tensormask_R_50_FPN_6x/153538791/model_final_e8df31.pkl">model</a>&nbsp;\| &nbsp;<a href="https://dl.fbaipublicfiles.com/detectron2/TensorMask/tensormask_R_50_FPN_6x/153538791/metrics.json">metrics</a> |
+
+
+## <a name="CitingTensorMask"></a>Citing TensorMask
+
+If you use TensorMask, please use the following BibTeX entry.
+
+```
+@InProceedings{chen2019tensormask,
+  title={Tensormask: A Foundation for Dense Object Segmentation},
+  author={Chen, Xinlei and Girshick, Ross and He, Kaiming and Doll{\'a}r, Piotr},
+  journal={The International Conference on Computer Vision (ICCV)},
+  year={2019}
+}
+```
+
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/projects/TensorMask/configs/Base-TensorMask.yaml b/preprocess/humanparsing/mhp_extension/detectron2/projects/TensorMask/configs/Base-TensorMask.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..a7245349b4aa9cfa00f20074cc7cb5cdb02607f9
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/projects/TensorMask/configs/Base-TensorMask.yaml
@@ -0,0 +1,25 @@
+MODEL:
+  META_ARCHITECTURE: "TensorMask"
+  MASK_ON: True
+  BACKBONE:
+    NAME: "build_retinanet_resnet_fpn_backbone"
+  RESNETS:
+    OUT_FEATURES: ["res2", "res3", "res4", "res5"]
+  ANCHOR_GENERATOR:
+    SIZES: [[44, 60], [88, 120], [176, 240], [352, 480], [704, 960], [1408, 1920]]
+    ASPECT_RATIOS: [[1.0]]
+  FPN:
+    IN_FEATURES: ["res2", "res3", "res4", "res5"]
+    FUSE_TYPE: "avg"
+  TENSOR_MASK:
+    ALIGNED_ON: True
+    BIPYRAMID_ON: True
+DATASETS:
+  TRAIN: ("coco_2017_train",)
+  TEST: ("coco_2017_val",)
+SOLVER:
+  IMS_PER_BATCH: 16
+  BASE_LR: 0.02
+  STEPS: (60000, 80000)
+  MAX_ITER: 90000
+VERSION: 2
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/projects/TensorMask/configs/tensormask_R_50_FPN_1x.yaml b/preprocess/humanparsing/mhp_extension/detectron2/projects/TensorMask/configs/tensormask_R_50_FPN_1x.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..5d5eee135a93149a0c4b2148a47cee02e8aed8eb
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/projects/TensorMask/configs/tensormask_R_50_FPN_1x.yaml
@@ -0,0 +1,5 @@
+_BASE_: "Base-TensorMask.yaml"
+MODEL:
+  WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
+  RESNETS:
+    DEPTH: 50
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/projects/TensorMask/configs/tensormask_R_50_FPN_6x.yaml b/preprocess/humanparsing/mhp_extension/detectron2/projects/TensorMask/configs/tensormask_R_50_FPN_6x.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..366a965c4adfdbba2482593c0c81f3e6af50dfd2
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/projects/TensorMask/configs/tensormask_R_50_FPN_6x.yaml
@@ -0,0 +1,11 @@
+_BASE_: "Base-TensorMask.yaml"
+MODEL:
+  WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
+  RESNETS:
+    DEPTH: 50
+SOLVER:
+  STEPS: (480000, 520000)
+  MAX_ITER: 540000
+INPUT:
+  MIN_SIZE_TRAIN_SAMPLING: "range"
+  MIN_SIZE_TRAIN: (640, 800)
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/projects/TensorMask/setup.py b/preprocess/humanparsing/mhp_extension/detectron2/projects/TensorMask/setup.py
new file mode 100644
index 0000000000000000000000000000000000000000..0194e76608966b528ab32879edc40a8e4ac3225f
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/projects/TensorMask/setup.py
@@ -0,0 +1,69 @@
+#!/usr/bin/env python
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+
+import glob
+import os
+from setuptools import find_packages, setup
+import torch
+from torch.utils.cpp_extension import CUDA_HOME, CppExtension, CUDAExtension
+
+
+def get_extensions():
+    this_dir = os.path.dirname(os.path.abspath(__file__))
+    extensions_dir = os.path.join(this_dir, "tensormask", "layers", "csrc")
+
+    main_source = os.path.join(extensions_dir, "vision.cpp")
+    sources = glob.glob(os.path.join(extensions_dir, "**", "*.cpp"))
+    source_cuda = glob.glob(os.path.join(extensions_dir, "**", "*.cu")) + glob.glob(
+        os.path.join(extensions_dir, "*.cu")
+    )
+
+    sources = [main_source] + sources
+
+    extension = CppExtension
+
+    extra_compile_args = {"cxx": []}
+    define_macros = []
+
+    if (torch.cuda.is_available() and CUDA_HOME is not None) or os.getenv("FORCE_CUDA", "0") == "1":
+        extension = CUDAExtension
+        sources += source_cuda
+        define_macros += [("WITH_CUDA", None)]
+        extra_compile_args["nvcc"] = [
+            "-DCUDA_HAS_FP16=1",
+            "-D__CUDA_NO_HALF_OPERATORS__",
+            "-D__CUDA_NO_HALF_CONVERSIONS__",
+            "-D__CUDA_NO_HALF2_OPERATORS__",
+        ]
+
+        # It's better if pytorch can do this by default ..
+        CC = os.environ.get("CC", None)
+        if CC is not None:
+            extra_compile_args["nvcc"].append("-ccbin={}".format(CC))
+
+    sources = [os.path.join(extensions_dir, s) for s in sources]
+
+    include_dirs = [extensions_dir]
+
+    ext_modules = [
+        extension(
+            "tensormask._C",
+            sources,
+            include_dirs=include_dirs,
+            define_macros=define_macros,
+            extra_compile_args=extra_compile_args,
+        )
+    ]
+
+    return ext_modules
+
+
+setup(
+    name="tensormask",
+    version="0.1",
+    author="FAIR",
+    packages=find_packages(exclude=("configs", "tests")),
+    python_requires=">=3.6",
+    ext_modules=get_extensions(),
+    cmdclass={"build_ext": torch.utils.cpp_extension.BuildExtension},
+)
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/projects/TensorMask/tensormask/__init__.py b/preprocess/humanparsing/mhp_extension/detectron2/projects/TensorMask/tensormask/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e3b642a55519867dc52ccc57a36c32c72c3d34da
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/projects/TensorMask/tensormask/__init__.py
@@ -0,0 +1,3 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+from .config import add_tensormask_config
+from .arch import TensorMask
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/projects/TensorMask/tensormask/arch.py b/preprocess/humanparsing/mhp_extension/detectron2/projects/TensorMask/tensormask/arch.py
new file mode 100644
index 0000000000000000000000000000000000000000..a3e89c6b4283b28fe8028300e146d7b7543f0da1
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/projects/TensorMask/tensormask/arch.py
@@ -0,0 +1,904 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+import copy
+import logging
+import math
+from typing import List
+import torch
+import torch.nn.functional as F
+from fvcore.nn import sigmoid_focal_loss_star_jit, smooth_l1_loss
+from torch import nn
+
+from detectron2.layers import ShapeSpec, batched_nms, cat, paste_masks_in_image
+from detectron2.modeling.anchor_generator import DefaultAnchorGenerator
+from detectron2.modeling.backbone import build_backbone
+from detectron2.modeling.box_regression import Box2BoxTransform
+from detectron2.modeling.meta_arch.build import META_ARCH_REGISTRY
+from detectron2.modeling.meta_arch.retinanet import (
+    permute_all_cls_and_box_to_N_HWA_K_and_concat,
+    permute_to_N_HWA_K,
+)
+from detectron2.structures import Boxes, ImageList, Instances
+from detectron2.utils.logger import log_first_n
+
+from tensormask.layers import SwapAlign2Nat
+
+__all__ = ["TensorMask"]
+
+
+def _assignment_rule(
+    gt_boxes,
+    anchor_boxes,
+    unit_lengths,
+    min_anchor_size,
+    scale_thresh=2.0,
+    spatial_thresh=1.0,
+    uniqueness_on=True,
+):
+    """
+    Given two lists of boxes of N ground truth boxes and M anchor boxes,
+    compute the assignment between the two, following the assignment rules in
+    https://arxiv.org/abs/1903.12174.
+    The box order must be (xmin, ymin, xmax, ymax), so please make sure to convert
+    to BoxMode.XYXY_ABS before calling this function.
+
+    Args:
+        gt_boxes, anchor_boxes (Boxes): two Boxes. Contains N & M boxes/anchors, respectively.
+        unit_lengths (Tensor): Contains the unit lengths of M anchor boxes.
+        min_anchor_size (float): Minimum size of the anchor, in pixels
+        scale_thresh (float): The `scale` threshold: the maximum size of the anchor
+                              should not be greater than scale_thresh x max(h, w) of
+                              the ground truth box.
+        spatial_thresh (float): The `spatial` threshold: the l2 distance between the
+                              center of the anchor and the ground truth box should not
+                              be greater than spatial_thresh x u where u is the unit length.
+
+    Returns:
+        matches (Tensor[int64]): a vector of length M, where matches[i] is a matched
+                ground-truth index in [0, N)
+        match_labels (Tensor[int8]): a vector of length M, where pred_labels[i] indicates
+            whether a prediction is a true or false positive or ignored
+    """
+    gt_boxes, anchor_boxes = gt_boxes.tensor, anchor_boxes.tensor
+    N = gt_boxes.shape[0]
+    M = anchor_boxes.shape[0]
+    if N == 0 or M == 0:
+        return (
+            gt_boxes.new_full((N,), 0, dtype=torch.int64),
+            gt_boxes.new_full((N,), -1, dtype=torch.int8),
+        )
+
+    # Containment rule
+    lt = torch.min(gt_boxes[:, None, :2], anchor_boxes[:, :2])  # [N,M,2]
+    rb = torch.max(gt_boxes[:, None, 2:], anchor_boxes[:, 2:])  # [N,M,2]
+    union = cat([lt, rb], dim=2)  # [N,M,4]
+
+    dummy_gt_boxes = torch.zeros_like(gt_boxes)
+    anchor = dummy_gt_boxes[:, None, :] + anchor_boxes[:, :]  # [N,M,4]
+
+    contain_matrix = torch.all(union == anchor, dim=2)  # [N,M]
+
+    # Centrality rule, scale
+    gt_size_lower = torch.max(gt_boxes[:, 2:] - gt_boxes[:, :2], dim=1)[0]  # [N]
+    gt_size_upper = gt_size_lower * scale_thresh  # [N]
+    # Fall back for small objects
+    gt_size_upper[gt_size_upper < min_anchor_size] = min_anchor_size
+    # Due to sampling of locations, the anchor sizes are deducted with sampling strides
+    anchor_size = (
+        torch.max(anchor_boxes[:, 2:] - anchor_boxes[:, :2], dim=1)[0] - unit_lengths
+    )  # [M]
+
+    size_diff_upper = gt_size_upper[:, None] - anchor_size  # [N,M]
+    scale_matrix = size_diff_upper >= 0  # [N,M]
+
+    # Centrality rule, spatial
+    gt_center = (gt_boxes[:, 2:] + gt_boxes[:, :2]) / 2  # [N,2]
+    anchor_center = (anchor_boxes[:, 2:] + anchor_boxes[:, :2]) / 2  # [M,2]
+    offset_center = gt_center[:, None, :] - anchor_center[:, :]  # [N,M,2]
+    offset_center /= unit_lengths[:, None]  # [N,M,2]
+    spatial_square = spatial_thresh * spatial_thresh
+    spatial_matrix = torch.sum(offset_center * offset_center, dim=2) <= spatial_square
+
+    assign_matrix = (contain_matrix & scale_matrix & spatial_matrix).int()
+
+    # assign_matrix is N (gt) x M (predicted)
+    # Max over gt elements (dim 0) to find best gt candidate for each prediction
+    matched_vals, matches = assign_matrix.max(dim=0)
+    match_labels = matches.new_full(matches.size(), 1, dtype=torch.int8)
+
+    match_labels[matched_vals == 0] = 0
+    match_labels[matched_vals == 1] = 1
+
+    # find all the elements that match to ground truths multiple times
+    not_unique_idxs = assign_matrix.sum(dim=0) > 1
+    if uniqueness_on:
+        match_labels[not_unique_idxs] = 0
+    else:
+        match_labels[not_unique_idxs] = -1
+
+    return matches, match_labels
+
+
+# TODO make the paste_mask function in d2 core support mask list
+def _paste_mask_lists_in_image(masks, boxes, image_shape, threshold=0.5):
+    """
+    Paste a list of masks that are of various resolutions (e.g., 28 x 28) into an image.
+    The location, height, and width for pasting each mask is determined by their
+    corresponding bounding boxes in boxes.
+
+    Args:
+        masks (list(Tensor)): A list of Tensor of shape (1, Hmask_i, Wmask_i).
+                            Values are in [0, 1]. The list length, Bimg, is the
+                            number of detected object instances in the image.
+        boxes (Boxes): A Boxes of length Bimg. boxes.tensor[i] and masks[i] correspond
+                            to the same object instance.
+        image_shape (tuple): height, width
+        threshold (float): A threshold in [0, 1] for converting the (soft) masks to
+            binary masks.
+
+    Returns:
+        img_masks (Tensor): A tensor of shape (Bimg, Himage, Wimage), where Bimg is the
+        number of detected object instances and Himage, Wimage are the image width
+        and height. img_masks[i] is a binary mask for object instance i.
+    """
+    if len(masks) == 0:
+        return torch.empty((0, 1) + image_shape, dtype=torch.uint8)
+
+    # Loop over masks groups. Each group has the same mask prediction size.
+    img_masks = []
+    ind_masks = []
+    mask_sizes = torch.tensor([m.shape[-1] for m in masks])
+    unique_sizes = torch.unique(mask_sizes)
+    for msize in unique_sizes.tolist():
+        cur_ind = torch.where(mask_sizes == msize)[0]
+        ind_masks.append(cur_ind)
+
+        cur_masks = cat([masks[i] for i in cur_ind])
+        cur_boxes = boxes[cur_ind]
+        img_masks.append(paste_masks_in_image(cur_masks, cur_boxes, image_shape, threshold))
+
+    img_masks = cat(img_masks)
+    ind_masks = cat(ind_masks)
+
+    img_masks_out = torch.empty_like(img_masks)
+    img_masks_out[ind_masks, :, :] = img_masks
+
+    return img_masks_out
+
+
+def _postprocess(results, result_mask_info, output_height, output_width, mask_threshold=0.5):
+    """
+    Post-process the output boxes for TensorMask.
+    The input images are often resized when entering an object detector.
+    As a result, we often need the outputs of the detector in a different
+    resolution from its inputs.
+
+    This function will postprocess the raw outputs of TensorMask
+    to produce outputs according to the desired output resolution.
+
+    Args:
+        results (Instances): the raw outputs from the detector.
+            `results.image_size` contains the input image resolution the detector sees.
+            This object might be modified in-place. Note that it does not contain the field
+            `pred_masks`, which is provided by another input `result_masks`.
+        result_mask_info (list[Tensor], Boxes): a pair of two items for mask related results.
+                The first item is a list of #detection tensors, each is the predicted masks.
+                The second item is the anchors corresponding to the predicted masks.
+        output_height, output_width: the desired output resolution.
+
+    Returns:
+        Instances: the postprocessed output from the model, based on the output resolution
+    """
+    scale_x, scale_y = (output_width / results.image_size[1], output_height / results.image_size[0])
+    results = Instances((output_height, output_width), **results.get_fields())
+
+    output_boxes = results.pred_boxes
+    output_boxes.tensor[:, 0::2] *= scale_x
+    output_boxes.tensor[:, 1::2] *= scale_y
+    output_boxes.clip(results.image_size)
+
+    inds_nonempty = output_boxes.nonempty()
+    results = results[inds_nonempty]
+    result_masks, result_anchors = result_mask_info
+    if result_masks:
+        result_anchors.tensor[:, 0::2] *= scale_x
+        result_anchors.tensor[:, 1::2] *= scale_y
+        result_masks = [x for (i, x) in zip(inds_nonempty.tolist(), result_masks) if i]
+        results.pred_masks = _paste_mask_lists_in_image(
+            result_masks,
+            result_anchors[inds_nonempty],
+            results.image_size,
+            threshold=mask_threshold,
+        )
+    return results
+
+
+class TensorMaskAnchorGenerator(DefaultAnchorGenerator):
+    """
+    For a set of image sizes and feature maps, computes a set of anchors for TensorMask.
+    It also computes the unit lengths and indexes for each anchor box.
+    """
+
+    def grid_anchors_with_unit_lengths_and_indexes(self, grid_sizes):
+        anchors = []
+        unit_lengths = []
+        indexes = []
+        for lvl, (size, stride, base_anchors) in enumerate(
+            zip(grid_sizes, self.strides, self.cell_anchors)
+        ):
+            grid_height, grid_width = size
+            device = base_anchors.device
+            shifts_x = torch.arange(
+                0, grid_width * stride, step=stride, dtype=torch.float32, device=device
+            )
+            shifts_y = torch.arange(
+                0, grid_height * stride, step=stride, dtype=torch.float32, device=device
+            )
+            shift_y, shift_x = torch.meshgrid(shifts_y, shifts_x)
+            shifts = torch.stack((shift_x, shift_y, shift_x, shift_y), dim=2)
+            # Stack anchors in shapes of (HWA, 4)
+            cur_anchor = (shifts[:, :, None, :] + base_anchors.view(1, 1, -1, 4)).view(-1, 4)
+            anchors.append(cur_anchor)
+            unit_lengths.append(
+                torch.full((cur_anchor.shape[0],), stride, dtype=torch.float32, device=device)
+            )
+            # create mask indexes using mesh grid
+            shifts_l = torch.full((1,), lvl, dtype=torch.int64, device=device)
+            shifts_i = torch.zeros((1,), dtype=torch.int64, device=device)
+            shifts_h = torch.arange(0, grid_height, dtype=torch.int64, device=device)
+            shifts_w = torch.arange(0, grid_width, dtype=torch.int64, device=device)
+            shifts_a = torch.arange(0, base_anchors.shape[0], dtype=torch.int64, device=device)
+            grids = torch.meshgrid(shifts_l, shifts_i, shifts_h, shifts_w, shifts_a)
+
+            indexes.append(torch.stack(grids, dim=5).view(-1, 5))
+
+        return anchors, unit_lengths, indexes
+
+    def forward(self, features):
+        """
+        Returns:
+            list[list[Boxes]]: a list of #image elements. Each is a list of #feature level Boxes.
+                The Boxes contains anchors of this image on the specific feature level.
+            list[list[Tensor]]: a list of #image elements. Each is a list of #feature level tensors.
+                The tensor contains strides, or unit lengths for the anchors.
+            list[list[Tensor]]: a list of #image elements. Each is a list of #feature level tensors.
+                The Tensor contains indexes for the anchors, with the last dimension meaning
+                (L, N, H, W, A), where L is level, I is image (not set yet), H is height,
+                W is width, and A is anchor.
+        """
+        num_images = len(features[0])
+        grid_sizes = [feature_map.shape[-2:] for feature_map in features]
+        anchors_list, lengths_list, indexes_list = self.grid_anchors_with_unit_lengths_and_indexes(
+            grid_sizes
+        )
+
+        # Convert anchors from Tensor to Boxes
+        anchors_per_im = [Boxes(x) for x in anchors_list]
+
+        # TODO it can be simplified to not return duplicated information for
+        # each image, just like detectron2's own AnchorGenerator
+        anchors = [copy.deepcopy(anchors_per_im) for _ in range(num_images)]
+        unit_lengths = [copy.deepcopy(lengths_list) for _ in range(num_images)]
+        indexes = [copy.deepcopy(indexes_list) for _ in range(num_images)]
+
+        return anchors, unit_lengths, indexes
+
+
+@META_ARCH_REGISTRY.register()
+class TensorMask(nn.Module):
+    """
+    TensorMask model. Creates FPN backbone, anchors and a head for classification
+    and box regression. Calculates and applies proper losses to class, box, and
+    masks.
+    """
+
+    def __init__(self, cfg):
+        super().__init__()
+
+        # fmt: off
+        self.num_classes              = cfg.MODEL.TENSOR_MASK.NUM_CLASSES
+        self.in_features              = cfg.MODEL.TENSOR_MASK.IN_FEATURES
+        self.anchor_sizes             = cfg.MODEL.ANCHOR_GENERATOR.SIZES
+        self.num_levels               = len(cfg.MODEL.ANCHOR_GENERATOR.SIZES)
+        # Loss parameters:
+        self.focal_loss_alpha         = cfg.MODEL.TENSOR_MASK.FOCAL_LOSS_ALPHA
+        self.focal_loss_gamma         = cfg.MODEL.TENSOR_MASK.FOCAL_LOSS_GAMMA
+        # Inference parameters:
+        self.score_threshold          = cfg.MODEL.TENSOR_MASK.SCORE_THRESH_TEST
+        self.topk_candidates          = cfg.MODEL.TENSOR_MASK.TOPK_CANDIDATES_TEST
+        self.nms_threshold            = cfg.MODEL.TENSOR_MASK.NMS_THRESH_TEST
+        self.detections_im            = cfg.TEST.DETECTIONS_PER_IMAGE
+        # Mask parameters:
+        self.mask_on                  = cfg.MODEL.MASK_ON
+        self.mask_loss_weight         = cfg.MODEL.TENSOR_MASK.MASK_LOSS_WEIGHT
+        self.mask_pos_weight          = torch.tensor(cfg.MODEL.TENSOR_MASK.POSITIVE_WEIGHT,
+                                                     dtype=torch.float32)
+        self.bipyramid_on             = cfg.MODEL.TENSOR_MASK.BIPYRAMID_ON
+        # fmt: on
+
+        # build the backbone
+        self.backbone = build_backbone(cfg)
+
+        backbone_shape = self.backbone.output_shape()
+        feature_shapes = [backbone_shape[f] for f in self.in_features]
+        feature_strides = [x.stride for x in feature_shapes]
+        # build anchors
+        self.anchor_generator = TensorMaskAnchorGenerator(cfg, feature_shapes)
+        self.num_anchors = self.anchor_generator.num_cell_anchors[0]
+        anchors_min_level = cfg.MODEL.ANCHOR_GENERATOR.SIZES[0]
+        self.mask_sizes = [size // feature_strides[0] for size in anchors_min_level]
+        self.min_anchor_size = min(anchors_min_level) - feature_strides[0]
+
+        # head of the TensorMask
+        self.head = TensorMaskHead(
+            cfg, self.num_levels, self.num_anchors, self.mask_sizes, feature_shapes
+        )
+        # box transform
+        self.box2box_transform = Box2BoxTransform(weights=cfg.MODEL.TENSOR_MASK.BBOX_REG_WEIGHTS)
+        self.register_buffer("pixel_mean", torch.Tensor(cfg.MODEL.PIXEL_MEAN).view(-1, 1, 1))
+        self.register_buffer("pixel_std", torch.Tensor(cfg.MODEL.PIXEL_STD).view(-1, 1, 1))
+
+    @property
+    def device(self):
+        return self.pixel_mean.device
+
+    def forward(self, batched_inputs):
+        """
+        Args:
+            batched_inputs: a list, batched outputs of :class:`DetectionTransform` .
+                Each item in the list contains the inputs for one image.
+            For now, each item in the list is a dict that contains:
+                image: Tensor, image in (C, H, W) format.
+                instances: Instances
+                Other information that's included in the original dicts, such as:
+                    "height", "width" (int): the output resolution of the model, used in inference.
+                        See :meth:`postprocess` for details.
+         Returns:
+            losses (dict[str: Tensor]): mapping from a named loss to a tensor
+                storing the loss. Used during training only.
+        """
+        images = self.preprocess_image(batched_inputs)
+        if "instances" in batched_inputs[0]:
+            gt_instances = [x["instances"].to(self.device) for x in batched_inputs]
+        elif "targets" in batched_inputs[0]:
+            log_first_n(
+                logging.WARN, "'targets' in the model inputs is now renamed to 'instances'!", n=10
+            )
+            gt_instances = [x["targets"].to(self.device) for x in batched_inputs]
+        else:
+            gt_instances = None
+
+        features = self.backbone(images.tensor)
+        features = [features[f] for f in self.in_features]
+        # apply the TensorMask head
+        pred_logits, pred_deltas, pred_masks = self.head(features)
+        # generate anchors based on features, is it image specific?
+        anchors, unit_lengths, indexes = self.anchor_generator(features)
+
+        if self.training:
+            # get ground truths for class labels and box targets, it will label each anchor
+            gt_class_info, gt_delta_info, gt_mask_info, num_fg = self.get_ground_truth(
+                anchors, unit_lengths, indexes, gt_instances
+            )
+            # compute the loss
+            return self.losses(
+                gt_class_info,
+                gt_delta_info,
+                gt_mask_info,
+                num_fg,
+                pred_logits,
+                pred_deltas,
+                pred_masks,
+            )
+        else:
+            # do inference to get the output
+            results = self.inference(pred_logits, pred_deltas, pred_masks, anchors, indexes, images)
+            processed_results = []
+            for results_im, input_im, image_size in zip(
+                results, batched_inputs, images.image_sizes
+            ):
+                height = input_im.get("height", image_size[0])
+                width = input_im.get("width", image_size[1])
+                # this is to do post-processing with the image size
+                result_box, result_mask = results_im
+                r = _postprocess(result_box, result_mask, height, width)
+                processed_results.append({"instances": r})
+            return processed_results
+
+    def losses(
+        self,
+        gt_class_info,
+        gt_delta_info,
+        gt_mask_info,
+        num_fg,
+        pred_logits,
+        pred_deltas,
+        pred_masks,
+    ):
+        """
+        Args:
+            For `gt_class_info`, `gt_delta_info`, `gt_mask_info` and `num_fg` parameters, see
+                :meth:`TensorMask.get_ground_truth`.
+            For `pred_logits`, `pred_deltas` and `pred_masks`, see
+                :meth:`TensorMaskHead.forward`.
+
+        Returns:
+            losses (dict[str: Tensor]): mapping from a named loss to a scalar tensor
+                storing the loss. Used during training only. The potential dict keys are:
+                "loss_cls", "loss_box_reg" and "loss_mask".
+        """
+        gt_classes_target, gt_valid_inds = gt_class_info
+        gt_deltas, gt_fg_inds = gt_delta_info
+        gt_masks, gt_mask_inds = gt_mask_info
+        loss_normalizer = torch.tensor(max(1, num_fg), dtype=torch.float32, device=self.device)
+
+        # classification and regression
+        pred_logits, pred_deltas = permute_all_cls_and_box_to_N_HWA_K_and_concat(
+            pred_logits, pred_deltas, self.num_classes
+        )
+        loss_cls = (
+            sigmoid_focal_loss_star_jit(
+                pred_logits[gt_valid_inds],
+                gt_classes_target[gt_valid_inds],
+                alpha=self.focal_loss_alpha,
+                gamma=self.focal_loss_gamma,
+                reduction="sum",
+            )
+            / loss_normalizer
+        )
+
+        if num_fg == 0:
+            loss_box_reg = pred_deltas.sum() * 0
+        else:
+            loss_box_reg = (
+                smooth_l1_loss(pred_deltas[gt_fg_inds], gt_deltas, beta=0.0, reduction="sum")
+                / loss_normalizer
+            )
+        losses = {"loss_cls": loss_cls, "loss_box_reg": loss_box_reg}
+
+        # mask prediction
+        if self.mask_on:
+            loss_mask = 0
+            for lvl in range(self.num_levels):
+                cur_level_factor = 2 ** lvl if self.bipyramid_on else 1
+                for anc in range(self.num_anchors):
+                    cur_gt_mask_inds = gt_mask_inds[lvl][anc]
+                    if cur_gt_mask_inds is None:
+                        loss_mask += pred_masks[lvl][anc][0, 0, 0, 0] * 0
+                    else:
+                        cur_mask_size = self.mask_sizes[anc] * cur_level_factor
+                        # TODO maybe there are numerical issues when mask sizes are large
+                        cur_size_divider = torch.tensor(
+                            self.mask_loss_weight / (cur_mask_size ** 2),
+                            dtype=torch.float32,
+                            device=self.device,
+                        )
+
+                        cur_pred_masks = pred_masks[lvl][anc][
+                            cur_gt_mask_inds[:, 0],  # N
+                            :,  # V x U
+                            cur_gt_mask_inds[:, 1],  # H
+                            cur_gt_mask_inds[:, 2],  # W
+                        ]
+
+                        loss_mask += F.binary_cross_entropy_with_logits(
+                            cur_pred_masks.view(-1, cur_mask_size, cur_mask_size),  # V, U
+                            gt_masks[lvl][anc].to(dtype=torch.float32),
+                            reduction="sum",
+                            weight=cur_size_divider,
+                            pos_weight=self.mask_pos_weight,
+                        )
+            losses["loss_mask"] = loss_mask / loss_normalizer
+        return losses
+
+    @torch.no_grad()
+    def get_ground_truth(self, anchors, unit_lengths, indexes, targets):
+        """
+        Args:
+            anchors (list[list[Boxes]]): a list of N=#image elements. Each is a
+                list of #feature level Boxes. The Boxes contains anchors of
+                this image on the specific feature level.
+            unit_lengths (list[list[Tensor]]): a list of N=#image elements. Each is a
+                list of #feature level Tensor. The tensor contains unit lengths for anchors of
+                this image on the specific feature level.
+            indexes (list[list[Tensor]]): a list of N=#image elements. Each is a
+                list of #feature level Tensor. The tensor contains the 5D index of
+                each anchor, the second dimension means (L, N, H, W, A), where L
+                is level, I is image, H is height, W is width, and A is anchor.
+            targets (list[Instances]): a list of N `Instances`s. The i-th
+                `Instances` contains the ground-truth per-instance annotations
+                for the i-th input image.  Specify `targets` during training only.
+
+        Returns:
+            gt_class_info (Tensor, Tensor): A pair of two tensors for classification.
+                The first one is an integer tensor of shape (R, #classes) storing ground-truth
+                labels for each anchor. R is the total number of anchors in the batch.
+                The second one is an integer tensor of shape (R,), to indicate which
+                anchors are valid for loss computation, which anchors are not.
+            gt_delta_info (Tensor, Tensor): A pair of two tensors for boxes.
+                The first one, of shape (F, 4). F=#foreground anchors.
+                The last dimension represents ground-truth box2box transform
+                targets (dx, dy, dw, dh) that map each anchor to its matched ground-truth box.
+                Only foreground anchors have values in this tensor. Could be `None` if F=0.
+                The second one, of shape (R,), is an integer tensor indicating which anchors
+                are foreground ones used for box regression. Could be `None` if F=0.
+            gt_mask_info (list[list[Tensor]], list[list[Tensor]]): A pair of two lists for masks.
+                The first one is a list of P=#feature level elements. Each is a
+                list of A=#anchor tensors. Each tensor contains the ground truth
+                masks of the same size and for the same feature level. Could be `None`.
+                The second one is a list of P=#feature level elements. Each is a
+                list of A=#anchor tensors. Each tensor contains the location of the ground truth
+                masks of the same size and for the same feature level. The second dimension means
+                (N, H, W), where N is image, H is height, and W is width. Could be `None`.
+            num_fg (int): F=#foreground anchors, used later for loss normalization.
+        """
+        gt_classes = []
+        gt_deltas = []
+        gt_masks = [[[] for _ in range(self.num_anchors)] for _ in range(self.num_levels)]
+        gt_mask_inds = [[[] for _ in range(self.num_anchors)] for _ in range(self.num_levels)]
+
+        anchors = [Boxes.cat(anchors_i) for anchors_i in anchors]
+        unit_lengths = [cat(unit_lengths_i) for unit_lengths_i in unit_lengths]
+        indexes = [cat(indexes_i) for indexes_i in indexes]
+
+        num_fg = 0
+        for i, (anchors_im, unit_lengths_im, indexes_im, targets_im) in enumerate(
+            zip(anchors, unit_lengths, indexes, targets)
+        ):
+            # Initialize all
+            gt_classes_i = torch.full_like(
+                unit_lengths_im, self.num_classes, dtype=torch.int64, device=self.device
+            )
+            # Ground truth classes
+            has_gt = len(targets_im) > 0
+            if has_gt:
+                # Compute the pairwise matrix
+                gt_matched_inds, anchor_labels = _assignment_rule(
+                    targets_im.gt_boxes, anchors_im, unit_lengths_im, self.min_anchor_size
+                )
+                # Find the foreground instances
+                fg_inds = anchor_labels == 1
+                fg_anchors = anchors_im[fg_inds]
+                num_fg += len(fg_anchors)
+                # Find the ground truths for foreground instances
+                gt_fg_matched_inds = gt_matched_inds[fg_inds]
+                # Assign labels for foreground instances
+                gt_classes_i[fg_inds] = targets_im.gt_classes[gt_fg_matched_inds]
+                # Anchors with label -1 are ignored, others are left as negative
+                gt_classes_i[anchor_labels == -1] = -1
+
+                # Boxes
+                # Ground truth box regression, only for foregrounds
+                matched_gt_boxes = targets_im[gt_fg_matched_inds].gt_boxes
+                # Compute box regression offsets for foregrounds only
+                gt_deltas_i = self.box2box_transform.get_deltas(
+                    fg_anchors.tensor, matched_gt_boxes.tensor
+                )
+                gt_deltas.append(gt_deltas_i)
+
+                # Masks
+                if self.mask_on:
+                    # Compute masks for each level and each anchor
+                    matched_indexes = indexes_im[fg_inds, :]
+                    for lvl in range(self.num_levels):
+                        ids_lvl = matched_indexes[:, 0] == lvl
+                        if torch.any(ids_lvl):
+                            cur_level_factor = 2 ** lvl if self.bipyramid_on else 1
+                            for anc in range(self.num_anchors):
+                                ids_lvl_anchor = ids_lvl & (matched_indexes[:, 4] == anc)
+                                if torch.any(ids_lvl_anchor):
+                                    gt_masks[lvl][anc].append(
+                                        targets_im[
+                                            gt_fg_matched_inds[ids_lvl_anchor]
+                                        ].gt_masks.crop_and_resize(
+                                            fg_anchors[ids_lvl_anchor].tensor,
+                                            self.mask_sizes[anc] * cur_level_factor,
+                                        )
+                                    )
+                                    # Select (N, H, W) dimensions
+                                    gt_mask_inds_lvl_anc = matched_indexes[ids_lvl_anchor, 1:4]
+                                    # Set the image index to the current image
+                                    gt_mask_inds_lvl_anc[:, 0] = i
+                                    gt_mask_inds[lvl][anc].append(gt_mask_inds_lvl_anc)
+            gt_classes.append(gt_classes_i)
+
+        # Classes and boxes
+        gt_classes = cat(gt_classes)
+        gt_valid_inds = gt_classes >= 0
+        gt_fg_inds = gt_valid_inds & (gt_classes < self.num_classes)
+        gt_classes_target = torch.zeros(
+            (gt_classes.shape[0], self.num_classes), dtype=torch.float32, device=self.device
+        )
+        gt_classes_target[gt_fg_inds, gt_classes[gt_fg_inds]] = 1
+        gt_deltas = cat(gt_deltas) if gt_deltas else None
+
+        # Masks
+        gt_masks = [[cat(mla) if mla else None for mla in ml] for ml in gt_masks]
+        gt_mask_inds = [[cat(ila) if ila else None for ila in il] for il in gt_mask_inds]
+        return (
+            (gt_classes_target, gt_valid_inds),
+            (gt_deltas, gt_fg_inds),
+            (gt_masks, gt_mask_inds),
+            num_fg,
+        )
+
+    def inference(self, pred_logits, pred_deltas, pred_masks, anchors, indexes, images):
+        """
+        Arguments:
+            pred_logits, pred_deltas, pred_masks: Same as the output of:
+                meth:`TensorMaskHead.forward`
+            anchors, indexes: Same as the input of meth:`TensorMask.get_ground_truth`
+            images (ImageList): the input images
+
+        Returns:
+            results (List[Instances]): a list of #images elements.
+        """
+        assert len(anchors) == len(images)
+        results = []
+
+        pred_logits = [permute_to_N_HWA_K(x, self.num_classes) for x in pred_logits]
+        pred_deltas = [permute_to_N_HWA_K(x, 4) for x in pred_deltas]
+
+        pred_logits = cat(pred_logits, dim=1)
+        pred_deltas = cat(pred_deltas, dim=1)
+
+        for img_idx, (anchors_im, indexes_im) in enumerate(zip(anchors, indexes)):
+            # Get the size of the current image
+            image_size = images.image_sizes[img_idx]
+
+            logits_im = pred_logits[img_idx]
+            deltas_im = pred_deltas[img_idx]
+
+            if self.mask_on:
+                masks_im = [[mla[img_idx] for mla in ml] for ml in pred_masks]
+            else:
+                masks_im = [None] * self.num_levels
+            results_im = self.inference_single_image(
+                logits_im,
+                deltas_im,
+                masks_im,
+                Boxes.cat(anchors_im),
+                cat(indexes_im),
+                tuple(image_size),
+            )
+            results.append(results_im)
+        return results
+
+    def inference_single_image(
+        self, pred_logits, pred_deltas, pred_masks, anchors, indexes, image_size
+    ):
+        """
+        Single-image inference. Return bounding-box detection results by thresholding
+        on scores and applying non-maximum suppression (NMS).
+
+        Arguments:
+            pred_logits (list[Tensor]): list of #feature levels. Each entry contains
+                tensor of size (AxHxW, K)
+            pred_deltas (list[Tensor]): Same shape as 'pred_logits' except that K becomes 4.
+            pred_masks (list[list[Tensor]]): List of #feature levels, each is a list of #anchors.
+                Each entry contains tensor of size (M_i*M_i, H, W). `None` if mask_on=False.
+            anchors (list[Boxes]): list of #feature levels. Each entry contains
+                a Boxes object, which contains all the anchors for that
+                image in that feature level.
+            image_size (tuple(H, W)): a tuple of the image height and width.
+
+        Returns:
+            Same as `inference`, but for only one image.
+        """
+        pred_logits = pred_logits.flatten().sigmoid_()
+        # We get top locations across all levels to accelerate the inference speed,
+        # which does not seem to affect the accuracy.
+        # First select values above the threshold
+        logits_top_idxs = torch.where(pred_logits > self.score_threshold)[0]
+        # Then get the top values
+        num_topk = min(self.topk_candidates, logits_top_idxs.shape[0])
+        pred_prob, topk_idxs = pred_logits[logits_top_idxs].sort(descending=True)
+        # Keep top k scoring values
+        pred_prob = pred_prob[:num_topk]
+        # Keep top k values
+        top_idxs = logits_top_idxs[topk_idxs[:num_topk]]
+
+        # class index
+        cls_idxs = top_idxs % self.num_classes
+        # HWA index
+        top_idxs //= self.num_classes
+        # predict boxes
+        pred_boxes = self.box2box_transform.apply_deltas(
+            pred_deltas[top_idxs], anchors[top_idxs].tensor
+        )
+        # apply nms
+        keep = batched_nms(pred_boxes, pred_prob, cls_idxs, self.nms_threshold)
+        # pick the top ones
+        keep = keep[: self.detections_im]
+
+        results = Instances(image_size)
+        results.pred_boxes = Boxes(pred_boxes[keep])
+        results.scores = pred_prob[keep]
+        results.pred_classes = cls_idxs[keep]
+
+        # deal with masks
+        result_masks, result_anchors = [], None
+        if self.mask_on:
+            # index and anchors, useful for masks
+            top_indexes = indexes[top_idxs]
+            top_anchors = anchors[top_idxs]
+            result_indexes = top_indexes[keep]
+            result_anchors = top_anchors[keep]
+            # Get masks and do sigmoid
+            for lvl, _, h, w, anc in result_indexes.tolist():
+                cur_size = self.mask_sizes[anc] * (2 ** lvl if self.bipyramid_on else 1)
+                result_masks.append(
+                    torch.sigmoid(pred_masks[lvl][anc][:, h, w].view(1, cur_size, cur_size))
+                )
+
+        return results, (result_masks, result_anchors)
+
+    def preprocess_image(self, batched_inputs):
+        """
+        Normalize, pad and batch the input images.
+        """
+        images = [x["image"].to(self.device) for x in batched_inputs]
+        images = [(x - self.pixel_mean) / self.pixel_std for x in images]
+        images = ImageList.from_tensors(images, self.backbone.size_divisibility)
+        return images
+
+
+class TensorMaskHead(nn.Module):
+    def __init__(self, cfg, num_levels, num_anchors, mask_sizes, input_shape: List[ShapeSpec]):
+        """
+        TensorMask head.
+        """
+        super().__init__()
+        # fmt: off
+        self.in_features        = cfg.MODEL.TENSOR_MASK.IN_FEATURES
+        in_channels             = input_shape[0].channels
+        num_classes             = cfg.MODEL.TENSOR_MASK.NUM_CLASSES
+        cls_channels            = cfg.MODEL.TENSOR_MASK.CLS_CHANNELS
+        num_convs               = cfg.MODEL.TENSOR_MASK.NUM_CONVS
+        # box parameters
+        bbox_channels           = cfg.MODEL.TENSOR_MASK.BBOX_CHANNELS
+        # mask parameters
+        self.mask_on            = cfg.MODEL.MASK_ON
+        self.mask_sizes         = mask_sizes
+        mask_channels           = cfg.MODEL.TENSOR_MASK.MASK_CHANNELS
+        self.align_on           = cfg.MODEL.TENSOR_MASK.ALIGNED_ON
+        self.bipyramid_on       = cfg.MODEL.TENSOR_MASK.BIPYRAMID_ON
+        # fmt: on
+
+        # class subnet
+        cls_subnet = []
+        cur_channels = in_channels
+        for _ in range(num_convs):
+            cls_subnet.append(
+                nn.Conv2d(cur_channels, cls_channels, kernel_size=3, stride=1, padding=1)
+            )
+            cur_channels = cls_channels
+            cls_subnet.append(nn.ReLU())
+
+        self.cls_subnet = nn.Sequential(*cls_subnet)
+        self.cls_score = nn.Conv2d(
+            cur_channels, num_anchors * num_classes, kernel_size=3, stride=1, padding=1
+        )
+        modules_list = [self.cls_subnet, self.cls_score]
+
+        # box subnet
+        bbox_subnet = []
+        cur_channels = in_channels
+        for _ in range(num_convs):
+            bbox_subnet.append(
+                nn.Conv2d(cur_channels, bbox_channels, kernel_size=3, stride=1, padding=1)
+            )
+            cur_channels = bbox_channels
+            bbox_subnet.append(nn.ReLU())
+
+        self.bbox_subnet = nn.Sequential(*bbox_subnet)
+        self.bbox_pred = nn.Conv2d(
+            cur_channels, num_anchors * 4, kernel_size=3, stride=1, padding=1
+        )
+        modules_list.extend([self.bbox_subnet, self.bbox_pred])
+
+        # mask subnet
+        if self.mask_on:
+            mask_subnet = []
+            cur_channels = in_channels
+            for _ in range(num_convs):
+                mask_subnet.append(
+                    nn.Conv2d(cur_channels, mask_channels, kernel_size=3, stride=1, padding=1)
+                )
+                cur_channels = mask_channels
+                mask_subnet.append(nn.ReLU())
+
+            self.mask_subnet = nn.Sequential(*mask_subnet)
+            modules_list.append(self.mask_subnet)
+            for mask_size in self.mask_sizes:
+                cur_mask_module = "mask_pred_%02d" % mask_size
+                self.add_module(
+                    cur_mask_module,
+                    nn.Conv2d(
+                        cur_channels, mask_size * mask_size, kernel_size=1, stride=1, padding=0
+                    ),
+                )
+                modules_list.append(getattr(self, cur_mask_module))
+            if self.align_on:
+                if self.bipyramid_on:
+                    for lvl in range(num_levels):
+                        cur_mask_module = "align2nat_%02d" % lvl
+                        lambda_val = 2 ** lvl
+                        setattr(self, cur_mask_module, SwapAlign2Nat(lambda_val))
+                    # Also the fusing layer, stay at the same channel size
+                    mask_fuse = [
+                        nn.Conv2d(cur_channels, cur_channels, kernel_size=3, stride=1, padding=1),
+                        nn.ReLU(),
+                    ]
+                    self.mask_fuse = nn.Sequential(*mask_fuse)
+                    modules_list.append(self.mask_fuse)
+                else:
+                    self.align2nat = SwapAlign2Nat(1)
+
+        # Initialization
+        for modules in modules_list:
+            for layer in modules.modules():
+                if isinstance(layer, nn.Conv2d):
+                    torch.nn.init.normal_(layer.weight, mean=0, std=0.01)
+                    torch.nn.init.constant_(layer.bias, 0)
+
+        # Use prior in model initialization to improve stability
+        bias_value = -(math.log((1 - 0.01) / 0.01))
+        torch.nn.init.constant_(self.cls_score.bias, bias_value)
+
+    def forward(self, features):
+        """
+        Arguments:
+            features (list[Tensor]): FPN feature map tensors in high to low resolution.
+                Each tensor in the list correspond to different feature levels.
+
+        Returns:
+            pred_logits (list[Tensor]): #lvl tensors, each has shape (N, AxK, Hi, Wi).
+                The tensor predicts the classification probability
+                at each spatial position for each of the A anchors and K object
+                classes.
+            pred_deltas (list[Tensor]): #lvl tensors, each has shape (N, Ax4, Hi, Wi).
+                The tensor predicts 4-vector (dx,dy,dw,dh) box
+                regression values for every anchor. These values are the
+                relative offset between the anchor and the ground truth box.
+            pred_masks (list(list[Tensor])): #lvl list of tensors, each is a list of
+                A tensors of shape (N, M_{i,a}, Hi, Wi).
+                The tensor predicts a dense set of M_ixM_i masks at every location.
+        """
+        pred_logits = [self.cls_score(self.cls_subnet(x)) for x in features]
+        pred_deltas = [self.bbox_pred(self.bbox_subnet(x)) for x in features]
+
+        pred_masks = None
+        if self.mask_on:
+            mask_feats = [self.mask_subnet(x) for x in features]
+
+            if self.bipyramid_on:
+                mask_feat_high_res = mask_feats[0]
+                H, W = mask_feat_high_res.shape[-2:]
+                mask_feats_up = []
+                for lvl, mask_feat in enumerate(mask_feats):
+                    lambda_val = 2.0 ** lvl
+                    mask_feat_up = mask_feat
+                    if lvl > 0:
+                        mask_feat_up = F.interpolate(
+                            mask_feat, scale_factor=lambda_val, mode="bilinear", align_corners=False
+                        )
+                    mask_feats_up.append(
+                        self.mask_fuse(mask_feat_up[:, :, :H, :W] + mask_feat_high_res)
+                    )
+                mask_feats = mask_feats_up
+
+            pred_masks = []
+            for lvl, mask_feat in enumerate(mask_feats):
+                cur_masks = []
+                for mask_size in self.mask_sizes:
+                    cur_mask_module = getattr(self, "mask_pred_%02d" % mask_size)
+                    cur_mask = cur_mask_module(mask_feat)
+                    if self.align_on:
+                        if self.bipyramid_on:
+                            cur_mask_module = getattr(self, "align2nat_%02d" % lvl)
+                            cur_mask = cur_mask_module(cur_mask)
+                        else:
+                            cur_mask = self.align2nat(cur_mask)
+                    cur_masks.append(cur_mask)
+                pred_masks.append(cur_masks)
+        return pred_logits, pred_deltas, pred_masks
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/projects/TensorMask/tensormask/config.py b/preprocess/humanparsing/mhp_extension/detectron2/projects/TensorMask/tensormask/config.py
new file mode 100644
index 0000000000000000000000000000000000000000..44479f211811bd4060c6afef9ed86791b0dcd0d4
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/projects/TensorMask/tensormask/config.py
@@ -0,0 +1,50 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+
+from detectron2.config import CfgNode as CN
+
+
+def add_tensormask_config(cfg):
+    """
+    Add config for TensorMask.
+    """
+    cfg.MODEL.TENSOR_MASK = CN()
+
+    # Anchor parameters
+    cfg.MODEL.TENSOR_MASK.IN_FEATURES = ["p2", "p3", "p4", "p5", "p6", "p7"]
+
+    # Convolutions to use in the towers
+    cfg.MODEL.TENSOR_MASK.NUM_CONVS = 4
+
+    # Number of foreground classes.
+    cfg.MODEL.TENSOR_MASK.NUM_CLASSES = 80
+    # Channel size for the classification tower
+    cfg.MODEL.TENSOR_MASK.CLS_CHANNELS = 256
+
+    cfg.MODEL.TENSOR_MASK.SCORE_THRESH_TEST = 0.05
+    # Only the top (1000 * #levels) candidate boxes across all levels are
+    # considered jointly during test (to improve speed)
+    cfg.MODEL.TENSOR_MASK.TOPK_CANDIDATES_TEST = 6000
+    cfg.MODEL.TENSOR_MASK.NMS_THRESH_TEST = 0.5
+
+    # Box parameters
+    # Channel size for the box tower
+    cfg.MODEL.TENSOR_MASK.BBOX_CHANNELS = 128
+    # Weights on (dx, dy, dw, dh)
+    cfg.MODEL.TENSOR_MASK.BBOX_REG_WEIGHTS = (1.5, 1.5, 0.75, 0.75)
+
+    # Loss parameters
+    cfg.MODEL.TENSOR_MASK.FOCAL_LOSS_GAMMA = 3.0
+    cfg.MODEL.TENSOR_MASK.FOCAL_LOSS_ALPHA = 0.3
+
+    # Mask parameters
+    # Channel size for the mask tower
+    cfg.MODEL.TENSOR_MASK.MASK_CHANNELS = 128
+    # Mask loss weight
+    cfg.MODEL.TENSOR_MASK.MASK_LOSS_WEIGHT = 2.0
+    # weight on positive pixels within the mask
+    cfg.MODEL.TENSOR_MASK.POSITIVE_WEIGHT = 1.5
+    # Whether to predict in the aligned representation
+    cfg.MODEL.TENSOR_MASK.ALIGNED_ON = False
+    # Whether to use the bipyramid architecture
+    cfg.MODEL.TENSOR_MASK.BIPYRAMID_ON = False
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/projects/TensorMask/tensormask/layers/__init__.py b/preprocess/humanparsing/mhp_extension/detectron2/projects/TensorMask/tensormask/layers/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..cbbac429a69ce7cb17872e27b868f5603de5dc64
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/projects/TensorMask/tensormask/layers/__init__.py
@@ -0,0 +1,4 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+from .swap_align2nat import SwapAlign2Nat, swap_align2nat
+
+__all__ = [k for k in globals().keys() if not k.startswith("_")]
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/projects/TensorMask/tensormask/layers/csrc/SwapAlign2Nat/SwapAlign2Nat.h b/preprocess/humanparsing/mhp_extension/detectron2/projects/TensorMask/tensormask/layers/csrc/SwapAlign2Nat/SwapAlign2Nat.h
new file mode 100644
index 0000000000000000000000000000000000000000..2ec037391f1c5a40e69190bbdb50f71501d54825
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/projects/TensorMask/tensormask/layers/csrc/SwapAlign2Nat/SwapAlign2Nat.h
@@ -0,0 +1,54 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+#pragma once
+#include <torch/types.h>
+
+namespace tensormask {
+
+#ifdef WITH_CUDA
+at::Tensor SwapAlign2Nat_forward_cuda(
+    const at::Tensor& X,
+    const int lambda_val,
+    const float pad_val);
+
+at::Tensor SwapAlign2Nat_backward_cuda(
+    const at::Tensor& gY,
+    const int lambda_val,
+    const int batch_size,
+    const int channel,
+    const int height,
+    const int width);
+#endif
+
+inline at::Tensor SwapAlign2Nat_forward(
+    const at::Tensor& X,
+    const int lambda_val,
+    const float pad_val) {
+  if (X.type().is_cuda()) {
+#ifdef WITH_CUDA
+    return SwapAlign2Nat_forward_cuda(X, lambda_val, pad_val);
+#else
+    AT_ERROR("Not compiled with GPU support");
+#endif
+  }
+  AT_ERROR("Not implemented on the CPU");
+}
+
+inline at::Tensor SwapAlign2Nat_backward(
+    const at::Tensor& gY,
+    const int lambda_val,
+    const int batch_size,
+    const int channel,
+    const int height,
+    const int width) {
+  if (gY.type().is_cuda()) {
+#ifdef WITH_CUDA
+    return SwapAlign2Nat_backward_cuda(
+        gY, lambda_val, batch_size, channel, height, width);
+#else
+    AT_ERROR("Not compiled with GPU support");
+#endif
+  }
+  AT_ERROR("Not implemented on the CPU");
+}
+
+} // namespace tensormask
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/projects/TensorMask/tensormask/layers/csrc/SwapAlign2Nat/SwapAlign2Nat_cuda.cu b/preprocess/humanparsing/mhp_extension/detectron2/projects/TensorMask/tensormask/layers/csrc/SwapAlign2Nat/SwapAlign2Nat_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..06de4a4d046523be9959dee73dfc1c2c20852ce1
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/projects/TensorMask/tensormask/layers/csrc/SwapAlign2Nat/SwapAlign2Nat_cuda.cu
@@ -0,0 +1,526 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+#include <ATen/ATen.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAGuard.h>
+#include <ATen/cuda/CUDAApplyUtils.cuh>
+
+// TODO make it in a common file
+#define CUDA_1D_KERNEL_LOOP(i, n)                            \
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; \
+       i += blockDim.x * gridDim.x)
+
+template <typename T>
+__device__ inline T get_pixel_val(
+    const T* tensor,
+    const int idx,
+    const int H,
+    const int W,
+    const int y,
+    const int x,
+    const int V,
+    const int U,
+    const int v,
+    const int u,
+    const T pad_val) {
+  if ((y < 0) || (y >= H) || (x < 0) || (x >= W) || (v < 0) || (v >= V) ||
+      (u < 0) || (u >= U)) {
+    return pad_val;
+  } else {
+    return tensor[(((idx * V + v) * U + u) * H + y) * W + x];
+  }
+}
+
+template <typename T>
+__device__ inline void add_pixel_val(
+    T* tensor,
+    const T val,
+    const int idx,
+    const int H,
+    const int W,
+    const int y,
+    const int x,
+    const int V,
+    const int U,
+    const int v,
+    const int u) {
+  if ((val == 0.) || (y < 0) || (y >= H) || (x < 0) || (x >= W) || (v < 0) ||
+      (v >= V) || (u < 0) || (u >= U)) {
+    return;
+  } else {
+    atomicAdd(tensor + ((((idx * V + v) * U + u) * H + y) * W + x), val);
+  }
+}
+
+template <typename T>
+__global__ void SwapAlign2NatForwardFeat(
+    const int nthreads,
+    const T* bottom_data,
+    const int Vout,
+    const int Uout,
+    const float hVout,
+    const float hUout,
+    const int Vin,
+    const int Uin,
+    const float lambda,
+    const int Hin,
+    const int Win,
+    const int Hout,
+    const int Wout,
+    const T pad_val,
+    T* top_data) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    int idx = index;
+    const int x = idx % Wout;
+    idx /= Wout;
+    const int y = idx % Hout;
+    idx /= Hout;
+    const int u = idx % Uout;
+    idx /= Uout;
+    const int v = idx % Vout;
+    idx /= Vout;
+
+    const float ox = x * lambda + u - hUout + 0.5;
+    const int xf = static_cast<int>(floor(ox));
+    const int xc = static_cast<int>(ceil(ox));
+    const float xwc = ox - xf;
+    const float xwf = 1. - xwc;
+
+    const float oy = y * lambda + v - hVout + 0.5;
+    const int yf = static_cast<int>(floor(oy));
+    const int yc = static_cast<int>(ceil(oy));
+    const float ywc = oy - yf;
+    const float ywf = 1. - ywc;
+
+    const float ou = (u + 0.5) / lambda - 0.5;
+    const int uf = static_cast<int>(floor(ou));
+    const int uc = static_cast<int>(ceil(ou));
+    const float uwc = ou - uf;
+    const float uwf = 1. - uwc;
+
+    const float ov = (v + 0.5) / lambda - 0.5;
+    const int vf = static_cast<int>(floor(ov));
+    const int vc = static_cast<int>(ceil(ov));
+    const float vwc = ov - vf;
+    const float vwf = 1. - vwc;
+
+    T val = ywf * xwf * vwf * uwf *
+            get_pixel_val(
+                bottom_data, idx, Hin, Win, yf, xf, Vin, Uin, vf, uf, pad_val) +
+        ywf * xwf * vwf * uwc *
+            get_pixel_val(
+                bottom_data, idx, Hin, Win, yf, xf, Vin, Uin, vf, uc, pad_val) +
+        ywf * xwf * vwc * uwf *
+            get_pixel_val(
+                bottom_data, idx, Hin, Win, yf, xf, Vin, Uin, vc, uf, pad_val) +
+        ywf * xwf * vwc * uwc *
+            get_pixel_val(
+                bottom_data, idx, Hin, Win, yf, xf, Vin, Uin, vc, uc, pad_val) +
+        ywf * xwc * vwf * uwf *
+            get_pixel_val(
+                bottom_data, idx, Hin, Win, yf, xc, Vin, Uin, vf, uf, pad_val) +
+        ywf * xwc * vwf * uwc *
+            get_pixel_val(
+                bottom_data, idx, Hin, Win, yf, xc, Vin, Uin, vf, uc, pad_val) +
+        ywf * xwc * vwc * uwf *
+            get_pixel_val(
+                bottom_data, idx, Hin, Win, yf, xc, Vin, Uin, vc, uf, pad_val) +
+        ywf * xwc * vwc * uwc *
+            get_pixel_val(
+                bottom_data, idx, Hin, Win, yf, xc, Vin, Uin, vc, uc, pad_val) +
+        ywc * xwf * vwf * uwf *
+            get_pixel_val(
+                bottom_data, idx, Hin, Win, yc, xf, Vin, Uin, vf, uf, pad_val) +
+        ywc * xwf * vwf * uwc *
+            get_pixel_val(
+                bottom_data, idx, Hin, Win, yc, xf, Vin, Uin, vf, uc, pad_val) +
+        ywc * xwf * vwc * uwf *
+            get_pixel_val(
+                bottom_data, idx, Hin, Win, yc, xf, Vin, Uin, vc, uf, pad_val) +
+        ywc * xwf * vwc * uwc *
+            get_pixel_val(
+                bottom_data, idx, Hin, Win, yc, xf, Vin, Uin, vc, uc, pad_val) +
+        ywc * xwc * vwf * uwf *
+            get_pixel_val(
+                bottom_data, idx, Hin, Win, yc, xc, Vin, Uin, vf, uf, pad_val) +
+        ywc * xwc * vwf * uwc *
+            get_pixel_val(
+                bottom_data, idx, Hin, Win, yc, xc, Vin, Uin, vf, uc, pad_val) +
+        ywc * xwc * vwc * uwf *
+            get_pixel_val(
+                bottom_data, idx, Hin, Win, yc, xc, Vin, Uin, vc, uf, pad_val) +
+        ywc * xwc * vwc * uwc *
+            get_pixel_val(
+                bottom_data, idx, Hin, Win, yc, xc, Vin, Uin, vc, uc, pad_val);
+
+    top_data[index] = val;
+  }
+}
+
+template <typename T>
+__global__ void SwapAlign2NatBackwardFeat(
+    const int nthreads,
+    const T* top_diff,
+    const int Vout,
+    const int Uout,
+    const float hVout,
+    const float hUout,
+    const int Vin,
+    const int Uin,
+    const float lambda,
+    const int Hin,
+    const int Win,
+    const int Hout,
+    const int Wout,
+    T* bottom_diff) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    int idx = index;
+    const int x = idx % Wout;
+    idx /= Wout;
+    const int y = idx % Hout;
+    idx /= Hout;
+    const int u = idx % Uout;
+    idx /= Uout;
+    const int v = idx % Vout;
+    idx /= Vout;
+
+    const float ox = x * lambda + u - hUout + 0.5;
+    const int xf = static_cast<int>(floor(ox));
+    const int xc = static_cast<int>(ceil(ox));
+    const float xwc = ox - xf;
+    const float xwf = 1. - xwc;
+
+    const float oy = y * lambda + v - hVout + 0.5;
+    const int yf = static_cast<int>(floor(oy));
+    const int yc = static_cast<int>(ceil(oy));
+    const float ywc = oy - yf;
+    const float ywf = 1. - ywc;
+
+    const float ou = (u + 0.5) / lambda - 0.5;
+    const int uf = static_cast<int>(floor(ou));
+    const int uc = static_cast<int>(ceil(ou));
+    const float uwc = ou - uf;
+    const float uwf = 1. - uwc;
+
+    const float ov = (v + 0.5) / lambda - 0.5;
+    const int vf = static_cast<int>(floor(ov));
+    const int vc = static_cast<int>(ceil(ov));
+    const float vwc = ov - vf;
+    const float vwf = 1. - vwc;
+
+    const T grad = top_diff[index];
+
+    add_pixel_val(
+        bottom_diff,
+        ywf * xwf * vwf * uwf * grad,
+        idx,
+        Hin,
+        Win,
+        yf,
+        xf,
+        Vin,
+        Uin,
+        vf,
+        uf);
+    add_pixel_val(
+        bottom_diff,
+        ywf * xwf * vwf * uwc * grad,
+        idx,
+        Hin,
+        Win,
+        yf,
+        xf,
+        Vin,
+        Uin,
+        vf,
+        uc);
+    add_pixel_val(
+        bottom_diff,
+        ywf * xwf * vwc * uwf * grad,
+        idx,
+        Hin,
+        Win,
+        yf,
+        xf,
+        Vin,
+        Uin,
+        vc,
+        uf);
+    add_pixel_val(
+        bottom_diff,
+        ywf * xwf * vwc * uwc * grad,
+        idx,
+        Hin,
+        Win,
+        yf,
+        xf,
+        Vin,
+        Uin,
+        vc,
+        uc);
+    add_pixel_val(
+        bottom_diff,
+        ywf * xwc * vwf * uwf * grad,
+        idx,
+        Hin,
+        Win,
+        yf,
+        xc,
+        Vin,
+        Uin,
+        vf,
+        uf);
+    add_pixel_val(
+        bottom_diff,
+        ywf * xwc * vwf * uwc * grad,
+        idx,
+        Hin,
+        Win,
+        yf,
+        xc,
+        Vin,
+        Uin,
+        vf,
+        uc);
+    add_pixel_val(
+        bottom_diff,
+        ywf * xwc * vwc * uwf * grad,
+        idx,
+        Hin,
+        Win,
+        yf,
+        xc,
+        Vin,
+        Uin,
+        vc,
+        uf);
+    add_pixel_val(
+        bottom_diff,
+        ywf * xwc * vwc * uwc * grad,
+        idx,
+        Hin,
+        Win,
+        yf,
+        xc,
+        Vin,
+        Uin,
+        vc,
+        uc);
+    add_pixel_val(
+        bottom_diff,
+        ywc * xwf * vwf * uwf * grad,
+        idx,
+        Hin,
+        Win,
+        yc,
+        xf,
+        Vin,
+        Uin,
+        vf,
+        uf);
+    add_pixel_val(
+        bottom_diff,
+        ywc * xwf * vwf * uwc * grad,
+        idx,
+        Hin,
+        Win,
+        yc,
+        xf,
+        Vin,
+        Uin,
+        vf,
+        uc);
+    add_pixel_val(
+        bottom_diff,
+        ywc * xwf * vwc * uwf * grad,
+        idx,
+        Hin,
+        Win,
+        yc,
+        xf,
+        Vin,
+        Uin,
+        vc,
+        uf);
+    add_pixel_val(
+        bottom_diff,
+        ywc * xwf * vwc * uwc * grad,
+        idx,
+        Hin,
+        Win,
+        yc,
+        xf,
+        Vin,
+        Uin,
+        vc,
+        uc);
+    add_pixel_val(
+        bottom_diff,
+        ywc * xwc * vwf * uwf * grad,
+        idx,
+        Hin,
+        Win,
+        yc,
+        xc,
+        Vin,
+        Uin,
+        vf,
+        uf);
+    add_pixel_val(
+        bottom_diff,
+        ywc * xwc * vwf * uwc * grad,
+        idx,
+        Hin,
+        Win,
+        yc,
+        xc,
+        Vin,
+        Uin,
+        vf,
+        uc);
+    add_pixel_val(
+        bottom_diff,
+        ywc * xwc * vwc * uwf * grad,
+        idx,
+        Hin,
+        Win,
+        yc,
+        xc,
+        Vin,
+        Uin,
+        vc,
+        uf);
+    add_pixel_val(
+        bottom_diff,
+        ywc * xwc * vwc * uwc * grad,
+        idx,
+        Hin,
+        Win,
+        yc,
+        xc,
+        Vin,
+        Uin,
+        vc,
+        uc);
+  }
+}
+
+namespace tensormask {
+
+at::Tensor SwapAlign2Nat_forward_cuda(
+    const at::Tensor& X,
+    const int lambda_val,
+    const float pad_val) {
+  AT_ASSERTM(X.device().is_cuda(), "input must be a CUDA tensor");
+  AT_ASSERTM(X.ndimension() == 4, "input must be a 4D tensor");
+  AT_ASSERTM(lambda_val >= 1, "lambda should be greater or equal to 1");
+  const int N = X.size(0);
+  const int C = X.size(1);
+  const int Vin = static_cast<int>(sqrt(static_cast<float>(C)));
+  const int Uin = C / Vin;
+  AT_ASSERTM(
+      C == Vin * Uin && Vin == Uin, "#channels should be a square number");
+  const int Vout = lambda_val * Vin;
+  const int Uout = lambda_val * Uin;
+  const int Hin = X.size(2);
+  const int Win = X.size(3);
+  const float lambda = static_cast<float>(lambda_val);
+  const int Hout = static_cast<int>(ceil(Hin / lambda));
+  const int Wout = static_cast<int>(ceil(Win / lambda));
+  const float hVout = Vout / 2.;
+  const float hUout = Uout / 2.;
+
+  at::cuda::CUDAGuard device_guard(X.device());
+
+  at::Tensor Y = at::empty({N, Vout * Uout, Hout, Wout}, X.options());
+
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  dim3 grid(std::min(at::cuda::ATenCeilDiv(Y.numel(), 512L), 4096L));
+  dim3 block(512);
+
+  if (Y.numel() == 0) {
+    AT_CUDA_CHECK(cudaGetLastError());
+    return Y;
+  }
+
+  auto X_ = X.contiguous();
+  AT_DISPATCH_FLOATING_TYPES(X.scalar_type(), "SwapAlign2Nat_forward", [&] {
+    SwapAlign2NatForwardFeat<scalar_t><<<grid, block, 0, stream>>>(
+        Y.numel(),
+        X_.data_ptr<scalar_t>(),
+        Vout,
+        Uout,
+        hVout,
+        hUout,
+        Vin,
+        Uin,
+        lambda,
+        Hin,
+        Win,
+        Hout,
+        Wout,
+        pad_val,
+        Y.data_ptr<scalar_t>());
+  });
+  cudaDeviceSynchronize();
+  AT_CUDA_CHECK(cudaGetLastError());
+  return Y;
+}
+
+at::Tensor SwapAlign2Nat_backward_cuda(
+    const at::Tensor& gY,
+    const int lambda_val,
+    const int batch_size,
+    const int channel,
+    const int height,
+    const int width) {
+  AT_ASSERTM(gY.device().is_cuda(), "input gradient must be a CUDA tensor");
+  AT_ASSERTM(gY.ndimension() == 4, "input gradient must be a 4D tensor");
+  AT_ASSERTM(lambda_val >= 1, "lambda should be greater or equal to 1");
+  const int Vin = static_cast<int>(sqrt(static_cast<float>(channel)));
+  const int Uin = channel / Vin;
+  const int Vout = lambda_val * Vin;
+  const int Uout = lambda_val * Uin;
+  const float hVout = Vout / 2.;
+  const float hUout = Uout / 2.;
+  const int Hout = gY.size(2);
+  const int Wout = gY.size(3);
+
+  at::cuda::CUDAGuard device_guard(gY.device());
+
+  at::Tensor gX = at::zeros({batch_size, channel, height, width}, gY.options());
+
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  dim3 grid(std::min(at::cuda::ATenCeilDiv(gY.numel(), 512L), 4096L));
+  dim3 block(512);
+
+  // handle possibly empty gradients
+  if (gY.numel() == 0) {
+    AT_CUDA_CHECK(cudaGetLastError());
+    return gX;
+  }
+
+  auto gY_ = gY.contiguous();
+  AT_DISPATCH_FLOATING_TYPES(gY.scalar_type(), "SwapAlign2Nat_backward", [&] {
+    SwapAlign2NatBackwardFeat<scalar_t><<<grid, block, 0, stream>>>(
+        gY.numel(),
+        gY_.data_ptr<scalar_t>(),
+        Vout,
+        Uout,
+        hVout,
+        hUout,
+        Vin,
+        Uin,
+        static_cast<float>(lambda_val),
+        height,
+        width,
+        Hout,
+        Wout,
+        gX.data_ptr<scalar_t>());
+  });
+  AT_CUDA_CHECK(cudaGetLastError());
+  return gX;
+}
+
+} // namespace tensormask
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/projects/TensorMask/tensormask/layers/csrc/vision.cpp b/preprocess/humanparsing/mhp_extension/detectron2/projects/TensorMask/tensormask/layers/csrc/vision.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..ad8e472c2cfc7c10e00cd6b00fc22c0dd9384dd1
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/projects/TensorMask/tensormask/layers/csrc/vision.cpp
@@ -0,0 +1,19 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+
+#include <torch/extension.h>
+#include "SwapAlign2Nat/SwapAlign2Nat.h"
+
+namespace tensormask {
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+  m.def(
+      "swap_align2nat_forward",
+      &SwapAlign2Nat_forward,
+      "SwapAlign2Nat_forward");
+  m.def(
+      "swap_align2nat_backward",
+      &SwapAlign2Nat_backward,
+      "SwapAlign2Nat_backward");
+}
+
+} // namespace tensormask
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/projects/TensorMask/tensormask/layers/swap_align2nat.py b/preprocess/humanparsing/mhp_extension/detectron2/projects/TensorMask/tensormask/layers/swap_align2nat.py
new file mode 100644
index 0000000000000000000000000000000000000000..a72c98a968577eff2302d75e4cb41620e4ecf582
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/projects/TensorMask/tensormask/layers/swap_align2nat.py
@@ -0,0 +1,61 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+from torch import nn
+from torch.autograd import Function
+from torch.autograd.function import once_differentiable
+
+from tensormask import _C
+
+
+class _SwapAlign2Nat(Function):
+    @staticmethod
+    def forward(ctx, X, lambda_val, pad_val):
+        ctx.lambda_val = lambda_val
+        ctx.input_shape = X.size()
+
+        Y = _C.swap_align2nat_forward(X, lambda_val, pad_val)
+        return Y
+
+    @staticmethod
+    @once_differentiable
+    def backward(ctx, gY):
+        lambda_val = ctx.lambda_val
+        bs, ch, h, w = ctx.input_shape
+
+        gX = _C.swap_align2nat_backward(gY, lambda_val, bs, ch, h, w)
+
+        return gX, None, None
+
+
+swap_align2nat = _SwapAlign2Nat.apply
+
+
+class SwapAlign2Nat(nn.Module):
+    """
+        The op `SwapAlign2Nat` described in https://arxiv.org/abs/1903.12174.
+        Given an input tensor that predicts masks of shape (N, C=VxU, H, W),
+        apply the op, it will return masks of shape (N, V'xU', H', W') where
+        the unit lengths of (V, U) and (H, W) are swapped, and the mask representation
+        is transformed from aligned to natural.
+        Args:
+            lambda_val (int): the relative unit length ratio between (V, U) and (H, W),
+                                as we always have larger unit lengths for (V, U) than (H, W),
+                                lambda_val is always >= 1.
+            pad_val (float):    padding value for the values falling outside of the input
+                                tensor, default set to -6 as sigmoid(-6) is ~0, indicating
+                                that is no masks outside of the tensor.
+    """
+
+    def __init__(self, lambda_val, pad_val=-6.0):
+        super(SwapAlign2Nat, self).__init__()
+        self.lambda_val = lambda_val
+        self.pad_val = pad_val
+
+    def forward(self, X):
+        return swap_align2nat(X, self.lambda_val, self.pad_val)
+
+    def __repr__(self):
+        tmpstr = self.__class__.__name__ + "("
+        tmpstr += "lambda_val=" + str(self.lambda_val)
+        tmpstr += ", pad_val=" + str(self.pad_val)
+        tmpstr += ")"
+        return tmpstr
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/projects/TensorMask/tests/__init__.py b/preprocess/humanparsing/mhp_extension/detectron2/projects/TensorMask/tests/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..168f9979a4623806934b0ff1102ac166704e7dec
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/projects/TensorMask/tests/__init__.py
@@ -0,0 +1 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/projects/TensorMask/tests/test_swap_align2nat.py b/preprocess/humanparsing/mhp_extension/detectron2/projects/TensorMask/tests/test_swap_align2nat.py
new file mode 100644
index 0000000000000000000000000000000000000000..b3d018ce199ddaa19af25e8304d969e8f59c747a
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/projects/TensorMask/tests/test_swap_align2nat.py
@@ -0,0 +1,32 @@
+#!/usr/bin/env python3
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+
+import unittest
+import torch
+from torch.autograd import gradcheck
+
+from tensormask.layers.swap_align2nat import SwapAlign2Nat
+
+
+class SwapAlign2NatTest(unittest.TestCase):
+    @unittest.skipIf(not torch.cuda.is_available(), "CUDA not available")
+    def test_swap_align2nat_gradcheck_cuda(self):
+        dtype = torch.float64
+        device = torch.device("cuda")
+        m = SwapAlign2Nat(2).to(dtype=dtype, device=device)
+        x = torch.rand(2, 4, 10, 10, dtype=dtype, device=device, requires_grad=True)
+
+        self.assertTrue(gradcheck(m, x), "gradcheck failed for SwapAlign2Nat CUDA")
+
+    def _swap_align2nat(self, tensor, lambda_val):
+        """
+        The basic setup for testing Swap_Align
+        """
+        op = SwapAlign2Nat(lambda_val, pad_val=0.0)
+        input = torch.from_numpy(tensor[None, :, :, :].astype("float32"))
+        output = op.forward(input.cuda()).cpu().numpy()
+        return output[0]
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/projects/TensorMask/train_net.py b/preprocess/humanparsing/mhp_extension/detectron2/projects/TensorMask/train_net.py
new file mode 100644
index 0000000000000000000000000000000000000000..b898fc77b7f52cae6ff398ac5aec73c59ab928ab
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/projects/TensorMask/train_net.py
@@ -0,0 +1,70 @@
+#!/usr/bin/env python3
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+
+"""
+TensorMask Training Script.
+
+This script is a simplified version of the training script in detectron2/tools.
+"""
+
+import os
+
+import detectron2.utils.comm as comm
+from detectron2.checkpoint import DetectionCheckpointer
+from detectron2.config import get_cfg
+from detectron2.engine import DefaultTrainer, default_argument_parser, default_setup, launch
+from detectron2.evaluation import COCOEvaluator, verify_results
+
+from tensormask import add_tensormask_config
+
+
+class Trainer(DefaultTrainer):
+    @classmethod
+    def build_evaluator(cls, cfg, dataset_name, output_folder=None):
+        if output_folder is None:
+            output_folder = os.path.join(cfg.OUTPUT_DIR, "inference")
+        return COCOEvaluator(dataset_name, cfg, True, output_folder)
+
+
+def setup(args):
+    """
+    Create configs and perform basic setups.
+    """
+    cfg = get_cfg()
+    add_tensormask_config(cfg)
+    cfg.merge_from_file(args.config_file)
+    cfg.merge_from_list(args.opts)
+    cfg.freeze()
+    default_setup(cfg, args)
+    return cfg
+
+
+def main(args):
+    cfg = setup(args)
+
+    if args.eval_only:
+        model = Trainer.build_model(cfg)
+        DetectionCheckpointer(model, save_dir=cfg.OUTPUT_DIR).resume_or_load(
+            cfg.MODEL.WEIGHTS, resume=args.resume
+        )
+        res = Trainer.test(cfg, model)
+        if comm.is_main_process():
+            verify_results(cfg, res)
+        return res
+
+    trainer = Trainer(cfg)
+    trainer.resume_or_load(resume=args.resume)
+    return trainer.train()
+
+
+if __name__ == "__main__":
+    args = default_argument_parser().parse_args()
+    print("Command Line Args:", args)
+    launch(
+        main,
+        args.num_gpus,
+        num_machines=args.num_machines,
+        machine_rank=args.machine_rank,
+        dist_url=args.dist_url,
+        args=(args,),
+    )
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/projects/TridentNet/README.md b/preprocess/humanparsing/mhp_extension/detectron2/projects/TridentNet/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..4b7a90102d008a498e93dff595a09206be5269e7
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/projects/TridentNet/README.md
@@ -0,0 +1,60 @@
+
+# TridentNet in Detectron2
+**Scale-Aware Trident Networks for Object Detection**
+
+Yanghao Li\*, Yuntao Chen\*, Naiyan Wang, Zhaoxiang Zhang
+
+[[`TridentNet`](https://github.com/TuSimple/simpledet/tree/master/models/tridentnet)] [[`arXiv`](https://arxiv.org/abs/1901.01892)] [[`BibTeX`](#CitingTridentNet)]
+
+<div align="center">
+  <img src="https://drive.google.com/uc?export=view&id=10THEPdIPmf3ooMyNzrfZbpWihEBvixwt" width="700px" />
+</div>
+
+In this repository, we implement TridentNet-Fast in Detectron2.
+Trident Network (TridentNet) aims to generate scale-specific feature maps with a uniform representational power. We construct a parallel multi-branch architecture in which each branch shares the same transformation parameters but with different receptive fields. TridentNet-Fast is a fast approximation version of TridentNet that could achieve significant improvements without any additional parameters and computational cost.
+
+## Training
+
+To train a model, run
+```bash
+python /path/to/detectron2/projects/TridentNet/train_net.py --config-file <config.yaml>
+```
+
+For example, to launch end-to-end TridentNet training with ResNet-50 backbone on 8 GPUs,
+one should execute:
+```bash
+python /path/to/detectron2/projects/TridentNet/train_net.py --config-file configs/tridentnet_fast_R_50_C4_1x.yaml --num-gpus 8
+```
+
+## Evaluation
+
+Model evaluation can be done similarly:
+```bash
+python /path/to/detectron2/projects/TridentNet/train_net.py --config-file configs/tridentnet_fast_R_50_C4_1x.yaml --eval-only MODEL.WEIGHTS model.pth
+```
+
+## Results on MS-COCO in Detectron2
+
+|Model|Backbone|Head|lr sched|AP|AP50|AP75|APs|APm|APl|download|
+|-----|--------|----|--------|--|----|----|---|---|---|--------|
+|Faster|R50-C4|C5-512ROI|1X|35.7|56.1|38.0|19.2|40.9|48.7|<a href="https://dl.fbaipublicfiles.com/detectron2/COCO-Detection/faster_rcnn_R_50_C4_1x/137257644/model_final_721ade.pkl">model</a>&nbsp;\|&nbsp;<a href="https://dl.fbaipublicfiles.com/detectron2/COCO-Detection/faster_rcnn_R_50_C4_1x/137257644/metrics.json">metrics</a>|
+|TridentFast|R50-C4|C5-128ROI|1X|38.0|58.1|40.8|19.5|42.2|54.6|<a href="https://dl.fbaipublicfiles.com/detectron2/TridentNet/tridentnet_fast_R_50_C4_1x/148572687/model_final_756cda.pkl">model</a>&nbsp;\|&nbsp;<a href="https://dl.fbaipublicfiles.com/detectron2/TridentNet/tridentnet_fast_R_50_C4_1x/148572687/metrics.json">metrics</a>|
+|Faster|R50-C4|C5-512ROI|3X|38.4|58.7|41.3|20.7|42.7|53.1|<a href="https://dl.fbaipublicfiles.com/detectron2/COCO-Detection/faster_rcnn_R_50_C4_3x/137849393/model_final_f97cb7.pkl">model</a>&nbsp;\|&nbsp;<a href="https://dl.fbaipublicfiles.com/detectron2/COCO-Detection/faster_rcnn_R_50_C4_3x/137849393/metrics.json">metrics</a>|
+|TridentFast|R50-C4|C5-128ROI|3X|40.6|60.8|43.6|23.4|44.7|57.1|<a href="https://dl.fbaipublicfiles.com/detectron2/TridentNet/tridentnet_fast_R_50_C4_3x/148572287/model_final_e1027c.pkl">model</a>&nbsp;\|&nbsp;<a href="https://dl.fbaipublicfiles.com/detectron2/TridentNet/tridentnet_fast_R_50_C4_3x/148572287/metrics.json">metrics</a>|
+|Faster|R101-C4|C5-512ROI|3X|41.1|61.4|44.0|22.2|45.5|55.9|<a href="https://dl.fbaipublicfiles.com/detectron2/COCO-Detection/faster_rcnn_R_101_C4_3x/138204752/model_final_298dad.pkl">model</a>&nbsp;\|&nbsp;<a href="https://dl.fbaipublicfiles.com/detectron2/COCO-Detection/faster_rcnn_R_101_C4_3x/138204752/metrics.json">metrics</a>|
+|TridentFast|R101-C4|C5-128ROI|3X|43.6|63.4|47.0|24.3|47.8|60.0|<a href="https://dl.fbaipublicfiles.com/detectron2/TridentNet/tridentnet_fast_R_101_C4_3x/148572198/model_final_164568.pkl">model</a>&nbsp;\|&nbsp;<a href="https://dl.fbaipublicfiles.com/detectron2/TridentNet/tridentnet_fast_R_101_C4_3x/148572198/metrics.json">metrics</a>|
+
+
+## <a name="CitingTridentNet"></a>Citing TridentNet
+
+If you use TridentNet, please use the following BibTeX entry.
+
+```
+@InProceedings{li2019scale,
+  title={Scale-Aware Trident Networks for Object Detection},
+  author={Li, Yanghao and Chen, Yuntao and Wang, Naiyan and Zhang, Zhaoxiang},
+  journal={The International Conference on Computer Vision (ICCV)},
+  year={2019}
+}
+```
+
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/projects/TridentNet/configs/Base-TridentNet-Fast-C4.yaml b/preprocess/humanparsing/mhp_extension/detectron2/projects/TridentNet/configs/Base-TridentNet-Fast-C4.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..8c3d80797ba9ae63a5669ccbd74a0d2006fee3b7
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/projects/TridentNet/configs/Base-TridentNet-Fast-C4.yaml
@@ -0,0 +1,29 @@
+MODEL:
+  META_ARCHITECTURE: "GeneralizedRCNN"
+  BACKBONE:
+    NAME: "build_trident_resnet_backbone"
+  ROI_HEADS:
+    NAME: "TridentRes5ROIHeads"
+    POSITIVE_FRACTION: 0.5
+    BATCH_SIZE_PER_IMAGE: 128
+    PROPOSAL_APPEND_GT: False
+  PROPOSAL_GENERATOR:
+    NAME: "TridentRPN"
+  RPN:
+    POST_NMS_TOPK_TRAIN: 500
+  TRIDENT:
+    NUM_BRANCH: 3
+    BRANCH_DILATIONS: [1, 2, 3]
+    TEST_BRANCH_IDX: 1
+    TRIDENT_STAGE: "res4"
+DATASETS:
+  TRAIN: ("coco_2017_train",)
+  TEST: ("coco_2017_val",)
+SOLVER:
+  IMS_PER_BATCH: 16
+  BASE_LR: 0.02
+  STEPS: (60000, 80000)
+  MAX_ITER: 90000
+INPUT:
+  MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800)
+VERSION: 2
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/projects/TridentNet/configs/tridentnet_fast_R_101_C4_3x.yaml b/preprocess/humanparsing/mhp_extension/detectron2/projects/TridentNet/configs/tridentnet_fast_R_101_C4_3x.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..bc83c2f9e7b7653c8982e657b5f116abe6ad6e1f
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/projects/TridentNet/configs/tridentnet_fast_R_101_C4_3x.yaml
@@ -0,0 +1,9 @@
+_BASE_: "Base-TridentNet-Fast-C4.yaml"
+MODEL:
+  WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-101.pkl"
+  MASK_ON: False
+  RESNETS:
+    DEPTH: 101
+SOLVER:
+  STEPS: (210000, 250000)
+  MAX_ITER: 270000
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/projects/TridentNet/configs/tridentnet_fast_R_50_C4_1x.yaml b/preprocess/humanparsing/mhp_extension/detectron2/projects/TridentNet/configs/tridentnet_fast_R_50_C4_1x.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..fda2cb6622d732c0f70d74d567c26182a9a41c44
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/projects/TridentNet/configs/tridentnet_fast_R_50_C4_1x.yaml
@@ -0,0 +1,6 @@
+_BASE_: "Base-TridentNet-Fast-C4.yaml"
+MODEL:
+  WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
+  MASK_ON: False
+  RESNETS:
+    DEPTH: 50
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/projects/TridentNet/configs/tridentnet_fast_R_50_C4_3x.yaml b/preprocess/humanparsing/mhp_extension/detectron2/projects/TridentNet/configs/tridentnet_fast_R_50_C4_3x.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..ebf89d03ea043810b02e71ecc2c1711c250e161c
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/projects/TridentNet/configs/tridentnet_fast_R_50_C4_3x.yaml
@@ -0,0 +1,9 @@
+_BASE_: "Base-TridentNet-Fast-C4.yaml"
+MODEL:
+  WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
+  MASK_ON: False
+  RESNETS:
+    DEPTH: 50
+SOLVER:
+  STEPS: (210000, 250000)
+  MAX_ITER: 270000
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/projects/TridentNet/train_net.py b/preprocess/humanparsing/mhp_extension/detectron2/projects/TridentNet/train_net.py
new file mode 100644
index 0000000000000000000000000000000000000000..eac2ec5c39e4a3ce2221f354dcea288bffcb1fbb
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/projects/TridentNet/train_net.py
@@ -0,0 +1,67 @@
+#!/usr/bin/env python3
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+
+"""
+TridentNet Training Script.
+
+This script is a simplified version of the training script in detectron2/tools.
+"""
+
+import os
+
+from detectron2.checkpoint import DetectionCheckpointer
+from detectron2.config import get_cfg
+from detectron2.engine import DefaultTrainer, default_argument_parser, default_setup, launch
+from detectron2.evaluation import COCOEvaluator
+
+from tridentnet import add_tridentnet_config
+
+
+class Trainer(DefaultTrainer):
+    @classmethod
+    def build_evaluator(cls, cfg, dataset_name, output_folder=None):
+        if output_folder is None:
+            output_folder = os.path.join(cfg.OUTPUT_DIR, "inference")
+        return COCOEvaluator(dataset_name, cfg, True, output_folder)
+
+
+def setup(args):
+    """
+    Create configs and perform basic setups.
+    """
+    cfg = get_cfg()
+    add_tridentnet_config(cfg)
+    cfg.merge_from_file(args.config_file)
+    cfg.merge_from_list(args.opts)
+    cfg.freeze()
+    default_setup(cfg, args)
+    return cfg
+
+
+def main(args):
+    cfg = setup(args)
+
+    if args.eval_only:
+        model = Trainer.build_model(cfg)
+        DetectionCheckpointer(model, save_dir=cfg.OUTPUT_DIR).resume_or_load(
+            cfg.MODEL.WEIGHTS, resume=args.resume
+        )
+        res = Trainer.test(cfg, model)
+        return res
+
+    trainer = Trainer(cfg)
+    trainer.resume_or_load(resume=args.resume)
+    return trainer.train()
+
+
+if __name__ == "__main__":
+    args = default_argument_parser().parse_args()
+    print("Command Line Args:", args)
+    launch(
+        main,
+        args.num_gpus,
+        num_machines=args.num_machines,
+        machine_rank=args.machine_rank,
+        dist_url=args.dist_url,
+        args=(args,),
+    )
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/projects/TridentNet/tridentnet/__init__.py b/preprocess/humanparsing/mhp_extension/detectron2/projects/TridentNet/tridentnet/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..2fcdeb45a03d3835b3c2498ca8021a11d8cb4758
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/projects/TridentNet/tridentnet/__init__.py
@@ -0,0 +1,9 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+from .config import add_tridentnet_config
+from .trident_backbone import (
+    TridentBottleneckBlock,
+    build_trident_resnet_backbone,
+    make_trident_stage,
+)
+from .trident_rpn import TridentRPN
+from .trident_rcnn import TridentRes5ROIHeads, TridentStandardROIHeads
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/projects/TridentNet/tridentnet/config.py b/preprocess/humanparsing/mhp_extension/detectron2/projects/TridentNet/tridentnet/config.py
new file mode 100644
index 0000000000000000000000000000000000000000..f33f473cb32633d9ba6582f0406ffe0a929d23c6
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/projects/TridentNet/tridentnet/config.py
@@ -0,0 +1,26 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+
+from detectron2.config import CfgNode as CN
+
+
+def add_tridentnet_config(cfg):
+    """
+    Add config for tridentnet.
+    """
+    _C = cfg
+
+    _C.MODEL.TRIDENT = CN()
+
+    # Number of branches for TridentNet.
+    _C.MODEL.TRIDENT.NUM_BRANCH = 3
+    # Specify the dilations for each branch.
+    _C.MODEL.TRIDENT.BRANCH_DILATIONS = [1, 2, 3]
+    # Specify the stage for applying trident blocks. Default stage is Res4 according to the
+    # TridentNet paper.
+    _C.MODEL.TRIDENT.TRIDENT_STAGE = "res4"
+    # Specify the test branch index TridentNet Fast inference:
+    #   - use -1 to aggregate results of all branches during inference.
+    #   - otherwise, only using specified branch for fast inference. Recommended setting is
+    #     to use the middle branch.
+    _C.MODEL.TRIDENT.TEST_BRANCH_IDX = 1
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/projects/TridentNet/tridentnet/trident_backbone.py b/preprocess/humanparsing/mhp_extension/detectron2/projects/TridentNet/tridentnet/trident_backbone.py
new file mode 100644
index 0000000000000000000000000000000000000000..232dfaf1ca01c0395c0ceea544bfbdee0d45ce1a
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/projects/TridentNet/tridentnet/trident_backbone.py
@@ -0,0 +1,223 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+import fvcore.nn.weight_init as weight_init
+import torch
+import torch.nn.functional as F
+
+from detectron2.layers import Conv2d, FrozenBatchNorm2d, get_norm
+from detectron2.modeling import BACKBONE_REGISTRY, ResNet, ResNetBlockBase, make_stage
+from detectron2.modeling.backbone.resnet import BasicStem, BottleneckBlock, DeformBottleneckBlock
+
+from .trident_conv import TridentConv
+
+__all__ = ["TridentBottleneckBlock", "make_trident_stage", "build_trident_resnet_backbone"]
+
+
+class TridentBottleneckBlock(ResNetBlockBase):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        *,
+        bottleneck_channels,
+        stride=1,
+        num_groups=1,
+        norm="BN",
+        stride_in_1x1=False,
+        num_branch=3,
+        dilations=(1, 2, 3),
+        concat_output=False,
+        test_branch_idx=-1,
+    ):
+        """
+        Args:
+            num_branch (int): the number of branches in TridentNet.
+            dilations (tuple): the dilations of multiple branches in TridentNet.
+            concat_output (bool): if concatenate outputs of multiple branches in TridentNet.
+                Use 'True' for the last trident block.
+        """
+        super().__init__(in_channels, out_channels, stride)
+
+        assert num_branch == len(dilations)
+
+        self.num_branch = num_branch
+        self.concat_output = concat_output
+        self.test_branch_idx = test_branch_idx
+
+        if in_channels != out_channels:
+            self.shortcut = Conv2d(
+                in_channels,
+                out_channels,
+                kernel_size=1,
+                stride=stride,
+                bias=False,
+                norm=get_norm(norm, out_channels),
+            )
+        else:
+            self.shortcut = None
+
+        stride_1x1, stride_3x3 = (stride, 1) if stride_in_1x1 else (1, stride)
+
+        self.conv1 = Conv2d(
+            in_channels,
+            bottleneck_channels,
+            kernel_size=1,
+            stride=stride_1x1,
+            bias=False,
+            norm=get_norm(norm, bottleneck_channels),
+        )
+
+        self.conv2 = TridentConv(
+            bottleneck_channels,
+            bottleneck_channels,
+            kernel_size=3,
+            stride=stride_3x3,
+            paddings=dilations,
+            bias=False,
+            groups=num_groups,
+            dilations=dilations,
+            num_branch=num_branch,
+            test_branch_idx=test_branch_idx,
+            norm=get_norm(norm, bottleneck_channels),
+        )
+
+        self.conv3 = Conv2d(
+            bottleneck_channels,
+            out_channels,
+            kernel_size=1,
+            bias=False,
+            norm=get_norm(norm, out_channels),
+        )
+
+        for layer in [self.conv1, self.conv2, self.conv3, self.shortcut]:
+            if layer is not None:  # shortcut can be None
+                weight_init.c2_msra_fill(layer)
+
+    def forward(self, x):
+        num_branch = self.num_branch if self.training or self.test_branch_idx == -1 else 1
+        if not isinstance(x, list):
+            x = [x] * num_branch
+        out = [self.conv1(b) for b in x]
+        out = [F.relu_(b) for b in out]
+
+        out = self.conv2(out)
+        out = [F.relu_(b) for b in out]
+
+        out = [self.conv3(b) for b in out]
+
+        if self.shortcut is not None:
+            shortcut = [self.shortcut(b) for b in x]
+        else:
+            shortcut = x
+
+        out = [out_b + shortcut_b for out_b, shortcut_b in zip(out, shortcut)]
+        out = [F.relu_(b) for b in out]
+        if self.concat_output:
+            out = torch.cat(out)
+        return out
+
+
+def make_trident_stage(block_class, num_blocks, first_stride, **kwargs):
+    """
+    Create a resnet stage by creating many blocks for TridentNet.
+    """
+    blocks = []
+    for i in range(num_blocks - 1):
+        blocks.append(block_class(stride=first_stride if i == 0 else 1, **kwargs))
+        kwargs["in_channels"] = kwargs["out_channels"]
+    blocks.append(block_class(stride=1, concat_output=True, **kwargs))
+    return blocks
+
+
+@BACKBONE_REGISTRY.register()
+def build_trident_resnet_backbone(cfg, input_shape):
+    """
+    Create a ResNet instance from config for TridentNet.
+
+    Returns:
+        ResNet: a :class:`ResNet` instance.
+    """
+    # need registration of new blocks/stems?
+    norm = cfg.MODEL.RESNETS.NORM
+    stem = BasicStem(
+        in_channels=input_shape.channels,
+        out_channels=cfg.MODEL.RESNETS.STEM_OUT_CHANNELS,
+        norm=norm,
+    )
+    freeze_at = cfg.MODEL.BACKBONE.FREEZE_AT
+
+    if freeze_at >= 1:
+        for p in stem.parameters():
+            p.requires_grad = False
+        stem = FrozenBatchNorm2d.convert_frozen_batchnorm(stem)
+
+    # fmt: off
+    out_features         = cfg.MODEL.RESNETS.OUT_FEATURES
+    depth                = cfg.MODEL.RESNETS.DEPTH
+    num_groups           = cfg.MODEL.RESNETS.NUM_GROUPS
+    width_per_group      = cfg.MODEL.RESNETS.WIDTH_PER_GROUP
+    bottleneck_channels  = num_groups * width_per_group
+    in_channels          = cfg.MODEL.RESNETS.STEM_OUT_CHANNELS
+    out_channels         = cfg.MODEL.RESNETS.RES2_OUT_CHANNELS
+    stride_in_1x1        = cfg.MODEL.RESNETS.STRIDE_IN_1X1
+    res5_dilation        = cfg.MODEL.RESNETS.RES5_DILATION
+    deform_on_per_stage  = cfg.MODEL.RESNETS.DEFORM_ON_PER_STAGE
+    deform_modulated     = cfg.MODEL.RESNETS.DEFORM_MODULATED
+    deform_num_groups    = cfg.MODEL.RESNETS.DEFORM_NUM_GROUPS
+    num_branch           = cfg.MODEL.TRIDENT.NUM_BRANCH
+    branch_dilations     = cfg.MODEL.TRIDENT.BRANCH_DILATIONS
+    trident_stage        = cfg.MODEL.TRIDENT.TRIDENT_STAGE
+    test_branch_idx      = cfg.MODEL.TRIDENT.TEST_BRANCH_IDX
+    # fmt: on
+    assert res5_dilation in {1, 2}, "res5_dilation cannot be {}.".format(res5_dilation)
+
+    num_blocks_per_stage = {50: [3, 4, 6, 3], 101: [3, 4, 23, 3], 152: [3, 8, 36, 3]}[depth]
+
+    stages = []
+
+    res_stage_idx = {"res2": 2, "res3": 3, "res4": 4, "res5": 5}
+    out_stage_idx = [res_stage_idx[f] for f in out_features]
+    trident_stage_idx = res_stage_idx[trident_stage]
+    max_stage_idx = max(out_stage_idx)
+    for idx, stage_idx in enumerate(range(2, max_stage_idx + 1)):
+        dilation = res5_dilation if stage_idx == 5 else 1
+        first_stride = 1 if idx == 0 or (stage_idx == 5 and dilation == 2) else 2
+        stage_kargs = {
+            "num_blocks": num_blocks_per_stage[idx],
+            "first_stride": first_stride,
+            "in_channels": in_channels,
+            "bottleneck_channels": bottleneck_channels,
+            "out_channels": out_channels,
+            "num_groups": num_groups,
+            "norm": norm,
+            "stride_in_1x1": stride_in_1x1,
+            "dilation": dilation,
+        }
+        if stage_idx == trident_stage_idx:
+            assert not deform_on_per_stage[
+                idx
+            ], "Not support deformable conv in Trident blocks yet."
+            stage_kargs["block_class"] = TridentBottleneckBlock
+            stage_kargs["num_branch"] = num_branch
+            stage_kargs["dilations"] = branch_dilations
+            stage_kargs["test_branch_idx"] = test_branch_idx
+            stage_kargs.pop("dilation")
+        elif deform_on_per_stage[idx]:
+            stage_kargs["block_class"] = DeformBottleneckBlock
+            stage_kargs["deform_modulated"] = deform_modulated
+            stage_kargs["deform_num_groups"] = deform_num_groups
+        else:
+            stage_kargs["block_class"] = BottleneckBlock
+        blocks = (
+            make_trident_stage(**stage_kargs)
+            if stage_idx == trident_stage_idx
+            else make_stage(**stage_kargs)
+        )
+        in_channels = out_channels
+        out_channels *= 2
+        bottleneck_channels *= 2
+
+        if freeze_at >= stage_idx:
+            for block in blocks:
+                block.freeze()
+        stages.append(blocks)
+    return ResNet(stem, stages, out_features=out_features)
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/projects/TridentNet/tridentnet/trident_conv.py b/preprocess/humanparsing/mhp_extension/detectron2/projects/TridentNet/tridentnet/trident_conv.py
new file mode 100644
index 0000000000000000000000000000000000000000..7e2d5252bda5ebb2e9eee10af9c9a14fc72bb8fe
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/projects/TridentNet/tridentnet/trident_conv.py
@@ -0,0 +1,107 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+import torch
+from torch import nn
+from torch.nn import functional as F
+from torch.nn.modules.utils import _pair
+
+from detectron2.layers.wrappers import _NewEmptyTensorOp
+
+
+class TridentConv(nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        kernel_size,
+        stride=1,
+        paddings=0,
+        dilations=1,
+        groups=1,
+        num_branch=1,
+        test_branch_idx=-1,
+        bias=False,
+        norm=None,
+        activation=None,
+    ):
+        super(TridentConv, self).__init__()
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.kernel_size = _pair(kernel_size)
+        self.num_branch = num_branch
+        self.stride = _pair(stride)
+        self.groups = groups
+        self.with_bias = bias
+        if isinstance(paddings, int):
+            paddings = [paddings] * self.num_branch
+        if isinstance(dilations, int):
+            dilations = [dilations] * self.num_branch
+        self.paddings = [_pair(padding) for padding in paddings]
+        self.dilations = [_pair(dilation) for dilation in dilations]
+        self.test_branch_idx = test_branch_idx
+        self.norm = norm
+        self.activation = activation
+
+        assert len({self.num_branch, len(self.paddings), len(self.dilations)}) == 1
+
+        self.weight = nn.Parameter(
+            torch.Tensor(out_channels, in_channels // groups, *self.kernel_size)
+        )
+        if bias:
+            self.bias = nn.Parameter(torch.Tensor(out_channels))
+        else:
+            self.bias = None
+
+        nn.init.kaiming_uniform_(self.weight, nonlinearity="relu")
+        if self.bias is not None:
+            nn.init.constant_(self.bias, 0)
+
+    def forward(self, inputs):
+        num_branch = self.num_branch if self.training or self.test_branch_idx == -1 else 1
+        assert len(inputs) == num_branch
+
+        if inputs[0].numel() == 0:
+            output_shape = [
+                (i + 2 * p - (di * (k - 1) + 1)) // s + 1
+                for i, p, di, k, s in zip(
+                    inputs[0].shape[-2:], self.padding, self.dilation, self.kernel_size, self.stride
+                )
+            ]
+            output_shape = [input[0].shape[0], self.weight.shape[0]] + output_shape
+            return [_NewEmptyTensorOp.apply(input, output_shape) for input in inputs]
+
+        if self.training or self.test_branch_idx == -1:
+            outputs = [
+                F.conv2d(input, self.weight, self.bias, self.stride, padding, dilation, self.groups)
+                for input, dilation, padding in zip(inputs, self.dilations, self.paddings)
+            ]
+        else:
+            outputs = [
+                F.conv2d(
+                    inputs[0],
+                    self.weight,
+                    self.bias,
+                    self.stride,
+                    self.paddings[self.test_branch_idx],
+                    self.dilations[self.test_branch_idx],
+                    self.groups,
+                )
+            ]
+
+        if self.norm is not None:
+            outputs = [self.norm(x) for x in outputs]
+        if self.activation is not None:
+            outputs = [self.activation(x) for x in outputs]
+        return outputs
+
+    def extra_repr(self):
+        tmpstr = "in_channels=" + str(self.in_channels)
+        tmpstr += ", out_channels=" + str(self.out_channels)
+        tmpstr += ", kernel_size=" + str(self.kernel_size)
+        tmpstr += ", num_branch=" + str(self.num_branch)
+        tmpstr += ", test_branch_idx=" + str(self.test_branch_idx)
+        tmpstr += ", stride=" + str(self.stride)
+        tmpstr += ", paddings=" + str(self.paddings)
+        tmpstr += ", dilations=" + str(self.dilations)
+        tmpstr += ", groups=" + str(self.groups)
+        tmpstr += ", bias=" + str(self.with_bias)
+        return tmpstr
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/projects/TridentNet/tridentnet/trident_rcnn.py b/preprocess/humanparsing/mhp_extension/detectron2/projects/TridentNet/tridentnet/trident_rcnn.py
new file mode 100644
index 0000000000000000000000000000000000000000..65deb90977c525f9e42ea9b2581944832a9af47e
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/projects/TridentNet/tridentnet/trident_rcnn.py
@@ -0,0 +1,116 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+from detectron2.layers import batched_nms
+from detectron2.modeling import ROI_HEADS_REGISTRY, StandardROIHeads
+from detectron2.modeling.roi_heads.roi_heads import Res5ROIHeads
+from detectron2.structures import Instances
+
+
+def merge_branch_instances(instances, num_branch, nms_thresh, topk_per_image):
+    """
+    Merge detection results from different branches of TridentNet.
+    Return detection results by applying non-maximum suppression (NMS) on bounding boxes
+    and keep the unsuppressed boxes and other instances (e.g mask) if any.
+
+    Args:
+        instances (list[Instances]): A list of N * num_branch instances that store detection
+            results. Contain N images and each image has num_branch instances.
+        num_branch (int): Number of branches used for merging detection results for each image.
+        nms_thresh (float):  The threshold to use for box non-maximum suppression. Value in [0, 1].
+        topk_per_image (int): The number of top scoring detections to return. Set < 0 to return
+            all detections.
+
+    Returns:
+        results: (list[Instances]): A list of N instances, one for each image in the batch,
+            that stores the topk most confidence detections after merging results from multiple
+            branches.
+    """
+    if num_branch == 1:
+        return instances
+
+    batch_size = len(instances) // num_branch
+    results = []
+    for i in range(batch_size):
+        instance = Instances.cat([instances[i + batch_size * j] for j in range(num_branch)])
+
+        # Apply per-class NMS
+        keep = batched_nms(
+            instance.pred_boxes.tensor, instance.scores, instance.pred_classes, nms_thresh
+        )
+        keep = keep[:topk_per_image]
+        result = instance[keep]
+
+        results.append(result)
+
+    return results
+
+
+@ROI_HEADS_REGISTRY.register()
+class TridentRes5ROIHeads(Res5ROIHeads):
+    """
+    The TridentNet ROIHeads in a typical "C4" R-CNN model.
+    See :class:`Res5ROIHeads`.
+    """
+
+    def __init__(self, cfg, input_shape):
+        super().__init__(cfg, input_shape)
+
+        self.num_branch = cfg.MODEL.TRIDENT.NUM_BRANCH
+        self.trident_fast = cfg.MODEL.TRIDENT.TEST_BRANCH_IDX != -1
+
+    def forward(self, images, features, proposals, targets=None):
+        """
+        See :class:`Res5ROIHeads.forward`.
+        """
+        num_branch = self.num_branch if self.training or not self.trident_fast else 1
+        all_targets = targets * num_branch if targets is not None else None
+        pred_instances, losses = super().forward(images, features, proposals, all_targets)
+        del images, all_targets, targets
+
+        if self.training:
+            return pred_instances, losses
+        else:
+            pred_instances = merge_branch_instances(
+                pred_instances,
+                num_branch,
+                self.box_predictor.test_nms_thresh,
+                self.box_predictor.test_topk_per_image,
+            )
+
+            return pred_instances, {}
+
+
+@ROI_HEADS_REGISTRY.register()
+class TridentStandardROIHeads(StandardROIHeads):
+    """
+    The `StandardROIHeads` for TridentNet.
+    See :class:`StandardROIHeads`.
+    """
+
+    def __init__(self, cfg, input_shape):
+        super(TridentStandardROIHeads, self).__init__(cfg, input_shape)
+
+        self.num_branch = cfg.MODEL.TRIDENT.NUM_BRANCH
+        self.trident_fast = cfg.MODEL.TRIDENT.TEST_BRANCH_IDX != -1
+
+    def forward(self, images, features, proposals, targets=None):
+        """
+        See :class:`Res5ROIHeads.forward`.
+        """
+        # Use 1 branch if using trident_fast during inference.
+        num_branch = self.num_branch if self.training or not self.trident_fast else 1
+        # Duplicate targets for all branches in TridentNet.
+        all_targets = targets * num_branch if targets is not None else None
+        pred_instances, losses = super().forward(images, features, proposals, all_targets)
+        del images, all_targets, targets
+
+        if self.training:
+            return pred_instances, losses
+        else:
+            pred_instances = merge_branch_instances(
+                pred_instances,
+                num_branch,
+                self.box_predictor.test_nms_thresh,
+                self.box_predictor.test_topk_per_image,
+            )
+
+            return pred_instances, {}
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/projects/TridentNet/tridentnet/trident_rpn.py b/preprocess/humanparsing/mhp_extension/detectron2/projects/TridentNet/tridentnet/trident_rpn.py
new file mode 100644
index 0000000000000000000000000000000000000000..c30137f312232ccccd86182108949fbe34b97231
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/projects/TridentNet/tridentnet/trident_rpn.py
@@ -0,0 +1,32 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+import torch
+
+from detectron2.modeling import PROPOSAL_GENERATOR_REGISTRY
+from detectron2.modeling.proposal_generator.rpn import RPN
+from detectron2.structures import ImageList
+
+
+@PROPOSAL_GENERATOR_REGISTRY.register()
+class TridentRPN(RPN):
+    """
+    Trident RPN subnetwork.
+    """
+
+    def __init__(self, cfg, input_shape):
+        super(TridentRPN, self).__init__(cfg, input_shape)
+
+        self.num_branch = cfg.MODEL.TRIDENT.NUM_BRANCH
+        self.trident_fast = cfg.MODEL.TRIDENT.TEST_BRANCH_IDX != -1
+
+    def forward(self, images, features, gt_instances=None):
+        """
+        See :class:`RPN.forward`.
+        """
+        num_branch = self.num_branch if self.training or not self.trident_fast else 1
+        # Duplicate images and gt_instances for all branches in TridentNet.
+        all_images = ImageList(
+            torch.cat([images.tensor] * num_branch), images.image_sizes * num_branch
+        )
+        all_gt_instances = gt_instances * num_branch if gt_instances is not None else None
+
+        return super(TridentRPN, self).forward(all_images, features, all_gt_instances)
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/setup.cfg b/preprocess/humanparsing/mhp_extension/detectron2/setup.cfg
new file mode 100644
index 0000000000000000000000000000000000000000..b09bba99ca88d5cc900d1cc7fb0947d0443522be
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/setup.cfg
@@ -0,0 +1,26 @@
+[isort]
+line_length=100
+multi_line_output=3
+include_trailing_comma=True
+known_standard_library=numpy,setuptools,mock
+skip=./datasets,docs
+skip_glob=*/__init__.py
+known_myself=detectron2
+known_third_party=fvcore,matplotlib,cv2,torch,torchvision,PIL,pycocotools,yacs,termcolor,cityscapesscripts,tabulate,tqdm,scipy,lvis,psutil,pkg_resources,caffe2,onnx
+no_lines_before=STDLIB,THIRDPARTY
+sections=FUTURE,STDLIB,THIRDPARTY,myself,FIRSTPARTY,LOCALFOLDER
+default_section=FIRSTPARTY
+
+[mypy]
+python_version=3.6
+ignore_missing_imports = True
+warn_unused_configs = True
+disallow_untyped_defs = True
+check_untyped_defs = True
+warn_unused_ignores = True
+warn_redundant_casts = True
+show_column_numbers = True
+follow_imports = silent
+allow_redefinition = True
+; Require all functions to be annotated
+disallow_incomplete_defs = True
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/setup.py b/preprocess/humanparsing/mhp_extension/detectron2/setup.py
new file mode 100644
index 0000000000000000000000000000000000000000..a863fab1b7658a888df8623b57fe53673698cf60
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/setup.py
@@ -0,0 +1,156 @@
+#!/usr/bin/env python
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+
+import glob
+import os
+import shutil
+from os import path
+from setuptools import find_packages, setup
+from typing import List
+import torch
+from torch.utils.cpp_extension import CUDA_HOME, CppExtension, CUDAExtension
+
+torch_ver = [int(x) for x in torch.__version__.split(".")[:2]]
+assert torch_ver >= [1, 4], "Requires PyTorch >= 1.4"
+
+
+def get_version():
+    init_py_path = path.join(path.abspath(path.dirname(__file__)), "detectron2", "__init__.py")
+    init_py = open(init_py_path, "r").readlines()
+    version_line = [l.strip() for l in init_py if l.startswith("__version__")][0]
+    version = version_line.split("=")[-1].strip().strip("'\"")
+
+    # The following is used to build release packages.
+    # Users should never use it.
+    suffix = os.getenv("D2_VERSION_SUFFIX", "")
+    version = version + suffix
+    if os.getenv("BUILD_NIGHTLY", "0") == "1":
+        from datetime import datetime
+
+        date_str = datetime.today().strftime("%y%m%d")
+        version = version + ".dev" + date_str
+
+        new_init_py = [l for l in init_py if not l.startswith("__version__")]
+        new_init_py.append('__version__ = "{}"\n'.format(version))
+        with open(init_py_path, "w") as f:
+            f.write("".join(new_init_py))
+    return version
+
+
+def get_extensions():
+    this_dir = path.dirname(path.abspath(__file__))
+    extensions_dir = path.join(this_dir, "detectron2", "layers", "csrc")
+
+    main_source = path.join(extensions_dir, "vision.cpp")
+    sources = glob.glob(path.join(extensions_dir, "**", "*.cpp"))
+    source_cuda = glob.glob(path.join(extensions_dir, "**", "*.cu")) + glob.glob(
+        path.join(extensions_dir, "*.cu")
+    )
+
+    sources = [main_source] + sources
+    extension = CppExtension
+
+    extra_compile_args = {"cxx": []}
+    define_macros = []
+
+    if (
+        torch.cuda.is_available() and CUDA_HOME is not None and os.path.isdir(CUDA_HOME)
+    ) or os.getenv("FORCE_CUDA", "0") == "1":
+        extension = CUDAExtension
+        sources += source_cuda
+        define_macros += [("WITH_CUDA", None)]
+        extra_compile_args["nvcc"] = [
+            "-DCUDA_HAS_FP16=1",
+            "-D__CUDA_NO_HALF_OPERATORS__",
+            "-D__CUDA_NO_HALF_CONVERSIONS__",
+            "-D__CUDA_NO_HALF2_OPERATORS__",
+        ]
+
+        # It's better if pytorch can do this by default ..
+        CC = os.environ.get("CC", None)
+        if CC is not None:
+            extra_compile_args["nvcc"].append("-ccbin={}".format(CC))
+
+    include_dirs = [extensions_dir]
+
+    ext_modules = [
+        extension(
+            "detectron2._C",
+            sources,
+            include_dirs=include_dirs,
+            define_macros=define_macros,
+            extra_compile_args=extra_compile_args,
+        )
+    ]
+
+    return ext_modules
+
+
+def get_model_zoo_configs() -> List[str]:
+    """
+    Return a list of configs to include in package for model zoo. Copy over these configs inside
+    detectron2/model_zoo.
+    """
+
+    # Use absolute paths while symlinking.
+    source_configs_dir = path.join(path.dirname(path.realpath(__file__)), "configs")
+    destination = path.join(
+        path.dirname(path.realpath(__file__)), "detectron2", "model_zoo", "configs"
+    )
+    # Symlink the config directory inside package to have a cleaner pip install.
+
+    # Remove stale symlink/directory from a previous build.
+    if path.exists(source_configs_dir):
+        if path.islink(destination):
+            os.unlink(destination)
+        elif path.isdir(destination):
+            shutil.rmtree(destination)
+
+    if not path.exists(destination):
+        try:
+            os.symlink(source_configs_dir, destination)
+        except OSError:
+            # Fall back to copying if symlink fails: ex. on Windows.
+            shutil.copytree(source_configs_dir, destination)
+
+    config_paths = glob.glob("configs/**/*.yaml", recursive=True)
+    return config_paths
+
+
+setup(
+    name="detectron2",
+    version=get_version(),
+    author="FAIR",
+    url="https://github.com/facebookresearch/detectron2",
+    description="Detectron2 is FAIR's next-generation research "
+    "platform for object detection and segmentation.",
+    packages=find_packages(exclude=("configs", "tests*")),
+    package_data={"detectron2.model_zoo": get_model_zoo_configs()},
+    python_requires=">=3.6",
+    install_requires=[
+        "termcolor>=1.1",
+        "Pillow",  # you can also use pillow-simd for better performance
+        "yacs>=0.1.6",
+        "tabulate",
+        "cloudpickle",
+        "matplotlib",
+        "mock",
+        "tqdm>4.29.0",
+        "tensorboard",
+        "fvcore>=0.1.1",
+        "future",  # used by caffe2
+        "pydot",  # used to save caffe2 SVGs
+    ],
+    extras_require={
+        "all": ["shapely", "psutil"],
+        "dev": [
+            "flake8==3.7.9",
+            "isort",
+            "black @ git+https://github.com/psf/black@673327449f86fce558adde153bb6cbe54bfebad2",
+            "flake8-bugbear",
+            "flake8-comprehensions",
+        ],
+    },
+    ext_modules=get_extensions(),
+    cmdclass={"build_ext": torch.utils.cpp_extension.BuildExtension},
+)
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/tests/README.md b/preprocess/humanparsing/mhp_extension/detectron2/tests/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..f560384045ab4f6bc2beabef1170308fca117eb3
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/tests/README.md
@@ -0,0 +1,9 @@
+## Unit Tests
+
+To run the unittests, do:
+```
+cd detectron2
+python -m unittest discover -v -s ./tests
+```
+
+There are also end-to-end inference & training tests, in [dev/run_*_tests.sh](../dev).
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/tests/__init__.py b/preprocess/humanparsing/mhp_extension/detectron2/tests/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..168f9979a4623806934b0ff1102ac166704e7dec
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/tests/__init__.py
@@ -0,0 +1 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/tests/data/__init__.py b/preprocess/humanparsing/mhp_extension/detectron2/tests/data/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/tests/data/test_coco.py b/preprocess/humanparsing/mhp_extension/detectron2/tests/data/test_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..2cd807d0ae465ad2e060a373f2e75db2483771c7
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/tests/data/test_coco.py
@@ -0,0 +1,77 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+import json
+import numpy as np
+import os
+import tempfile
+import unittest
+import pycocotools
+
+from detectron2.data import DatasetCatalog, MetadataCatalog
+from detectron2.data.datasets.coco import convert_to_coco_dict, load_coco_json
+from detectron2.structures import BoxMode
+
+
+def make_mask():
+    """
+    Makes a donut shaped binary mask.
+    """
+    H = 100
+    W = 100
+    mask = np.zeros([H, W], dtype=np.uint8)
+    for x in range(W):
+        for y in range(H):
+            d = np.linalg.norm(np.array([W, H]) / 2 - np.array([x, y]))
+            if d > 10 and d < 20:
+                mask[y, x] = 1
+    return mask
+
+
+def make_dataset_dicts(mask):
+    """
+    Returns a list of dicts that represents a single COCO data point for
+    object detection. The single instance given by `mask` is represented by
+    RLE.
+    """
+    record = {}
+    record["file_name"] = "test"
+    record["image_id"] = 0
+    record["height"] = mask.shape[0]
+    record["width"] = mask.shape[1]
+
+    y, x = np.nonzero(mask)
+    segmentation = pycocotools.mask.encode(np.asarray(mask, order="F"))
+    min_x = np.min(x)
+    max_x = np.max(x)
+    min_y = np.min(y)
+    max_y = np.max(y)
+    obj = {
+        "bbox": [min_x, min_y, max_x, max_y],
+        "bbox_mode": BoxMode.XYXY_ABS,
+        "category_id": 0,
+        "iscrowd": 0,
+        "segmentation": segmentation,
+    }
+    record["annotations"] = [obj]
+    return [record]
+
+
+class TestRLEToJson(unittest.TestCase):
+    def test(self):
+        # Make a dummy dataset.
+        mask = make_mask()
+        DatasetCatalog.register("test_dataset", lambda: make_dataset_dicts(mask))
+        MetadataCatalog.get("test_dataset").set(thing_classes=["test_label"])
+
+        # Dump to json.
+        json_dict = convert_to_coco_dict("test_dataset")
+        with tempfile.TemporaryDirectory() as tmpdir:
+            json_file_name = os.path.join(tmpdir, "test.json")
+            with open(json_file_name, "w") as f:
+                json.dump(json_dict, f)
+            # Load from json.
+            dicts = load_coco_json(json_file_name, "")
+
+        # Check the loaded mask matches the original.
+        anno = dicts[0]["annotations"][0]
+        loaded_mask = pycocotools.mask.decode(anno["segmentation"])
+        self.assertTrue(np.array_equal(loaded_mask, mask))
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/tests/data/test_detection_utils.py b/preprocess/humanparsing/mhp_extension/detectron2/tests/data/test_detection_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..bdd94dd92366418347cc74a58e807240fd795111
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/tests/data/test_detection_utils.py
@@ -0,0 +1,116 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+
+import copy
+import numpy as np
+import unittest
+import pycocotools.mask as mask_util
+
+from detectron2.data import detection_utils
+from detectron2.data import transforms as T
+from detectron2.structures import BitMasks, BoxMode
+
+
+class TestTransformAnnotations(unittest.TestCase):
+    def test_transform_simple_annotation(self):
+        transforms = T.TransformList([T.HFlipTransform(400)])
+        anno = {
+            "bbox": np.asarray([10, 10, 200, 300]),
+            "bbox_mode": BoxMode.XYXY_ABS,
+            "category_id": 3,
+            "segmentation": [[10, 10, 100, 100, 100, 10], [150, 150, 200, 150, 200, 200]],
+        }
+
+        output = detection_utils.transform_instance_annotations(anno, transforms, (400, 400))
+        self.assertTrue(np.allclose(output["bbox"], [200, 10, 390, 300]))
+        self.assertEqual(len(output["segmentation"]), len(anno["segmentation"]))
+        self.assertTrue(np.allclose(output["segmentation"][0], [390, 10, 300, 100, 300, 10]))
+
+        detection_utils.annotations_to_instances([output, output], (400, 400))
+
+    def test_flip_keypoints(self):
+        transforms = T.TransformList([T.HFlipTransform(400)])
+        anno = {
+            "bbox": np.asarray([10, 10, 200, 300]),
+            "bbox_mode": BoxMode.XYXY_ABS,
+            "keypoints": np.random.rand(17, 3) * 50 + 15,
+        }
+
+        output = detection_utils.transform_instance_annotations(
+            copy.deepcopy(anno),
+            transforms,
+            (400, 400),
+            keypoint_hflip_indices=detection_utils.create_keypoint_hflip_indices(
+                ["keypoints_coco_2017_train"]
+            ),
+        )
+        # The first keypoint is nose
+        self.assertTrue(np.allclose(output["keypoints"][0, 0], 400 - anno["keypoints"][0, 0]))
+        # The last 16 keypoints are 8 left-right pairs
+        self.assertTrue(
+            np.allclose(
+                output["keypoints"][1:, 0].reshape(-1, 2)[:, ::-1],
+                400 - anno["keypoints"][1:, 0].reshape(-1, 2),
+            )
+        )
+        self.assertTrue(
+            np.allclose(
+                output["keypoints"][1:, 1:].reshape(-1, 2, 2)[:, ::-1, :],
+                anno["keypoints"][1:, 1:].reshape(-1, 2, 2),
+            )
+        )
+
+    def test_transform_RLE(self):
+        transforms = T.TransformList([T.HFlipTransform(400)])
+        mask = np.zeros((300, 400), order="F").astype("uint8")
+        mask[:, :200] = 1
+
+        anno = {
+            "bbox": np.asarray([10, 10, 200, 300]),
+            "bbox_mode": BoxMode.XYXY_ABS,
+            "segmentation": mask_util.encode(mask[:, :, None])[0],
+            "category_id": 3,
+        }
+        output = detection_utils.transform_instance_annotations(
+            copy.deepcopy(anno), transforms, (300, 400)
+        )
+        mask = output["segmentation"]
+        self.assertTrue((mask[:, 200:] == 1).all())
+        self.assertTrue((mask[:, :200] == 0).all())
+
+        inst = detection_utils.annotations_to_instances(
+            [output, output], (400, 400), mask_format="bitmask"
+        )
+        self.assertTrue(isinstance(inst.gt_masks, BitMasks))
+
+    def test_transform_RLE_resize(self):
+        transforms = T.TransformList(
+            [T.HFlipTransform(400), T.ScaleTransform(300, 400, 400, 400, "bilinear")]
+        )
+        mask = np.zeros((300, 400), order="F").astype("uint8")
+        mask[:, :200] = 1
+
+        anno = {
+            "bbox": np.asarray([10, 10, 200, 300]),
+            "bbox_mode": BoxMode.XYXY_ABS,
+            "segmentation": mask_util.encode(mask[:, :, None])[0],
+            "category_id": 3,
+        }
+        output = detection_utils.transform_instance_annotations(
+            copy.deepcopy(anno), transforms, (400, 400)
+        )
+
+        inst = detection_utils.annotations_to_instances(
+            [output, output], (400, 400), mask_format="bitmask"
+        )
+        self.assertTrue(isinstance(inst.gt_masks, BitMasks))
+
+    def test_gen_crop(self):
+        instance = {"bbox": [10, 10, 100, 100], "bbox_mode": BoxMode.XYXY_ABS}
+        t = detection_utils.gen_crop_transform_with_instance((10, 10), (150, 150), instance)
+        # the box center must fall into the cropped region
+        self.assertTrue(t.x0 <= 55 <= t.x0 + t.w)
+
+    def test_gen_crop_outside_boxes(self):
+        instance = {"bbox": [10, 10, 100, 100], "bbox_mode": BoxMode.XYXY_ABS}
+        with self.assertRaises(AssertionError):
+            detection_utils.gen_crop_transform_with_instance((10, 10), (15, 15), instance)
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/tests/data/test_rotation_transform.py b/preprocess/humanparsing/mhp_extension/detectron2/tests/data/test_rotation_transform.py
new file mode 100644
index 0000000000000000000000000000000000000000..45faf7e25eb08d70e92e5f6be326083ed0d23c76
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/tests/data/test_rotation_transform.py
@@ -0,0 +1,62 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+import numpy as np
+import unittest
+
+from detectron2.data.transforms.transform import RotationTransform
+
+
+class TestRotationTransform(unittest.TestCase):
+    def assertEqualsArrays(self, a1, a2):
+        self.assertTrue(np.allclose(a1, a2))
+
+    def randomData(self, h=5, w=5):
+        image = np.random.rand(h, w)
+        coords = np.array([[i, j] for j in range(h + 1) for i in range(w + 1)], dtype=float)
+        return image, coords, h, w
+
+    def test180(self):
+        image, coords, h, w = self.randomData(6, 6)
+        rot = RotationTransform(h, w, 180, expand=False, center=None)
+        self.assertEqualsArrays(rot.apply_image(image), image[::-1, ::-1])
+        rotated_coords = [[w - c[0], h - c[1]] for c in coords]
+        self.assertEqualsArrays(rot.apply_coords(coords), rotated_coords)
+
+    def test45_coords(self):
+        _, coords, h, w = self.randomData(4, 6)
+        rot = RotationTransform(h, w, 45, expand=False, center=None)
+        rotated_coords = [
+            [(x + y - (h + w) / 2) / np.sqrt(2) + w / 2, h / 2 + (y + (w - h) / 2 - x) / np.sqrt(2)]
+            for (x, y) in coords
+        ]
+        self.assertEqualsArrays(rot.apply_coords(coords), rotated_coords)
+
+    def test90(self):
+        image, coords, h, w = self.randomData()
+        rot = RotationTransform(h, w, 90, expand=False, center=None)
+        self.assertEqualsArrays(rot.apply_image(image), image.T[::-1])
+        rotated_coords = [[c[1], w - c[0]] for c in coords]
+        self.assertEqualsArrays(rot.apply_coords(coords), rotated_coords)
+
+    def test90_expand(self):  # non-square image
+        image, coords, h, w = self.randomData(h=5, w=8)
+        rot = RotationTransform(h, w, 90, expand=True, center=None)
+        self.assertEqualsArrays(rot.apply_image(image), image.T[::-1])
+        rotated_coords = [[c[1], w - c[0]] for c in coords]
+        self.assertEqualsArrays(rot.apply_coords(coords), rotated_coords)
+
+    def test_center_expand(self):
+        # center has no effect if expand=True because it only affects shifting
+        image, coords, h, w = self.randomData(h=5, w=8)
+        angle = np.random.randint(360)
+        rot1 = RotationTransform(h, w, angle, expand=True, center=None)
+        rot2 = RotationTransform(h, w, angle, expand=True, center=(0, 0))
+        rot3 = RotationTransform(h, w, angle, expand=True, center=(h, w))
+        rot4 = RotationTransform(h, w, angle, expand=True, center=(2, 5))
+        for r1 in [rot1, rot2, rot3, rot4]:
+            for r2 in [rot1, rot2, rot3, rot4]:
+                self.assertEqualsArrays(r1.apply_image(image), r2.apply_image(image))
+                self.assertEqualsArrays(r1.apply_coords(coords), r2.apply_coords(coords))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/tests/data/test_sampler.py b/preprocess/humanparsing/mhp_extension/detectron2/tests/data/test_sampler.py
new file mode 100644
index 0000000000000000000000000000000000000000..1256a87a9cc3405ac20bb6b2cf1ee0b22b8f180f
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/tests/data/test_sampler.py
@@ -0,0 +1,23 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+import unittest
+from torch.utils.data.sampler import SequentialSampler
+
+from detectron2.data.samplers import GroupedBatchSampler
+
+
+class TestGroupedBatchSampler(unittest.TestCase):
+    def test_missing_group_id(self):
+        sampler = SequentialSampler(list(range(100)))
+        group_ids = [1] * 100
+        samples = GroupedBatchSampler(sampler, group_ids, 2)
+
+        for mini_batch in samples:
+            self.assertEqual(len(mini_batch), 2)
+
+    def test_groups(self):
+        sampler = SequentialSampler(list(range(100)))
+        group_ids = [1, 0] * 50
+        samples = GroupedBatchSampler(sampler, group_ids, 2)
+
+        for mini_batch in samples:
+            self.assertEqual((mini_batch[0] + mini_batch[1]) % 2, 0)
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/tests/data/test_transforms.py b/preprocess/humanparsing/mhp_extension/detectron2/tests/data/test_transforms.py
new file mode 100644
index 0000000000000000000000000000000000000000..6d8551887aca5d5fa773d33227cb1685f4e2a8c8
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/tests/data/test_transforms.py
@@ -0,0 +1,134 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+
+import logging
+import numpy as np
+import unittest
+from unittest import mock
+
+from detectron2.config import get_cfg
+from detectron2.data import detection_utils
+from detectron2.data import transforms as T
+from detectron2.utils.logger import setup_logger
+
+logger = logging.getLogger(__name__)
+
+
+class TestTransforms(unittest.TestCase):
+    def setUp(self):
+        setup_logger()
+
+    def test_apply_rotated_boxes(self):
+        np.random.seed(125)
+        cfg = get_cfg()
+        is_train = True
+        transform_gen = detection_utils.build_transform_gen(cfg, is_train)
+        image = np.random.rand(200, 300)
+        image, transforms = T.apply_transform_gens(transform_gen, image)
+        image_shape = image.shape[:2]  # h, w
+        assert image_shape == (800, 1200)
+        annotation = {"bbox": [179, 97, 62, 40, -56]}
+
+        boxes = np.array([annotation["bbox"]], dtype=np.float64)  # boxes.shape = (1, 5)
+        transformed_bbox = transforms.apply_rotated_box(boxes)[0]
+
+        expected_bbox = np.array([484, 388, 248, 160, 56], dtype=np.float64)
+        err_msg = "transformed_bbox = {}, expected {}".format(transformed_bbox, expected_bbox)
+        assert np.allclose(transformed_bbox, expected_bbox), err_msg
+
+    def test_apply_rotated_boxes_unequal_scaling_factor(self):
+        np.random.seed(125)
+        h, w = 400, 200
+        newh, neww = 800, 800
+        image = np.random.rand(h, w)
+        transform_gen = []
+        transform_gen.append(T.Resize(shape=(newh, neww)))
+        image, transforms = T.apply_transform_gens(transform_gen, image)
+        image_shape = image.shape[:2]  # h, w
+        assert image_shape == (newh, neww)
+
+        boxes = np.array(
+            [
+                [150, 100, 40, 20, 0],
+                [150, 100, 40, 20, 30],
+                [150, 100, 40, 20, 90],
+                [150, 100, 40, 20, -90],
+            ],
+            dtype=np.float64,
+        )
+        transformed_boxes = transforms.apply_rotated_box(boxes)
+
+        expected_bboxes = np.array(
+            [
+                [600, 200, 160, 40, 0],
+                [600, 200, 144.22205102, 52.91502622, 49.10660535],
+                [600, 200, 80, 80, 90],
+                [600, 200, 80, 80, -90],
+            ],
+            dtype=np.float64,
+        )
+        err_msg = "transformed_boxes = {}, expected {}".format(transformed_boxes, expected_bboxes)
+        assert np.allclose(transformed_boxes, expected_bboxes), err_msg
+
+    def test_print_transform_gen(self):
+        t = T.RandomCrop("relative", (100, 100))
+        self.assertTrue(str(t) == "RandomCrop(crop_type='relative', crop_size=(100, 100))")
+
+        t = T.RandomFlip(prob=0.5)
+        self.assertTrue(str(t) == "RandomFlip(prob=0.5)")
+
+        t = T.RandomFlip()
+        self.assertTrue(str(t) == "RandomFlip()")
+
+    def test_random_apply_prob_out_of_range_check(self):
+        # GIVEN
+        test_probabilities = {0.0: True, 0.5: True, 1.0: True, -0.01: False, 1.01: False}
+
+        # WHEN
+        for given_probability, is_valid in test_probabilities.items():
+            # THEN
+            if not is_valid:
+                self.assertRaises(AssertionError, T.RandomApply, None, prob=given_probability)
+            else:
+                T.RandomApply(T.NoOpTransform(), prob=given_probability)
+
+    def test_random_apply_wrapping_transform_gen_probability_occured_evaluation(self):
+        # GIVEN
+        transform_mock = mock.MagicMock(name="MockTransform", spec=T.TransformGen)
+        image_mock = mock.MagicMock(name="MockImage")
+        random_apply = T.RandomApply(transform_mock, prob=0.001)
+
+        # WHEN
+        with mock.patch.object(random_apply, "_rand_range", return_value=0.0001):
+            transform = random_apply.get_transform(image_mock)
+
+        # THEN
+        transform_mock.get_transform.assert_called_once_with(image_mock)
+        self.assertIsNot(transform, transform_mock)
+
+    def test_random_apply_wrapping_std_transform_probability_occured_evaluation(self):
+        # GIVEN
+        transform_mock = mock.MagicMock(name="MockTransform", spec=T.Transform)
+        image_mock = mock.MagicMock(name="MockImage")
+        random_apply = T.RandomApply(transform_mock, prob=0.001)
+
+        # WHEN
+        with mock.patch.object(random_apply, "_rand_range", return_value=0.0001):
+            transform = random_apply.get_transform(image_mock)
+
+        # THEN
+        self.assertIs(transform, transform_mock)
+
+    def test_random_apply_probability_not_occured_evaluation(self):
+        # GIVEN
+        transform_mock = mock.MagicMock(name="MockTransform", spec=T.TransformGen)
+        image_mock = mock.MagicMock(name="MockImage")
+        random_apply = T.RandomApply(transform_mock, prob=0.001)
+
+        # WHEN
+        with mock.patch.object(random_apply, "_rand_range", return_value=0.9):
+            transform = random_apply.get_transform(image_mock)
+
+        # THEN
+        transform_mock.get_transform.assert_not_called()
+        self.assertIsInstance(transform, T.NoOpTransform)
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/tests/layers/__init__.py b/preprocess/humanparsing/mhp_extension/detectron2/tests/layers/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/tests/layers/test_mask_ops.py b/preprocess/humanparsing/mhp_extension/detectron2/tests/layers/test_mask_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..d180627354b6b9d8e0776d70f78e91ee5e530210
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/tests/layers/test_mask_ops.py
@@ -0,0 +1,190 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+
+import contextlib
+import io
+import numpy as np
+import unittest
+from collections import defaultdict
+import torch
+import tqdm
+from fvcore.common.benchmark import benchmark
+from fvcore.common.file_io import PathManager
+from pycocotools.coco import COCO
+from tabulate import tabulate
+from torch.nn import functional as F
+
+from detectron2.data import MetadataCatalog
+from detectron2.layers.mask_ops import (
+    pad_masks,
+    paste_mask_in_image_old,
+    paste_masks_in_image,
+    scale_boxes,
+)
+from detectron2.structures import BitMasks, Boxes, BoxMode, PolygonMasks
+from detectron2.structures.masks import polygons_to_bitmask
+
+
+def iou_between_full_image_bit_masks(a, b):
+    intersect = (a & b).sum()
+    union = (a | b).sum()
+    return intersect / union
+
+
+def rasterize_polygons_with_grid_sample(full_image_bit_mask, box, mask_size, threshold=0.5):
+    x0, y0, x1, y1 = box[0], box[1], box[2], box[3]
+
+    img_h, img_w = full_image_bit_mask.shape
+
+    mask_y = np.arange(0.0, mask_size) + 0.5  # mask y sample coords in [0.5, mask_size - 0.5]
+    mask_x = np.arange(0.0, mask_size) + 0.5  # mask x sample coords in [0.5, mask_size - 0.5]
+    mask_y = mask_y / mask_size * (y1 - y0) + y0
+    mask_x = mask_x / mask_size * (x1 - x0) + x0
+
+    mask_x = (mask_x - 0.5) / (img_w - 1) * 2 + -1
+    mask_y = (mask_y - 0.5) / (img_h - 1) * 2 + -1
+    gy, gx = torch.meshgrid(torch.from_numpy(mask_y), torch.from_numpy(mask_x))
+    ind = torch.stack([gx, gy], dim=-1).to(dtype=torch.float32)
+
+    full_image_bit_mask = torch.from_numpy(full_image_bit_mask)
+    mask = F.grid_sample(
+        full_image_bit_mask[None, None, :, :].to(dtype=torch.float32),
+        ind[None, :, :, :],
+        align_corners=True,
+    )
+
+    return mask[0, 0] >= threshold
+
+
+class TestMaskCropPaste(unittest.TestCase):
+    def setUp(self):
+        json_file = MetadataCatalog.get("coco_2017_val_100").json_file
+        if not PathManager.isfile(json_file):
+            raise unittest.SkipTest("{} not found".format(json_file))
+        with contextlib.redirect_stdout(io.StringIO()):
+            json_file = PathManager.get_local_path(json_file)
+            self.coco = COCO(json_file)
+
+    def test_crop_paste_consistency(self):
+        """
+        rasterize_polygons_within_box (used in training)
+        and
+        paste_masks_in_image (used in inference)
+        should be inverse operations to each other.
+
+        This function runs several implementation of the above two operations and prints
+        the reconstruction error.
+        """
+
+        anns = self.coco.loadAnns(self.coco.getAnnIds(iscrowd=False))  # avoid crowd annotations
+
+        selected_anns = anns[:100]
+
+        ious = []
+        for ann in tqdm.tqdm(selected_anns):
+            results = self.process_annotation(ann)
+            ious.append([k[2] for k in results])
+
+        ious = np.array(ious)
+        mean_ious = ious.mean(axis=0)
+        table = []
+        res_dic = defaultdict(dict)
+        for row, iou in zip(results, mean_ious):
+            table.append((row[0], row[1], iou))
+            res_dic[row[0]][row[1]] = iou
+        print(tabulate(table, headers=["rasterize", "paste", "iou"], tablefmt="simple"))
+        # assert that the reconstruction is good:
+        self.assertTrue(res_dic["polygon"]["aligned"] > 0.94)
+        self.assertTrue(res_dic["roialign"]["aligned"] > 0.95)
+
+    def process_annotation(self, ann, mask_side_len=28):
+        # Parse annotation data
+        img_info = self.coco.loadImgs(ids=[ann["image_id"]])[0]
+        height, width = img_info["height"], img_info["width"]
+        gt_polygons = [np.array(p, dtype=np.float64) for p in ann["segmentation"]]
+        gt_bbox = BoxMode.convert(ann["bbox"], BoxMode.XYWH_ABS, BoxMode.XYXY_ABS)
+        gt_bit_mask = polygons_to_bitmask(gt_polygons, height, width)
+
+        # Run rasterize ..
+        torch_gt_bbox = torch.tensor(gt_bbox).to(dtype=torch.float32).reshape(-1, 4)
+        box_bitmasks = {
+            "polygon": PolygonMasks([gt_polygons]).crop_and_resize(torch_gt_bbox, mask_side_len)[0],
+            "gridsample": rasterize_polygons_with_grid_sample(gt_bit_mask, gt_bbox, mask_side_len),
+            "roialign": BitMasks(torch.from_numpy(gt_bit_mask[None, :, :])).crop_and_resize(
+                torch_gt_bbox, mask_side_len
+            )[0],
+        }
+
+        # Run paste ..
+        results = defaultdict(dict)
+        for k, box_bitmask in box_bitmasks.items():
+            padded_bitmask, scale = pad_masks(box_bitmask[None, :, :], 1)
+            scaled_boxes = scale_boxes(torch_gt_bbox, scale)
+
+            r = results[k]
+            r["old"] = paste_mask_in_image_old(
+                padded_bitmask[0], scaled_boxes[0], height, width, threshold=0.5
+            )
+            r["aligned"] = paste_masks_in_image(
+                box_bitmask[None, :, :], Boxes(torch_gt_bbox), (height, width)
+            )[0]
+
+        table = []
+        for rasterize_method, r in results.items():
+            for paste_method, mask in r.items():
+                mask = np.asarray(mask)
+                iou = iou_between_full_image_bit_masks(gt_bit_mask.astype("uint8"), mask)
+                table.append((rasterize_method, paste_method, iou))
+        return table
+
+    def test_polygon_area(self):
+        # Draw polygon boxes
+        for d in [5.0, 10.0, 1000.0]:
+            polygon = PolygonMasks([[[0, 0, 0, d, d, d, d, 0]]])
+            area = polygon.area()[0]
+            target = d ** 2
+            self.assertEqual(area, target)
+
+        # Draw polygon triangles
+        for d in [5.0, 10.0, 1000.0]:
+            polygon = PolygonMasks([[[0, 0, 0, d, d, d]]])
+            area = polygon.area()[0]
+            target = d ** 2 / 2
+            self.assertEqual(area, target)
+
+
+def benchmark_paste():
+    S = 800
+    H, W = image_shape = (S, S)
+    N = 64
+    torch.manual_seed(42)
+    masks = torch.rand(N, 28, 28)
+
+    center = torch.rand(N, 2) * 600 + 100
+    wh = torch.clamp(torch.randn(N, 2) * 40 + 200, min=50)
+    x0y0 = torch.clamp(center - wh * 0.5, min=0.0)
+    x1y1 = torch.clamp(center + wh * 0.5, max=S)
+    boxes = Boxes(torch.cat([x0y0, x1y1], axis=1))
+
+    def func(device, n=3):
+        m = masks.to(device=device)
+        b = boxes.to(device=device)
+
+        def bench():
+            for _ in range(n):
+                paste_masks_in_image(m, b, image_shape)
+            if device.type == "cuda":
+                torch.cuda.synchronize()
+
+        return bench
+
+    specs = [{"device": torch.device("cpu"), "n": 3}]
+    if torch.cuda.is_available():
+        specs.append({"device": torch.device("cuda"), "n": 3})
+
+    benchmark(func, "paste_masks", specs, num_iters=10, warmup_iters=2)
+
+
+if __name__ == "__main__":
+    benchmark_paste()
+    unittest.main()
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/tests/layers/test_nms_rotated.py b/preprocess/humanparsing/mhp_extension/detectron2/tests/layers/test_nms_rotated.py
new file mode 100644
index 0000000000000000000000000000000000000000..94b346c524d2c372273dfe992df045962b9605cd
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/tests/layers/test_nms_rotated.py
@@ -0,0 +1,188 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+from __future__ import absolute_import, division, print_function, unicode_literals
+import numpy as np
+import unittest
+import torch
+from torchvision import ops
+
+from detectron2.layers import batched_nms, batched_nms_rotated, nms_rotated
+
+
+def nms_edit_distance(keep1, keep2):
+    """
+    Compare the "keep" result of two nms call.
+    They are allowed to be different in terms of edit distance
+    due to floating point precision issues, e.g.,
+    if a box happen to have an IoU of 0.5 with another box,
+    one implentation may choose to keep it while another may discard it.
+    """
+    if torch.equal(keep1, keep2):
+        # they should be equal most of the time
+        return 0
+    keep1, keep2 = tuple(keep1.cpu()), tuple(keep2.cpu())
+    m, n = len(keep1), len(keep2)
+
+    # edit distance with DP
+    f = [np.arange(n + 1), np.arange(n + 1)]
+    for i in range(m):
+        cur_row = i % 2
+        other_row = (i + 1) % 2
+        f[other_row][0] = i + 1
+        for j in range(n):
+            f[other_row][j + 1] = (
+                f[cur_row][j]
+                if keep1[i] == keep2[j]
+                else min(min(f[cur_row][j], f[cur_row][j + 1]), f[other_row][j]) + 1
+            )
+    return f[m % 2][n]
+
+
+class TestNMSRotated(unittest.TestCase):
+    def reference_horizontal_nms(self, boxes, scores, iou_threshold):
+        """
+        Args:
+            box_scores (N, 5): boxes in corner-form and probabilities.
+                (Note here 5 == 4 + 1, i.e., 4-dim horizontal box + 1-dim prob)
+            iou_threshold: intersection over union threshold.
+        Returns:
+             picked: a list of indexes of the kept boxes
+        """
+        picked = []
+        _, indexes = scores.sort(descending=True)
+        while len(indexes) > 0:
+            current = indexes[0]
+            picked.append(current.item())
+            if len(indexes) == 1:
+                break
+            current_box = boxes[current, :]
+            indexes = indexes[1:]
+            rest_boxes = boxes[indexes, :]
+            iou = ops.box_iou(rest_boxes, current_box.unsqueeze(0)).squeeze(1)
+            indexes = indexes[iou <= iou_threshold]
+
+        return torch.as_tensor(picked)
+
+    def _create_tensors(self, N):
+        boxes = torch.rand(N, 4) * 100
+        # Note: the implementation of this function in torchvision is:
+        # boxes[:, 2:] += torch.rand(N, 2) * 100
+        # but it does not guarantee non-negative widths/heights constraints:
+        # boxes[:, 2] >= boxes[:, 0] and boxes[:, 3] >= boxes[:, 1]:
+        boxes[:, 2:] += boxes[:, :2]
+        scores = torch.rand(N)
+        return boxes, scores
+
+    def test_batched_nms_rotated_0_degree_cpu(self):
+        N = 2000
+        num_classes = 50
+        boxes, scores = self._create_tensors(N)
+        idxs = torch.randint(0, num_classes, (N,))
+        rotated_boxes = torch.zeros(N, 5)
+        rotated_boxes[:, 0] = (boxes[:, 0] + boxes[:, 2]) / 2.0
+        rotated_boxes[:, 1] = (boxes[:, 1] + boxes[:, 3]) / 2.0
+        rotated_boxes[:, 2] = boxes[:, 2] - boxes[:, 0]
+        rotated_boxes[:, 3] = boxes[:, 3] - boxes[:, 1]
+        err_msg = "Rotated NMS with 0 degree is incompatible with horizontal NMS for IoU={}"
+        for iou in [0.2, 0.5, 0.8]:
+            backup = boxes.clone()
+            keep_ref = batched_nms(boxes, scores, idxs, iou)
+            assert torch.allclose(boxes, backup), "boxes modified by batched_nms"
+            backup = rotated_boxes.clone()
+            keep = batched_nms_rotated(rotated_boxes, scores, idxs, iou)
+            assert torch.allclose(
+                rotated_boxes, backup
+            ), "rotated_boxes modified by batched_nms_rotated"
+            self.assertLessEqual(nms_edit_distance(keep, keep_ref), 1, err_msg.format(iou))
+
+    @unittest.skipIf(not torch.cuda.is_available(), "CUDA not available")
+    def test_batched_nms_rotated_0_degree_cuda(self):
+        N = 2000
+        num_classes = 50
+        boxes, scores = self._create_tensors(N)
+        idxs = torch.randint(0, num_classes, (N,))
+        rotated_boxes = torch.zeros(N, 5)
+        rotated_boxes[:, 0] = (boxes[:, 0] + boxes[:, 2]) / 2.0
+        rotated_boxes[:, 1] = (boxes[:, 1] + boxes[:, 3]) / 2.0
+        rotated_boxes[:, 2] = boxes[:, 2] - boxes[:, 0]
+        rotated_boxes[:, 3] = boxes[:, 3] - boxes[:, 1]
+        err_msg = "Rotated NMS with 0 degree is incompatible with horizontal NMS for IoU={}"
+        for iou in [0.2, 0.5, 0.8]:
+            backup = boxes.clone()
+            keep_ref = batched_nms(boxes.cuda(), scores.cuda(), idxs, iou)
+            self.assertTrue(torch.allclose(boxes, backup), "boxes modified by batched_nms")
+            backup = rotated_boxes.clone()
+            keep = batched_nms_rotated(rotated_boxes.cuda(), scores.cuda(), idxs, iou)
+            self.assertTrue(
+                torch.allclose(rotated_boxes, backup),
+                "rotated_boxes modified by batched_nms_rotated",
+            )
+            self.assertLessEqual(nms_edit_distance(keep, keep_ref), 1, err_msg.format(iou))
+
+    def test_nms_rotated_0_degree_cpu(self):
+        N = 1000
+        boxes, scores = self._create_tensors(N)
+        rotated_boxes = torch.zeros(N, 5)
+        rotated_boxes[:, 0] = (boxes[:, 0] + boxes[:, 2]) / 2.0
+        rotated_boxes[:, 1] = (boxes[:, 1] + boxes[:, 3]) / 2.0
+        rotated_boxes[:, 2] = boxes[:, 2] - boxes[:, 0]
+        rotated_boxes[:, 3] = boxes[:, 3] - boxes[:, 1]
+        err_msg = "Rotated NMS incompatible between CPU and reference implementation for IoU={}"
+        for iou in [0.5]:
+            keep_ref = self.reference_horizontal_nms(boxes, scores, iou)
+            keep = nms_rotated(rotated_boxes, scores, iou)
+            self.assertLessEqual(nms_edit_distance(keep, keep_ref), 1, err_msg.format(iou))
+
+    def test_nms_rotated_90_degrees_cpu(self):
+        N = 1000
+        boxes, scores = self._create_tensors(N)
+        rotated_boxes = torch.zeros(N, 5)
+        rotated_boxes[:, 0] = (boxes[:, 0] + boxes[:, 2]) / 2.0
+        rotated_boxes[:, 1] = (boxes[:, 1] + boxes[:, 3]) / 2.0
+        # Note for rotated_boxes[:, 2] and rotated_boxes[:, 3]:
+        # widths and heights are intentionally swapped here for 90 degrees case
+        # so that the reference horizontal nms could be used
+        rotated_boxes[:, 2] = boxes[:, 3] - boxes[:, 1]
+        rotated_boxes[:, 3] = boxes[:, 2] - boxes[:, 0]
+
+        rotated_boxes[:, 4] = torch.ones(N) * 90
+        err_msg = "Rotated NMS incompatible between CPU and reference implementation for IoU={}"
+        for iou in [0.2, 0.5, 0.8]:
+            keep_ref = self.reference_horizontal_nms(boxes, scores, iou)
+            keep = nms_rotated(rotated_boxes, scores, iou)
+            assert torch.equal(keep, keep_ref), err_msg.format(iou)
+
+    def test_nms_rotated_180_degrees_cpu(self):
+        N = 1000
+        boxes, scores = self._create_tensors(N)
+        rotated_boxes = torch.zeros(N, 5)
+        rotated_boxes[:, 0] = (boxes[:, 0] + boxes[:, 2]) / 2.0
+        rotated_boxes[:, 1] = (boxes[:, 1] + boxes[:, 3]) / 2.0
+        rotated_boxes[:, 2] = boxes[:, 2] - boxes[:, 0]
+        rotated_boxes[:, 3] = boxes[:, 3] - boxes[:, 1]
+        rotated_boxes[:, 4] = torch.ones(N) * 180
+        err_msg = "Rotated NMS incompatible between CPU and reference implementation for IoU={}"
+        for iou in [0.2, 0.5, 0.8]:
+            keep_ref = self.reference_horizontal_nms(boxes, scores, iou)
+            keep = nms_rotated(rotated_boxes, scores, iou)
+            assert torch.equal(keep, keep_ref), err_msg.format(iou)
+
+    @unittest.skipIf(not torch.cuda.is_available(), "CUDA not available")
+    def test_nms_rotated_0_degree_cuda(self):
+        N = 1000
+        boxes, scores = self._create_tensors(N)
+        rotated_boxes = torch.zeros(N, 5)
+        rotated_boxes[:, 0] = (boxes[:, 0] + boxes[:, 2]) / 2.0
+        rotated_boxes[:, 1] = (boxes[:, 1] + boxes[:, 3]) / 2.0
+        rotated_boxes[:, 2] = boxes[:, 2] - boxes[:, 0]
+        rotated_boxes[:, 3] = boxes[:, 3] - boxes[:, 1]
+        err_msg = "Rotated NMS incompatible between CPU and CUDA for IoU={}"
+
+        for iou in [0.2, 0.5, 0.8]:
+            r_cpu = nms_rotated(rotated_boxes, scores, iou)
+            r_cuda = nms_rotated(rotated_boxes.cuda(), scores.cuda(), iou)
+
+            assert torch.equal(r_cpu, r_cuda.cpu()), err_msg.format(iou)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/tests/layers/test_roi_align.py b/preprocess/humanparsing/mhp_extension/detectron2/tests/layers/test_roi_align.py
new file mode 100644
index 0000000000000000000000000000000000000000..633d7c29c41b94b8a57c15aff728f23a71b535d1
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/tests/layers/test_roi_align.py
@@ -0,0 +1,152 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+import numpy as np
+import unittest
+import cv2
+import torch
+from fvcore.common.benchmark import benchmark
+
+from detectron2.layers.roi_align import ROIAlign
+
+
+class ROIAlignTest(unittest.TestCase):
+    def test_forward_output(self):
+        input = np.arange(25).reshape(5, 5).astype("float32")
+        """
+        0  1  2   3 4
+        5  6  7   8 9
+        10 11 12 13 14
+        15 16 17 18 19
+        20 21 22 23 24
+        """
+
+        output = self._simple_roialign(input, [1, 1, 3, 3], (4, 4), aligned=False)
+        output_correct = self._simple_roialign(input, [1, 1, 3, 3], (4, 4), aligned=True)
+
+        # without correction:
+        old_results = [
+            [7.5, 8, 8.5, 9],
+            [10, 10.5, 11, 11.5],
+            [12.5, 13, 13.5, 14],
+            [15, 15.5, 16, 16.5],
+        ]
+
+        # with 0.5 correction:
+        correct_results = [
+            [4.5, 5.0, 5.5, 6.0],
+            [7.0, 7.5, 8.0, 8.5],
+            [9.5, 10.0, 10.5, 11.0],
+            [12.0, 12.5, 13.0, 13.5],
+        ]
+        # This is an upsampled version of [[6, 7], [11, 12]]
+
+        self.assertTrue(np.allclose(output.flatten(), np.asarray(old_results).flatten()))
+        self.assertTrue(
+            np.allclose(output_correct.flatten(), np.asarray(correct_results).flatten())
+        )
+
+        # Also see similar issues in tensorflow at
+        # https://github.com/tensorflow/tensorflow/issues/26278
+
+    def test_resize(self):
+        H, W = 30, 30
+        input = np.random.rand(H, W).astype("float32") * 100
+        box = [10, 10, 20, 20]
+        output = self._simple_roialign(input, box, (5, 5), aligned=True)
+
+        input2x = cv2.resize(input, (W // 2, H // 2), interpolation=cv2.INTER_LINEAR)
+        box2x = [x / 2 for x in box]
+        output2x = self._simple_roialign(input2x, box2x, (5, 5), aligned=True)
+        diff = np.abs(output2x - output)
+        self.assertTrue(diff.max() < 1e-4)
+
+    def _simple_roialign(self, img, box, resolution, aligned=True):
+        """
+        RoiAlign with scale 1.0 and 0 sample ratio.
+        """
+        if isinstance(resolution, int):
+            resolution = (resolution, resolution)
+        op = ROIAlign(resolution, 1.0, 0, aligned=aligned)
+        input = torch.from_numpy(img[None, None, :, :].astype("float32"))
+
+        rois = [0] + list(box)
+        rois = torch.from_numpy(np.asarray(rois)[None, :].astype("float32"))
+        output = op.forward(input, rois)
+        if torch.cuda.is_available():
+            output_cuda = op.forward(input.cuda(), rois.cuda()).cpu()
+            self.assertTrue(torch.allclose(output, output_cuda))
+        return output[0, 0]
+
+    def _simple_roialign_with_grad(self, img, box, resolution, device):
+        if isinstance(resolution, int):
+            resolution = (resolution, resolution)
+
+        op = ROIAlign(resolution, 1.0, 0, aligned=True)
+        input = torch.from_numpy(img[None, None, :, :].astype("float32"))
+
+        rois = [0] + list(box)
+        rois = torch.from_numpy(np.asarray(rois)[None, :].astype("float32"))
+        input = input.to(device=device)
+        rois = rois.to(device=device)
+        input.requires_grad = True
+        output = op.forward(input, rois)
+        return input, output
+
+    def test_empty_box(self):
+        img = np.random.rand(5, 5)
+        box = [3, 4, 5, 4]
+        o = self._simple_roialign(img, box, 7)
+        self.assertTrue(o.shape == (7, 7))
+        self.assertTrue((o == 0).all())
+
+        for dev in ["cpu"] + ["cuda"] if torch.cuda.is_available() else []:
+            input, output = self._simple_roialign_with_grad(img, box, 7, torch.device(dev))
+            output.sum().backward()
+            self.assertTrue(torch.allclose(input.grad, torch.zeros_like(input)))
+
+    def test_empty_batch(self):
+        input = torch.zeros(0, 3, 10, 10, dtype=torch.float32)
+        rois = torch.zeros(0, 5, dtype=torch.float32)
+        op = ROIAlign((7, 7), 1.0, 0, aligned=True)
+        output = op.forward(input, rois)
+        self.assertTrue(output.shape == (0, 3, 7, 7))
+
+
+def benchmark_roi_align():
+    from detectron2 import _C
+
+    def random_boxes(mean_box, stdev, N, maxsize):
+        ret = torch.rand(N, 4) * stdev + torch.tensor(mean_box, dtype=torch.float)
+        ret.clamp_(min=0, max=maxsize)
+        return ret
+
+    def func(N, C, H, W, nboxes_per_img):
+        input = torch.rand(N, C, H, W)
+        boxes = []
+        batch_idx = []
+        for k in range(N):
+            b = random_boxes([80, 80, 130, 130], 24, nboxes_per_img, H)
+            # try smaller boxes:
+            # b = random_boxes([100, 100, 110, 110], 4, nboxes_per_img, H)
+            boxes.append(b)
+            batch_idx.append(torch.zeros(nboxes_per_img, 1, dtype=torch.float32) + k)
+        boxes = torch.cat(boxes, axis=0)
+        batch_idx = torch.cat(batch_idx, axis=0)
+        boxes = torch.cat([batch_idx, boxes], axis=1)
+
+        input = input.cuda()
+        boxes = boxes.cuda()
+
+        def bench():
+            _C.roi_align_forward(input, boxes, 1.0, 7, 7, 0, True)
+            torch.cuda.synchronize()
+
+        return bench
+
+    args = [dict(N=2, C=512, H=256, W=256, nboxes_per_img=500)]
+    benchmark(func, "cuda_roialign", args, num_iters=20, warmup_iters=1)
+
+
+if __name__ == "__main__":
+    if torch.cuda.is_available():
+        benchmark_roi_align()
+    unittest.main()
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/tests/layers/test_roi_align_rotated.py b/preprocess/humanparsing/mhp_extension/detectron2/tests/layers/test_roi_align_rotated.py
new file mode 100644
index 0000000000000000000000000000000000000000..1915b59ff6774a54ee0e5dbfdbe0ecf89f2e2235
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/tests/layers/test_roi_align_rotated.py
@@ -0,0 +1,176 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+import logging
+import unittest
+import cv2
+import torch
+from torch.autograd import Variable, gradcheck
+
+from detectron2.layers.roi_align import ROIAlign
+from detectron2.layers.roi_align_rotated import ROIAlignRotated
+
+logger = logging.getLogger(__name__)
+
+
+class ROIAlignRotatedTest(unittest.TestCase):
+    def _box_to_rotated_box(self, box, angle):
+        return [
+            (box[0] + box[2]) / 2.0,
+            (box[1] + box[3]) / 2.0,
+            box[2] - box[0],
+            box[3] - box[1],
+            angle,
+        ]
+
+    def _rot90(self, img, num):
+        num = num % 4  # note: -1 % 4 == 3
+        for _ in range(num):
+            img = img.transpose(0, 1).flip(0)
+        return img
+
+    def test_forward_output_0_90_180_270(self):
+        for i in range(4):
+            # i = 0, 1, 2, 3 corresponding to 0, 90, 180, 270 degrees
+            img = torch.arange(25, dtype=torch.float32).reshape(5, 5)
+            """
+            0  1  2   3 4
+            5  6  7   8 9
+            10 11 12 13 14
+            15 16 17 18 19
+            20 21 22 23 24
+            """
+            box = [1, 1, 3, 3]
+            rotated_box = self._box_to_rotated_box(box=box, angle=90 * i)
+
+            result = self._simple_roi_align_rotated(img=img, box=rotated_box, resolution=(4, 4))
+
+            # Here's an explanation for 0 degree case:
+            # point 0 in the original input lies at [0.5, 0.5]
+            # (the center of bin [0, 1] x [0, 1])
+            # point 1 in the original input lies at [1.5, 0.5], etc.
+            # since the resolution is (4, 4) that divides [1, 3] x [1, 3]
+            # into 4 x 4 equal bins,
+            # the top-left bin is [1, 1.5] x [1, 1.5], and its center
+            # (1.25, 1.25) lies at the 3/4 position
+            # between point 0 and point 1, point 5 and point 6,
+            # point 0 and point 5, point 1 and point 6, so it can be calculated as
+            # 0.25*(0*0.25+1*0.75)+(5*0.25+6*0.75)*0.75 = 4.5
+            result_expected = torch.tensor(
+                [
+                    [4.5, 5.0, 5.5, 6.0],
+                    [7.0, 7.5, 8.0, 8.5],
+                    [9.5, 10.0, 10.5, 11.0],
+                    [12.0, 12.5, 13.0, 13.5],
+                ]
+            )
+            # This is also an upsampled version of [[6, 7], [11, 12]]
+
+            # When the box is rotated by 90 degrees CCW,
+            # the result would be rotated by 90 degrees CW, thus it's -i here
+            result_expected = self._rot90(result_expected, -i)
+
+            assert torch.allclose(result, result_expected)
+
+    def test_resize(self):
+        H, W = 30, 30
+        input = torch.rand(H, W) * 100
+        box = [10, 10, 20, 20]
+        rotated_box = self._box_to_rotated_box(box, angle=0)
+        output = self._simple_roi_align_rotated(img=input, box=rotated_box, resolution=(5, 5))
+
+        input2x = cv2.resize(input.numpy(), (W // 2, H // 2), interpolation=cv2.INTER_LINEAR)
+        input2x = torch.from_numpy(input2x)
+        box2x = [x / 2 for x in box]
+        rotated_box2x = self._box_to_rotated_box(box2x, angle=0)
+        output2x = self._simple_roi_align_rotated(img=input2x, box=rotated_box2x, resolution=(5, 5))
+        assert torch.allclose(output2x, output)
+
+    def _simple_roi_align_rotated(self, img, box, resolution):
+        """
+        RoiAlignRotated with scale 1.0 and 0 sample ratio.
+        """
+        op = ROIAlignRotated(output_size=resolution, spatial_scale=1.0, sampling_ratio=0)
+        input = img[None, None, :, :]
+
+        rois = [0] + list(box)
+        rois = torch.tensor(rois, dtype=torch.float32)[None, :]
+        result_cpu = op.forward(input, rois)
+        if torch.cuda.is_available():
+            result_cuda = op.forward(input.cuda(), rois.cuda())
+            assert torch.allclose(result_cpu, result_cuda.cpu())
+        return result_cpu[0, 0]
+
+    def test_empty_box(self):
+        img = torch.rand(5, 5)
+        out = self._simple_roi_align_rotated(img, [2, 3, 0, 0, 0], (7, 7))
+        self.assertTrue((out == 0).all())
+
+    def test_roi_align_rotated_gradcheck_cpu(self):
+        dtype = torch.float64
+        device = torch.device("cpu")
+        roi_align_rotated_op = ROIAlignRotated(
+            output_size=(5, 5), spatial_scale=0.5, sampling_ratio=1
+        ).to(dtype=dtype, device=device)
+        x = torch.rand(1, 1, 10, 10, dtype=dtype, device=device, requires_grad=True)
+        # roi format is (batch index, x_center, y_center, width, height, angle)
+        rois = torch.tensor(
+            [[0, 4.5, 4.5, 9, 9, 0], [0, 2, 7, 4, 4, 0], [0, 7, 7, 4, 4, 0]],
+            dtype=dtype,
+            device=device,
+        )
+
+        def func(input):
+            return roi_align_rotated_op(input, rois)
+
+        assert gradcheck(func, (x,)), "gradcheck failed for RoIAlignRotated CPU"
+        assert gradcheck(func, (x.transpose(2, 3),)), "gradcheck failed for RoIAlignRotated CPU"
+
+    @unittest.skipIf(not torch.cuda.is_available(), "CUDA not available")
+    def test_roi_align_rotated_gradient_cuda(self):
+        """
+        Compute gradients for ROIAlignRotated with multiple bounding boxes on the GPU,
+        and compare the result with ROIAlign
+        """
+        # torch.manual_seed(123)
+        dtype = torch.float64
+        device = torch.device("cuda")
+        pool_h, pool_w = (5, 5)
+
+        roi_align = ROIAlign(output_size=(pool_h, pool_w), spatial_scale=1, sampling_ratio=2).to(
+            device=device
+        )
+
+        roi_align_rotated = ROIAlignRotated(
+            output_size=(pool_h, pool_w), spatial_scale=1, sampling_ratio=2
+        ).to(device=device)
+
+        x = torch.rand(1, 1, 10, 10, dtype=dtype, device=device, requires_grad=True)
+        # x_rotated = x.clone() won't work (will lead to grad_fun=CloneBackward)!
+        x_rotated = Variable(x.data.clone(), requires_grad=True)
+
+        # roi_rotated format is (batch index, x_center, y_center, width, height, angle)
+        rois_rotated = torch.tensor(
+            [[0, 4.5, 4.5, 9, 9, 0], [0, 2, 7, 4, 4, 0], [0, 7, 7, 4, 4, 0]],
+            dtype=dtype,
+            device=device,
+        )
+
+        y_rotated = roi_align_rotated(x_rotated, rois_rotated)
+        s_rotated = y_rotated.sum()
+        s_rotated.backward()
+
+        # roi format is (batch index, x1, y1, x2, y2)
+        rois = torch.tensor(
+            [[0, 0, 0, 9, 9], [0, 0, 5, 4, 9], [0, 5, 5, 9, 9]], dtype=dtype, device=device
+        )
+
+        y = roi_align(x, rois)
+        s = y.sum()
+        s.backward()
+
+        assert torch.allclose(
+            x.grad, x_rotated.grad
+        ), "gradients for ROIAlign and ROIAlignRotated mismatch on CUDA"
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/tests/modeling/__init__.py b/preprocess/humanparsing/mhp_extension/detectron2/tests/modeling/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/tests/modeling/test_anchor_generator.py b/preprocess/humanparsing/mhp_extension/detectron2/tests/modeling/test_anchor_generator.py
new file mode 100644
index 0000000000000000000000000000000000000000..bc14f0279ee682040082e51f96a41a267269d6ce
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/tests/modeling/test_anchor_generator.py
@@ -0,0 +1,121 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+import logging
+import unittest
+import torch
+
+from detectron2.config import get_cfg
+from detectron2.layers import ShapeSpec
+from detectron2.modeling.anchor_generator import DefaultAnchorGenerator, RotatedAnchorGenerator
+
+logger = logging.getLogger(__name__)
+
+
+class TestAnchorGenerator(unittest.TestCase):
+    def test_default_anchor_generator(self):
+        cfg = get_cfg()
+        cfg.MODEL.ANCHOR_GENERATOR.SIZES = [[32, 64]]
+        cfg.MODEL.ANCHOR_GENERATOR.ASPECT_RATIOS = [[0.25, 1, 4]]
+
+        anchor_generator = DefaultAnchorGenerator(cfg, [ShapeSpec(stride=4)])
+
+        # only the last two dimensions of features matter here
+        num_images = 2
+        features = {"stage3": torch.rand(num_images, 96, 1, 2)}
+        anchors = anchor_generator([features["stage3"]])
+        expected_anchor_tensor = torch.tensor(
+            [
+                [-32.0, -8.0, 32.0, 8.0],
+                [-16.0, -16.0, 16.0, 16.0],
+                [-8.0, -32.0, 8.0, 32.0],
+                [-64.0, -16.0, 64.0, 16.0],
+                [-32.0, -32.0, 32.0, 32.0],
+                [-16.0, -64.0, 16.0, 64.0],
+                [-28.0, -8.0, 36.0, 8.0],  # -28.0 == -32.0 + STRIDE (4)
+                [-12.0, -16.0, 20.0, 16.0],
+                [-4.0, -32.0, 12.0, 32.0],
+                [-60.0, -16.0, 68.0, 16.0],
+                [-28.0, -32.0, 36.0, 32.0],
+                [-12.0, -64.0, 20.0, 64.0],
+            ]
+        )
+
+        assert torch.allclose(anchors[0].tensor, expected_anchor_tensor)
+
+    def test_default_anchor_generator_centered(self):
+        # test explicit args
+        anchor_generator = DefaultAnchorGenerator(
+            sizes=[32, 64], aspect_ratios=[0.25, 1, 4], strides=[4]
+        )
+
+        # only the last two dimensions of features matter here
+        num_images = 2
+        features = {"stage3": torch.rand(num_images, 96, 1, 2)}
+        expected_anchor_tensor = torch.tensor(
+            [
+                [-30.0, -6.0, 34.0, 10.0],
+                [-14.0, -14.0, 18.0, 18.0],
+                [-6.0, -30.0, 10.0, 34.0],
+                [-62.0, -14.0, 66.0, 18.0],
+                [-30.0, -30.0, 34.0, 34.0],
+                [-14.0, -62.0, 18.0, 66.0],
+                [-26.0, -6.0, 38.0, 10.0],
+                [-10.0, -14.0, 22.0, 18.0],
+                [-2.0, -30.0, 14.0, 34.0],
+                [-58.0, -14.0, 70.0, 18.0],
+                [-26.0, -30.0, 38.0, 34.0],
+                [-10.0, -62.0, 22.0, 66.0],
+            ]
+        )
+
+        anchors = anchor_generator([features["stage3"]])
+        assert torch.allclose(anchors[0].tensor, expected_anchor_tensor)
+
+        # doesn't work yet
+        # anchors = torch.jit.script(anchor_generator)([features["stage3"]])
+        # assert torch.allclose(anchors[0].tensor, expected_anchor_tensor)
+
+    def test_rrpn_anchor_generator(self):
+        cfg = get_cfg()
+        cfg.MODEL.ANCHOR_GENERATOR.SIZES = [[32, 64]]
+        cfg.MODEL.ANCHOR_GENERATOR.ASPECT_RATIOS = [[0.25, 1, 4]]
+        cfg.MODEL.ANCHOR_GENERATOR.ANGLES = [0, 45]  # test single list[float]
+        anchor_generator = RotatedAnchorGenerator(cfg, [ShapeSpec(stride=4)])
+
+        # only the last two dimensions of features matter here
+        num_images = 2
+        features = {"stage3": torch.rand(num_images, 96, 1, 2)}
+        anchors = anchor_generator([features["stage3"]])
+        expected_anchor_tensor = torch.tensor(
+            [
+                [0.0, 0.0, 64.0, 16.0, 0.0],
+                [0.0, 0.0, 64.0, 16.0, 45.0],
+                [0.0, 0.0, 32.0, 32.0, 0.0],
+                [0.0, 0.0, 32.0, 32.0, 45.0],
+                [0.0, 0.0, 16.0, 64.0, 0.0],
+                [0.0, 0.0, 16.0, 64.0, 45.0],
+                [0.0, 0.0, 128.0, 32.0, 0.0],
+                [0.0, 0.0, 128.0, 32.0, 45.0],
+                [0.0, 0.0, 64.0, 64.0, 0.0],
+                [0.0, 0.0, 64.0, 64.0, 45.0],
+                [0.0, 0.0, 32.0, 128.0, 0.0],
+                [0.0, 0.0, 32.0, 128.0, 45.0],
+                [4.0, 0.0, 64.0, 16.0, 0.0],  # 4.0 == 0.0 + STRIDE (4)
+                [4.0, 0.0, 64.0, 16.0, 45.0],
+                [4.0, 0.0, 32.0, 32.0, 0.0],
+                [4.0, 0.0, 32.0, 32.0, 45.0],
+                [4.0, 0.0, 16.0, 64.0, 0.0],
+                [4.0, 0.0, 16.0, 64.0, 45.0],
+                [4.0, 0.0, 128.0, 32.0, 0.0],
+                [4.0, 0.0, 128.0, 32.0, 45.0],
+                [4.0, 0.0, 64.0, 64.0, 0.0],
+                [4.0, 0.0, 64.0, 64.0, 45.0],
+                [4.0, 0.0, 32.0, 128.0, 0.0],
+                [4.0, 0.0, 32.0, 128.0, 45.0],
+            ]
+        )
+
+        assert torch.allclose(anchors[0].tensor, expected_anchor_tensor)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/tests/modeling/test_box2box_transform.py b/preprocess/humanparsing/mhp_extension/detectron2/tests/modeling/test_box2box_transform.py
new file mode 100644
index 0000000000000000000000000000000000000000..9d124d79fc0e17f268f6b5b50fcb8f8dfad59368
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/tests/modeling/test_box2box_transform.py
@@ -0,0 +1,64 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+import logging
+import unittest
+import torch
+
+from detectron2.modeling.box_regression import Box2BoxTransform, Box2BoxTransformRotated
+
+logger = logging.getLogger(__name__)
+
+
+def random_boxes(mean_box, stdev, N):
+    return torch.rand(N, 4) * stdev + torch.tensor(mean_box, dtype=torch.float)
+
+
+class TestBox2BoxTransform(unittest.TestCase):
+    def test_reconstruction(self):
+        weights = (5, 5, 10, 10)
+        b2b_tfm = Box2BoxTransform(weights=weights)
+        src_boxes = random_boxes([10, 10, 20, 20], 1, 10)
+        dst_boxes = random_boxes([10, 10, 20, 20], 1, 10)
+
+        devices = [torch.device("cpu")]
+        if torch.cuda.is_available():
+            devices.append(torch.device("cuda"))
+        for device in devices:
+            src_boxes = src_boxes.to(device=device)
+            dst_boxes = dst_boxes.to(device=device)
+            deltas = b2b_tfm.get_deltas(src_boxes, dst_boxes)
+            dst_boxes_reconstructed = b2b_tfm.apply_deltas(deltas, src_boxes)
+            assert torch.allclose(dst_boxes, dst_boxes_reconstructed)
+
+
+def random_rotated_boxes(mean_box, std_length, std_angle, N):
+    return torch.cat(
+        [torch.rand(N, 4) * std_length, torch.rand(N, 1) * std_angle], dim=1
+    ) + torch.tensor(mean_box, dtype=torch.float)
+
+
+class TestBox2BoxTransformRotated(unittest.TestCase):
+    def test_reconstruction(self):
+        weights = (5, 5, 10, 10, 1)
+        b2b_transform = Box2BoxTransformRotated(weights=weights)
+        src_boxes = random_rotated_boxes([10, 10, 20, 20, -30], 5, 60.0, 10)
+        dst_boxes = random_rotated_boxes([10, 10, 20, 20, -30], 5, 60.0, 10)
+
+        devices = [torch.device("cpu")]
+        if torch.cuda.is_available():
+            devices.append(torch.device("cuda"))
+        for device in devices:
+            src_boxes = src_boxes.to(device=device)
+            dst_boxes = dst_boxes.to(device=device)
+            deltas = b2b_transform.get_deltas(src_boxes, dst_boxes)
+            dst_boxes_reconstructed = b2b_transform.apply_deltas(deltas, src_boxes)
+            assert torch.allclose(dst_boxes[:, :4], dst_boxes_reconstructed[:, :4], atol=1e-5)
+            # angle difference has to be normalized
+            assert torch.allclose(
+                (dst_boxes[:, 4] - dst_boxes_reconstructed[:, 4] + 180.0) % 360.0 - 180.0,
+                torch.zeros_like(dst_boxes[:, 4]),
+                atol=1e-4,
+            )
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/tests/modeling/test_fast_rcnn.py b/preprocess/humanparsing/mhp_extension/detectron2/tests/modeling/test_fast_rcnn.py
new file mode 100644
index 0000000000000000000000000000000000000000..70b64d3db497bac52e127d02a543b14d2e37e8eb
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/tests/modeling/test_fast_rcnn.py
@@ -0,0 +1,106 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+import logging
+import unittest
+import torch
+
+from detectron2.layers import ShapeSpec
+from detectron2.modeling.box_regression import Box2BoxTransform, Box2BoxTransformRotated
+from detectron2.modeling.roi_heads.fast_rcnn import FastRCNNOutputLayers
+from detectron2.modeling.roi_heads.rotated_fast_rcnn import RotatedFastRCNNOutputLayers
+from detectron2.structures import Boxes, Instances, RotatedBoxes
+from detectron2.utils.events import EventStorage
+
+logger = logging.getLogger(__name__)
+
+
+class FastRCNNTest(unittest.TestCase):
+    def test_fast_rcnn(self):
+        torch.manual_seed(132)
+
+        box_head_output_size = 8
+
+        box_predictor = FastRCNNOutputLayers(
+            ShapeSpec(channels=box_head_output_size),
+            box2box_transform=Box2BoxTransform(weights=(10, 10, 5, 5)),
+            num_classes=5,
+        )
+        feature_pooled = torch.rand(2, box_head_output_size)
+        predictions = box_predictor(feature_pooled)
+
+        proposal_boxes = torch.tensor([[0.8, 1.1, 3.2, 2.8], [2.3, 2.5, 7, 8]], dtype=torch.float32)
+        gt_boxes = torch.tensor([[1, 1, 3, 3], [2, 2, 6, 6]], dtype=torch.float32)
+        proposal = Instances((10, 10))
+        proposal.proposal_boxes = Boxes(proposal_boxes)
+        proposal.gt_boxes = Boxes(gt_boxes)
+        proposal.gt_classes = torch.tensor([1, 2])
+
+        with EventStorage():  # capture events in a new storage to discard them
+            losses = box_predictor.losses(predictions, [proposal])
+
+        expected_losses = {
+            "loss_cls": torch.tensor(1.7951188087),
+            "loss_box_reg": torch.tensor(4.0357131958),
+        }
+        for name in expected_losses.keys():
+            assert torch.allclose(losses[name], expected_losses[name])
+
+    def test_fast_rcnn_empty_batch(self, device="cpu"):
+        box_predictor = FastRCNNOutputLayers(
+            ShapeSpec(channels=10),
+            box2box_transform=Box2BoxTransform(weights=(10, 10, 5, 5)),
+            num_classes=8,
+        ).to(device=device)
+
+        logits = torch.randn(0, 100, requires_grad=True, device=device)
+        deltas = torch.randn(0, 4, requires_grad=True, device=device)
+        losses = box_predictor.losses([logits, deltas], [])
+        for value in losses.values():
+            self.assertTrue(torch.allclose(value, torch.zeros_like(value)))
+        sum(losses.values()).backward()
+        self.assertTrue(logits.grad is not None)
+        self.assertTrue(deltas.grad is not None)
+
+        predictions, _ = box_predictor.inference([logits, deltas], [])
+        self.assertEqual(len(predictions), 0)
+
+    @unittest.skipIf(not torch.cuda.is_available(), "CUDA not available")
+    def test_fast_rcnn_empty_batch_cuda(self):
+        self.test_fast_rcnn_empty_batch(device=torch.device("cuda"))
+
+    def test_fast_rcnn_rotated(self):
+        torch.manual_seed(132)
+        box_head_output_size = 8
+
+        box_predictor = RotatedFastRCNNOutputLayers(
+            ShapeSpec(channels=box_head_output_size),
+            box2box_transform=Box2BoxTransformRotated(weights=(10, 10, 5, 5, 1)),
+            num_classes=5,
+        )
+        feature_pooled = torch.rand(2, box_head_output_size)
+        predictions = box_predictor(feature_pooled)
+        proposal_boxes = torch.tensor(
+            [[2, 1.95, 2.4, 1.7, 0], [4.65, 5.25, 4.7, 5.5, 0]], dtype=torch.float32
+        )
+        gt_boxes = torch.tensor([[2, 2, 2, 2, 0], [4, 4, 4, 4, 0]], dtype=torch.float32)
+        proposal = Instances((10, 10))
+        proposal.proposal_boxes = RotatedBoxes(proposal_boxes)
+        proposal.gt_boxes = RotatedBoxes(gt_boxes)
+        proposal.gt_classes = torch.tensor([1, 2])
+
+        with EventStorage():  # capture events in a new storage to discard them
+            losses = box_predictor.losses(predictions, [proposal])
+
+        # Note: the expected losses are slightly different even if
+        # the boxes are essentially the same as in the FastRCNNOutput test, because
+        # bbox_pred in FastRCNNOutputLayers have different Linear layers/initialization
+        # between the two cases.
+        expected_losses = {
+            "loss_cls": torch.tensor(1.7920907736),
+            "loss_box_reg": torch.tensor(4.0410838127),
+        }
+        for name in expected_losses.keys():
+            assert torch.allclose(losses[name], expected_losses[name])
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/tests/modeling/test_model_e2e.py b/preprocess/humanparsing/mhp_extension/detectron2/tests/modeling/test_model_e2e.py
new file mode 100644
index 0000000000000000000000000000000000000000..95fe6a09fd15f877544392ddeccd9906025b0fdd
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/tests/modeling/test_model_e2e.py
@@ -0,0 +1,154 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+
+
+import unittest
+import torch
+
+import detectron2.model_zoo as model_zoo
+from detectron2.config import get_cfg
+from detectron2.modeling import build_model
+from detectron2.structures import BitMasks, Boxes, ImageList, Instances
+from detectron2.utils.events import EventStorage
+
+
+def get_model_zoo(config_path):
+    """
+    Like model_zoo.get, but do not load any weights (even pretrained)
+    """
+    cfg_file = model_zoo.get_config_file(config_path)
+    cfg = get_cfg()
+    cfg.merge_from_file(cfg_file)
+    if not torch.cuda.is_available():
+        cfg.MODEL.DEVICE = "cpu"
+    return build_model(cfg)
+
+
+def create_model_input(img, inst=None):
+    if inst is not None:
+        return {"image": img, "instances": inst}
+    else:
+        return {"image": img}
+
+
+def get_empty_instance(h, w):
+    inst = Instances((h, w))
+    inst.gt_boxes = Boxes(torch.rand(0, 4))
+    inst.gt_classes = torch.tensor([]).to(dtype=torch.int64)
+    inst.gt_masks = BitMasks(torch.rand(0, h, w))
+    return inst
+
+
+def get_regular_bitmask_instances(h, w):
+    inst = Instances((h, w))
+    inst.gt_boxes = Boxes(torch.rand(3, 4))
+    inst.gt_boxes.tensor[:, 2:] += inst.gt_boxes.tensor[:, :2]
+    inst.gt_classes = torch.tensor([3, 4, 5]).to(dtype=torch.int64)
+    inst.gt_masks = BitMasks((torch.rand(3, h, w) > 0.5))
+    return inst
+
+
+class ModelE2ETest:
+    def setUp(self):
+        torch.manual_seed(43)
+        self.model = get_model_zoo(self.CONFIG_PATH)
+
+    def _test_eval(self, input_sizes):
+        inputs = [create_model_input(torch.rand(3, s[0], s[1])) for s in input_sizes]
+        self.model.eval()
+        self.model(inputs)
+
+    def _test_train(self, input_sizes, instances):
+        assert len(input_sizes) == len(instances)
+        inputs = [
+            create_model_input(torch.rand(3, s[0], s[1]), inst)
+            for s, inst in zip(input_sizes, instances)
+        ]
+        self.model.train()
+        with EventStorage():
+            losses = self.model(inputs)
+            sum(losses.values()).backward()
+            del losses
+
+    def _inf_tensor(self, *shape):
+        return 1.0 / torch.zeros(*shape, device=self.model.device)
+
+    def _nan_tensor(self, *shape):
+        return torch.zeros(*shape, device=self.model.device).fill_(float("nan"))
+
+    def test_empty_data(self):
+        instances = [get_empty_instance(200, 250), get_empty_instance(200, 249)]
+        self._test_eval([(200, 250), (200, 249)])
+        self._test_train([(200, 250), (200, 249)], instances)
+
+    @unittest.skipIf(not torch.cuda.is_available(), "CUDA unavailable")
+    def test_eval_tocpu(self):
+        model = get_model_zoo(self.CONFIG_PATH).cpu()
+        model.eval()
+        input_sizes = [(200, 250), (200, 249)]
+        inputs = [create_model_input(torch.rand(3, s[0], s[1])) for s in input_sizes]
+        model(inputs)
+
+
+class MaskRCNNE2ETest(ModelE2ETest, unittest.TestCase):
+    CONFIG_PATH = "COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_1x.yaml"
+
+    def test_half_empty_data(self):
+        instances = [get_empty_instance(200, 250), get_regular_bitmask_instances(200, 249)]
+        self._test_train([(200, 250), (200, 249)], instances)
+
+    # This test is flaky because in some environment the output features are zero due to relu
+    # def test_rpn_inf_nan_data(self):
+    #     self.model.eval()
+    #     for tensor in [self._inf_tensor, self._nan_tensor]:
+    #         images = ImageList(tensor(1, 3, 512, 512), [(510, 510)])
+    #         features = {
+    #             "p2": tensor(1, 256, 256, 256),
+    #             "p3": tensor(1, 256, 128, 128),
+    #             "p4": tensor(1, 256, 64, 64),
+    #             "p5": tensor(1, 256, 32, 32),
+    #             "p6": tensor(1, 256, 16, 16),
+    #         }
+    #         props, _ = self.model.proposal_generator(images, features)
+    #         self.assertEqual(len(props[0]), 0)
+
+    def test_roiheads_inf_nan_data(self):
+        self.model.eval()
+        for tensor in [self._inf_tensor, self._nan_tensor]:
+            images = ImageList(tensor(1, 3, 512, 512), [(510, 510)])
+            features = {
+                "p2": tensor(1, 256, 256, 256),
+                "p3": tensor(1, 256, 128, 128),
+                "p4": tensor(1, 256, 64, 64),
+                "p5": tensor(1, 256, 32, 32),
+                "p6": tensor(1, 256, 16, 16),
+            }
+            props = [Instances((510, 510))]
+            props[0].proposal_boxes = Boxes([[10, 10, 20, 20]]).to(device=self.model.device)
+            props[0].objectness_logits = torch.tensor([1.0]).reshape(1, 1)
+            det, _ = self.model.roi_heads(images, features, props)
+            self.assertEqual(len(det[0]), 0)
+
+
+class RetinaNetE2ETest(ModelE2ETest, unittest.TestCase):
+    CONFIG_PATH = "COCO-Detection/retinanet_R_50_FPN_1x.yaml"
+
+    def test_inf_nan_data(self):
+        self.model.eval()
+        self.model.score_threshold = -999999999
+        for tensor in [self._inf_tensor, self._nan_tensor]:
+            images = ImageList(tensor(1, 3, 512, 512), [(510, 510)])
+            features = [
+                tensor(1, 256, 128, 128),
+                tensor(1, 256, 64, 64),
+                tensor(1, 256, 32, 32),
+                tensor(1, 256, 16, 16),
+                tensor(1, 256, 8, 8),
+            ]
+            anchors = self.model.anchor_generator(features)
+            box_cls, box_delta = self.model.head(features)
+            box_cls = [tensor(*k.shape) for k in box_cls]
+            box_delta = [tensor(*k.shape) for k in box_delta]
+            det = self.model.inference(box_cls, box_delta, anchors, images.image_sizes)
+            # all predictions (if any) are infinite or nan
+            if len(det[0]):
+                self.assertTrue(torch.isfinite(det[0].pred_boxes.tensor).sum() == 0)
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/tests/modeling/test_roi_heads.py b/preprocess/humanparsing/mhp_extension/detectron2/tests/modeling/test_roi_heads.py
new file mode 100644
index 0000000000000000000000000000000000000000..5a0630353ca1c2fbb33d2dee7ddb922d57cad3cd
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/tests/modeling/test_roi_heads.py
@@ -0,0 +1,108 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+import logging
+import unittest
+import torch
+
+from detectron2.config import get_cfg
+from detectron2.modeling.backbone import build_backbone
+from detectron2.modeling.proposal_generator.build import build_proposal_generator
+from detectron2.modeling.roi_heads import build_roi_heads
+from detectron2.structures import Boxes, ImageList, Instances, RotatedBoxes
+from detectron2.utils.events import EventStorage
+
+logger = logging.getLogger(__name__)
+
+
+class ROIHeadsTest(unittest.TestCase):
+    def test_roi_heads(self):
+        torch.manual_seed(121)
+        cfg = get_cfg()
+        cfg.MODEL.ROI_HEADS.NAME = "StandardROIHeads"
+        cfg.MODEL.ROI_BOX_HEAD.NAME = "FastRCNNConvFCHead"
+        cfg.MODEL.ROI_BOX_HEAD.NUM_FC = 2
+        cfg.MODEL.ROI_BOX_HEAD.POOLER_TYPE = "ROIAlignV2"
+        cfg.MODEL.ROI_BOX_HEAD.BBOX_REG_WEIGHTS = (10, 10, 5, 5)
+        backbone = build_backbone(cfg)
+        num_images = 2
+        images_tensor = torch.rand(num_images, 20, 30)
+        image_sizes = [(10, 10), (20, 30)]
+        images = ImageList(images_tensor, image_sizes)
+        num_channels = 1024
+        features = {"res4": torch.rand(num_images, num_channels, 1, 2)}
+
+        image_shape = (15, 15)
+        gt_boxes0 = torch.tensor([[1, 1, 3, 3], [2, 2, 6, 6]], dtype=torch.float32)
+        gt_instance0 = Instances(image_shape)
+        gt_instance0.gt_boxes = Boxes(gt_boxes0)
+        gt_instance0.gt_classes = torch.tensor([2, 1])
+        gt_boxes1 = torch.tensor([[1, 5, 2, 8], [7, 3, 10, 5]], dtype=torch.float32)
+        gt_instance1 = Instances(image_shape)
+        gt_instance1.gt_boxes = Boxes(gt_boxes1)
+        gt_instance1.gt_classes = torch.tensor([1, 2])
+        gt_instances = [gt_instance0, gt_instance1]
+
+        proposal_generator = build_proposal_generator(cfg, backbone.output_shape())
+        roi_heads = build_roi_heads(cfg, backbone.output_shape())
+
+        with EventStorage():  # capture events in a new storage to discard them
+            proposals, proposal_losses = proposal_generator(images, features, gt_instances)
+            _, detector_losses = roi_heads(images, features, proposals, gt_instances)
+
+        expected_losses = {
+            "loss_cls": torch.tensor(4.4236516953),
+            "loss_box_reg": torch.tensor(0.0091214813),
+        }
+        for name in expected_losses.keys():
+            self.assertTrue(torch.allclose(detector_losses[name], expected_losses[name]))
+
+    def test_rroi_heads(self):
+        torch.manual_seed(121)
+        cfg = get_cfg()
+        cfg.MODEL.PROPOSAL_GENERATOR.NAME = "RRPN"
+        cfg.MODEL.ANCHOR_GENERATOR.NAME = "RotatedAnchorGenerator"
+        cfg.MODEL.ROI_HEADS.NAME = "RROIHeads"
+        cfg.MODEL.ROI_BOX_HEAD.NAME = "FastRCNNConvFCHead"
+        cfg.MODEL.ROI_BOX_HEAD.NUM_FC = 2
+        cfg.MODEL.RPN.BBOX_REG_WEIGHTS = (1, 1, 1, 1, 1)
+        cfg.MODEL.RPN.HEAD_NAME = "StandardRPNHead"
+        cfg.MODEL.ROI_BOX_HEAD.POOLER_TYPE = "ROIAlignRotated"
+        cfg.MODEL.ROI_BOX_HEAD.BBOX_REG_WEIGHTS = (10, 10, 5, 5, 1)
+        backbone = build_backbone(cfg)
+        num_images = 2
+        images_tensor = torch.rand(num_images, 20, 30)
+        image_sizes = [(10, 10), (20, 30)]
+        images = ImageList(images_tensor, image_sizes)
+        num_channels = 1024
+        features = {"res4": torch.rand(num_images, num_channels, 1, 2)}
+
+        image_shape = (15, 15)
+        gt_boxes0 = torch.tensor([[2, 2, 2, 2, 30], [4, 4, 4, 4, 0]], dtype=torch.float32)
+        gt_instance0 = Instances(image_shape)
+        gt_instance0.gt_boxes = RotatedBoxes(gt_boxes0)
+        gt_instance0.gt_classes = torch.tensor([2, 1])
+        gt_boxes1 = torch.tensor([[1.5, 5.5, 1, 3, 0], [8.5, 4, 3, 2, -50]], dtype=torch.float32)
+        gt_instance1 = Instances(image_shape)
+        gt_instance1.gt_boxes = RotatedBoxes(gt_boxes1)
+        gt_instance1.gt_classes = torch.tensor([1, 2])
+        gt_instances = [gt_instance0, gt_instance1]
+
+        proposal_generator = build_proposal_generator(cfg, backbone.output_shape())
+        roi_heads = build_roi_heads(cfg, backbone.output_shape())
+
+        with EventStorage():  # capture events in a new storage to discard them
+            proposals, proposal_losses = proposal_generator(images, features, gt_instances)
+            _, detector_losses = roi_heads(images, features, proposals, gt_instances)
+
+        expected_losses = {
+            "loss_cls": torch.tensor(4.381618499755859),
+            "loss_box_reg": torch.tensor(0.0011829272843897343),
+        }
+        for name in expected_losses.keys():
+            err_msg = "detector_losses[{}] = {}, expected losses = {}".format(
+                name, detector_losses[name], expected_losses[name]
+            )
+            self.assertTrue(torch.allclose(detector_losses[name], expected_losses[name]), err_msg)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/tests/modeling/test_roi_pooler.py b/preprocess/humanparsing/mhp_extension/detectron2/tests/modeling/test_roi_pooler.py
new file mode 100644
index 0000000000000000000000000000000000000000..9aa3825c0196e4a6d89162e3d7c797e3d77b23bd
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/tests/modeling/test_roi_pooler.py
@@ -0,0 +1,85 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+import logging
+import unittest
+import torch
+
+from detectron2.modeling.poolers import ROIPooler
+from detectron2.structures import Boxes, RotatedBoxes
+
+logger = logging.getLogger(__name__)
+
+
+class TestROIPooler(unittest.TestCase):
+    def _rand_boxes(self, num_boxes, x_max, y_max):
+        coords = torch.rand(num_boxes, 4)
+        coords[:, 0] *= x_max
+        coords[:, 1] *= y_max
+        coords[:, 2] *= x_max
+        coords[:, 3] *= y_max
+        boxes = torch.zeros(num_boxes, 4)
+        boxes[:, 0] = torch.min(coords[:, 0], coords[:, 2])
+        boxes[:, 1] = torch.min(coords[:, 1], coords[:, 3])
+        boxes[:, 2] = torch.max(coords[:, 0], coords[:, 2])
+        boxes[:, 3] = torch.max(coords[:, 1], coords[:, 3])
+        return boxes
+
+    def _test_roialignv2_roialignrotated_match(self, device):
+        pooler_resolution = 14
+        canonical_level = 4
+        canonical_scale_factor = 2 ** canonical_level
+        pooler_scales = (1.0 / canonical_scale_factor,)
+        sampling_ratio = 0
+
+        N, C, H, W = 2, 4, 10, 8
+        N_rois = 10
+        std = 11
+        mean = 0
+        feature = (torch.rand(N, C, H, W) - 0.5) * 2 * std + mean
+
+        features = [feature.to(device)]
+
+        rois = []
+        rois_rotated = []
+        for _ in range(N):
+            boxes = self._rand_boxes(
+                num_boxes=N_rois, x_max=W * canonical_scale_factor, y_max=H * canonical_scale_factor
+            )
+
+            rotated_boxes = torch.zeros(N_rois, 5)
+            rotated_boxes[:, 0] = (boxes[:, 0] + boxes[:, 2]) / 2.0
+            rotated_boxes[:, 1] = (boxes[:, 1] + boxes[:, 3]) / 2.0
+            rotated_boxes[:, 2] = boxes[:, 2] - boxes[:, 0]
+            rotated_boxes[:, 3] = boxes[:, 3] - boxes[:, 1]
+            rois.append(Boxes(boxes).to(device))
+            rois_rotated.append(RotatedBoxes(rotated_boxes).to(device))
+
+        roialignv2_pooler = ROIPooler(
+            output_size=pooler_resolution,
+            scales=pooler_scales,
+            sampling_ratio=sampling_ratio,
+            pooler_type="ROIAlignV2",
+        )
+
+        roialignv2_out = roialignv2_pooler(features, rois)
+
+        roialignrotated_pooler = ROIPooler(
+            output_size=pooler_resolution,
+            scales=pooler_scales,
+            sampling_ratio=sampling_ratio,
+            pooler_type="ROIAlignRotated",
+        )
+
+        roialignrotated_out = roialignrotated_pooler(features, rois_rotated)
+
+        self.assertTrue(torch.allclose(roialignv2_out, roialignrotated_out, atol=1e-4))
+
+    def test_roialignv2_roialignrotated_match_cpu(self):
+        self._test_roialignv2_roialignrotated_match(device="cpu")
+
+    @unittest.skipIf(not torch.cuda.is_available(), "CUDA not available")
+    def test_roialignv2_roialignrotated_match_cuda(self):
+        self._test_roialignv2_roialignrotated_match(device="cuda")
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/tests/modeling/test_rpn.py b/preprocess/humanparsing/mhp_extension/detectron2/tests/modeling/test_rpn.py
new file mode 100644
index 0000000000000000000000000000000000000000..967d2102b85f2d66e3f0b32b31805c4ac01afa0c
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/tests/modeling/test_rpn.py
@@ -0,0 +1,234 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+import logging
+import unittest
+import torch
+
+from detectron2.config import get_cfg
+from detectron2.modeling.backbone import build_backbone
+from detectron2.modeling.proposal_generator.build import build_proposal_generator
+from detectron2.modeling.proposal_generator.rpn_outputs import find_top_rpn_proposals
+from detectron2.structures import Boxes, ImageList, Instances, RotatedBoxes
+from detectron2.utils.events import EventStorage
+
+logger = logging.getLogger(__name__)
+
+
+class RPNTest(unittest.TestCase):
+    def test_rpn(self):
+        torch.manual_seed(121)
+        cfg = get_cfg()
+        cfg.MODEL.PROPOSAL_GENERATOR.NAME = "RPN"
+        cfg.MODEL.ANCHOR_GENERATOR.NAME = "DefaultAnchorGenerator"
+        cfg.MODEL.RPN.BBOX_REG_WEIGHTS = (1, 1, 1, 1)
+        backbone = build_backbone(cfg)
+        proposal_generator = build_proposal_generator(cfg, backbone.output_shape())
+        num_images = 2
+        images_tensor = torch.rand(num_images, 20, 30)
+        image_sizes = [(10, 10), (20, 30)]
+        images = ImageList(images_tensor, image_sizes)
+        image_shape = (15, 15)
+        num_channels = 1024
+        features = {"res4": torch.rand(num_images, num_channels, 1, 2)}
+        gt_boxes = torch.tensor([[1, 1, 3, 3], [2, 2, 6, 6]], dtype=torch.float32)
+        gt_instances = Instances(image_shape)
+        gt_instances.gt_boxes = Boxes(gt_boxes)
+        with EventStorage():  # capture events in a new storage to discard them
+            proposals, proposal_losses = proposal_generator(
+                images, features, [gt_instances[0], gt_instances[1]]
+            )
+
+        expected_losses = {
+            "loss_rpn_cls": torch.tensor(0.0804563984),
+            "loss_rpn_loc": torch.tensor(0.0990132466),
+        }
+        for name in expected_losses.keys():
+            err_msg = "proposal_losses[{}] = {}, expected losses = {}".format(
+                name, proposal_losses[name], expected_losses[name]
+            )
+            self.assertTrue(torch.allclose(proposal_losses[name], expected_losses[name]), err_msg)
+
+        expected_proposal_boxes = [
+            Boxes(torch.tensor([[0, 0, 10, 10], [7.3365392685, 0, 10, 10]])),
+            Boxes(
+                torch.tensor(
+                    [
+                        [0, 0, 30, 20],
+                        [0, 0, 16.7862777710, 13.1362524033],
+                        [0, 0, 30, 13.3173446655],
+                        [0, 0, 10.8602609634, 20],
+                        [7.7165775299, 0, 27.3875980377, 20],
+                    ]
+                )
+            ),
+        ]
+
+        expected_objectness_logits = [
+            torch.tensor([0.1225359365, -0.0133192837]),
+            torch.tensor([0.1415634006, 0.0989848152, 0.0565387346, -0.0072308783, -0.0428492837]),
+        ]
+
+        for proposal, expected_proposal_box, im_size, expected_objectness_logit in zip(
+            proposals, expected_proposal_boxes, image_sizes, expected_objectness_logits
+        ):
+            self.assertEqual(len(proposal), len(expected_proposal_box))
+            self.assertEqual(proposal.image_size, im_size)
+            self.assertTrue(
+                torch.allclose(proposal.proposal_boxes.tensor, expected_proposal_box.tensor)
+            )
+            self.assertTrue(torch.allclose(proposal.objectness_logits, expected_objectness_logit))
+
+    def test_rrpn(self):
+        torch.manual_seed(121)
+        cfg = get_cfg()
+        cfg.MODEL.PROPOSAL_GENERATOR.NAME = "RRPN"
+        cfg.MODEL.ANCHOR_GENERATOR.NAME = "RotatedAnchorGenerator"
+        cfg.MODEL.ANCHOR_GENERATOR.SIZES = [[32, 64]]
+        cfg.MODEL.ANCHOR_GENERATOR.ASPECT_RATIOS = [[0.25, 1]]
+        cfg.MODEL.ANCHOR_GENERATOR.ANGLES = [[0, 60]]
+        cfg.MODEL.RPN.BBOX_REG_WEIGHTS = (1, 1, 1, 1, 1)
+        cfg.MODEL.RPN.HEAD_NAME = "StandardRPNHead"
+        backbone = build_backbone(cfg)
+        proposal_generator = build_proposal_generator(cfg, backbone.output_shape())
+        num_images = 2
+        images_tensor = torch.rand(num_images, 20, 30)
+        image_sizes = [(10, 10), (20, 30)]
+        images = ImageList(images_tensor, image_sizes)
+        image_shape = (15, 15)
+        num_channels = 1024
+        features = {"res4": torch.rand(num_images, num_channels, 1, 2)}
+        gt_boxes = torch.tensor([[2, 2, 2, 2, 0], [4, 4, 4, 4, 0]], dtype=torch.float32)
+        gt_instances = Instances(image_shape)
+        gt_instances.gt_boxes = RotatedBoxes(gt_boxes)
+        with EventStorage():  # capture events in a new storage to discard them
+            proposals, proposal_losses = proposal_generator(
+                images, features, [gt_instances[0], gt_instances[1]]
+            )
+
+        expected_losses = {
+            "loss_rpn_cls": torch.tensor(0.043263837695121765),
+            "loss_rpn_loc": torch.tensor(0.14432406425476074),
+        }
+        for name in expected_losses.keys():
+            err_msg = "proposal_losses[{}] = {}, expected losses = {}".format(
+                name, proposal_losses[name], expected_losses[name]
+            )
+            self.assertTrue(torch.allclose(proposal_losses[name], expected_losses[name]), err_msg)
+
+        expected_proposal_boxes = [
+            RotatedBoxes(
+                torch.tensor(
+                    [
+                        [0.60189795, 1.24095452, 61.98131943, 18.03621292, -4.07244873],
+                        [15.64940453, 1.69624567, 59.59749603, 16.34339333, 2.62692475],
+                        [-3.02982378, -2.69752932, 67.90952301, 59.62455750, 59.97010040],
+                        [16.71863365, 1.98309708, 35.61507797, 32.81484985, 62.92267227],
+                        [0.49432933, -7.92979717, 67.77606201, 62.93098450, -1.85656738],
+                        [8.00880814, 1.36017394, 121.81007385, 32.74150467, 50.44297409],
+                        [16.44299889, -4.82221127, 63.39775848, 61.22503662, 54.12270737],
+                        [5.00000000, 5.00000000, 10.00000000, 10.00000000, -0.76943970],
+                        [17.64130402, -0.98095351, 61.40377808, 16.28918839, 55.53118134],
+                        [0.13016054, 4.60568953, 35.80157471, 32.30180359, 62.52872086],
+                        [-4.26460743, 0.39604485, 124.30079651, 31.84611320, -1.58203125],
+                        [7.52815342, -0.91636634, 62.39784622, 15.45565224, 60.79549789],
+                    ]
+                )
+            ),
+            RotatedBoxes(
+                torch.tensor(
+                    [
+                        [0.07734215, 0.81635046, 65.33510590, 17.34688377, -1.51821899],
+                        [-3.41833067, -3.11320257, 64.17595673, 60.55617905, 58.27033234],
+                        [20.67383385, -6.16561556, 63.60531998, 62.52315903, 54.85546494],
+                        [15.00000000, 10.00000000, 30.00000000, 20.00000000, -0.18218994],
+                        [9.22646523, -6.84775209, 62.09895706, 65.46472931, -2.74307251],
+                        [15.00000000, 4.93451595, 30.00000000, 9.86903191, -0.60272217],
+                        [8.88342094, 2.65560246, 120.95362854, 32.45022202, 55.75970078],
+                        [16.39088631, 2.33887148, 34.78761292, 35.61492920, 60.81977463],
+                        [9.78298569, 10.00000000, 19.56597137, 20.00000000, -0.86660767],
+                        [1.28576660, 5.49873352, 34.93610382, 33.22600174, 60.51599884],
+                        [17.58912468, -1.63270092, 62.96052551, 16.45713997, 52.91245270],
+                        [5.64749718, -1.90428460, 62.37649155, 16.19474792, 61.09543991],
+                        [0.82255805, 2.34931135, 118.83985901, 32.83671188, 56.50753784],
+                        [-5.33874989, 1.64404404, 125.28501892, 33.35424042, -2.80731201],
+                    ]
+                )
+            ),
+        ]
+
+        expected_objectness_logits = [
+            torch.tensor(
+                [
+                    0.10111768,
+                    0.09112845,
+                    0.08466332,
+                    0.07589971,
+                    0.06650183,
+                    0.06350251,
+                    0.04299347,
+                    0.01864817,
+                    0.00986163,
+                    0.00078543,
+                    -0.04573630,
+                    -0.04799230,
+                ]
+            ),
+            torch.tensor(
+                [
+                    0.11373727,
+                    0.09377633,
+                    0.05281663,
+                    0.05143715,
+                    0.04040275,
+                    0.03250912,
+                    0.01307789,
+                    0.01177734,
+                    0.00038105,
+                    -0.00540255,
+                    -0.01194804,
+                    -0.01461012,
+                    -0.03061717,
+                    -0.03599222,
+                ]
+            ),
+        ]
+
+        torch.set_printoptions(precision=8, sci_mode=False)
+
+        for proposal, expected_proposal_box, im_size, expected_objectness_logit in zip(
+            proposals, expected_proposal_boxes, image_sizes, expected_objectness_logits
+        ):
+            self.assertEqual(len(proposal), len(expected_proposal_box))
+            self.assertEqual(proposal.image_size, im_size)
+            # It seems that there's some randomness in the result across different machines:
+            # This test can be run on a local machine for 100 times with exactly the same result,
+            # However, a different machine might produce slightly different results,
+            # thus the atol here.
+            err_msg = "computed proposal boxes = {}, expected {}".format(
+                proposal.proposal_boxes.tensor, expected_proposal_box.tensor
+            )
+            self.assertTrue(
+                torch.allclose(
+                    proposal.proposal_boxes.tensor, expected_proposal_box.tensor, atol=1e-5
+                ),
+                err_msg,
+            )
+
+            err_msg = "computed objectness logits = {}, expected {}".format(
+                proposal.objectness_logits, expected_objectness_logit
+            )
+            self.assertTrue(
+                torch.allclose(proposal.objectness_logits, expected_objectness_logit, atol=1e-5),
+                err_msg,
+            )
+
+    def test_rpn_proposals_inf(self):
+        N, Hi, Wi, A = 3, 3, 3, 3
+        proposals = [torch.rand(N, Hi * Wi * A, 4)]
+        pred_logits = [torch.rand(N, Hi * Wi * A)]
+        pred_logits[0][1][3:5].fill_(float("inf"))
+        images = ImageList.from_tensors([torch.rand(3, 10, 10)] * 3)
+        find_top_rpn_proposals(proposals, pred_logits, images, 0.5, 1000, 1000, 0, False)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/tests/structures/__init__.py b/preprocess/humanparsing/mhp_extension/detectron2/tests/structures/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/tests/structures/test_boxes.py b/preprocess/humanparsing/mhp_extension/detectron2/tests/structures/test_boxes.py
new file mode 100644
index 0000000000000000000000000000000000000000..4d33c3bf9b7471c7e4382bc9e66c26e1fb60e29f
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/tests/structures/test_boxes.py
@@ -0,0 +1,182 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+import json
+import math
+import numpy as np
+import unittest
+import torch
+
+from detectron2.structures import Boxes, BoxMode, pairwise_iou
+
+
+class TestBoxMode(unittest.TestCase):
+    def _convert_xy_to_wh(self, x):
+        return BoxMode.convert(x, BoxMode.XYXY_ABS, BoxMode.XYWH_ABS)
+
+    def _convert_xywha_to_xyxy(self, x):
+        return BoxMode.convert(x, BoxMode.XYWHA_ABS, BoxMode.XYXY_ABS)
+
+    def _convert_xywh_to_xywha(self, x):
+        return BoxMode.convert(x, BoxMode.XYWH_ABS, BoxMode.XYWHA_ABS)
+
+    def test_box_convert_list(self):
+        for tp in [list, tuple]:
+            box = tp([5.0, 5.0, 10.0, 10.0])
+            output = self._convert_xy_to_wh(box)
+            self.assertIsInstance(output, tp)
+            self.assertIsInstance(output[0], float)
+            self.assertEqual(output, tp([5.0, 5.0, 5.0, 5.0]))
+
+            with self.assertRaises(Exception):
+                self._convert_xy_to_wh([box])
+
+    def test_box_convert_array(self):
+        box = np.asarray([[5, 5, 10, 10], [1, 1, 2, 3]])
+        output = self._convert_xy_to_wh(box)
+        self.assertEqual(output.dtype, box.dtype)
+        self.assertEqual(output.shape, box.shape)
+        self.assertTrue((output[0] == [5, 5, 5, 5]).all())
+        self.assertTrue((output[1] == [1, 1, 1, 2]).all())
+
+    def test_box_convert_cpu_tensor(self):
+        box = torch.tensor([[5, 5, 10, 10], [1, 1, 2, 3]])
+        output = self._convert_xy_to_wh(box)
+        self.assertEqual(output.dtype, box.dtype)
+        self.assertEqual(output.shape, box.shape)
+        output = output.numpy()
+        self.assertTrue((output[0] == [5, 5, 5, 5]).all())
+        self.assertTrue((output[1] == [1, 1, 1, 2]).all())
+
+    @unittest.skipIf(not torch.cuda.is_available(), "CUDA not available")
+    def test_box_convert_cuda_tensor(self):
+        box = torch.tensor([[5, 5, 10, 10], [1, 1, 2, 3]]).cuda()
+        output = self._convert_xy_to_wh(box)
+        self.assertEqual(output.dtype, box.dtype)
+        self.assertEqual(output.shape, box.shape)
+        self.assertEqual(output.device, box.device)
+        output = output.cpu().numpy()
+        self.assertTrue((output[0] == [5, 5, 5, 5]).all())
+        self.assertTrue((output[1] == [1, 1, 1, 2]).all())
+
+    def test_box_convert_xywha_to_xyxy_list(self):
+        for tp in [list, tuple]:
+            box = tp([50, 50, 30, 20, 0])
+            output = self._convert_xywha_to_xyxy(box)
+            self.assertIsInstance(output, tp)
+            self.assertEqual(output, tp([35, 40, 65, 60]))
+
+            with self.assertRaises(Exception):
+                self._convert_xywha_to_xyxy([box])
+
+    def test_box_convert_xywha_to_xyxy_array(self):
+        for dtype in [np.float64, np.float32]:
+            box = np.asarray(
+                [
+                    [50, 50, 30, 20, 0],
+                    [50, 50, 30, 20, 90],
+                    [1, 1, math.sqrt(2), math.sqrt(2), -45],
+                ],
+                dtype=dtype,
+            )
+            output = self._convert_xywha_to_xyxy(box)
+            self.assertEqual(output.dtype, box.dtype)
+            expected = np.asarray([[35, 40, 65, 60], [40, 35, 60, 65], [0, 0, 2, 2]], dtype=dtype)
+            self.assertTrue(np.allclose(output, expected, atol=1e-6), "output={}".format(output))
+
+    def test_box_convert_xywha_to_xyxy_tensor(self):
+        for dtype in [torch.float32, torch.float64]:
+            box = torch.tensor(
+                [
+                    [50, 50, 30, 20, 0],
+                    [50, 50, 30, 20, 90],
+                    [1, 1, math.sqrt(2), math.sqrt(2), -45],
+                ],
+                dtype=dtype,
+            )
+            output = self._convert_xywha_to_xyxy(box)
+            self.assertEqual(output.dtype, box.dtype)
+            expected = torch.tensor([[35, 40, 65, 60], [40, 35, 60, 65], [0, 0, 2, 2]], dtype=dtype)
+
+            self.assertTrue(torch.allclose(output, expected, atol=1e-6), "output={}".format(output))
+
+    def test_box_convert_xywh_to_xywha_list(self):
+        for tp in [list, tuple]:
+            box = tp([50, 50, 30, 20])
+            output = self._convert_xywh_to_xywha(box)
+            self.assertIsInstance(output, tp)
+            self.assertEqual(output, tp([65, 60, 30, 20, 0]))
+
+            with self.assertRaises(Exception):
+                self._convert_xywh_to_xywha([box])
+
+    def test_box_convert_xywh_to_xywha_array(self):
+        for dtype in [np.float64, np.float32]:
+            box = np.asarray([[30, 40, 70, 60], [30, 40, 60, 70], [-1, -1, 2, 2]], dtype=dtype)
+            output = self._convert_xywh_to_xywha(box)
+            self.assertEqual(output.dtype, box.dtype)
+            expected = np.asarray(
+                [[65, 70, 70, 60, 0], [60, 75, 60, 70, 0], [0, 0, 2, 2, 0]], dtype=dtype
+            )
+            self.assertTrue(np.allclose(output, expected, atol=1e-6), "output={}".format(output))
+
+    def test_box_convert_xywh_to_xywha_tensor(self):
+        for dtype in [torch.float32, torch.float64]:
+            box = torch.tensor([[30, 40, 70, 60], [30, 40, 60, 70], [-1, -1, 2, 2]], dtype=dtype)
+            output = self._convert_xywh_to_xywha(box)
+            self.assertEqual(output.dtype, box.dtype)
+            expected = torch.tensor(
+                [[65, 70, 70, 60, 0], [60, 75, 60, 70, 0], [0, 0, 2, 2, 0]], dtype=dtype
+            )
+
+            self.assertTrue(torch.allclose(output, expected, atol=1e-6), "output={}".format(output))
+
+    def test_json_serializable(self):
+        payload = {"box_mode": BoxMode.XYWH_REL}
+        try:
+            json.dumps(payload)
+        except Exception:
+            self.fail("JSON serialization failed")
+
+    def test_json_deserializable(self):
+        payload = '{"box_mode": 2}'
+        obj = json.loads(payload)
+        try:
+            obj["box_mode"] = BoxMode(obj["box_mode"])
+        except Exception:
+            self.fail("JSON deserialization failed")
+
+
+class TestBoxIOU(unittest.TestCase):
+    def test_pairwise_iou(self):
+        boxes1 = torch.tensor([[0.0, 0.0, 1.0, 1.0], [0.0, 0.0, 1.0, 1.0]])
+
+        boxes2 = torch.tensor(
+            [
+                [0.0, 0.0, 1.0, 1.0],
+                [0.0, 0.0, 0.5, 1.0],
+                [0.0, 0.0, 1.0, 0.5],
+                [0.0, 0.0, 0.5, 0.5],
+                [0.5, 0.5, 1.0, 1.0],
+                [0.5, 0.5, 1.5, 1.5],
+            ]
+        )
+
+        expected_ious = torch.tensor(
+            [
+                [1.0, 0.5, 0.5, 0.25, 0.25, 0.25 / (2 - 0.25)],
+                [1.0, 0.5, 0.5, 0.25, 0.25, 0.25 / (2 - 0.25)],
+            ]
+        )
+
+        ious = pairwise_iou(Boxes(boxes1), Boxes(boxes2))
+
+        self.assertTrue(torch.allclose(ious, expected_ious))
+
+
+class TestBoxes(unittest.TestCase):
+    def test_empty_cat(self):
+        x = Boxes.cat([])
+        self.assertTrue(x.tensor.shape, (0, 4))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/tests/structures/test_imagelist.py b/preprocess/humanparsing/mhp_extension/detectron2/tests/structures/test_imagelist.py
new file mode 100644
index 0000000000000000000000000000000000000000..abeb35569ddc34a618735f4989dfbfae23d47bc1
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/tests/structures/test_imagelist.py
@@ -0,0 +1,38 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+
+import unittest
+from typing import Sequence
+import torch
+
+from detectron2.structures import ImageList
+
+
+class TestImageList(unittest.TestCase):
+    def test_imagelist_padding_shape(self):
+        class TensorToImageList(torch.nn.Module):
+            def forward(self, tensors: Sequence[torch.Tensor]):
+                return ImageList.from_tensors(tensors, 4).tensor
+
+        func = torch.jit.trace(
+            TensorToImageList(), ([torch.ones((3, 10, 10), dtype=torch.float32)],)
+        )
+        ret = func([torch.ones((3, 15, 20), dtype=torch.float32)])
+        self.assertEqual(list(ret.shape), [1, 3, 16, 20], str(ret.shape))
+
+        func = torch.jit.trace(
+            TensorToImageList(),
+            (
+                [
+                    torch.ones((3, 16, 10), dtype=torch.float32),
+                    torch.ones((3, 13, 11), dtype=torch.float32),
+                ],
+            ),
+        )
+        ret = func(
+            [
+                torch.ones((3, 25, 20), dtype=torch.float32),
+                torch.ones((3, 10, 10), dtype=torch.float32),
+            ]
+        )
+        # does not support calling with different #images
+        self.assertEqual(list(ret.shape), [2, 3, 28, 20], str(ret.shape))
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/tests/structures/test_instances.py b/preprocess/humanparsing/mhp_extension/detectron2/tests/structures/test_instances.py
new file mode 100644
index 0000000000000000000000000000000000000000..79c5249217633d3f144d02f14d11f32d1d4be7c9
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/tests/structures/test_instances.py
@@ -0,0 +1,25 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+import unittest
+import torch
+
+from detectron2.structures import Instances
+
+
+class TestInstancesIndexing(unittest.TestCase):
+    def test_int_indexing(self):
+        attr1 = torch.tensor([[0.0, 0.0, 1.0], [0.0, 0.0, 0.5], [0.0, 0.0, 1.0], [0.0, 0.5, 0.5]])
+        attr2 = torch.tensor([0.1, 0.2, 0.3, 0.4])
+        instances = Instances((100, 100))
+        instances.attr1 = attr1
+        instances.attr2 = attr2
+        for i in range(-len(instances), len(instances)):
+            inst = instances[i]
+            self.assertEqual((inst.attr1 == attr1[i]).all(), True)
+            self.assertEqual((inst.attr2 == attr2[i]).all(), True)
+
+        self.assertRaises(IndexError, lambda: instances[len(instances)])
+        self.assertRaises(IndexError, lambda: instances[-len(instances) - 1])
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/tests/structures/test_rotated_boxes.py b/preprocess/humanparsing/mhp_extension/detectron2/tests/structures/test_rotated_boxes.py
new file mode 100644
index 0000000000000000000000000000000000000000..575ac480e39d7406e55f4ff45b867e6f5c3796a0
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/tests/structures/test_rotated_boxes.py
@@ -0,0 +1,357 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+from __future__ import absolute_import, division, print_function, unicode_literals
+import logging
+import math
+import random
+import unittest
+import torch
+from fvcore.common.benchmark import benchmark
+
+from detectron2.layers.rotated_boxes import pairwise_iou_rotated
+from detectron2.structures.boxes import Boxes
+from detectron2.structures.rotated_boxes import RotatedBoxes, pairwise_iou
+
+logger = logging.getLogger(__name__)
+
+
+class TestRotatedBoxesLayer(unittest.TestCase):
+    def test_iou_0_dim_cpu(self):
+        boxes1 = torch.rand(0, 5, dtype=torch.float32)
+        boxes2 = torch.rand(10, 5, dtype=torch.float32)
+        expected_ious = torch.zeros(0, 10, dtype=torch.float32)
+        ious = pairwise_iou_rotated(boxes1, boxes2)
+        self.assertTrue(torch.allclose(ious, expected_ious))
+
+        boxes1 = torch.rand(10, 5, dtype=torch.float32)
+        boxes2 = torch.rand(0, 5, dtype=torch.float32)
+        expected_ious = torch.zeros(10, 0, dtype=torch.float32)
+        ious = pairwise_iou_rotated(boxes1, boxes2)
+        self.assertTrue(torch.allclose(ious, expected_ious))
+
+    @unittest.skipIf(not torch.cuda.is_available(), "CUDA not available")
+    def test_iou_0_dim_cuda(self):
+        boxes1 = torch.rand(0, 5, dtype=torch.float32)
+        boxes2 = torch.rand(10, 5, dtype=torch.float32)
+        expected_ious = torch.zeros(0, 10, dtype=torch.float32)
+        ious_cuda = pairwise_iou_rotated(boxes1.cuda(), boxes2.cuda())
+        self.assertTrue(torch.allclose(ious_cuda.cpu(), expected_ious))
+
+        boxes1 = torch.rand(10, 5, dtype=torch.float32)
+        boxes2 = torch.rand(0, 5, dtype=torch.float32)
+        expected_ious = torch.zeros(10, 0, dtype=torch.float32)
+        ious_cuda = pairwise_iou_rotated(boxes1.cuda(), boxes2.cuda())
+        self.assertTrue(torch.allclose(ious_cuda.cpu(), expected_ious))
+
+    def test_iou_half_overlap_cpu(self):
+        boxes1 = torch.tensor([[0.5, 0.5, 1.0, 1.0, 0.0]], dtype=torch.float32)
+        boxes2 = torch.tensor([[0.25, 0.5, 0.5, 1.0, 0.0]], dtype=torch.float32)
+        expected_ious = torch.tensor([[0.5]], dtype=torch.float32)
+        ious = pairwise_iou_rotated(boxes1, boxes2)
+        self.assertTrue(torch.allclose(ious, expected_ious))
+
+    @unittest.skipIf(not torch.cuda.is_available(), "CUDA not available")
+    def test_iou_half_overlap_cuda(self):
+        boxes1 = torch.tensor([[0.5, 0.5, 1.0, 1.0, 0.0]], dtype=torch.float32)
+        boxes2 = torch.tensor([[0.25, 0.5, 0.5, 1.0, 0.0]], dtype=torch.float32)
+        expected_ious = torch.tensor([[0.5]], dtype=torch.float32)
+        ious_cuda = pairwise_iou_rotated(boxes1.cuda(), boxes2.cuda())
+        self.assertTrue(torch.allclose(ious_cuda.cpu(), expected_ious))
+
+    def test_iou_precision(self):
+        for device in ["cpu"] + ["cuda"] if torch.cuda.is_available() else []:
+            boxes1 = torch.tensor([[565, 565, 10, 10.0, 0]], dtype=torch.float32, device=device)
+            boxes2 = torch.tensor([[565, 565, 10, 8.3, 0]], dtype=torch.float32, device=device)
+            iou = 8.3 / 10.0
+            expected_ious = torch.tensor([[iou]], dtype=torch.float32)
+            ious = pairwise_iou_rotated(boxes1, boxes2)
+            self.assertTrue(torch.allclose(ious.cpu(), expected_ious))
+
+    @unittest.skipIf(not torch.cuda.is_available(), "CUDA not available")
+    def test_iou_too_many_boxes_cuda(self):
+        s1, s2 = 5, 1289035
+        boxes1 = torch.zeros(s1, 5)
+        boxes2 = torch.zeros(s2, 5)
+        ious_cuda = pairwise_iou_rotated(boxes1.cuda(), boxes2.cuda())
+        self.assertTupleEqual(tuple(ious_cuda.shape), (s1, s2))
+
+    def test_iou_extreme(self):
+        # Cause floating point issues in cuda kernels (#1266)
+        for device in ["cpu"] + ["cuda"] if torch.cuda.is_available() else []:
+            boxes1 = torch.tensor([[160.0, 153.0, 230.0, 23.0, -37.0]], device=device)
+            boxes2 = torch.tensor(
+                [
+                    [
+                        -1.117407639806935e17,
+                        1.3858420478349148e18,
+                        1000.0000610351562,
+                        1000.0000610351562,
+                        1612.0,
+                    ]
+                ],
+                device=device,
+            )
+            ious = pairwise_iou_rotated(boxes1, boxes2)
+            self.assertTrue(ious.min() >= 0, ious)
+
+
+class TestRotatedBoxesStructure(unittest.TestCase):
+    def test_clip_area_0_degree(self):
+        for _ in range(50):
+            num_boxes = 100
+            boxes_5d = torch.zeros(num_boxes, 5)
+            boxes_5d[:, 0] = torch.FloatTensor(num_boxes).uniform_(-100, 500)
+            boxes_5d[:, 1] = torch.FloatTensor(num_boxes).uniform_(-100, 500)
+            boxes_5d[:, 2] = torch.FloatTensor(num_boxes).uniform_(0, 500)
+            boxes_5d[:, 3] = torch.FloatTensor(num_boxes).uniform_(0, 500)
+            # Convert from (x_ctr, y_ctr, w, h, 0) to  (x1, y1, x2, y2)
+            boxes_4d = torch.zeros(num_boxes, 4)
+            boxes_4d[:, 0] = boxes_5d[:, 0] - boxes_5d[:, 2] / 2.0
+            boxes_4d[:, 1] = boxes_5d[:, 1] - boxes_5d[:, 3] / 2.0
+            boxes_4d[:, 2] = boxes_5d[:, 0] + boxes_5d[:, 2] / 2.0
+            boxes_4d[:, 3] = boxes_5d[:, 1] + boxes_5d[:, 3] / 2.0
+
+            image_size = (500, 600)
+            test_boxes_4d = Boxes(boxes_4d)
+            test_boxes_5d = RotatedBoxes(boxes_5d)
+            # Before clip
+            areas_4d = test_boxes_4d.area()
+            areas_5d = test_boxes_5d.area()
+            self.assertTrue(torch.allclose(areas_4d, areas_5d, atol=1e-1, rtol=1e-5))
+            # After clip
+            test_boxes_4d.clip(image_size)
+            test_boxes_5d.clip(image_size)
+            areas_4d = test_boxes_4d.area()
+            areas_5d = test_boxes_5d.area()
+            self.assertTrue(torch.allclose(areas_4d, areas_5d, atol=1e-1, rtol=1e-5))
+
+    def test_clip_area_arbitrary_angle(self):
+        num_boxes = 100
+        boxes_5d = torch.zeros(num_boxes, 5)
+        boxes_5d[:, 0] = torch.FloatTensor(num_boxes).uniform_(-100, 500)
+        boxes_5d[:, 1] = torch.FloatTensor(num_boxes).uniform_(-100, 500)
+        boxes_5d[:, 2] = torch.FloatTensor(num_boxes).uniform_(0, 500)
+        boxes_5d[:, 3] = torch.FloatTensor(num_boxes).uniform_(0, 500)
+        boxes_5d[:, 4] = torch.FloatTensor(num_boxes).uniform_(-1800, 1800)
+        clip_angle_threshold = random.uniform(0, 180)
+
+        image_size = (500, 600)
+        test_boxes_5d = RotatedBoxes(boxes_5d)
+        # Before clip
+        areas_before = test_boxes_5d.area()
+        # After clip
+        test_boxes_5d.clip(image_size, clip_angle_threshold)
+        areas_diff = test_boxes_5d.area() - areas_before
+
+        # the areas should only decrease after clipping
+        self.assertTrue(torch.all(areas_diff <= 0))
+        # whenever the box is clipped (thus the area shrinks),
+        # the angle for the box must be within the clip_angle_threshold
+        # Note that the clip function will normalize the angle range
+        # to be within (-180, 180]
+        self.assertTrue(
+            torch.all(torch.abs(boxes_5d[:, 4][torch.where(areas_diff < 0)]) < clip_angle_threshold)
+        )
+
+    def test_normalize_angles(self):
+        # torch.manual_seed(0)
+        for _ in range(50):
+            num_boxes = 100
+            boxes_5d = torch.zeros(num_boxes, 5)
+            boxes_5d[:, 0] = torch.FloatTensor(num_boxes).uniform_(-100, 500)
+            boxes_5d[:, 1] = torch.FloatTensor(num_boxes).uniform_(-100, 500)
+            boxes_5d[:, 2] = torch.FloatTensor(num_boxes).uniform_(0, 500)
+            boxes_5d[:, 3] = torch.FloatTensor(num_boxes).uniform_(0, 500)
+            boxes_5d[:, 4] = torch.FloatTensor(num_boxes).uniform_(-1800, 1800)
+            rotated_boxes = RotatedBoxes(boxes_5d)
+            normalized_boxes = rotated_boxes.clone()
+            normalized_boxes.normalize_angles()
+            self.assertTrue(torch.all(normalized_boxes.tensor[:, 4] >= -180))
+            self.assertTrue(torch.all(normalized_boxes.tensor[:, 4] < 180))
+            # x, y, w, h should not change
+            self.assertTrue(torch.allclose(boxes_5d[:, :4], normalized_boxes.tensor[:, :4]))
+            # the cos/sin values of the angles should stay the same
+
+            self.assertTrue(
+                torch.allclose(
+                    torch.cos(boxes_5d[:, 4] * math.pi / 180),
+                    torch.cos(normalized_boxes.tensor[:, 4] * math.pi / 180),
+                    atol=1e-5,
+                )
+            )
+
+            self.assertTrue(
+                torch.allclose(
+                    torch.sin(boxes_5d[:, 4] * math.pi / 180),
+                    torch.sin(normalized_boxes.tensor[:, 4] * math.pi / 180),
+                    atol=1e-5,
+                )
+            )
+
+    def test_pairwise_iou_0_degree(self):
+        for device in ["cpu"] + ["cuda"] if torch.cuda.is_available() else []:
+            boxes1 = torch.tensor(
+                [[0.5, 0.5, 1.0, 1.0, 0.0], [0.5, 0.5, 1.0, 1.0, 0.0]],
+                dtype=torch.float32,
+                device=device,
+            )
+            boxes2 = torch.tensor(
+                [
+                    [0.5, 0.5, 1.0, 1.0, 0.0],
+                    [0.25, 0.5, 0.5, 1.0, 0.0],
+                    [0.5, 0.25, 1.0, 0.5, 0.0],
+                    [0.25, 0.25, 0.5, 0.5, 0.0],
+                    [0.75, 0.75, 0.5, 0.5, 0.0],
+                    [1.0, 1.0, 1.0, 1.0, 0.0],
+                ],
+                dtype=torch.float32,
+                device=device,
+            )
+            expected_ious = torch.tensor(
+                [
+                    [1.0, 0.5, 0.5, 0.25, 0.25, 0.25 / (2 - 0.25)],
+                    [1.0, 0.5, 0.5, 0.25, 0.25, 0.25 / (2 - 0.25)],
+                ],
+                dtype=torch.float32,
+                device=device,
+            )
+            ious = pairwise_iou(RotatedBoxes(boxes1), RotatedBoxes(boxes2))
+            self.assertTrue(torch.allclose(ious, expected_ious))
+
+    def test_pairwise_iou_45_degrees(self):
+        for device in ["cpu"] + ["cuda"] if torch.cuda.is_available() else []:
+            boxes1 = torch.tensor(
+                [
+                    [1, 1, math.sqrt(2), math.sqrt(2), 45],
+                    [1, 1, 2 * math.sqrt(2), 2 * math.sqrt(2), -45],
+                ],
+                dtype=torch.float32,
+                device=device,
+            )
+            boxes2 = torch.tensor([[1, 1, 2, 2, 0]], dtype=torch.float32, device=device)
+            expected_ious = torch.tensor([[0.5], [0.5]], dtype=torch.float32, device=device)
+            ious = pairwise_iou(RotatedBoxes(boxes1), RotatedBoxes(boxes2))
+            self.assertTrue(torch.allclose(ious, expected_ious))
+
+    def test_pairwise_iou_orthogonal(self):
+        for device in ["cpu"] + ["cuda"] if torch.cuda.is_available() else []:
+            boxes1 = torch.tensor([[5, 5, 10, 6, 55]], dtype=torch.float32, device=device)
+            boxes2 = torch.tensor([[5, 5, 10, 6, -35]], dtype=torch.float32, device=device)
+            iou = (6.0 * 6.0) / (6.0 * 6.0 + 4.0 * 6.0 + 4.0 * 6.0)
+            expected_ious = torch.tensor([[iou]], dtype=torch.float32, device=device)
+            ious = pairwise_iou(RotatedBoxes(boxes1), RotatedBoxes(boxes2))
+            self.assertTrue(torch.allclose(ious, expected_ious))
+
+    def test_pairwise_iou_large_close_boxes(self):
+        for device in ["cpu"] + ["cuda"] if torch.cuda.is_available() else []:
+            boxes1 = torch.tensor(
+                [[299.500000, 417.370422, 600.000000, 364.259186, 27.1828]],
+                dtype=torch.float32,
+                device=device,
+            )
+            boxes2 = torch.tensor(
+                [[299.500000, 417.370422, 600.000000, 364.259155, 27.1828]],
+                dtype=torch.float32,
+                device=device,
+            )
+            iou = 364.259155 / 364.259186
+            expected_ious = torch.tensor([[iou]], dtype=torch.float32, device=device)
+            ious = pairwise_iou(RotatedBoxes(boxes1), RotatedBoxes(boxes2))
+            self.assertTrue(torch.allclose(ious, expected_ious))
+
+    def test_pairwise_iou_many_boxes(self):
+        for device in ["cpu"] + ["cuda"] if torch.cuda.is_available() else []:
+            num_boxes1 = 100
+            num_boxes2 = 200
+            boxes1 = torch.stack(
+                [
+                    torch.tensor(
+                        [5 + 20 * i, 5 + 20 * i, 10, 10, 0], dtype=torch.float32, device=device
+                    )
+                    for i in range(num_boxes1)
+                ]
+            )
+            boxes2 = torch.stack(
+                [
+                    torch.tensor(
+                        [5 + 20 * i, 5 + 20 * i, 10, 1 + 9 * i / num_boxes2, 0],
+                        dtype=torch.float32,
+                        device=device,
+                    )
+                    for i in range(num_boxes2)
+                ]
+            )
+            expected_ious = torch.zeros(num_boxes1, num_boxes2, dtype=torch.float32, device=device)
+            for i in range(min(num_boxes1, num_boxes2)):
+                expected_ious[i][i] = (1 + 9 * i / num_boxes2) / 10.0
+            ious = pairwise_iou(RotatedBoxes(boxes1), RotatedBoxes(boxes2))
+            self.assertTrue(torch.allclose(ious, expected_ious))
+
+    def test_pairwise_iou_issue1207_simplified(self):
+        for device in ["cpu"] + ["cuda"] if torch.cuda.is_available() else []:
+            # Simplified test case of D2-issue-1207
+            boxes1 = torch.tensor([[3, 3, 8, 2, -45.0]], device=device)
+            boxes2 = torch.tensor([[6, 0, 8, 2, -45.0]], device=device)
+            iou = 0.0
+            expected_ious = torch.tensor([[iou]], dtype=torch.float32, device=device)
+
+            ious = pairwise_iou(RotatedBoxes(boxes1), RotatedBoxes(boxes2))
+            self.assertTrue(torch.allclose(ious, expected_ious))
+
+    def test_pairwise_iou_issue1207(self):
+        for device in ["cpu"] + ["cuda"] if torch.cuda.is_available() else []:
+            # The original test case in D2-issue-1207
+            boxes1 = torch.tensor([[160.0, 153.0, 230.0, 23.0, -37.0]], device=device)
+            boxes2 = torch.tensor([[190.0, 127.0, 80.0, 21.0, -46.0]], device=device)
+
+            iou = 0.0
+            expected_ious = torch.tensor([[iou]], dtype=torch.float32, device=device)
+
+            ious = pairwise_iou(RotatedBoxes(boxes1), RotatedBoxes(boxes2))
+            self.assertTrue(torch.allclose(ious, expected_ious))
+
+    def test_empty_cat(self):
+        x = RotatedBoxes.cat([])
+        self.assertTrue(x.tensor.shape, (0, 5))
+
+
+def benchmark_rotated_iou():
+    num_boxes1 = 200
+    num_boxes2 = 500
+    boxes1 = torch.stack(
+        [
+            torch.tensor([5 + 20 * i, 5 + 20 * i, 10, 10, 0], dtype=torch.float32)
+            for i in range(num_boxes1)
+        ]
+    )
+    boxes2 = torch.stack(
+        [
+            torch.tensor(
+                [5 + 20 * i, 5 + 20 * i, 10, 1 + 9 * i / num_boxes2, 0], dtype=torch.float32
+            )
+            for i in range(num_boxes2)
+        ]
+    )
+
+    def func(dev, n=1):
+        b1 = boxes1.to(device=dev)
+        b2 = boxes2.to(device=dev)
+
+        def bench():
+            for _ in range(n):
+                pairwise_iou_rotated(b1, b2)
+            if dev.type == "cuda":
+                torch.cuda.synchronize()
+
+        return bench
+
+    # only run it once per timed loop, since it's slow
+    args = [{"dev": torch.device("cpu"), "n": 1}]
+    if torch.cuda.is_available():
+        args.append({"dev": torch.device("cuda"), "n": 10})
+
+    benchmark(func, "rotated_iou", args, warmup_iters=3)
+
+
+if __name__ == "__main__":
+    unittest.main()
+    benchmark_rotated_iou()
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/tests/test_checkpoint.py b/preprocess/humanparsing/mhp_extension/detectron2/tests/test_checkpoint.py
new file mode 100644
index 0000000000000000000000000000000000000000..725b488fdaec5d2b3a5c6d11c11d2c362453a2a4
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/tests/test_checkpoint.py
@@ -0,0 +1,48 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+import unittest
+from collections import OrderedDict
+import torch
+from torch import nn
+
+from detectron2.checkpoint.c2_model_loading import align_and_update_state_dicts
+from detectron2.utils.logger import setup_logger
+
+
+class TestCheckpointer(unittest.TestCase):
+    def setUp(self):
+        setup_logger()
+
+    def create_complex_model(self):
+        m = nn.Module()
+        m.block1 = nn.Module()
+        m.block1.layer1 = nn.Linear(2, 3)
+        m.layer2 = nn.Linear(3, 2)
+        m.res = nn.Module()
+        m.res.layer2 = nn.Linear(3, 2)
+
+        state_dict = OrderedDict()
+        state_dict["layer1.weight"] = torch.rand(3, 2)
+        state_dict["layer1.bias"] = torch.rand(3)
+        state_dict["layer2.weight"] = torch.rand(2, 3)
+        state_dict["layer2.bias"] = torch.rand(2)
+        state_dict["res.layer2.weight"] = torch.rand(2, 3)
+        state_dict["res.layer2.bias"] = torch.rand(2)
+        return m, state_dict
+
+    def test_complex_model_loaded(self):
+        for add_data_parallel in [False, True]:
+            model, state_dict = self.create_complex_model()
+            if add_data_parallel:
+                model = nn.DataParallel(model)
+            model_sd = model.state_dict()
+
+            align_and_update_state_dicts(model_sd, state_dict)
+            for loaded, stored in zip(model_sd.values(), state_dict.values()):
+                # different tensor references
+                self.assertFalse(id(loaded) == id(stored))
+                # same content
+                self.assertTrue(loaded.equal(stored))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/tests/test_config.py b/preprocess/humanparsing/mhp_extension/detectron2/tests/test_config.py
new file mode 100644
index 0000000000000000000000000000000000000000..650bdf2c42107c7031709653783cb2f3043e1bdf
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/tests/test_config.py
@@ -0,0 +1,240 @@
+#!/usr/bin/env python
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+
+
+import os
+import tempfile
+import unittest
+import torch
+
+from detectron2.config import configurable, downgrade_config, get_cfg, upgrade_config
+from detectron2.layers import ShapeSpec
+
+_V0_CFG = """
+MODEL:
+  RPN_HEAD:
+    NAME: "TEST"
+VERSION: 0
+"""
+
+_V1_CFG = """
+MODEL:
+  WEIGHT: "/path/to/weight"
+"""
+
+
+class TestConfigVersioning(unittest.TestCase):
+    def test_upgrade_downgrade_consistency(self):
+        cfg = get_cfg()
+        # check that custom is preserved
+        cfg.USER_CUSTOM = 1
+
+        down = downgrade_config(cfg, to_version=0)
+        up = upgrade_config(down)
+        self.assertTrue(up == cfg)
+
+    def _merge_cfg_str(self, cfg, merge_str):
+        f = tempfile.NamedTemporaryFile(mode="w", suffix=".yaml", delete=False)
+        try:
+            f.write(merge_str)
+            f.close()
+            cfg.merge_from_file(f.name)
+        finally:
+            os.remove(f.name)
+        return cfg
+
+    def test_auto_upgrade(self):
+        cfg = get_cfg()
+        latest_ver = cfg.VERSION
+        cfg.USER_CUSTOM = 1
+
+        self._merge_cfg_str(cfg, _V0_CFG)
+
+        self.assertEqual(cfg.MODEL.RPN.HEAD_NAME, "TEST")
+        self.assertEqual(cfg.VERSION, latest_ver)
+
+    def test_guess_v1(self):
+        cfg = get_cfg()
+        latest_ver = cfg.VERSION
+        self._merge_cfg_str(cfg, _V1_CFG)
+        self.assertEqual(cfg.VERSION, latest_ver)
+
+
+class _TestClassA(torch.nn.Module):
+    @configurable
+    def __init__(self, arg1, arg2, arg3=3):
+        super().__init__()
+        self.arg1 = arg1
+        self.arg2 = arg2
+        self.arg3 = arg3
+        assert arg1 == 1
+        assert arg2 == 2
+        assert arg3 == 3
+
+    @classmethod
+    def from_config(cls, cfg):
+        args = {"arg1": cfg.ARG1, "arg2": cfg.ARG2}
+        return args
+
+
+class _TestClassB(_TestClassA):
+    @configurable
+    def __init__(self, input_shape, arg1, arg2, arg3=3):
+        """
+        Doc of _TestClassB
+        """
+        assert input_shape == "shape"
+        super().__init__(arg1, arg2, arg3)
+
+    @classmethod
+    def from_config(cls, cfg, input_shape):  # test extra positional arg in from_config
+        args = {"arg1": cfg.ARG1, "arg2": cfg.ARG2}
+        args["input_shape"] = input_shape
+        return args
+
+
+class _LegacySubClass(_TestClassB):
+    # an old subclass written in cfg style
+    def __init__(self, cfg, input_shape, arg4=4):
+        super().__init__(cfg, input_shape)
+        assert self.arg1 == 1
+        assert self.arg2 == 2
+        assert self.arg3 == 3
+
+
+class _NewSubClassNewInit(_TestClassB):
+    # test new subclass with a new __init__
+    @configurable
+    def __init__(self, input_shape, arg4=4, **kwargs):
+        super().__init__(input_shape, **kwargs)
+        assert self.arg1 == 1
+        assert self.arg2 == 2
+        assert self.arg3 == 3
+
+
+class _LegacySubClassNotCfg(_TestClassB):
+    # an old subclass written in cfg style, but argument is not called "cfg"
+    def __init__(self, config, input_shape):
+        super().__init__(config, input_shape)
+        assert self.arg1 == 1
+        assert self.arg2 == 2
+        assert self.arg3 == 3
+
+
+class _TestClassC(_TestClassB):
+    @classmethod
+    def from_config(cls, cfg, input_shape, **kwargs):  # test extra kwarg overwrite
+        args = {"arg1": cfg.ARG1, "arg2": cfg.ARG2}
+        args["input_shape"] = input_shape
+        args.update(kwargs)
+        return args
+
+
+class _TestClassD(_TestClassA):
+    @configurable
+    def __init__(self, input_shape: ShapeSpec, arg1: int, arg2, arg3=3):
+        assert input_shape == "shape"
+        super().__init__(arg1, arg2, arg3)
+
+    # _TestClassA.from_config does not have input_shape args.
+    # Test whether input_shape will be forwarded to __init__
+
+
+class TestConfigurable(unittest.TestCase):
+    def testInitWithArgs(self):
+        _ = _TestClassA(arg1=1, arg2=2, arg3=3)
+        _ = _TestClassB("shape", arg1=1, arg2=2)
+        _ = _TestClassC("shape", arg1=1, arg2=2)
+        _ = _TestClassD("shape", arg1=1, arg2=2, arg3=3)
+
+    def testPatchedAttr(self):
+        self.assertTrue("Doc" in _TestClassB.__init__.__doc__)
+        self.assertEqual(_TestClassD.__init__.__annotations__["arg1"], int)
+
+    def testInitWithCfg(self):
+        cfg = get_cfg()
+        cfg.ARG1 = 1
+        cfg.ARG2 = 2
+        cfg.ARG3 = 3
+        _ = _TestClassA(cfg)
+        _ = _TestClassB(cfg, input_shape="shape")
+        _ = _TestClassC(cfg, input_shape="shape")
+        _ = _TestClassD(cfg, input_shape="shape")
+        _ = _LegacySubClass(cfg, input_shape="shape")
+        _ = _NewSubClassNewInit(cfg, input_shape="shape")
+        _ = _LegacySubClassNotCfg(cfg, input_shape="shape")
+        with self.assertRaises(TypeError):
+            # disallow forwarding positional args to __init__ since it's prone to errors
+            _ = _TestClassD(cfg, "shape")
+
+        # call with kwargs instead
+        _ = _TestClassA(cfg=cfg)
+        _ = _TestClassB(cfg=cfg, input_shape="shape")
+        _ = _TestClassC(cfg=cfg, input_shape="shape")
+        _ = _TestClassD(cfg=cfg, input_shape="shape")
+        _ = _LegacySubClass(cfg=cfg, input_shape="shape")
+        _ = _NewSubClassNewInit(cfg=cfg, input_shape="shape")
+        _ = _LegacySubClassNotCfg(config=cfg, input_shape="shape")
+
+    def testInitWithCfgOverwrite(self):
+        cfg = get_cfg()
+        cfg.ARG1 = 1
+        cfg.ARG2 = 999  # wrong config
+        with self.assertRaises(AssertionError):
+            _ = _TestClassA(cfg, arg3=3)
+
+        # overwrite arg2 with correct config later:
+        _ = _TestClassA(cfg, arg2=2, arg3=3)
+        _ = _TestClassB(cfg, input_shape="shape", arg2=2, arg3=3)
+        _ = _TestClassC(cfg, input_shape="shape", arg2=2, arg3=3)
+        _ = _TestClassD(cfg, input_shape="shape", arg2=2, arg3=3)
+
+        # call with kwargs cfg=cfg instead
+        _ = _TestClassA(cfg=cfg, arg2=2, arg3=3)
+        _ = _TestClassB(cfg=cfg, input_shape="shape", arg2=2, arg3=3)
+        _ = _TestClassC(cfg=cfg, input_shape="shape", arg2=2, arg3=3)
+        _ = _TestClassD(cfg=cfg, input_shape="shape", arg2=2, arg3=3)
+
+    def testInitWithCfgWrongArgs(self):
+        cfg = get_cfg()
+        cfg.ARG1 = 1
+        cfg.ARG2 = 2
+        with self.assertRaises(TypeError):
+            _ = _TestClassB(cfg, "shape", not_exist=1)
+        with self.assertRaises(TypeError):
+            _ = _TestClassC(cfg, "shape", not_exist=1)
+        with self.assertRaises(TypeError):
+            _ = _TestClassD(cfg, "shape", not_exist=1)
+
+    def testBadClass(self):
+        class _BadClass1:
+            @configurable
+            def __init__(self, a=1, b=2):
+                pass
+
+        class _BadClass2:
+            @configurable
+            def __init__(self, a=1, b=2):
+                pass
+
+            def from_config(self, cfg):  # noqa
+                pass
+
+        class _BadClass3:
+            @configurable
+            def __init__(self, a=1, b=2):
+                pass
+
+            # bad name: must be cfg
+            @classmethod
+            def from_config(cls, config):  # noqa
+                pass
+
+        with self.assertRaises(AttributeError):
+            _ = _BadClass1(a=1)
+
+        with self.assertRaises(TypeError):
+            _ = _BadClass2(a=1)
+
+        with self.assertRaises(TypeError):
+            _ = _BadClass3(get_cfg())
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/tests/test_export_caffe2.py b/preprocess/humanparsing/mhp_extension/detectron2/tests/test_export_caffe2.py
new file mode 100644
index 0000000000000000000000000000000000000000..ad989c4a3d11e6675d26ae2690f06d2ffe30d44c
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/tests/test_export_caffe2.py
@@ -0,0 +1,71 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+# -*- coding: utf-8 -*-
+
+import copy
+import numpy as np
+import os
+import tempfile
+import unittest
+import cv2
+import torch
+from fvcore.common.file_io import PathManager
+
+from detectron2 import model_zoo
+from detectron2.checkpoint import DetectionCheckpointer
+from detectron2.config import get_cfg
+from detectron2.data import DatasetCatalog
+from detectron2.modeling import build_model
+from detectron2.utils.logger import setup_logger
+
+
+@unittest.skipIf(os.environ.get("CIRCLECI"), "Require COCO data and model zoo.")
+class TestCaffe2Export(unittest.TestCase):
+    def setUp(self):
+        setup_logger()
+
+    def _test_model(self, config_path, device="cpu"):
+        # requires extra dependencies
+        from detectron2.export import Caffe2Model, add_export_config, export_caffe2_model
+
+        cfg = get_cfg()
+        cfg.merge_from_file(model_zoo.get_config_file(config_path))
+        cfg = add_export_config(cfg)
+        cfg.MODEL.DEVICE = device
+
+        model = build_model(cfg)
+        DetectionCheckpointer(model).load(model_zoo.get_checkpoint_url(config_path))
+
+        inputs = [{"image": self._get_test_image()}]
+        c2_model = export_caffe2_model(cfg, model, copy.deepcopy(inputs))
+
+        with tempfile.TemporaryDirectory(prefix="detectron2_unittest") as d:
+            c2_model.save_protobuf(d)
+            c2_model.save_graph(os.path.join(d, "test.svg"), inputs=copy.deepcopy(inputs))
+            c2_model = Caffe2Model.load_protobuf(d)
+        c2_model(inputs)[0]["instances"]
+
+    def _get_test_image(self):
+        try:
+            file_name = DatasetCatalog.get("coco_2017_train")[0]["file_name"]
+            assert PathManager.exists(file_name)
+        except Exception:
+            self.skipTest("COCO dataset not available.")
+
+        with PathManager.open(file_name, "rb") as f:
+            buf = f.read()
+        img = cv2.imdecode(np.frombuffer(buf, dtype=np.uint8), cv2.IMREAD_COLOR)
+        assert img is not None, file_name
+        return torch.from_numpy(img.transpose(2, 0, 1))
+
+    def testMaskRCNN(self):
+        self._test_model("COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_3x.yaml")
+
+    @unittest.skipIf(not torch.cuda.is_available(), "CUDA not available")
+    def testMaskRCNNGPU(self):
+        self._test_model("COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_3x.yaml", device="cuda")
+
+    def testRetinaNet(self):
+        self._test_model("COCO-Detection/retinanet_R_50_FPN_3x.yaml")
+
+    def testPanopticFPN(self):
+        self._test_model("COCO-PanopticSegmentation/panoptic_fpn_R_50_3x.yaml")
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/tests/test_model_analysis.py b/preprocess/humanparsing/mhp_extension/detectron2/tests/test_model_analysis.py
new file mode 100644
index 0000000000000000000000000000000000000000..0e3f84c9354746fc634aca997abb232424ddebb2
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/tests/test_model_analysis.py
@@ -0,0 +1,58 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+
+
+import unittest
+import torch
+
+import detectron2.model_zoo as model_zoo
+from detectron2.config import get_cfg
+from detectron2.modeling import build_model
+from detectron2.utils.analysis import flop_count_operators, parameter_count
+
+
+def get_model_zoo(config_path):
+    """
+    Like model_zoo.get, but do not load any weights (even pretrained)
+    """
+    cfg_file = model_zoo.get_config_file(config_path)
+    cfg = get_cfg()
+    cfg.merge_from_file(cfg_file)
+    if not torch.cuda.is_available():
+        cfg.MODEL.DEVICE = "cpu"
+    return build_model(cfg)
+
+
+class RetinaNetTest(unittest.TestCase):
+    def setUp(self):
+        self.model = get_model_zoo("COCO-Detection/retinanet_R_50_FPN_1x.yaml")
+
+    def test_flop(self):
+        # RetinaNet supports flop-counting with random inputs
+        inputs = [{"image": torch.rand(3, 800, 800)}]
+        res = flop_count_operators(self.model, inputs)
+        self.assertTrue(int(res["conv"]), 146)  # 146B flops
+
+    def test_param_count(self):
+        res = parameter_count(self.model)
+        self.assertTrue(res[""], 37915572)
+        self.assertTrue(res["backbone"], 31452352)
+
+
+class FasterRCNNTest(unittest.TestCase):
+    def setUp(self):
+        self.model = get_model_zoo("COCO-Detection/faster_rcnn_R_50_FPN_1x.yaml")
+
+    def test_flop(self):
+        # Faster R-CNN supports flop-counting with random inputs
+        inputs = [{"image": torch.rand(3, 800, 800)}]
+        res = flop_count_operators(self.model, inputs)
+
+        # This only checks flops for backbone & proposal generator
+        # Flops for box head is not conv, and depends on #proposals, which is
+        # almost 0 for random inputs.
+        self.assertTrue(int(res["conv"]), 117)
+
+    def test_param_count(self):
+        res = parameter_count(self.model)
+        self.assertTrue(res[""], 41699936)
+        self.assertTrue(res["backbone"], 26799296)
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/tests/test_model_zoo.py b/preprocess/humanparsing/mhp_extension/detectron2/tests/test_model_zoo.py
new file mode 100644
index 0000000000000000000000000000000000000000..2d16c711af2ab797dab04d0573c2ed70e071ebfd
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/tests/test_model_zoo.py
@@ -0,0 +1,29 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+import logging
+import unittest
+
+from detectron2 import model_zoo
+from detectron2.modeling import FPN, GeneralizedRCNN
+
+logger = logging.getLogger(__name__)
+
+
+class TestModelZoo(unittest.TestCase):
+    def test_get_returns_model(self):
+        model = model_zoo.get("Misc/scratch_mask_rcnn_R_50_FPN_3x_gn.yaml", trained=False)
+        self.assertIsInstance(model, GeneralizedRCNN)
+        self.assertIsInstance(model.backbone, FPN)
+
+    def test_get_invalid_model(self):
+        self.assertRaises(RuntimeError, model_zoo.get, "Invalid/config.yaml")
+
+    def test_get_url(self):
+        url = model_zoo.get_checkpoint_url("Misc/scratch_mask_rcnn_R_50_FPN_3x_gn.yaml")
+        self.assertEqual(
+            url,
+            "https://dl.fbaipublicfiles.com/detectron2/Misc/scratch_mask_rcnn_R_50_FPN_3x_gn/138602908/model_final_01ca85.pkl",  # noqa
+        )
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/tests/test_visualizer.py b/preprocess/humanparsing/mhp_extension/detectron2/tests/test_visualizer.py
new file mode 100644
index 0000000000000000000000000000000000000000..1cdeddc6733e25d882bede48a404a1d52c0845de
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/tests/test_visualizer.py
@@ -0,0 +1,143 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+# File:
+
+import numpy as np
+import unittest
+import torch
+
+from detectron2.data import MetadataCatalog
+from detectron2.structures import BoxMode, Instances, RotatedBoxes
+from detectron2.utils.visualizer import Visualizer
+
+
+class TestVisualizer(unittest.TestCase):
+    def _random_data(self):
+        H, W = 100, 100
+        N = 10
+        img = np.random.rand(H, W, 3) * 255
+        boxxy = np.random.rand(N, 2) * (H // 2)
+        boxes = np.concatenate((boxxy, boxxy + H // 2), axis=1)
+
+        def _rand_poly():
+            return np.random.rand(3, 2).flatten() * H
+
+        polygons = [[_rand_poly() for _ in range(np.random.randint(1, 5))] for _ in range(N)]
+
+        mask = np.zeros_like(img[:, :, 0], dtype=np.bool)
+        mask[:10, 10:20] = 1
+
+        labels = [str(i) for i in range(N)]
+        return img, boxes, labels, polygons, [mask] * N
+
+    @property
+    def metadata(self):
+        return MetadataCatalog.get("coco_2017_train")
+
+    def test_draw_dataset_dict(self):
+        img = np.random.rand(512, 512, 3) * 255
+        dic = {
+            "annotations": [
+                {
+                    "bbox": [
+                        368.9946492271106,
+                        330.891438763377,
+                        13.148537455410235,
+                        13.644708680142685,
+                    ],
+                    "bbox_mode": BoxMode.XYWH_ABS,
+                    "category_id": 0,
+                    "iscrowd": 1,
+                    "segmentation": {
+                        "counts": "_jh52m?2N2N2N2O100O10O001N1O2MceP2",
+                        "size": [512, 512],
+                    },
+                }
+            ],
+            "height": 512,
+            "image_id": 1,
+            "width": 512,
+        }
+        v = Visualizer(img, self.metadata)
+        v.draw_dataset_dict(dic)
+
+    def test_overlay_instances(self):
+        img, boxes, labels, polygons, masks = self._random_data()
+
+        v = Visualizer(img, self.metadata)
+        output = v.overlay_instances(masks=polygons, boxes=boxes, labels=labels).get_image()
+        self.assertEqual(output.shape, img.shape)
+
+        # Test 2x scaling
+        v = Visualizer(img, self.metadata, scale=2.0)
+        output = v.overlay_instances(masks=polygons, boxes=boxes, labels=labels).get_image()
+        self.assertEqual(output.shape[0], img.shape[0] * 2)
+
+        # Test overlay masks
+        v = Visualizer(img, self.metadata)
+        output = v.overlay_instances(masks=masks, boxes=boxes, labels=labels).get_image()
+        self.assertEqual(output.shape, img.shape)
+
+    def test_overlay_instances_no_boxes(self):
+        img, boxes, labels, polygons, _ = self._random_data()
+        v = Visualizer(img, self.metadata)
+        v.overlay_instances(masks=polygons, boxes=None, labels=labels).get_image()
+
+    def test_draw_instance_predictions(self):
+        img, boxes, _, _, masks = self._random_data()
+        num_inst = len(boxes)
+        inst = Instances((img.shape[0], img.shape[1]))
+        inst.pred_classes = torch.randint(0, 80, size=(num_inst,))
+        inst.scores = torch.rand(num_inst)
+        inst.pred_boxes = torch.from_numpy(boxes)
+        inst.pred_masks = torch.from_numpy(np.asarray(masks))
+
+        v = Visualizer(img, self.metadata)
+        v.draw_instance_predictions(inst)
+
+    def test_draw_empty_mask_predictions(self):
+        img, boxes, _, _, masks = self._random_data()
+        num_inst = len(boxes)
+        inst = Instances((img.shape[0], img.shape[1]))
+        inst.pred_classes = torch.randint(0, 80, size=(num_inst,))
+        inst.scores = torch.rand(num_inst)
+        inst.pred_boxes = torch.from_numpy(boxes)
+        inst.pred_masks = torch.from_numpy(np.zeros_like(np.asarray(masks)))
+
+        v = Visualizer(img, self.metadata)
+        v.draw_instance_predictions(inst)
+
+    def test_correct_output_shape(self):
+        img = np.random.rand(928, 928, 3) * 255
+        v = Visualizer(img, self.metadata)
+        out = v.output.get_image()
+        self.assertEqual(out.shape, img.shape)
+
+    def test_overlay_rotated_instances(self):
+        H, W = 100, 150
+        img = np.random.rand(H, W, 3) * 255
+        num_boxes = 50
+        boxes_5d = torch.zeros(num_boxes, 5)
+        boxes_5d[:, 0] = torch.FloatTensor(num_boxes).uniform_(-0.1 * W, 1.1 * W)
+        boxes_5d[:, 1] = torch.FloatTensor(num_boxes).uniform_(-0.1 * H, 1.1 * H)
+        boxes_5d[:, 2] = torch.FloatTensor(num_boxes).uniform_(0, max(W, H))
+        boxes_5d[:, 3] = torch.FloatTensor(num_boxes).uniform_(0, max(W, H))
+        boxes_5d[:, 4] = torch.FloatTensor(num_boxes).uniform_(-1800, 1800)
+        rotated_boxes = RotatedBoxes(boxes_5d)
+        labels = [str(i) for i in range(num_boxes)]
+
+        v = Visualizer(img, self.metadata)
+        output = v.overlay_instances(boxes=rotated_boxes, labels=labels).get_image()
+        self.assertEqual(output.shape, img.shape)
+
+    def test_draw_no_metadata(self):
+        img, boxes, _, _, masks = self._random_data()
+        num_inst = len(boxes)
+        inst = Instances((img.shape[0], img.shape[1]))
+        inst.pred_classes = torch.randint(0, 80, size=(num_inst,))
+        inst.scores = torch.rand(num_inst)
+        inst.pred_boxes = torch.from_numpy(boxes)
+        inst.pred_masks = torch.from_numpy(np.asarray(masks))
+
+        v = Visualizer(img, MetadataCatalog.get("asdfasdf"))
+        v.draw_instance_predictions(inst)
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/tools/README.md b/preprocess/humanparsing/mhp_extension/detectron2/tools/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..3733863970218bf8bdf9b32420163f4c858e209e
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/tools/README.md
@@ -0,0 +1,45 @@
+
+This directory contains a few scripts that use detectron2.
+
+
+* `train_net.py`
+
+An example training script that's made to train builtin models of detectron2.
+
+For usage, see [GETTING_STARTED.md](../GETTING_STARTED.md).
+
+* `plain_train_net.py`
+
+Similar to `train_net.py`, but implements a training loop instead of using `Trainer`.
+This script includes fewer features but it may be more friendly to hackers.
+
+* `benchmark.py`
+
+Benchmark the training speed, inference speed or data loading speed of a given config.
+
+Usage:
+```
+python benchmark.py --config-file config.yaml --task train/eval/data [optional DDP flags]
+```
+
+* `visualize_json_results.py`
+
+Visualize the json instance detection/segmentation results dumped by `COCOEvalutor` or `LVISEvaluator`
+
+Usage:
+```
+python visualize_json_results.py --input x.json --output dir/ --dataset coco_2017_val
+```
+If not using a builtin dataset, you'll need your own script or modify this script.
+
+* `visualize_data.py`
+
+Visualize ground truth raw annotations or training data (after preprocessing/augmentations).
+
+Usage:
+```
+python visualize_data.py --config-file config.yaml --source annotation/dataloader --output-dir dir/ [--show]
+```
+
+NOTE: the script does not stop by itself when using `--source dataloader` because a training
+dataloader is usually infinite.
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/tools/analyze_model.py b/preprocess/humanparsing/mhp_extension/detectron2/tools/analyze_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..9c06ea4b5fbfd551d85702171976f9bc33f2e275
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/tools/analyze_model.py
@@ -0,0 +1,127 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+
+import logging
+import numpy as np
+from collections import Counter
+import tqdm
+
+from detectron2.checkpoint import DetectionCheckpointer
+from detectron2.config import get_cfg
+from detectron2.data import build_detection_test_loader
+from detectron2.engine import default_argument_parser
+from detectron2.modeling import build_model
+from detectron2.utils.analysis import (
+    activation_count_operators,
+    flop_count_operators,
+    parameter_count_table,
+)
+from detectron2.utils.logger import setup_logger
+
+logger = logging.getLogger("detectron2")
+
+
+def setup(args):
+    cfg = get_cfg()
+    cfg.merge_from_file(args.config_file)
+    cfg.DATALOADER.NUM_WORKERS = 0
+    cfg.merge_from_list(args.opts)
+    cfg.freeze()
+    setup_logger()
+    return cfg
+
+
+def do_flop(cfg):
+    data_loader = build_detection_test_loader(cfg, cfg.DATASETS.TEST[0])
+    model = build_model(cfg)
+    DetectionCheckpointer(model).load(cfg.MODEL.WEIGHTS)
+    model.eval()
+
+    counts = Counter()
+    total_flops = []
+    for idx, data in zip(tqdm.trange(args.num_inputs), data_loader):  # noqa
+        count = flop_count_operators(model, data)
+        counts += count
+        total_flops.append(sum(count.values()))
+    logger.info(
+        "(G)Flops for Each Type of Operators:\n" + str([(k, v / idx) for k, v in counts.items()])
+    )
+    logger.info("Total (G)Flops: {}±{}".format(np.mean(total_flops), np.std(total_flops)))
+
+
+def do_activation(cfg):
+    data_loader = build_detection_test_loader(cfg, cfg.DATASETS.TEST[0])
+    model = build_model(cfg)
+    DetectionCheckpointer(model).load(cfg.MODEL.WEIGHTS)
+    model.eval()
+
+    counts = Counter()
+    total_activations = []
+    for idx, data in zip(tqdm.trange(args.num_inputs), data_loader):  # noqa
+        count = activation_count_operators(model, data)
+        counts += count
+        total_activations.append(sum(count.values()))
+    logger.info(
+        "(Million) Activations for Each Type of Operators:\n"
+        + str([(k, v / idx) for k, v in counts.items()])
+    )
+    logger.info(
+        "Total (Million) Activations: {}±{}".format(
+            np.mean(total_activations), np.std(total_activations)
+        )
+    )
+
+
+def do_parameter(cfg):
+    model = build_model(cfg)
+    logger.info("Parameter Count:\n" + parameter_count_table(model, max_depth=5))
+
+
+def do_structure(cfg):
+    model = build_model(cfg)
+    logger.info("Model Structure:\n" + str(model))
+
+
+if __name__ == "__main__":
+    parser = default_argument_parser(
+        epilog="""
+Examples:
+
+To show parameters of a model:
+$ ./analyze_model.py --tasks parameter \\
+    --config-file ../configs/COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_1x.yaml
+
+Flops and activations are data-dependent, therefore inputs and model weights
+are needed to count them:
+
+$ ./analyze_model.py --num-inputs 100 --tasks flop \\
+    --config-file ../configs/COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_1x.yaml \\
+    MODEL.WEIGHTS /path/to/model.pkl
+"""
+    )
+    parser.add_argument(
+        "--tasks",
+        choices=["flop", "activation", "parameter", "structure"],
+        required=True,
+        nargs="+",
+    )
+    parser.add_argument(
+        "--num-inputs",
+        default=100,
+        type=int,
+        help="number of inputs used to compute statistics for flops/activations, "
+        "both are data dependent.",
+    )
+    args = parser.parse_args()
+    assert not args.eval_only
+    assert args.num_gpus == 1
+
+    cfg = setup(args)
+
+    for task in args.tasks:
+        {
+            "flop": do_flop,
+            "activation": do_activation,
+            "parameter": do_parameter,
+            "structure": do_structure,
+        }[task](cfg)
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/tools/benchmark.py b/preprocess/humanparsing/mhp_extension/detectron2/tools/benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..9eec59f476882e4045ec3c682ffe515413a3be15
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/tools/benchmark.py
@@ -0,0 +1,167 @@
+#!/usr/bin/env python
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+"""
+A script to benchmark builtin models.
+
+Note: this script has an extra dependency of psutil.
+"""
+
+import itertools
+import logging
+import psutil
+import torch
+import tqdm
+from fvcore.common.timer import Timer
+from torch.nn.parallel import DistributedDataParallel
+
+from detectron2.checkpoint import DetectionCheckpointer
+from detectron2.config import get_cfg
+from detectron2.data import (
+    DatasetFromList,
+    build_detection_test_loader,
+    build_detection_train_loader,
+)
+from detectron2.engine import SimpleTrainer, default_argument_parser, hooks, launch
+from detectron2.modeling import build_model
+from detectron2.solver import build_optimizer
+from detectron2.utils import comm
+from detectron2.utils.events import CommonMetricPrinter
+from detectron2.utils.logger import setup_logger
+
+logger = logging.getLogger("detectron2")
+
+
+def setup(args):
+    cfg = get_cfg()
+    cfg.merge_from_file(args.config_file)
+    cfg.SOLVER.BASE_LR = 0.001  # Avoid NaNs. Not useful in this script anyway.
+    cfg.merge_from_list(args.opts)
+    cfg.freeze()
+    setup_logger(distributed_rank=comm.get_rank())
+    return cfg
+
+
+def benchmark_data(args):
+    cfg = setup(args)
+
+    timer = Timer()
+    dataloader = build_detection_train_loader(cfg)
+    logger.info("Initialize loader using {} seconds.".format(timer.seconds()))
+
+    timer.reset()
+    itr = iter(dataloader)
+    for i in range(10):  # warmup
+        next(itr)
+        if i == 0:
+            startup_time = timer.seconds()
+    timer = Timer()
+    max_iter = 1000
+    for _ in tqdm.trange(max_iter):
+        next(itr)
+    logger.info(
+        "{} iters ({} images) in {} seconds.".format(
+            max_iter, max_iter * cfg.SOLVER.IMS_PER_BATCH, timer.seconds()
+        )
+    )
+    logger.info("Startup time: {} seconds".format(startup_time))
+    vram = psutil.virtual_memory()
+    logger.info(
+        "RAM Usage: {:.2f}/{:.2f} GB".format(
+            (vram.total - vram.available) / 1024 ** 3, vram.total / 1024 ** 3
+        )
+    )
+
+    # test for a few more rounds
+    for _ in range(10):
+        timer = Timer()
+        max_iter = 1000
+        for _ in tqdm.trange(max_iter):
+            next(itr)
+        logger.info(
+            "{} iters ({} images) in {} seconds.".format(
+                max_iter, max_iter * cfg.SOLVER.IMS_PER_BATCH, timer.seconds()
+            )
+        )
+
+
+def benchmark_train(args):
+    cfg = setup(args)
+    model = build_model(cfg)
+    logger.info("Model:\n{}".format(model))
+    if comm.get_world_size() > 1:
+        model = DistributedDataParallel(
+            model, device_ids=[comm.get_local_rank()], broadcast_buffers=False
+        )
+    optimizer = build_optimizer(cfg, model)
+    checkpointer = DetectionCheckpointer(model, optimizer=optimizer)
+    checkpointer.load(cfg.MODEL.WEIGHTS)
+
+    cfg.defrost()
+    cfg.DATALOADER.NUM_WORKERS = 0
+    data_loader = build_detection_train_loader(cfg)
+    dummy_data = list(itertools.islice(data_loader, 100))
+
+    def f():
+        data = DatasetFromList(dummy_data, copy=False)
+        while True:
+            yield from data
+
+    max_iter = 400
+    trainer = SimpleTrainer(model, f(), optimizer)
+    trainer.register_hooks(
+        [hooks.IterationTimer(), hooks.PeriodicWriter([CommonMetricPrinter(max_iter)])]
+    )
+    trainer.train(1, max_iter)
+
+
+@torch.no_grad()
+def benchmark_eval(args):
+    cfg = setup(args)
+    model = build_model(cfg)
+    model.eval()
+    logger.info("Model:\n{}".format(model))
+    DetectionCheckpointer(model).load(cfg.MODEL.WEIGHTS)
+
+    cfg.defrost()
+    cfg.DATALOADER.NUM_WORKERS = 0
+    data_loader = build_detection_test_loader(cfg, cfg.DATASETS.TEST[0])
+    dummy_data = list(itertools.islice(data_loader, 100))
+
+    def f():
+        while True:
+            yield from DatasetFromList(dummy_data, copy=False)
+
+    for _ in range(5):  # warmup
+        model(dummy_data[0])
+
+    max_iter = 400
+    timer = Timer()
+    with tqdm.tqdm(total=max_iter) as pbar:
+        for idx, d in enumerate(f()):
+            if idx == max_iter:
+                break
+            model(d)
+            pbar.update()
+    logger.info("{} iters in {} seconds.".format(max_iter, timer.seconds()))
+
+
+if __name__ == "__main__":
+    parser = default_argument_parser()
+    parser.add_argument("--task", choices=["train", "eval", "data"], required=True)
+    args = parser.parse_args()
+    assert not args.eval_only
+
+    if args.task == "data":
+        f = benchmark_data
+    elif args.task == "train":
+        """
+        Note: training speed may not be representative.
+        The training cost of a R-CNN model varies with the content of the data
+        and the quality of the model.
+        """
+        f = benchmark_train
+    elif args.task == "eval":
+        f = benchmark_eval
+        # only benchmark single-GPU inference.
+        assert args.num_gpus == 1 and args.num_machines == 1
+    launch(f, args.num_gpus, args.num_machines, args.machine_rank, args.dist_url, args=(args,))
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/tools/convert-torchvision-to-d2.py b/preprocess/humanparsing/mhp_extension/detectron2/tools/convert-torchvision-to-d2.py
new file mode 100644
index 0000000000000000000000000000000000000000..18a24e4ef96d34a4a0d1f43debc2276260da1a2b
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/tools/convert-torchvision-to-d2.py
@@ -0,0 +1,56 @@
+#!/usr/bin/env python
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+
+import pickle as pkl
+import sys
+import torch
+
+"""
+Usage:
+  # download one of the ResNet{18,34,50,101,152} models from torchvision:
+  wget https://download.pytorch.org/models/resnet50-19c8e357.pth -O r50.pth
+  # run the conversion
+  ./convert-torchvision-to-d2.py r50.pth r50.pkl
+
+  # Then, use r50.pkl with the following changes in config:
+
+MODEL:
+  WEIGHTS: "/path/to/r50.pkl"
+  PIXEL_MEAN: [123.675, 116.280, 103.530]
+  PIXEL_STD: [58.395, 57.120, 57.375]
+  RESNETS:
+    DEPTH: 50
+    STRIDE_IN_1X1: False
+INPUT:
+  FORMAT: "RGB"
+
+  These models typically produce slightly worse results than the
+  pre-trained ResNets we use in official configs, which are the
+  original ResNet models released by MSRA.
+"""
+
+if __name__ == "__main__":
+    input = sys.argv[1]
+
+    obj = torch.load(input, map_location="cpu")
+
+    newmodel = {}
+    for k in list(obj.keys()):
+        old_k = k
+        if "layer" not in k:
+            k = "stem." + k
+        for t in [1, 2, 3, 4]:
+            k = k.replace("layer{}".format(t), "res{}".format(t + 1))
+        for t in [1, 2, 3]:
+            k = k.replace("bn{}".format(t), "conv{}.norm".format(t))
+        k = k.replace("downsample.0", "shortcut")
+        k = k.replace("downsample.1", "shortcut.norm")
+        print(old_k, "->", k)
+        newmodel[k] = obj.pop(old_k).detach().numpy()
+
+    res = {"model": newmodel, "__author__": "torchvision", "matching_heuristics": True}
+
+    with open(sys.argv[2], "wb") as f:
+        pkl.dump(res, f)
+    if obj:
+        print("Unconverted keys:", obj.keys())
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/tools/deploy/README.md b/preprocess/humanparsing/mhp_extension/detectron2/tools/deploy/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..b9d5b15512c0bd160accbb1823236b8954a37b86
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/tools/deploy/README.md
@@ -0,0 +1,9 @@
+
+This directory contains:
+
+1. A script that converts a detectron2 model to caffe2 format.
+
+2. An example that loads a Mask R-CNN model in caffe2 format and runs inference.
+
+See [tutorial](https://detectron2.readthedocs.io/tutorials/deployment.html)
+for their usage.
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/tools/deploy/caffe2_converter.py b/preprocess/humanparsing/mhp_extension/detectron2/tools/deploy/caffe2_converter.py
new file mode 100644
index 0000000000000000000000000000000000000000..08feb69fba090a302d1624d52d146ac7a0787223
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/tools/deploy/caffe2_converter.py
@@ -0,0 +1,98 @@
+#!/usr/bin/env python
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+import argparse
+import os
+import onnx
+import torch
+
+from detectron2.checkpoint import DetectionCheckpointer
+from detectron2.config import get_cfg
+from detectron2.data import build_detection_test_loader
+from detectron2.evaluation import COCOEvaluator, inference_on_dataset, print_csv_format
+from detectron2.export import Caffe2Tracer, add_export_config
+from detectron2.modeling import build_model
+from detectron2.utils.logger import setup_logger
+
+
+def setup_cfg(args):
+    cfg = get_cfg()
+    # cuda context is initialized before creating dataloader, so we don't fork anymore
+    cfg.DATALOADER.NUM_WORKERS = 0
+    cfg = add_export_config(cfg)
+    cfg.merge_from_file(args.config_file)
+    cfg.merge_from_list(args.opts)
+    cfg.freeze()
+    if cfg.MODEL.DEVICE != "cpu":
+        TORCH_VERSION = tuple(int(x) for x in torch.__version__.split(".")[:2])
+        assert TORCH_VERSION >= (1, 5), "PyTorch>=1.5 required for GPU conversion!"
+    return cfg
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Convert a model using caffe2 tracing.")
+    parser.add_argument(
+        "--format",
+        choices=["caffe2", "onnx", "torchscript"],
+        help="output format",
+        default="caffe2",
+    )
+    parser.add_argument("--config-file", default="", metavar="FILE", help="path to config file")
+    parser.add_argument("--run-eval", action="store_true")
+    parser.add_argument("--output", help="output directory for the converted model")
+    parser.add_argument(
+        "opts",
+        help="Modify config options using the command-line",
+        default=None,
+        nargs=argparse.REMAINDER,
+    )
+    args = parser.parse_args()
+    logger = setup_logger()
+    logger.info("Command line arguments: " + str(args))
+    os.makedirs(args.output, exist_ok=True)
+
+    cfg = setup_cfg(args)
+
+    # create a torch model
+    torch_model = build_model(cfg)
+    DetectionCheckpointer(torch_model).resume_or_load(cfg.MODEL.WEIGHTS)
+
+    # get a sample data
+    data_loader = build_detection_test_loader(cfg, cfg.DATASETS.TEST[0])
+    first_batch = next(iter(data_loader))
+
+    # convert and save caffe2 model
+    tracer = Caffe2Tracer(cfg, torch_model, first_batch)
+    if args.format == "caffe2":
+        caffe2_model = tracer.export_caffe2()
+        caffe2_model.save_protobuf(args.output)
+        # draw the caffe2 graph
+        caffe2_model.save_graph(os.path.join(args.output, "model.svg"), inputs=first_batch)
+    elif args.format == "onnx":
+        onnx_model = tracer.export_onnx()
+        onnx.save(onnx_model, os.path.join(args.output, "model.onnx"))
+    elif args.format == "torchscript":
+        script_model = tracer.export_torchscript()
+        script_model.save(os.path.join(args.output, "model.ts"))
+
+        # Recursively print IR of all modules
+        with open(os.path.join(args.output, "model_ts_IR.txt"), "w") as f:
+            try:
+                f.write(script_model._actual_script_module._c.dump_to_str(True, False, False))
+            except AttributeError:
+                pass
+        # Print IR of the entire graph (all submodules inlined)
+        with open(os.path.join(args.output, "model_ts_IR_inlined.txt"), "w") as f:
+            f.write(str(script_model.inlined_graph))
+        # Print the model structure in pytorch style
+        with open(os.path.join(args.output, "model.txt"), "w") as f:
+            f.write(str(script_model))
+
+    # run evaluation with the converted model
+    if args.run_eval:
+        assert args.format == "caffe2", "Python inference in other format is not yet supported."
+        dataset = cfg.DATASETS.TEST[0]
+        data_loader = build_detection_test_loader(cfg, dataset)
+        # NOTE: hard-coded evaluator. change to the evaluator for your dataset
+        evaluator = COCOEvaluator(dataset, cfg, True, args.output)
+        metrics = inference_on_dataset(caffe2_model, data_loader, evaluator)
+        print_csv_format(metrics)
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/tools/deploy/caffe2_mask_rcnn.cpp b/preprocess/humanparsing/mhp_extension/detectron2/tools/deploy/caffe2_mask_rcnn.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..44370b4c518408f1f46345c7e3ac07c7db63a485
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/tools/deploy/caffe2_mask_rcnn.cpp
@@ -0,0 +1,119 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+
+#include <c10/util/Flags.h>
+#include <caffe2/core/blob.h>
+#include <caffe2/core/common.h>
+#include <caffe2/core/init.h>
+#include <caffe2/core/net.h>
+#include <caffe2/core/workspace.h>
+#include <caffe2/utils/proto_utils.h>
+
+#include <opencv2/opencv.hpp>
+#include <cassert>
+#include <chrono>
+#include <iostream>
+#include <string>
+
+C10_DEFINE_string(predict_net, "", "path to model.pb");
+C10_DEFINE_string(init_net, "", "path to model_init.pb");
+C10_DEFINE_string(input, "", "path to input image");
+
+using namespace std;
+using namespace caffe2;
+
+int main(int argc, char** argv) {
+  caffe2::GlobalInit(&argc, &argv);
+  string predictNetPath = FLAGS_predict_net;
+  string initNetPath = FLAGS_init_net;
+  cv::Mat input = cv::imread(FLAGS_input, cv::IMREAD_COLOR);
+
+  const int height = input.rows;
+  const int width = input.cols;
+  // FPN models require divisibility of 32
+  assert(height % 32 == 0 && width % 32 == 0);
+  const int batch = 1;
+  const int channels = 3;
+
+  // initialize Net and Workspace
+  caffe2::NetDef initNet_, predictNet_;
+  CAFFE_ENFORCE(ReadProtoFromFile(initNetPath, &initNet_));
+  CAFFE_ENFORCE(ReadProtoFromFile(predictNetPath, &predictNet_));
+
+  Workspace workSpace;
+  for (auto& str : predictNet_.external_input()) {
+    workSpace.CreateBlob(str);
+  }
+  CAFFE_ENFORCE(workSpace.CreateNet(predictNet_));
+  CAFFE_ENFORCE(workSpace.RunNetOnce(initNet_));
+
+  // setup inputs
+  auto data = BlobGetMutableTensor(workSpace.GetBlob("data"), caffe2::CPU);
+  data->Resize(batch, channels, height, width);
+  float* ptr = data->mutable_data<float>();
+  // HWC to CHW
+  for (int c = 0; c < 3; ++c) {
+    for (int i = 0; i < height * width; ++i) {
+      ptr[c * height * width + i] = static_cast<float>(input.data[3 * i + c]);
+    }
+  }
+
+  auto im_info =
+      BlobGetMutableTensor(workSpace.GetBlob("im_info"), caffe2::CPU);
+  im_info->Resize(batch, 3);
+  float* im_info_ptr = im_info->mutable_data<float>();
+  im_info_ptr[0] = height;
+  im_info_ptr[1] = width;
+  im_info_ptr[2] = 1.0;
+
+  // run the network
+  CAFFE_ENFORCE(workSpace.RunNet(predictNet_.name()));
+
+  // run 3 more times to benchmark
+  int N_benchmark = 3;
+  auto start_time = chrono::high_resolution_clock::now();
+  for (int i = 0; i < N_benchmark; ++i) {
+    CAFFE_ENFORCE(workSpace.RunNet(predictNet_.name()));
+  }
+  auto end_time = chrono::high_resolution_clock::now();
+  auto ms = chrono::duration_cast<chrono::microseconds>(end_time - start_time)
+                .count();
+  cout << "Latency (should vary with different inputs): "
+       << ms * 1.0 / 1e6 / N_benchmark << " seconds" << endl;
+
+  // parse Mask R-CNN outputs
+  caffe2::Tensor bbox(
+      workSpace.GetBlob("bbox_nms")->Get<caffe2::Tensor>(), caffe2::CPU);
+  caffe2::Tensor scores(
+      workSpace.GetBlob("score_nms")->Get<caffe2::Tensor>(), caffe2::CPU);
+  caffe2::Tensor labels(
+      workSpace.GetBlob("class_nms")->Get<caffe2::Tensor>(), caffe2::CPU);
+  caffe2::Tensor mask_probs(
+      workSpace.GetBlob("mask_fcn_probs")->Get<caffe2::Tensor>(), caffe2::CPU);
+  cout << "bbox:" << bbox.DebugString() << endl;
+  cout << "scores:" << scores.DebugString() << endl;
+  cout << "labels:" << labels.DebugString() << endl;
+  cout << "mask_probs: " << mask_probs.DebugString() << endl;
+
+  int num_instances = bbox.sizes()[0];
+  for (int i = 0; i < num_instances; ++i) {
+    float score = scores.data<float>()[i];
+    if (score < 0.6)
+      continue; // skip them
+
+    const float* box = bbox.data<float>() + i * 4;
+    int label = labels.data<float>()[i];
+
+    cout << "Prediction " << i << ", xyxy=(";
+    cout << box[0] << ", " << box[1] << ", " << box[2] << ", " << box[3]
+         << "); score=" << score << "; label=" << label << endl;
+
+    const float* mask = mask_probs.data<float>() +
+        i * mask_probs.size_from_dim(1) + label * mask_probs.size_from_dim(2);
+
+    // save the 28x28 mask
+    cv::Mat cv_mask(28, 28, CV_32FC1);
+    memcpy(cv_mask.data, mask, 28 * 28 * sizeof(float));
+    cv::imwrite("mask" + std::to_string(i) + ".png", cv_mask * 255.);
+  }
+  return 0;
+}
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/tools/deploy/torchscript_traced_mask_rcnn.cpp b/preprocess/humanparsing/mhp_extension/detectron2/tools/deploy/torchscript_traced_mask_rcnn.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..82fbdb052fa53543920bf8169a05982005e30cc5
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/tools/deploy/torchscript_traced_mask_rcnn.cpp
@@ -0,0 +1,71 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+
+#include <opencv2/opencv.hpp>
+#include <iostream>
+#include <string>
+
+#include <torch/csrc/autograd/grad_mode.h>
+#include <torch/script.h>
+
+using namespace std;
+
+// experimental. don't use
+int main(int argc, const char* argv[]) {
+  if (argc != 3) {
+    return 1;
+  }
+  std::string image_file = argv[2];
+
+  torch::autograd::AutoGradMode guard(false);
+  auto module = torch::jit::load(argv[1]);
+
+  assert(module.buffers().size() > 0);
+  // Assume that the entire model is on the same device.
+  // We just put input to this device.
+  auto device = (*begin(module.buffers())).device();
+
+  cv::Mat input_img = cv::imread(image_file, cv::IMREAD_COLOR);
+  const int height = input_img.rows;
+  const int width = input_img.cols;
+  // FPN models require divisibility of 32
+  assert(height % 32 == 0 && width % 32 == 0);
+  const int channels = 3;
+
+  auto input = torch::from_blob(
+      input_img.data, {1, height, width, channels}, torch::kUInt8);
+  // NHWC to NCHW
+  input = input.to(device, torch::kFloat).permute({0, 3, 1, 2}).contiguous();
+
+  std::array<float, 3> im_info_data{height * 1.0f, width * 1.0f, 1.0f};
+  auto im_info = torch::from_blob(im_info_data.data(), {1, 3}).to(device);
+
+  // run the network
+  auto output = module.forward({std::make_tuple(input, im_info)});
+
+  // run 3 more times to benchmark
+  int N_benchmark = 3;
+  auto start_time = chrono::high_resolution_clock::now();
+  for (int i = 0; i < N_benchmark; ++i) {
+    output = module.forward({std::make_tuple(input, im_info)});
+  }
+  auto end_time = chrono::high_resolution_clock::now();
+  auto ms = chrono::duration_cast<chrono::microseconds>(end_time - start_time)
+                .count();
+  cout << "Latency (should vary with different inputs): "
+       << ms * 1.0 / 1e6 / N_benchmark << " seconds" << endl;
+
+  auto outputs = output.toTuple()->elements();
+  // parse Mask R-CNN outputs
+  auto bbox = outputs[0].toTensor(), scores = outputs[1].toTensor(),
+       labels = outputs[2].toTensor(), mask_probs = outputs[3].toTensor();
+
+  cout << "bbox: " << bbox.toString() << " " << bbox.sizes() << endl;
+  cout << "scores: " << scores.toString() << " " << scores.sizes() << endl;
+  cout << "labels: " << labels.toString() << " " << labels.sizes() << endl;
+  cout << "mask_probs: " << mask_probs.toString() << " " << mask_probs.sizes()
+       << endl;
+
+  int num_instances = bbox.sizes()[0];
+  cout << bbox << endl;
+  return 0;
+}
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/tools/finetune_net.py b/preprocess/humanparsing/mhp_extension/detectron2/tools/finetune_net.py
new file mode 100644
index 0000000000000000000000000000000000000000..3e521859f70b89da747b324375a5110d8663fdc7
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/tools/finetune_net.py
@@ -0,0 +1,183 @@
+#!/usr/bin/env python
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+"""
+Detection Training Script.
+
+This scripts reads a given config file and runs the training or evaluation.
+It is an entry point that is made to train standard models in detectron2.
+
+In order to let one script support training of many models,
+this script contains logic that are specific to these built-in models and therefore
+may not be suitable for your own project.
+For example, your research project perhaps only needs a single "evaluator".
+
+Therefore, we recommend you to use detectron2 as an library and take
+this file as an example of how to use the library.
+You may want to write your own script with your data and other customizations.
+"""
+
+import logging
+import os
+from collections import OrderedDict
+import torch
+
+import detectron2.utils.comm as comm
+from detectron2.checkpoint import DetectionCheckpointer
+from detectron2.config import get_cfg
+from detectron2.data import MetadataCatalog
+from detectron2.engine import DefaultTrainer, default_argument_parser, default_setup, hooks, launch
+from detectron2.evaluation import (
+    CityscapesInstanceEvaluator,
+    CityscapesSemSegEvaluator,
+    COCOEvaluator,
+    COCOPanopticEvaluator,
+    DatasetEvaluators,
+    LVISEvaluator,
+    PascalVOCDetectionEvaluator,
+    SemSegEvaluator,
+    verify_results,
+)
+from detectron2.modeling import GeneralizedRCNNWithTTA
+
+# Register Custom Dataset
+from detectron2.data.datasets import register_coco_instances
+
+register_coco_instances("CIHP_train", {}, "../../data/msrcnn_finetune_annotations/CIHP_train.json",
+                        "../../data/instance-level_human_parsing/Training/Images")
+register_coco_instances("CIHP_val", {}, "../../data/msrcnn_finetune_annotations/CIHP_val.json",
+                        "../../data/instance-level_human_parsing/Validation/Images")
+register_coco_instances("demo_train", {}, "../../demo/annotations/demo_train.json",
+                        "../../demo/img")
+register_coco_instances("demo_val", {}, "../../demo/annotations/demo_val.json",
+                        "../../demo/img")
+
+
+class Trainer(DefaultTrainer):
+    """
+    We use the "DefaultTrainer" which contains pre-defined default logic for
+    standard training workflow. They may not work for you, especially if you
+    are working on a new research project. In that case you can use the cleaner
+    "SimpleTrainer", or write your own training loop. You can use
+    "tools/plain_train_net.py" as an example.
+    """
+
+    @classmethod
+    def build_evaluator(cls, cfg, dataset_name, output_folder=None):
+        """
+        Create evaluator(s) for a given dataset.
+        This uses the special metadata "evaluator_type" associated with each builtin dataset.
+        For your own dataset, you can simply create an evaluator manually in your
+        script and do not have to worry about the hacky if-else logic here.
+        """
+        if output_folder is None:
+            output_folder = os.path.join(cfg.OUTPUT_DIR, "inference")
+        evaluator_list = []
+        evaluator_type = MetadataCatalog.get(dataset_name).evaluator_type
+        if evaluator_type in ["sem_seg", "coco_panoptic_seg"]:
+            evaluator_list.append(
+                SemSegEvaluator(
+                    dataset_name,
+                    distributed=True,
+                    num_classes=cfg.MODEL.SEM_SEG_HEAD.NUM_CLASSES,
+                    ignore_label=cfg.MODEL.SEM_SEG_HEAD.IGNORE_VALUE,
+                    output_dir=output_folder,
+                )
+            )
+        if evaluator_type in ["coco", "coco_panoptic_seg"]:
+            evaluator_list.append(COCOEvaluator(dataset_name, cfg, True, output_folder))
+        if evaluator_type == "coco_panoptic_seg":
+            evaluator_list.append(COCOPanopticEvaluator(dataset_name, output_folder))
+        if evaluator_type == "cityscapes_instance":
+            assert (
+                    torch.cuda.device_count() >= comm.get_rank()
+            ), "CityscapesEvaluator currently do not work with multiple machines."
+            return CityscapesInstanceEvaluator(dataset_name)
+        if evaluator_type == "cityscapes_sem_seg":
+            assert (
+                    torch.cuda.device_count() >= comm.get_rank()
+            ), "CityscapesEvaluator currently do not work with multiple machines."
+            return CityscapesSemSegEvaluator(dataset_name)
+        elif evaluator_type == "pascal_voc":
+            return PascalVOCDetectionEvaluator(dataset_name)
+        elif evaluator_type == "lvis":
+            return LVISEvaluator(dataset_name, cfg, True, output_folder)
+        if len(evaluator_list) == 0:
+            raise NotImplementedError(
+                "no Evaluator for the dataset {} with the type {}".format(
+                    dataset_name, evaluator_type
+                )
+            )
+        elif len(evaluator_list) == 1:
+            return evaluator_list[0]
+        return DatasetEvaluators(evaluator_list)
+
+    @classmethod
+    def test_with_TTA(cls, cfg, model):
+        logger = logging.getLogger("detectron2.trainer")
+        # In the end of training, run an evaluation with TTA
+        # Only support some R-CNN models.
+        logger.info("Running inference with test-time augmentation ...")
+        model = GeneralizedRCNNWithTTA(cfg, model)
+        evaluators = [
+            cls.build_evaluator(
+                cfg, name, output_folder=os.path.join(cfg.OUTPUT_DIR, "inference_TTA")
+            )
+            for name in cfg.DATASETS.TEST
+        ]
+        res = cls.test(cfg, model, evaluators)
+        res = OrderedDict({k + "_TTA": v for k, v in res.items()})
+        return res
+
+
+def setup(args):
+    """
+    Create configs and perform basic setups.
+    """
+    cfg = get_cfg()
+    cfg.merge_from_file(args.config_file)
+    cfg.merge_from_list(args.opts)
+    cfg.freeze()
+    default_setup(cfg, args)
+    return cfg
+
+
+def main(args):
+    cfg = setup(args)
+
+    if args.eval_only:
+        model = Trainer.build_model(cfg)
+        DetectionCheckpointer(model, save_dir=cfg.OUTPUT_DIR).resume_or_load(
+            cfg.MODEL.WEIGHTS, resume=args.resume
+        )
+        res = Trainer.test(cfg, model)
+        if cfg.TEST.AUG.ENABLED:
+            res.update(Trainer.test_with_TTA(cfg, model))
+        if comm.is_main_process():
+            verify_results(cfg, res)
+        return res
+
+    """
+    If you'd like to do anything fancier than the standard training logic,
+    consider writing your own training loop (see plain_train_net.py) or
+    subclassing the trainer.
+    """
+    trainer = Trainer(cfg)
+    trainer.resume_or_load(resume=False)
+    if cfg.TEST.AUG.ENABLED:
+        trainer.register_hooks(
+            [hooks.EvalHook(0, lambda: trainer.test_with_TTA(cfg, trainer.model))]
+        )
+    return trainer.train()
+
+
+if __name__ == "__main__":
+    args = default_argument_parser().parse_args()
+    print("Command Line Args:", args)
+    launch(
+        main,
+        args.num_gpus,
+        num_machines=args.num_machines,
+        machine_rank=args.machine_rank,
+        dist_url=args.dist_url,
+        args=(args,),
+    )
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/tools/inference.sh b/preprocess/humanparsing/mhp_extension/detectron2/tools/inference.sh
new file mode 100644
index 0000000000000000000000000000000000000000..3b9d39ed92e9cb574ac4349f457a52a27c38aac3
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/tools/inference.sh
@@ -0,0 +1,4 @@
+python finetune_net.py \
+	--num-gpus 1 \
+	--config-file ../configs/Misc/parsing_inference.yaml \
+	--eval-only MODEL.WEIGHTS ./model_final.pth TEST.AUG.ENABLED False 
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/tools/plain_train_net.py b/preprocess/humanparsing/mhp_extension/detectron2/tools/plain_train_net.py
new file mode 100644
index 0000000000000000000000000000000000000000..52a0a281f84bb64fa49c7cb2122564146ee27752
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/tools/plain_train_net.py
@@ -0,0 +1,237 @@
+#!/usr/bin/env python
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+"""
+Detectron2 training script with a plain training loop.
+
+This script reads a given config file and runs the training or evaluation.
+It is an entry point that is able to train standard models in detectron2.
+
+In order to let one script support training of many models,
+this script contains logic that are specific to these built-in models and therefore
+may not be suitable for your own project.
+For example, your research project perhaps only needs a single "evaluator".
+
+Therefore, we recommend you to use detectron2 as a library and take
+this file as an example of how to use the library.
+You may want to write your own script with your data and other customizations.
+
+Compared to "train_net.py", this script supports fewer default features.
+It also includes fewer abstraction, therefore is easier to add custom logic.
+"""
+
+import logging
+import os
+from collections import OrderedDict
+import torch
+from torch.nn.parallel import DistributedDataParallel
+
+import detectron2.utils.comm as comm
+from detectron2.checkpoint import DetectionCheckpointer, PeriodicCheckpointer
+from detectron2.config import get_cfg
+from detectron2.data import (
+    MetadataCatalog,
+    build_detection_test_loader,
+    build_detection_train_loader,
+)
+from detectron2.engine import default_argument_parser, default_setup, launch
+from detectron2.evaluation import (
+    CityscapesInstanceEvaluator,
+    CityscapesSemSegEvaluator,
+    COCOEvaluator,
+    COCOPanopticEvaluator,
+    DatasetEvaluators,
+    LVISEvaluator,
+    PascalVOCDetectionEvaluator,
+    SemSegEvaluator,
+    inference_on_dataset,
+    print_csv_format,
+)
+from detectron2.modeling import build_model
+from detectron2.solver import build_lr_scheduler, build_optimizer
+from detectron2.utils.events import (
+    CommonMetricPrinter,
+    EventStorage,
+    JSONWriter,
+    TensorboardXWriter,
+)
+
+logger = logging.getLogger("detectron2")
+
+
+def get_evaluator(cfg, dataset_name, output_folder=None):
+    """
+    Create evaluator(s) for a given dataset.
+    This uses the special metadata "evaluator_type" associated with each builtin dataset.
+    For your own dataset, you can simply create an evaluator manually in your
+    script and do not have to worry about the hacky if-else logic here.
+    """
+    if output_folder is None:
+        output_folder = os.path.join(cfg.OUTPUT_DIR, "inference")
+    evaluator_list = []
+    evaluator_type = MetadataCatalog.get(dataset_name).evaluator_type
+    if evaluator_type in ["sem_seg", "coco_panoptic_seg"]:
+        evaluator_list.append(
+            SemSegEvaluator(
+                dataset_name,
+                distributed=True,
+                num_classes=cfg.MODEL.SEM_SEG_HEAD.NUM_CLASSES,
+                ignore_label=cfg.MODEL.SEM_SEG_HEAD.IGNORE_VALUE,
+                output_dir=output_folder,
+            )
+        )
+    if evaluator_type in ["coco", "coco_panoptic_seg"]:
+        evaluator_list.append(COCOEvaluator(dataset_name, cfg, True, output_folder))
+    if evaluator_type == "coco_panoptic_seg":
+        evaluator_list.append(COCOPanopticEvaluator(dataset_name, output_folder))
+    if evaluator_type == "cityscapes_instance":
+        assert (
+            torch.cuda.device_count() >= comm.get_rank()
+        ), "CityscapesEvaluator currently do not work with multiple machines."
+        return CityscapesInstanceEvaluator(dataset_name)
+    if evaluator_type == "cityscapes_sem_seg":
+        assert (
+            torch.cuda.device_count() >= comm.get_rank()
+        ), "CityscapesEvaluator currently do not work with multiple machines."
+        return CityscapesSemSegEvaluator(dataset_name)
+    if evaluator_type == "pascal_voc":
+        return PascalVOCDetectionEvaluator(dataset_name)
+    if evaluator_type == "lvis":
+        return LVISEvaluator(dataset_name, cfg, True, output_folder)
+    if len(evaluator_list) == 0:
+        raise NotImplementedError(
+            "no Evaluator for the dataset {} with the type {}".format(dataset_name, evaluator_type)
+        )
+    if len(evaluator_list) == 1:
+        return evaluator_list[0]
+    return DatasetEvaluators(evaluator_list)
+
+
+def do_test(cfg, model):
+    results = OrderedDict()
+    for dataset_name in cfg.DATASETS.TEST:
+        data_loader = build_detection_test_loader(cfg, dataset_name)
+        evaluator = get_evaluator(
+            cfg, dataset_name, os.path.join(cfg.OUTPUT_DIR, "inference", dataset_name)
+        )
+        results_i = inference_on_dataset(model, data_loader, evaluator)
+        results[dataset_name] = results_i
+        if comm.is_main_process():
+            logger.info("Evaluation results for {} in csv format:".format(dataset_name))
+            print_csv_format(results_i)
+    if len(results) == 1:
+        results = list(results.values())[0]
+    return results
+
+
+def do_train(cfg, model, resume=False):
+    model.train()
+    optimizer = build_optimizer(cfg, model)
+    scheduler = build_lr_scheduler(cfg, optimizer)
+
+    checkpointer = DetectionCheckpointer(
+        model, cfg.OUTPUT_DIR, optimizer=optimizer, scheduler=scheduler
+    )
+    start_iter = (
+        checkpointer.resume_or_load(cfg.MODEL.WEIGHTS, resume=resume).get("iteration", -1) + 1
+    )
+    max_iter = cfg.SOLVER.MAX_ITER
+
+    periodic_checkpointer = PeriodicCheckpointer(
+        checkpointer, cfg.SOLVER.CHECKPOINT_PERIOD, max_iter=max_iter
+    )
+
+    writers = (
+        [
+            CommonMetricPrinter(max_iter),
+            JSONWriter(os.path.join(cfg.OUTPUT_DIR, "metrics.json")),
+            TensorboardXWriter(cfg.OUTPUT_DIR),
+        ]
+        if comm.is_main_process()
+        else []
+    )
+
+    # compared to "train_net.py", we do not support accurate timing and
+    # precise BN here, because they are not trivial to implement
+    data_loader = build_detection_train_loader(cfg)
+    logger.info("Starting training from iteration {}".format(start_iter))
+    with EventStorage(start_iter) as storage:
+        for data, iteration in zip(data_loader, range(start_iter, max_iter)):
+            iteration = iteration + 1
+            storage.step()
+
+            loss_dict = model(data)
+            losses = sum(loss_dict.values())
+            assert torch.isfinite(losses).all(), loss_dict
+
+            loss_dict_reduced = {k: v.item() for k, v in comm.reduce_dict(loss_dict).items()}
+            losses_reduced = sum(loss for loss in loss_dict_reduced.values())
+            if comm.is_main_process():
+                storage.put_scalars(total_loss=losses_reduced, **loss_dict_reduced)
+
+            optimizer.zero_grad()
+            losses.backward()
+            optimizer.step()
+            storage.put_scalar("lr", optimizer.param_groups[0]["lr"], smoothing_hint=False)
+            scheduler.step()
+
+            if (
+                cfg.TEST.EVAL_PERIOD > 0
+                and iteration % cfg.TEST.EVAL_PERIOD == 0
+                and iteration != max_iter
+            ):
+                do_test(cfg, model)
+                # Compared to "train_net.py", the test results are not dumped to EventStorage
+                comm.synchronize()
+
+            if iteration - start_iter > 5 and (iteration % 20 == 0 or iteration == max_iter):
+                for writer in writers:
+                    writer.write()
+            periodic_checkpointer.step(iteration)
+
+
+def setup(args):
+    """
+    Create configs and perform basic setups.
+    """
+    cfg = get_cfg()
+    cfg.merge_from_file(args.config_file)
+    cfg.merge_from_list(args.opts)
+    cfg.freeze()
+    default_setup(
+        cfg, args
+    )  # if you don't like any of the default setup, write your own setup code
+    return cfg
+
+
+def main(args):
+    cfg = setup(args)
+
+    model = build_model(cfg)
+    logger.info("Model:\n{}".format(model))
+    if args.eval_only:
+        DetectionCheckpointer(model, save_dir=cfg.OUTPUT_DIR).resume_or_load(
+            cfg.MODEL.WEIGHTS, resume=args.resume
+        )
+        return do_test(cfg, model)
+
+    distributed = comm.get_world_size() > 1
+    if distributed:
+        model = DistributedDataParallel(
+            model, device_ids=[comm.get_local_rank()], broadcast_buffers=False
+        )
+
+    do_train(cfg, model, resume=args.resume)
+    return do_test(cfg, model)
+
+
+if __name__ == "__main__":
+    args = default_argument_parser().parse_args()
+    print("Command Line Args:", args)
+    launch(
+        main,
+        args.num_gpus,
+        num_machines=args.num_machines,
+        machine_rank=args.machine_rank,
+        dist_url=args.dist_url,
+        args=(args,),
+    )
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/tools/run.sh b/preprocess/humanparsing/mhp_extension/detectron2/tools/run.sh
new file mode 100644
index 0000000000000000000000000000000000000000..b89267337002df6eff52a323a07801fb1da6476c
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/tools/run.sh
@@ -0,0 +1,3 @@
+python finetune_net.py \
+	--config-file ../configs/Misc/parsing_finetune_cihp+vip.yaml \
+	--num-gpus 8
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/tools/train_net.py b/preprocess/humanparsing/mhp_extension/detectron2/tools/train_net.py
new file mode 100644
index 0000000000000000000000000000000000000000..b1c0ee443c81a0a0f217682cce6d9051ef07c20e
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/tools/train_net.py
@@ -0,0 +1,171 @@
+#!/usr/bin/env python
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+"""
+Detection Training Script.
+
+This scripts reads a given config file and runs the training or evaluation.
+It is an entry point that is made to train standard models in detectron2.
+
+In order to let one script support training of many models,
+this script contains logic that are specific to these built-in models and therefore
+may not be suitable for your own project.
+For example, your research project perhaps only needs a single "evaluator".
+
+Therefore, we recommend you to use detectron2 as an library and take
+this file as an example of how to use the library.
+You may want to write your own script with your data and other customizations.
+"""
+
+import logging
+import os
+from collections import OrderedDict
+import torch
+
+import detectron2.utils.comm as comm
+from detectron2.checkpoint import DetectionCheckpointer
+from detectron2.config import get_cfg
+from detectron2.data import MetadataCatalog
+from detectron2.engine import DefaultTrainer, default_argument_parser, default_setup, hooks, launch
+from detectron2.evaluation import (
+    CityscapesInstanceEvaluator,
+    CityscapesSemSegEvaluator,
+    COCOEvaluator,
+    COCOPanopticEvaluator,
+    DatasetEvaluators,
+    LVISEvaluator,
+    PascalVOCDetectionEvaluator,
+    SemSegEvaluator,
+    verify_results,
+)
+from detectron2.modeling import GeneralizedRCNNWithTTA
+
+
+class Trainer(DefaultTrainer):
+    """
+    We use the "DefaultTrainer" which contains pre-defined default logic for
+    standard training workflow. They may not work for you, especially if you
+    are working on a new research project. In that case you can use the cleaner
+    "SimpleTrainer", or write your own training loop. You can use
+    "tools/plain_train_net.py" as an example.
+    """
+
+    @classmethod
+    def build_evaluator(cls, cfg, dataset_name, output_folder=None):
+        """
+        Create evaluator(s) for a given dataset.
+        This uses the special metadata "evaluator_type" associated with each builtin dataset.
+        For your own dataset, you can simply create an evaluator manually in your
+        script and do not have to worry about the hacky if-else logic here.
+        """
+        if output_folder is None:
+            output_folder = os.path.join(cfg.OUTPUT_DIR, "inference")
+        evaluator_list = []
+        evaluator_type = MetadataCatalog.get(dataset_name).evaluator_type
+        if evaluator_type in ["sem_seg", "coco_panoptic_seg"]:
+            evaluator_list.append(
+                SemSegEvaluator(
+                    dataset_name,
+                    distributed=True,
+                    num_classes=cfg.MODEL.SEM_SEG_HEAD.NUM_CLASSES,
+                    ignore_label=cfg.MODEL.SEM_SEG_HEAD.IGNORE_VALUE,
+                    output_dir=output_folder,
+                )
+            )
+        if evaluator_type in ["coco", "coco_panoptic_seg"]:
+            evaluator_list.append(COCOEvaluator(dataset_name, cfg, True, output_folder))
+        if evaluator_type == "coco_panoptic_seg":
+            evaluator_list.append(COCOPanopticEvaluator(dataset_name, output_folder))
+        if evaluator_type == "cityscapes_instance":
+            assert (
+                torch.cuda.device_count() >= comm.get_rank()
+            ), "CityscapesEvaluator currently do not work with multiple machines."
+            return CityscapesInstanceEvaluator(dataset_name)
+        if evaluator_type == "cityscapes_sem_seg":
+            assert (
+                torch.cuda.device_count() >= comm.get_rank()
+            ), "CityscapesEvaluator currently do not work with multiple machines."
+            return CityscapesSemSegEvaluator(dataset_name)
+        elif evaluator_type == "pascal_voc":
+            return PascalVOCDetectionEvaluator(dataset_name)
+        elif evaluator_type == "lvis":
+            return LVISEvaluator(dataset_name, cfg, True, output_folder)
+        if len(evaluator_list) == 0:
+            raise NotImplementedError(
+                "no Evaluator for the dataset {} with the type {}".format(
+                    dataset_name, evaluator_type
+                )
+            )
+        elif len(evaluator_list) == 1:
+            return evaluator_list[0]
+        return DatasetEvaluators(evaluator_list)
+
+    @classmethod
+    def test_with_TTA(cls, cfg, model):
+        logger = logging.getLogger("detectron2.trainer")
+        # In the end of training, run an evaluation with TTA
+        # Only support some R-CNN models.
+        logger.info("Running inference with test-time augmentation ...")
+        model = GeneralizedRCNNWithTTA(cfg, model)
+        evaluators = [
+            cls.build_evaluator(
+                cfg, name, output_folder=os.path.join(cfg.OUTPUT_DIR, "inference_TTA")
+            )
+            for name in cfg.DATASETS.TEST
+        ]
+        res = cls.test(cfg, model, evaluators)
+        res = OrderedDict({k + "_TTA": v for k, v in res.items()})
+        return res
+
+
+def setup(args):
+    """
+    Create configs and perform basic setups.
+    """
+    cfg = get_cfg()
+    cfg.merge_from_file(args.config_file)
+    cfg.merge_from_list(args.opts)
+    cfg.freeze()
+    default_setup(cfg, args)
+    return cfg
+
+
+def main(args):
+    cfg = setup(args)
+
+    if args.eval_only:
+        model = Trainer.build_model(cfg)
+        DetectionCheckpointer(model, save_dir=cfg.OUTPUT_DIR).resume_or_load(
+            cfg.MODEL.WEIGHTS, resume=args.resume
+        )
+        res = Trainer.test(cfg, model)
+        if cfg.TEST.AUG.ENABLED:
+            res.update(Trainer.test_with_TTA(cfg, model))
+        if comm.is_main_process():
+            verify_results(cfg, res)
+        return res
+
+    """
+    If you'd like to do anything fancier than the standard training logic,
+    consider writing your own training loop (see plain_train_net.py) or
+    subclassing the trainer.
+    """
+    trainer = Trainer(cfg)
+    trainer.resume_or_load(resume=args.resume)
+    if cfg.TEST.AUG.ENABLED:
+        trainer.register_hooks(
+            [hooks.EvalHook(0, lambda: trainer.test_with_TTA(cfg, trainer.model))]
+        )
+    return trainer.train()
+
+
+if __name__ == "__main__":
+    args = default_argument_parser().parse_args()
+    print("Command Line Args:", args)
+    launch(
+        main,
+        args.num_gpus,
+        num_machines=args.num_machines,
+        machine_rank=args.machine_rank,
+        dist_url=args.dist_url,
+        args=(args,),
+    )
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/tools/visualize_data.py b/preprocess/humanparsing/mhp_extension/detectron2/tools/visualize_data.py
new file mode 100644
index 0000000000000000000000000000000000000000..b143b2d250787c2880657d42c9e9cc0c80c6a348
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/tools/visualize_data.py
@@ -0,0 +1,93 @@
+#!/usr/bin/env python
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+import argparse
+import os
+from itertools import chain
+import cv2
+import tqdm
+
+from detectron2.config import get_cfg
+from detectron2.data import DatasetCatalog, MetadataCatalog, build_detection_train_loader
+from detectron2.data import detection_utils as utils
+from detectron2.data.build import filter_images_with_few_keypoints
+from detectron2.utils.logger import setup_logger
+from detectron2.utils.visualizer import Visualizer
+
+
+def setup(args):
+    cfg = get_cfg()
+    if args.config_file:
+        cfg.merge_from_file(args.config_file)
+    cfg.merge_from_list(args.opts)
+    cfg.freeze()
+    return cfg
+
+
+def parse_args(in_args=None):
+    parser = argparse.ArgumentParser(description="Visualize ground-truth data")
+    parser.add_argument(
+        "--source",
+        choices=["annotation", "dataloader"],
+        required=True,
+        help="visualize the annotations or the data loader (with pre-processing)",
+    )
+    parser.add_argument("--config-file", metavar="FILE", help="path to config file")
+    parser.add_argument("--output-dir", default="./", help="path to output directory")
+    parser.add_argument("--show", action="store_true", help="show output in a window")
+    parser.add_argument(
+        "opts",
+        help="Modify config options using the command-line",
+        default=None,
+        nargs=argparse.REMAINDER,
+    )
+    return parser.parse_args(in_args)
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    logger = setup_logger()
+    logger.info("Arguments: " + str(args))
+    cfg = setup(args)
+
+    dirname = args.output_dir
+    os.makedirs(dirname, exist_ok=True)
+    metadata = MetadataCatalog.get(cfg.DATASETS.TRAIN[0])
+
+    def output(vis, fname):
+        if args.show:
+            print(fname)
+            cv2.imshow("window", vis.get_image()[:, :, ::-1])
+            cv2.waitKey()
+        else:
+            filepath = os.path.join(dirname, fname)
+            print("Saving to {} ...".format(filepath))
+            vis.save(filepath)
+
+    scale = 2.0 if args.show else 1.0
+    if args.source == "dataloader":
+        train_data_loader = build_detection_train_loader(cfg)
+        for batch in train_data_loader:
+            for per_image in batch:
+                # Pytorch tensor is in (C, H, W) format
+                img = per_image["image"].permute(1, 2, 0).cpu().detach().numpy()
+                img = utils.convert_image_to_rgb(img, cfg.INPUT.FORMAT)
+
+                visualizer = Visualizer(img, metadata=metadata, scale=scale)
+                target_fields = per_image["instances"].get_fields()
+                labels = [metadata.thing_classes[i] for i in target_fields["gt_classes"]]
+                vis = visualizer.overlay_instances(
+                    labels=labels,
+                    boxes=target_fields.get("gt_boxes", None),
+                    masks=target_fields.get("gt_masks", None),
+                    keypoints=target_fields.get("gt_keypoints", None),
+                )
+                output(vis, str(per_image["image_id"]) + ".jpg")
+    else:
+        dicts = list(chain.from_iterable([DatasetCatalog.get(k) for k in cfg.DATASETS.TRAIN]))
+        if cfg.MODEL.KEYPOINT_ON:
+            dicts = filter_images_with_few_keypoints(dicts, 1)
+        for dic in tqdm.tqdm(dicts):
+            img = utils.read_image(dic["file_name"], "RGB")
+            visualizer = Visualizer(img, metadata=metadata, scale=scale)
+            vis = visualizer.draw_dataset_dict(dic)
+            output(vis, os.path.basename(dic["file_name"]))
diff --git a/preprocess/humanparsing/mhp_extension/detectron2/tools/visualize_json_results.py b/preprocess/humanparsing/mhp_extension/detectron2/tools/visualize_json_results.py
new file mode 100644
index 0000000000000000000000000000000000000000..d11ecb90382a630d90661bc65cefc4f8bf3486cf
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/detectron2/tools/visualize_json_results.py
@@ -0,0 +1,90 @@
+#!/usr/bin/env python
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+
+import argparse
+import json
+import numpy as np
+import os
+from collections import defaultdict
+import cv2
+import tqdm
+from fvcore.common.file_io import PathManager
+
+from detectron2.data import DatasetCatalog, MetadataCatalog
+from detectron2.structures import Boxes, BoxMode, Instances
+from detectron2.utils.logger import setup_logger
+from detectron2.utils.visualizer import Visualizer
+
+
+def create_instances(predictions, image_size):
+    ret = Instances(image_size)
+
+    score = np.asarray([x["score"] for x in predictions])
+    chosen = (score > args.conf_threshold).nonzero()[0]
+    score = score[chosen]
+    bbox = np.asarray([predictions[i]["bbox"] for i in chosen]).reshape(-1, 4)
+    bbox = BoxMode.convert(bbox, BoxMode.XYWH_ABS, BoxMode.XYXY_ABS)
+
+    labels = np.asarray([dataset_id_map(predictions[i]["category_id"]) for i in chosen])
+
+    ret.scores = score
+    ret.pred_boxes = Boxes(bbox)
+    ret.pred_classes = labels
+
+    try:
+        ret.pred_masks = [predictions[i]["segmentation"] for i in chosen]
+    except KeyError:
+        pass
+    return ret
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="A script that visualizes the json predictions from COCO or LVIS dataset."
+    )
+    parser.add_argument("--input", required=True, help="JSON file produced by the model")
+    parser.add_argument("--output", required=True, help="output directory")
+    parser.add_argument("--dataset", help="name of the dataset", default="coco_2017_val")
+    parser.add_argument("--conf-threshold", default=0.5, type=float, help="confidence threshold")
+    args = parser.parse_args()
+
+    logger = setup_logger()
+
+    with PathManager.open(args.input, "r") as f:
+        predictions = json.load(f)
+
+    pred_by_image = defaultdict(list)
+    for p in predictions:
+        pred_by_image[p["image_id"]].append(p)
+
+    dicts = list(DatasetCatalog.get(args.dataset))
+    metadata = MetadataCatalog.get(args.dataset)
+    if hasattr(metadata, "thing_dataset_id_to_contiguous_id"):
+
+        def dataset_id_map(ds_id):
+            return metadata.thing_dataset_id_to_contiguous_id[ds_id]
+
+    elif "lvis" in args.dataset:
+        # LVIS results are in the same format as COCO results, but have a different
+        # mapping from dataset category id to contiguous category id in [0, #categories - 1]
+        def dataset_id_map(ds_id):
+            return ds_id - 1
+
+    else:
+        raise ValueError("Unsupported dataset: {}".format(args.dataset))
+
+    os.makedirs(args.output, exist_ok=True)
+
+    for dic in tqdm.tqdm(dicts):
+        img = cv2.imread(dic["file_name"], cv2.IMREAD_COLOR)[:, :, ::-1]
+        basename = os.path.basename(dic["file_name"])
+
+        predictions = create_instances(pred_by_image[dic["image_id"]], img.shape[:2])
+        vis = Visualizer(img, metadata)
+        vis_pred = vis.draw_instance_predictions(predictions).get_image()
+
+        vis = Visualizer(img, metadata)
+        vis_gt = vis.draw_dataset_dict(dic).get_image()
+
+        concat = np.concatenate((vis_pred, vis_gt), axis=1)
+        cv2.imwrite(os.path.join(args.output, basename), concat[:, :, ::-1])
diff --git a/preprocess/humanparsing/mhp_extension/global_local_parsing/global_local_datasets.py b/preprocess/humanparsing/mhp_extension/global_local_parsing/global_local_datasets.py
new file mode 100644
index 0000000000000000000000000000000000000000..8b00594ef3302af2a30440676f96a4904ffe9077
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/global_local_parsing/global_local_datasets.py
@@ -0,0 +1,200 @@
+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+
+"""
+@Author  :   Peike Li
+@Contact :   peike.li@yahoo.com
+@File    :   datasets.py
+@Time    :   8/4/19 3:35 PM
+@Desc    :
+@License :   This source code is licensed under the license found in the
+             LICENSE file in the root directory of this source tree.
+"""
+
+import os
+import numpy as np
+import random
+import torch
+import cv2
+from torch.utils import data
+from utils.transforms import get_affine_transform
+
+
+class CropDataSet(data.Dataset):
+    def __init__(self, root, split_name, crop_size=[473, 473], scale_factor=0.25,
+                 rotation_factor=30, ignore_label=255, transform=None):
+        self.root = root
+        self.aspect_ratio = crop_size[1] * 1.0 / crop_size[0]
+        self.crop_size = np.asarray(crop_size)
+        self.ignore_label = ignore_label
+        self.scale_factor = scale_factor
+        self.rotation_factor = rotation_factor
+        self.flip_prob = 0.5
+        self.transform = transform
+        self.split_name = split_name
+
+        list_path = os.path.join(self.root, self.split_name + '.txt')
+        train_list = [i_id.strip() for i_id in open(list_path)]
+
+        self.train_list = train_list
+        self.number_samples = len(self.train_list)
+
+    def __len__(self):
+        return self.number_samples
+
+    def _box2cs(self, box):
+        x, y, w, h = box[:4]
+        return self._xywh2cs(x, y, w, h)
+
+    def _xywh2cs(self, x, y, w, h):
+        center = np.zeros((2), dtype=np.float32)
+        center[0] = x + w * 0.5
+        center[1] = y + h * 0.5
+        if w > self.aspect_ratio * h:
+            h = w * 1.0 / self.aspect_ratio
+        elif w < self.aspect_ratio * h:
+            w = h * self.aspect_ratio
+        scale = np.array([w * 1.0, h * 1.0], dtype=np.float32)
+        return center, scale
+
+    def __getitem__(self, index):
+        train_item = self.train_list[index]
+
+        im_path = os.path.join(self.root, self.split_name + '_images', train_item + '.jpg')
+        parsing_anno_path = os.path.join(self.root, self.split_name + '_segmentations', train_item + '.png')
+
+        im = cv2.imread(im_path, cv2.IMREAD_COLOR)
+        h, w, _ = im.shape
+        parsing_anno = np.zeros((h, w), dtype=np.long)
+
+        # Get person center and scale
+        person_center, s = self._box2cs([0, 0, w - 1, h - 1])
+        r = 0
+
+        if self.split_name != 'test':
+            # Get pose annotation
+            parsing_anno = cv2.imread(parsing_anno_path, cv2.IMREAD_GRAYSCALE)
+            sf = self.scale_factor
+            rf = self.rotation_factor
+            s = s * np.clip(np.random.randn() * sf + 1, 1 - sf, 1 + sf)
+            r = np.clip(np.random.randn() * rf, -rf * 2, rf * 2) if random.random() <= 0.6 else 0
+
+            if random.random() <= self.flip_prob:
+                im = im[:, ::-1, :]
+                parsing_anno = parsing_anno[:, ::-1]
+                person_center[0] = im.shape[1] - person_center[0] - 1
+                right_idx = [15, 17, 19]
+                left_idx = [14, 16, 18]
+                for i in range(0, 3):
+                    right_pos = np.where(parsing_anno == right_idx[i])
+                    left_pos = np.where(parsing_anno == left_idx[i])
+                    parsing_anno[right_pos[0], right_pos[1]] = left_idx[i]
+                    parsing_anno[left_pos[0], left_pos[1]] = right_idx[i]
+
+        trans = get_affine_transform(person_center, s, r, self.crop_size)
+        input = cv2.warpAffine(
+            im,
+            trans,
+            (int(self.crop_size[1]), int(self.crop_size[0])),
+            flags=cv2.INTER_LINEAR,
+            borderMode=cv2.BORDER_CONSTANT,
+            borderValue=(0, 0, 0))
+
+        if self.transform:
+            input = self.transform(input)
+
+        meta = {
+            'name': train_item,
+            'center': person_center,
+            'height': h,
+            'width': w,
+            'scale': s,
+            'rotation': r
+        }
+
+        if self.split_name == 'val' or self.split_name == 'test':
+            return input, meta
+        else:
+            label_parsing = cv2.warpAffine(
+                parsing_anno,
+                trans,
+                (int(self.crop_size[1]), int(self.crop_size[0])),
+                flags=cv2.INTER_NEAREST,
+                borderMode=cv2.BORDER_CONSTANT,
+                borderValue=(255))
+
+            label_parsing = torch.from_numpy(label_parsing)
+
+            return input, label_parsing, meta
+
+
+class CropDataValSet(data.Dataset):
+    def __init__(self, root, split_name='crop_pic', crop_size=[473, 473], transform=None, flip=False):
+        self.root = root
+        self.crop_size = crop_size
+        self.transform = transform
+        self.flip = flip
+        self.split_name = split_name
+        self.root = root
+        self.aspect_ratio = crop_size[1] * 1.0 / crop_size[0]
+        self.crop_size = np.asarray(crop_size)
+
+        list_path = os.path.join(self.root, self.split_name + '.txt')
+        val_list = [i_id.strip() for i_id in open(list_path)]
+
+        self.val_list = val_list
+        self.number_samples = len(self.val_list)
+
+    def __len__(self):
+        return len(self.val_list)
+
+    def _box2cs(self, box):
+        x, y, w, h = box[:4]
+        return self._xywh2cs(x, y, w, h)
+
+    def _xywh2cs(self, x, y, w, h):
+        center = np.zeros((2), dtype=np.float32)
+        center[0] = x + w * 0.5
+        center[1] = y + h * 0.5
+        if w > self.aspect_ratio * h:
+            h = w * 1.0 / self.aspect_ratio
+        elif w < self.aspect_ratio * h:
+            w = h * self.aspect_ratio
+        scale = np.array([w * 1.0, h * 1.0], dtype=np.float32)
+
+        return center, scale
+
+    def __getitem__(self, index):
+        val_item = self.val_list[index]
+        # Load training image
+        im_path = os.path.join(self.root, self.split_name, val_item + '.jpg')
+        im = cv2.imread(im_path, cv2.IMREAD_COLOR)
+        h, w, _ = im.shape
+        # Get person center and scale
+        person_center, s = self._box2cs([0, 0, w - 1, h - 1])
+        r = 0
+        trans = get_affine_transform(person_center, s, r, self.crop_size)
+        input = cv2.warpAffine(
+            im,
+            trans,
+            (int(self.crop_size[1]), int(self.crop_size[0])),
+            flags=cv2.INTER_LINEAR,
+            borderMode=cv2.BORDER_CONSTANT,
+            borderValue=(0, 0, 0))
+        input = self.transform(input)
+        flip_input = input.flip(dims=[-1])
+        if self.flip:
+            batch_input_im = torch.stack([input, flip_input])
+        else:
+            batch_input_im = input
+
+        meta = {
+            'name': val_item,
+            'center': person_center,
+            'height': h,
+            'width': w,
+            'scale': s,
+            'rotation': r
+        }
+
+        return batch_input_im, meta
diff --git a/preprocess/humanparsing/mhp_extension/global_local_parsing/global_local_evaluate.py b/preprocess/humanparsing/mhp_extension/global_local_parsing/global_local_evaluate.py
new file mode 100644
index 0000000000000000000000000000000000000000..288e3c8214f945d5a4f5fc6824b74b3d42e037b2
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/global_local_parsing/global_local_evaluate.py
@@ -0,0 +1,210 @@
+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+
+"""
+@Author  :   Peike Li
+@Contact :   peike.li@yahoo.com
+@File    :   evaluate.py
+@Time    :   8/4/19 3:36 PM
+@Desc    :
+@License :   This source code is licensed under the license found in the
+             LICENSE file in the root directory of this source tree.
+"""
+
+import os
+import argparse
+import numpy as np
+import torch
+
+from torch.utils import data
+from tqdm import tqdm
+from PIL import Image as PILImage
+import torchvision.transforms as transforms
+import torch.backends.cudnn as cudnn
+
+import networks
+from utils.miou import compute_mean_ioU
+from utils.transforms import BGR2RGB_transform
+from utils.transforms import transform_parsing, transform_logits
+from mhp_extension.global_local_parsing.global_local_datasets import CropDataValSet
+
+
+def get_arguments():
+    """Parse all the arguments provided from the CLI.
+
+    Returns:
+      A list of parsed arguments.
+    """
+    parser = argparse.ArgumentParser(description="Self Correction for Human Parsing")
+
+    # Network Structure
+    parser.add_argument("--arch", type=str, default='resnet101')
+    # Data Preference
+    parser.add_argument("--data-dir", type=str, default='./data/LIP')
+    parser.add_argument("--batch-size", type=int, default=1)
+    parser.add_argument("--split-name", type=str, default='crop_pic')
+    parser.add_argument("--input-size", type=str, default='473,473')
+    parser.add_argument("--num-classes", type=int, default=20)
+    parser.add_argument("--ignore-label", type=int, default=255)
+    parser.add_argument("--random-mirror", action="store_true")
+    parser.add_argument("--random-scale", action="store_true")
+    # Evaluation Preference
+    parser.add_argument("--log-dir", type=str, default='./log')
+    parser.add_argument("--model-restore", type=str, default='./log/checkpoint.pth.tar')
+    parser.add_argument("--gpu", type=str, default='0', help="choose gpu device.")
+    parser.add_argument("--save-results", action="store_true", help="whether to save the results.")
+    parser.add_argument("--flip", action="store_true", help="random flip during the test.")
+    parser.add_argument("--multi-scales", type=str, default='1', help="multiple scales during the test")
+    return parser.parse_args()
+
+
+def get_palette(num_cls):
+    """ Returns the color map for visualizing the segmentation mask.
+    Args:
+        num_cls: Number of classes
+    Returns:
+        The color map
+    """
+    n = num_cls
+    palette = [0] * (n * 3)
+    for j in range(0, n):
+        lab = j
+        palette[j * 3 + 0] = 0
+        palette[j * 3 + 1] = 0
+        palette[j * 3 + 2] = 0
+        i = 0
+        while lab:
+            palette[j * 3 + 0] |= (((lab >> 0) & 1) << (7 - i))
+            palette[j * 3 + 1] |= (((lab >> 1) & 1) << (7 - i))
+            palette[j * 3 + 2] |= (((lab >> 2) & 1) << (7 - i))
+            i += 1
+            lab >>= 3
+    return palette
+
+
+def multi_scale_testing(model, batch_input_im, crop_size=[473, 473], flip=True, multi_scales=[1]):
+    flipped_idx = (15, 14, 17, 16, 19, 18)
+    if len(batch_input_im.shape) > 4:
+        batch_input_im = batch_input_im.squeeze()
+    if len(batch_input_im.shape) == 3:
+        batch_input_im = batch_input_im.unsqueeze(0)
+
+    interp = torch.nn.Upsample(size=crop_size, mode='bilinear', align_corners=True)
+    ms_outputs = []
+    for s in multi_scales:
+        interp_im = torch.nn.Upsample(scale_factor=s, mode='bilinear', align_corners=True)
+        scaled_im = interp_im(batch_input_im)
+        parsing_output = model(scaled_im)
+        parsing_output = parsing_output[0][-1]
+        output = parsing_output[0]
+        if flip:
+            flipped_output = parsing_output[1]
+            flipped_output[14:20, :, :] = flipped_output[flipped_idx, :, :]
+            output += flipped_output.flip(dims=[-1])
+            output *= 0.5
+        output = interp(output.unsqueeze(0))
+        ms_outputs.append(output[0])
+    ms_fused_parsing_output = torch.stack(ms_outputs)
+    ms_fused_parsing_output = ms_fused_parsing_output.mean(0)
+    ms_fused_parsing_output = ms_fused_parsing_output.permute(1, 2, 0)  # HWC
+    parsing = torch.argmax(ms_fused_parsing_output, dim=2)
+    parsing = parsing.data.cpu().numpy()
+    ms_fused_parsing_output = ms_fused_parsing_output.data.cpu().numpy()
+    return parsing, ms_fused_parsing_output
+
+
+def main():
+    """Create the model and start the evaluation process."""
+    args = get_arguments()
+    multi_scales = [float(i) for i in args.multi_scales.split(',')]
+    gpus = [int(i) for i in args.gpu.split(',')]
+    assert len(gpus) == 1
+    if not args.gpu == 'None':
+        os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu
+
+    cudnn.benchmark = True
+    cudnn.enabled = True
+
+    h, w = map(int, args.input_size.split(','))
+    input_size = [h, w]
+
+    model = networks.init_model(args.arch, num_classes=args.num_classes, pretrained=None)
+
+    IMAGE_MEAN = model.mean
+    IMAGE_STD = model.std
+    INPUT_SPACE = model.input_space
+    print('image mean: {}'.format(IMAGE_MEAN))
+    print('image std: {}'.format(IMAGE_STD))
+    print('input space:{}'.format(INPUT_SPACE))
+    if INPUT_SPACE == 'BGR':
+        print('BGR Transformation')
+        transform = transforms.Compose([
+            transforms.ToTensor(),
+            transforms.Normalize(mean=IMAGE_MEAN,
+                                 std=IMAGE_STD),
+
+        ])
+    if INPUT_SPACE == 'RGB':
+        print('RGB Transformation')
+        transform = transforms.Compose([
+            transforms.ToTensor(),
+            BGR2RGB_transform(),
+            transforms.Normalize(mean=IMAGE_MEAN,
+                                 std=IMAGE_STD),
+        ])
+
+    # Data loader
+    lip_test_dataset = CropDataValSet(args.data_dir, args.split_name, crop_size=input_size, transform=transform,
+                                      flip=args.flip)
+    num_samples = len(lip_test_dataset)
+    print('Totoal testing sample numbers: {}'.format(num_samples))
+    testloader = data.DataLoader(lip_test_dataset, batch_size=args.batch_size, shuffle=False, pin_memory=True)
+
+    # Load model weight
+    state_dict = torch.load(args.model_restore)
+    from collections import OrderedDict
+    new_state_dict = OrderedDict()
+    for k, v in state_dict.items():
+        name = k[7:]  # remove `module.`
+        new_state_dict[name] = v
+    model.load_state_dict(new_state_dict)
+    model.cuda()
+    model.eval()
+
+    sp_results_dir = os.path.join(args.log_dir, args.split_name + '_parsing')
+    if not os.path.exists(sp_results_dir):
+        os.makedirs(sp_results_dir)
+
+    palette = get_palette(20)
+    parsing_preds = []
+    scales = np.zeros((num_samples, 2), dtype=np.float32)
+    centers = np.zeros((num_samples, 2), dtype=np.int32)
+    with torch.no_grad():
+        for idx, batch in enumerate(tqdm(testloader)):
+            image, meta = batch
+            if (len(image.shape) > 4):
+                image = image.squeeze()
+            im_name = meta['name'][0]
+            c = meta['center'].numpy()[0]
+            s = meta['scale'].numpy()[0]
+            w = meta['width'].numpy()[0]
+            h = meta['height'].numpy()[0]
+            scales[idx, :] = s
+            centers[idx, :] = c
+            parsing, logits = multi_scale_testing(model, image.cuda(), crop_size=input_size, flip=args.flip,
+                                                  multi_scales=multi_scales)
+            if args.save_results:
+                parsing_result = transform_parsing(parsing, c, s, w, h, input_size)
+                parsing_result_path = os.path.join(sp_results_dir, im_name + '.png')
+                output_im = PILImage.fromarray(np.asarray(parsing_result, dtype=np.uint8))
+                output_im.putpalette(palette)
+                output_im.save(parsing_result_path)
+                # save logits
+                logits_result = transform_logits(logits, c, s, w, h, input_size)
+                logits_result_path = os.path.join(sp_results_dir, im_name + '.npy')
+                np.save(logits_result_path, logits_result)
+    return
+
+
+if __name__ == '__main__':
+    main()
diff --git a/preprocess/humanparsing/mhp_extension/global_local_parsing/global_local_train.py b/preprocess/humanparsing/mhp_extension/global_local_parsing/global_local_train.py
new file mode 100644
index 0000000000000000000000000000000000000000..810b1dbbbc0bbc489830903770cc4d627e16c218
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/global_local_parsing/global_local_train.py
@@ -0,0 +1,232 @@
+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+
+"""
+@Author  :   Peike Li
+@Contact :   peike.li@yahoo.com
+@File    :   train.py
+@Time    :   8/4/19 3:36 PM
+@Desc    :
+@License :   This source code is licensed under the license found in the
+             LICENSE file in the root directory of this source tree.
+"""
+
+import os
+import json
+import timeit
+import argparse
+
+import torch
+import torch.optim as optim
+import torchvision.transforms as transforms
+import torch.backends.cudnn as cudnn
+from torch.utils import data
+
+import networks
+import utils.schp as schp
+from datasets.datasets import LIPDataSet
+from datasets.target_generation import generate_edge_tensor
+from utils.transforms import BGR2RGB_transform
+from utils.criterion import CriterionAll
+from utils.encoding import DataParallelModel, DataParallelCriterion
+from utils.warmup_scheduler import SGDRScheduler
+
+
+def get_arguments():
+    """Parse all the arguments provided from the CLI.
+    Returns:
+      A list of parsed arguments.
+    """
+    parser = argparse.ArgumentParser(description="Self Correction for Human Parsing")
+
+    # Network Structure
+    parser.add_argument("--arch", type=str, default='resnet101')
+    # Data Preference
+    parser.add_argument("--data-dir", type=str, default='./data/LIP')
+    parser.add_argument("--batch-size", type=int, default=16)
+    parser.add_argument("--input-size", type=str, default='473,473')
+    parser.add_argument("--split-name", type=str, default='crop_pic')
+    parser.add_argument("--num-classes", type=int, default=20)
+    parser.add_argument("--ignore-label", type=int, default=255)
+    parser.add_argument("--random-mirror", action="store_true")
+    parser.add_argument("--random-scale", action="store_true")
+    # Training Strategy
+    parser.add_argument("--learning-rate", type=float, default=7e-3)
+    parser.add_argument("--momentum", type=float, default=0.9)
+    parser.add_argument("--weight-decay", type=float, default=5e-4)
+    parser.add_argument("--gpu", type=str, default='0,1,2')
+    parser.add_argument("--start-epoch", type=int, default=0)
+    parser.add_argument("--epochs", type=int, default=150)
+    parser.add_argument("--eval-epochs", type=int, default=10)
+    parser.add_argument("--imagenet-pretrain", type=str, default='./pretrain_model/resnet101-imagenet.pth')
+    parser.add_argument("--log-dir", type=str, default='./log')
+    parser.add_argument("--model-restore", type=str, default='./log/checkpoint.pth.tar')
+    parser.add_argument("--schp-start", type=int, default=100, help='schp start epoch')
+    parser.add_argument("--cycle-epochs", type=int, default=10, help='schp cyclical epoch')
+    parser.add_argument("--schp-restore", type=str, default='./log/schp_checkpoint.pth.tar')
+    parser.add_argument("--lambda-s", type=float, default=1, help='segmentation loss weight')
+    parser.add_argument("--lambda-e", type=float, default=1, help='edge loss weight')
+    parser.add_argument("--lambda-c", type=float, default=0.1, help='segmentation-edge consistency loss weight')
+    return parser.parse_args()
+
+
+def main():
+    args = get_arguments()
+    print(args)
+
+    start_epoch = 0
+    cycle_n = 0
+
+    if not os.path.exists(args.log_dir):
+        os.makedirs(args.log_dir)
+    with open(os.path.join(args.log_dir, 'args.json'), 'w') as opt_file:
+        json.dump(vars(args), opt_file)
+
+    gpus = [int(i) for i in args.gpu.split(',')]
+    if not args.gpu == 'None':
+        os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu
+
+    input_size = list(map(int, args.input_size.split(',')))
+
+    cudnn.enabled = True
+    cudnn.benchmark = True
+
+    # Model Initialization
+    AugmentCE2P = networks.init_model(args.arch, num_classes=args.num_classes, pretrained=args.imagenet_pretrain)
+    model = DataParallelModel(AugmentCE2P)
+    model.cuda()
+
+    IMAGE_MEAN = AugmentCE2P.mean
+    IMAGE_STD = AugmentCE2P.std
+    INPUT_SPACE = AugmentCE2P.input_space
+    print('image mean: {}'.format(IMAGE_MEAN))
+    print('image std: {}'.format(IMAGE_STD))
+    print('input space:{}'.format(INPUT_SPACE))
+
+    restore_from = args.model_restore
+    if os.path.exists(restore_from):
+        print('Resume training from {}'.format(restore_from))
+        checkpoint = torch.load(restore_from)
+        model.load_state_dict(checkpoint['state_dict'])
+        start_epoch = checkpoint['epoch']
+
+    SCHP_AugmentCE2P = networks.init_model(args.arch, num_classes=args.num_classes, pretrained=args.imagenet_pretrain)
+    schp_model = DataParallelModel(SCHP_AugmentCE2P)
+    schp_model.cuda()
+
+    if os.path.exists(args.schp_restore):
+        print('Resuming schp checkpoint from {}'.format(args.schp_restore))
+        schp_checkpoint = torch.load(args.schp_restore)
+        schp_model_state_dict = schp_checkpoint['state_dict']
+        cycle_n = schp_checkpoint['cycle_n']
+        schp_model.load_state_dict(schp_model_state_dict)
+
+    # Loss Function
+    criterion = CriterionAll(lambda_1=args.lambda_s, lambda_2=args.lambda_e, lambda_3=args.lambda_c,
+                             num_classes=args.num_classes)
+    criterion = DataParallelCriterion(criterion)
+    criterion.cuda()
+
+    # Data Loader
+    if INPUT_SPACE == 'BGR':
+        print('BGR Transformation')
+        transform = transforms.Compose([
+            transforms.ToTensor(),
+            transforms.Normalize(mean=IMAGE_MEAN,
+                                 std=IMAGE_STD),
+        ])
+
+    elif INPUT_SPACE == 'RGB':
+        print('RGB Transformation')
+        transform = transforms.Compose([
+            transforms.ToTensor(),
+            BGR2RGB_transform(),
+            transforms.Normalize(mean=IMAGE_MEAN,
+                                 std=IMAGE_STD),
+        ])
+
+    train_dataset = LIPDataSet(args.data_dir, args.split_name, crop_size=input_size, transform=transform)
+    train_loader = data.DataLoader(train_dataset, batch_size=args.batch_size * len(gpus),
+                                   num_workers=16, shuffle=True, pin_memory=True, drop_last=True)
+    print('Total training samples: {}'.format(len(train_dataset)))
+
+    # Optimizer Initialization
+    optimizer = optim.SGD(model.parameters(), lr=args.learning_rate, momentum=args.momentum,
+                          weight_decay=args.weight_decay)
+
+    lr_scheduler = SGDRScheduler(optimizer, total_epoch=args.epochs,
+                                 eta_min=args.learning_rate / 100, warmup_epoch=10,
+                                 start_cyclical=args.schp_start, cyclical_base_lr=args.learning_rate / 2,
+                                 cyclical_epoch=args.cycle_epochs)
+
+    total_iters = args.epochs * len(train_loader)
+    start = timeit.default_timer()
+    for epoch in range(start_epoch, args.epochs):
+        lr_scheduler.step(epoch=epoch)
+        lr = lr_scheduler.get_lr()[0]
+
+        model.train()
+        for i_iter, batch in enumerate(train_loader):
+            i_iter += len(train_loader) * epoch
+
+            images, labels, _ = batch
+            labels = labels.cuda(non_blocking=True)
+
+            edges = generate_edge_tensor(labels)
+            labels = labels.type(torch.cuda.LongTensor)
+            edges = edges.type(torch.cuda.LongTensor)
+
+            preds = model(images)
+
+            # Online Self Correction Cycle with Label Refinement
+            if cycle_n >= 1:
+                with torch.no_grad():
+                    soft_preds = schp_model(images)
+                    soft_parsing = []
+                    soft_edge = []
+                    for soft_pred in soft_preds:
+                        soft_parsing.append(soft_pred[0][-1])
+                        soft_edge.append(soft_pred[1][-1])
+                    soft_preds = torch.cat(soft_parsing, dim=0)
+                    soft_edges = torch.cat(soft_edge, dim=0)
+            else:
+                soft_preds = None
+                soft_edges = None
+
+            loss = criterion(preds, [labels, edges, soft_preds, soft_edges], cycle_n)
+
+            optimizer.zero_grad()
+            loss.backward()
+            optimizer.step()
+
+            if i_iter % 100 == 0:
+                print('iter = {} of {} completed, lr = {}, loss = {}'.format(i_iter, total_iters, lr,
+                                                                             loss.data.cpu().numpy()))
+        if (epoch + 1) % (args.eval_epochs) == 0:
+            schp.save_checkpoint({
+                'epoch': epoch + 1,
+                'state_dict': model.state_dict(),
+            }, False, args.log_dir, filename='checkpoint_{}.pth.tar'.format(epoch + 1))
+
+        # Self Correction Cycle with Model Aggregation
+        if (epoch + 1) >= args.schp_start and (epoch + 1 - args.schp_start) % args.cycle_epochs == 0:
+            print('Self-correction cycle number {}'.format(cycle_n))
+            schp.moving_average(schp_model, model, 1.0 / (cycle_n + 1))
+            cycle_n += 1
+            schp.bn_re_estimate(train_loader, schp_model)
+            schp.save_schp_checkpoint({
+                'state_dict': schp_model.state_dict(),
+                'cycle_n': cycle_n,
+            }, False, args.log_dir, filename='schp_{}_checkpoint.pth.tar'.format(cycle_n))
+
+        torch.cuda.empty_cache()
+        end = timeit.default_timer()
+        print('epoch = {} of {} completed using {} s'.format(epoch, args.epochs,
+                                                             (end - start) / (epoch - start_epoch + 1)))
+
+    end = timeit.default_timer()
+    print('Training Finished in {} seconds'.format(end - start))
+
+
+if __name__ == '__main__':
+    main()
diff --git a/preprocess/humanparsing/mhp_extension/global_local_parsing/make_id_list.py b/preprocess/humanparsing/mhp_extension/global_local_parsing/make_id_list.py
new file mode 100644
index 0000000000000000000000000000000000000000..311edf45e2d5a00ad85f3df96530e2f51bfd4686
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/global_local_parsing/make_id_list.py
@@ -0,0 +1,13 @@
+import os
+
+DATASET = 'VIP'  # DATASET: MHPv2 or CIHP or VIP
+TYPE = 'crop_pic'  # crop_pic or DemoDataset
+IMG_DIR = '../demo/cropped_img/crop_pic'
+SAVE_DIR = '../demo/cropped_img'
+
+if not os.path.exists(SAVE_DIR):
+    os.makedirs(SAVE_DIR)
+
+with open(os.path.join(SAVE_DIR, TYPE + '.txt'), "w") as f:
+    for img_name in os.listdir(IMG_DIR):
+        f.write(img_name[:-4] + '\n')
diff --git a/preprocess/humanparsing/mhp_extension/logits_fusion.py b/preprocess/humanparsing/mhp_extension/logits_fusion.py
new file mode 100644
index 0000000000000000000000000000000000000000..07a8446282d24b7811b56de5b9591da29ffcdd60
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/logits_fusion.py
@@ -0,0 +1,307 @@
+import argparse
+import cv2
+import os
+import json
+import numpy as np
+from PIL import Image as PILImage
+import joblib
+
+
+def mask_nms(masks, bbox_scores, instances_confidence_threshold=0.5, overlap_threshold=0.7):
+    """
+    NMS-like procedure used in Panoptic Segmentation
+    Remove the overlap areas of different instances in Instance Segmentation
+    """
+    panoptic_seg = np.zeros(masks.shape[:2], dtype=np.uint8)
+    sorted_inds = list(range(len(bbox_scores)))
+    current_segment_id = 0
+    segments_score = []
+
+    for inst_id in sorted_inds:
+        score = bbox_scores[inst_id]
+        if score < instances_confidence_threshold:
+            break
+        mask = masks[:, :, inst_id]
+        mask_area = mask.sum()
+
+        if mask_area == 0:
+            continue
+
+        intersect = (mask > 0) & (panoptic_seg > 0)
+        intersect_area = intersect.sum()
+
+        if intersect_area * 1.0 / mask_area > overlap_threshold:
+            continue
+
+        if intersect_area > 0:
+            mask = mask & (panoptic_seg == 0)
+
+        current_segment_id += 1
+        #         panoptic_seg[np.where(mask==1)] = current_segment_id
+        #         panoptic_seg = panoptic_seg + current_segment_id*mask
+        panoptic_seg = np.where(mask == 0, panoptic_seg, current_segment_id)
+        segments_score.append(score)
+    #         print(np.unique(panoptic_seg))
+    return panoptic_seg, segments_score
+
+
+def extend(si, sj, instance_label, global_label, panoptic_seg_mask, class_map):
+    """
+    """
+    directions = [[-1, 0], [0, 1], [1, 0], [0, -1],
+                  [1, 1], [1, -1], [-1, 1], [-1, -1]]
+
+    inst_class = instance_label[si, sj]
+    human_class = panoptic_seg_mask[si, sj]
+    global_class = class_map[inst_class]
+    queue = [[si, sj]]
+
+    while len(queue) != 0:
+        cur = queue[0]
+        queue.pop(0)
+
+        for direction in directions:
+            ni = cur[0] + direction[0]
+            nj = cur[1] + direction[1]
+
+            if ni >= 0 and nj >= 0 and \
+                    ni < instance_label.shape[0] and \
+                    nj < instance_label.shape[1] and \
+                    instance_label[ni, nj] == 0 and \
+                    global_label[ni, nj] == global_class:
+                instance_label[ni, nj] = inst_class
+                # Using refined instance label to refine human label
+                panoptic_seg_mask[ni, nj] = human_class
+                queue.append([ni, nj])
+
+
+def refine(instance_label, panoptic_seg_mask, global_label, class_map):
+    """
+    Inputs:
+        [ instance_label ]
+            np.array() with shape [h, w]
+        [ global_label ] with shape [h, w]
+            np.array()
+  """
+    for i in range(instance_label.shape[0]):
+        for j in range(instance_label.shape[1]):
+            if instance_label[i, j] != 0:
+                extend(i, j, instance_label, global_label, panoptic_seg_mask, class_map)
+
+
+def get_palette(num_cls):
+    """ Returns the color map for visualizing the segmentation mask.
+    Inputs:
+        =num_cls=
+            Number of classes.
+    Returns:
+        The color map.
+    """
+    n = num_cls
+    palette = [0] * (n * 3)
+    for j in range(0, n):
+        lab = j
+        palette[j * 3 + 0] = 0
+        palette[j * 3 + 1] = 0
+        palette[j * 3 + 2] = 0
+        i = 0
+        while lab:
+            palette[j * 3 + 0] |= (((lab >> 0) & 1) << (7 - i))
+            palette[j * 3 + 1] |= (((lab >> 1) & 1) << (7 - i))
+            palette[j * 3 + 2] |= (((lab >> 2) & 1) << (7 - i))
+            i += 1
+            lab >>= 3
+    return palette
+
+
+def patch2img_output(patch_dir, img_name, img_height, img_width, bbox, bbox_type, num_class):
+    """transform bbox patch outputs to image output"""
+    assert bbox_type == 'gt' or 'msrcnn'
+    output = np.zeros((img_height, img_width, num_class), dtype='float')
+    output[:, :, 0] = np.inf
+    count_predictions = np.zeros((img_height, img_width, num_class), dtype='int32')
+    for i in range(len(bbox)):  # person index starts from 1
+        file_path = os.path.join(patch_dir, os.path.splitext(img_name)[0] + '_' + str(i + 1) + '_' + bbox_type + '.npy')
+        bbox_output = np.load(file_path)
+        output[bbox[i][1]:bbox[i][3] + 1, bbox[i][0]:bbox[i][2] + 1, 1:] += bbox_output[:, :, 1:]
+        count_predictions[bbox[i][1]:bbox[i][3] + 1, bbox[i][0]:bbox[i][2] + 1, 1:] += 1
+        output[bbox[i][1]:bbox[i][3] + 1, bbox[i][0]:bbox[i][2] + 1, 0] \
+            = np.minimum(output[bbox[i][1]:bbox[i][3] + 1, bbox[i][0]:bbox[i][2] + 1, 0], bbox_output[:, :, 0])
+
+    # Caution zero dividing.
+    count_predictions[count_predictions == 0] = 1
+    return output / count_predictions
+
+
+def get_instance(cat_gt, panoptic_seg_mask):
+    """
+    """
+    instance_gt = np.zeros_like(cat_gt, dtype=np.uint8)
+    num_humans = len(np.unique(panoptic_seg_mask)) - 1
+    class_map = {}
+
+    total_part_num = 0
+    for id in range(1, num_humans + 1):
+        human_part_label = np.where(panoptic_seg_mask == id, cat_gt, 0).astype(np.uint8)
+        #         human_part_label = (np.where(panoptic_seg_mask==id) * cat_gt).astype(np.uint8)
+        part_classes = np.unique(human_part_label)
+
+        exceed = False
+        for part_id in part_classes:
+            if part_id == 0:  # background
+                continue
+            total_part_num += 1
+
+            if total_part_num > 255:
+                print("total_part_num exceed, return current instance map: {}".format(total_part_num))
+                exceed = True
+                break
+            class_map[total_part_num] = part_id
+            instance_gt[np.where(human_part_label == part_id)] = total_part_num
+        if exceed:
+            break
+
+    # Make instance id continous.
+    ori_cur_labels = np.unique(instance_gt)
+    total_num_label = len(ori_cur_labels)
+    if instance_gt.max() + 1 != total_num_label:
+        for label in range(1, total_num_label):
+            instance_gt[instance_gt == ori_cur_labels[label]] = label
+
+    final_class_map = {}
+    for label in range(1, total_num_label):
+        if label >= 1:
+            final_class_map[label] = class_map[ori_cur_labels[label]]
+
+    return instance_gt, final_class_map
+
+
+def compute_confidence(im_name, feature_map, class_map,
+                       instance_label, output_dir,
+                       panoptic_seg_mask, seg_score_list):
+    """
+    """
+    conf_file = open(os.path.join(output_dir, os.path.splitext(im_name)[0] + '.txt'), 'w')
+
+    weighted_map = np.zeros_like(feature_map[:, :, 0])
+    for index, score in enumerate(seg_score_list):
+        weighted_map += (panoptic_seg_mask == index + 1) * score
+
+    for label in class_map.keys():
+        cls = class_map[label]
+        confidence = feature_map[:, :, cls].reshape(-1)[np.where(instance_label.reshape(-1) == label)]
+        confidence = (weighted_map * feature_map[:, :, cls].copy()).reshape(-1)[
+            np.where(instance_label.reshape(-1) == label)]
+
+        confidence = confidence.sum() / len(confidence)
+        conf_file.write('{} {}\n'.format(cls, confidence))
+
+    conf_file.close()
+
+
+def result_saving(fused_output, img_name, img_height, img_width, output_dir, mask_output_path, bbox_score, msrcnn_bbox):
+    if not os.path.exists(output_dir):
+        os.makedirs(output_dir)
+
+    global_root = os.path.join(output_dir, 'global_parsing')
+    instance_root = os.path.join(output_dir, 'instance_parsing')
+    tag_dir = os.path.join(output_dir, 'global_tag')
+
+    if not os.path.exists(global_root):
+        os.makedirs(global_root)
+    if not os.path.exists(instance_root):
+        os.makedirs(instance_root)
+    if not os.path.exists(tag_dir):
+        os.makedirs(tag_dir)
+
+    # For visualizing indexed png image.
+    palette = get_palette(256)
+
+    fused_output = cv2.resize(fused_output, dsize=(img_width, img_height), interpolation=cv2.INTER_LINEAR)
+    seg_pred = np.asarray(np.argmax(fused_output, axis=2), dtype=np.uint8)
+    masks = np.load(mask_output_path)
+    masks[np.where(seg_pred == 0)] = 0
+
+    panoptic_seg_mask = masks
+    seg_score_list = bbox_score
+
+    instance_pred, class_map = get_instance(seg_pred, panoptic_seg_mask)
+    refine(instance_pred, panoptic_seg_mask, seg_pred, class_map)
+
+    compute_confidence(img_name, fused_output, class_map, instance_pred, instance_root,
+                       panoptic_seg_mask, seg_score_list)
+
+    ins_seg_results = open(os.path.join(tag_dir, os.path.splitext(img_name)[0] + '.txt'), "a")
+    keep_human_id_list = list(np.unique(panoptic_seg_mask))
+    if 0 in keep_human_id_list:
+        keep_human_id_list.remove(0)
+    for i in keep_human_id_list:
+        ins_seg_results.write('{:.6f} {} {} {} {}\n'.format(seg_score_list[i - 1],
+                                                            int(msrcnn_bbox[i - 1][1]), int(msrcnn_bbox[i - 1][0]),
+                                                            int(msrcnn_bbox[i - 1][3]), int(msrcnn_bbox[i - 1][2])))
+    ins_seg_results.close()
+
+    output_im_global = PILImage.fromarray(seg_pred)
+    output_im_instance = PILImage.fromarray(instance_pred)
+    output_im_tag = PILImage.fromarray(panoptic_seg_mask)
+    output_im_global.putpalette(palette)
+    output_im_instance.putpalette(palette)
+    output_im_tag.putpalette(palette)
+
+    output_im_global.save(os.path.join(global_root, os.path.splitext(img_name)[0] + '.png'))
+    output_im_instance.save(os.path.join(instance_root, os.path.splitext(img_name)[0] + '.png'))
+    output_im_tag.save(os.path.join(tag_dir, os.path.splitext(img_name)[0] + '.png'))
+
+
+def multi_process(a, args):
+    img_name = a['im_name']
+    img_height = a['img_height']
+    img_width = a['img_width']
+    msrcnn_bbox = a['person_bbox']
+    bbox_score = a['person_bbox_score']
+
+    ######### loading outputs from gloabl and local models #########
+    global_output = np.load(os.path.join(args.global_output_dir, os.path.splitext(img_name)[0] + '.npy'))
+
+    msrcnn_output = patch2img_output(args.msrcnn_output_dir, img_name, img_height, img_width, msrcnn_bbox,
+                                     bbox_type='msrcnn', num_class=20)
+
+    gt_output = patch2img_output(args.gt_output_dir, img_name, img_height, img_width, msrcnn_bbox, bbox_type='msrcnn',
+                                 num_class=20)
+
+    #### global and local branch logits fusion #####
+#     fused_output = global_output + msrcnn_output + gt_output
+    fused_output = global_output + gt_output
+
+
+    mask_output_path = os.path.join(args.mask_output_dir, os.path.splitext(img_name)[0] + '_mask.npy')
+    result_saving(fused_output, img_name, img_height, img_width, args.save_dir, mask_output_path, bbox_score, msrcnn_bbox)
+    return
+
+
+def main(args):
+    json_file = open(args.test_json_path)
+    anno = json.load(json_file)['root']
+
+    results = joblib.Parallel(n_jobs=24, verbose=10, pre_dispatch="all")(
+        [joblib.delayed(multi_process)(a, args) for i, a in enumerate(anno)]
+    )
+
+
+def get_arguments():
+    parser = argparse.ArgumentParser(description="obtain final prediction by logits fusion")
+    parser.add_argument("--test_json_path", type=str, default='./data/CIHP/cascade_152_finetune/test.json')
+    parser.add_argument("--global_output_dir", type=str,
+                        default='./data/CIHP/global/global_result-cihp-resnet101/global_output')
+#     parser.add_argument("--msrcnn_output_dir", type=str,
+#                         default='./data/CIHP/cascade_152__finetune/msrcnn_result-cihp-resnet101/msrcnn_output')
+    parser.add_argument("--gt_output_dir", type=str,
+                        default='./data/CIHP/cascade_152__finetune/gt_result-cihp-resnet101/gt_output')
+    parser.add_argument("--mask_output_dir", type=str, default='./data/CIHP/cascade_152_finetune/mask')
+    parser.add_argument("--save_dir", type=str, default='./data/CIHP/fusion_results/cihp-msrcnn_finetune')
+    return parser.parse_args()
+
+
+if __name__ == '__main__':
+    args = get_arguments()
+    main(args)
diff --git a/preprocess/humanparsing/mhp_extension/make_crop_and_mask_w_mask_nms.py b/preprocess/humanparsing/mhp_extension/make_crop_and_mask_w_mask_nms.py
new file mode 100644
index 0000000000000000000000000000000000000000..1efc5ae86f81db7dcdae1d22db771d2a8e8d3ccf
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/make_crop_and_mask_w_mask_nms.py
@@ -0,0 +1,134 @@
+import numpy as np
+import cv2, torch
+import os
+import json
+import argparse
+import pycocotools.mask as mask_util
+from tqdm import tqdm
+
+
+def bbox_expand(img_height, img_width, bbox, exp_ratio):
+    x_min, y_min, x_max, y_max = bbox[:]
+    exp_x = (x_max - x_min) * ((exp_ratio - 1) / 2)
+    exp_y = (y_max - y_min) * ((exp_ratio - 1) / 2)
+    new_x_min = 0 if x_min - exp_x < 0 else np.round(x_min - exp_x)
+    new_y_min = 0 if y_min - exp_y < 0 else np.round(y_min - exp_y)
+    new_x_max = img_width - 1 if x_max + exp_x > img_width - 1 else np.round(x_max + exp_x)
+    new_y_max = img_height - 1 if y_max + exp_y > img_height - 1 else np.round(y_max + exp_y)
+    return int(new_x_min), int(new_y_min), int(new_x_max), int(new_y_max)
+
+
+def make_crop_and_mask(img_info, pred, file_list, crop_save_dir, mask_save_dir, args):
+    img_name = img_info['file_name']
+    img_id = img_info['id'] - 1  # img_info['id'] start form 1
+    img_w = img_info['width']
+    img_h = img_info['height']
+
+    img = cv2.imread(os.path.join(args.img_dir, img_name))
+
+    exp_bbox = []
+    ori_bbox = []
+    bbox_name_list = []
+    bbox_score_list = []
+    person_idx = 0
+
+    panoptic_seg = np.zeros((img_h, img_w), dtype=np.uint8)
+    assert len(pred[img_id]['instances']) > 0, 'image without instance prediction'
+
+    for instance in pred[img_id]['instances']:
+        score = instance['score']
+        if score < args.conf_thres:
+            break
+
+        mask = mask_util.decode(instance['segmentation'])
+        mask_area = mask.sum()
+
+        if mask_area == 0:  # if mask_area < img_w*img_h/1000:
+            continue
+
+        intersect = (mask > 0) & (panoptic_seg > 0)
+        intersect_area = intersect.sum()
+
+        if intersect_area * 1.0 / mask_area > args.overlap_threshold:  # todo add args
+            continue
+
+        if intersect_area > 0:
+            mask = mask & (panoptic_seg == 0)
+
+        person_idx += 1
+        panoptic_seg = np.where(mask == 0, panoptic_seg, person_idx)
+
+        bbox_score_list.append(score)
+
+        ins_bbox = instance['bbox']  # [x,y,w,h] format
+        x_min, y_min, box_w, box_h = ins_bbox
+        x_max, y_max = x_min + box_w, y_min + box_h
+        exp_x_min, exp_y_min, exp_x_max, exp_y_max = bbox_expand(img_h, img_w, [x_min, y_min, x_max, y_max],
+                                                                 args.exp_ratio)
+        crop_img = img[exp_y_min:exp_y_max + 1, exp_x_min:exp_x_max + 1, :]
+        exp_bbox.append([exp_x_min, exp_y_min, exp_x_max, exp_y_max])
+        ori_bbox.append([x_min, y_min, x_max, y_max])
+        bbox_name = os.path.splitext(img_name)[0] + '_' + str(person_idx) + '_msrcnn.jpg'
+        bbox_name_list.append(bbox_name)
+
+        cv2.imwrite(os.path.join(crop_save_dir, bbox_name), crop_img)
+
+    assert person_idx > 0, 'image without instance'
+    mask_name = os.path.splitext(img_name)[0] + '_mask.npy'
+    np.save(os.path.join(mask_save_dir, mask_name), panoptic_seg)
+
+    ############## json writing ##################
+    item = {}
+    item['dataset'] = 'CIHP'
+    item['im_name'] = img_name
+    item['img_height'] = img_h
+    item['img_width'] = img_w
+    item['center'] = [img_h / 2, img_w / 2]
+    item['person_num'] = person_idx
+    item['person_bbox'] = exp_bbox
+    item['real_person_bbox'] = ori_bbox
+    item['person_bbox_score'] = bbox_score_list
+    item['bbox_name'] = bbox_name_list
+    item['mask_name'] = mask_name
+    file_list.append(item)
+    json_file = {'root': file_list}
+    return json_file, file_list
+
+
+def get_arguments():
+    parser = argparse.ArgumentParser(description="crop person val/test demo for inference")
+    parser.add_argument("--exp_ratio", type=float, default=1.2)
+    parser.add_argument("--overlap_threshold", type=float, default=0.5)
+    parser.add_argument("--conf_thres", type=float, default=0.5)
+    parser.add_argument("--img_dir", type=str,
+                        default='/data03/v_xuyunqiu/data/instance-level_human_parsing/Testing/Images')
+    parser.add_argument("--save_dir", type=str,
+                        default='/data03/v_xuyunqiu/Projects/experiment_data/testing/resnest_200_TTA_mask_nms_all_data')
+    parser.add_argument("--img_list", type=str,
+                        default='/data03/v_xuyunqiu/Projects/pycococreator/annotations/CIHP_test.json')
+    parser.add_argument("--det_res", type=str,
+                        default='/data02/v_xuyunqiu/detectron2-ResNeSt/tools/output_cihp_inference_resnest/inference_TTA/instances_predictions.pth')
+    return parser.parse_args()
+
+
+def main(args):
+    img_info_list = json.load(open(args.img_list, encoding='UTF-8'))
+    pred = torch.load(args.det_res)
+
+    crop_save_dir = os.path.join(args.save_dir, 'crop_pic')
+    if not os.path.exists(crop_save_dir):
+        os.makedirs(crop_save_dir)
+    mask_save_dir = os.path.join(args.save_dir, 'crop_mask')
+    if not os.path.exists(mask_save_dir):
+        os.makedirs(mask_save_dir)
+
+    file_list = []
+    for img_info in tqdm(img_info_list['images']):
+        json_file, file_list = make_crop_and_mask(img_info, pred, file_list, crop_save_dir, mask_save_dir, args)
+        with open(os.path.join(args.save_dir, 'crop.json'), 'w') as f:
+            json.dump(json_file, f, indent=2)
+
+
+if __name__ == '__main__':
+    args = get_arguments()
+    main(args)
diff --git a/preprocess/humanparsing/mhp_extension/scripts/make_coco_style_annotation.sh b/preprocess/humanparsing/mhp_extension/scripts/make_coco_style_annotation.sh
new file mode 100644
index 0000000000000000000000000000000000000000..37a1e7d4944c318bc275a58dceeaf987bb6517dc
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/scripts/make_coco_style_annotation.sh
@@ -0,0 +1,14 @@
+python ./coco_style_annotation_creator/human_to_coco.py \
+    --dataset 'CIHP' \
+    --json_save_dir './data/CIHP/annotations' \
+    --train_img_dir './data/CIHP/Training/Images' \
+    --train_anno_dir './data/CIHP/Training/Human_ids' \
+    --val_img_dir './data/CIHP/Validation/Images' \
+    --val_anno_dir './data/CIHP/Validation/Human_ids'
+
+
+python ./coco_style_annotation_creator/test_human2coco_format.py \
+    --dataset 'CIHP' \
+    --json_save_dir './data/CIHP/annotations' \
+    --test_img_dir './data/CIHP/Testing/Images'
+
diff --git a/preprocess/humanparsing/mhp_extension/scripts/make_crop.sh b/preprocess/humanparsing/mhp_extension/scripts/make_crop.sh
new file mode 100644
index 0000000000000000000000000000000000000000..604a433c0494b1ddba9223cd95bf6de2b4b150b0
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/scripts/make_crop.sh
@@ -0,0 +1,6 @@
+python make_crop_and_mask_w_mask_nms.py \
+	--img_dir './data/CIHP/Testing/Images' \
+	--save_dir './data/CIHP/' \
+	--img_list './data/CIHP/annotations/CIHP_val.json' \
+	--det_res './data/CIHP/detectron2_prediction/inference/instances_predictions.pth'
+
diff --git a/preprocess/humanparsing/mhp_extension/scripts/parsing_fusion.sh b/preprocess/humanparsing/mhp_extension/scripts/parsing_fusion.sh
new file mode 100644
index 0000000000000000000000000000000000000000..107bcf6b0532a7f807c76cd706e48aab767a5da3
--- /dev/null
+++ b/preprocess/humanparsing/mhp_extension/scripts/parsing_fusion.sh
@@ -0,0 +1,6 @@
+python logits_fusion.py \
+--test_json_path ./data/CIHP/crop.json \
+--global_output_dir ./data/CIHP/global_pic_parsing \
+--msrcnn_output_dir ./data/CIHP/crop_pic_parsing \
+--gt_output_dir ./data/CIHP/crop_pic_parsing \
+--save_dir ./data/CIHP/mhp_fusion_parsing
diff --git a/preprocess/humanparsing/modules/__init__.py b/preprocess/humanparsing/modules/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..8a098dee5911f3613d320d23db37bc401cf57fa4
--- /dev/null
+++ b/preprocess/humanparsing/modules/__init__.py
@@ -0,0 +1,5 @@
+from .bn import ABN, InPlaceABN, InPlaceABNSync
+from .functions import ACT_RELU, ACT_LEAKY_RELU, ACT_ELU, ACT_NONE
+from .misc import GlobalAvgPool2d, SingleGPU
+from .residual import IdentityResidualBlock
+from .dense import DenseModule
diff --git a/preprocess/humanparsing/modules/bn.py b/preprocess/humanparsing/modules/bn.py
new file mode 100644
index 0000000000000000000000000000000000000000..a794698867e89140a030d550d832e6fa12561c8b
--- /dev/null
+++ b/preprocess/humanparsing/modules/bn.py
@@ -0,0 +1,132 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as functional
+
+try:
+    from queue import Queue
+except ImportError:
+    from Queue import Queue
+
+from .functions import *
+
+
+class ABN(nn.Module):
+    """Activated Batch Normalization
+
+    This gathers a `BatchNorm2d` and an activation function in a single module
+    """
+
+    def __init__(self, num_features, eps=1e-5, momentum=0.1, affine=True, activation="leaky_relu", slope=0.01):
+        """Creates an Activated Batch Normalization module
+
+        Parameters
+        ----------
+        num_features : int
+            Number of feature channels in the input and output.
+        eps : float
+            Small constant to prevent numerical issues.
+        momentum : float
+            Momentum factor applied to compute running statistics as.
+        affine : bool
+            If `True` apply learned scale and shift transformation after normalization.
+        activation : str
+            Name of the activation functions, one of: `leaky_relu`, `elu` or `none`.
+        slope : float
+            Negative slope for the `leaky_relu` activation.
+        """
+        super(ABN, self).__init__()
+        self.num_features = num_features
+        self.affine = affine
+        self.eps = eps
+        self.momentum = momentum
+        self.activation = activation
+        self.slope = slope
+        if self.affine:
+            self.weight = nn.Parameter(torch.ones(num_features))
+            self.bias = nn.Parameter(torch.zeros(num_features))
+        else:
+            self.register_parameter('weight', None)
+            self.register_parameter('bias', None)
+        self.register_buffer('running_mean', torch.zeros(num_features))
+        self.register_buffer('running_var', torch.ones(num_features))
+        self.reset_parameters()
+
+    def reset_parameters(self):
+        nn.init.constant_(self.running_mean, 0)
+        nn.init.constant_(self.running_var, 1)
+        if self.affine:
+            nn.init.constant_(self.weight, 1)
+            nn.init.constant_(self.bias, 0)
+
+    def forward(self, x):
+        x = functional.batch_norm(x, self.running_mean, self.running_var, self.weight, self.bias,
+                                  self.training, self.momentum, self.eps)
+
+        if self.activation == ACT_RELU:
+            return functional.relu(x, inplace=True)
+        elif self.activation == ACT_LEAKY_RELU:
+            return functional.leaky_relu(x, negative_slope=self.slope, inplace=True)
+        elif self.activation == ACT_ELU:
+            return functional.elu(x, inplace=True)
+        else:
+            return x
+
+    def __repr__(self):
+        rep = '{name}({num_features}, eps={eps}, momentum={momentum},' \
+              ' affine={affine}, activation={activation}'
+        if self.activation == "leaky_relu":
+            rep += ', slope={slope})'
+        else:
+            rep += ')'
+        return rep.format(name=self.__class__.__name__, **self.__dict__)
+
+
+class InPlaceABN(ABN):
+    """InPlace Activated Batch Normalization"""
+
+    def __init__(self, num_features, eps=1e-5, momentum=0.1, affine=True, activation="leaky_relu", slope=0.01):
+        """Creates an InPlace Activated Batch Normalization module
+
+        Parameters
+        ----------
+        num_features : int
+            Number of feature channels in the input and output.
+        eps : float
+            Small constant to prevent numerical issues.
+        momentum : float
+            Momentum factor applied to compute running statistics as.
+        affine : bool
+            If `True` apply learned scale and shift transformation after normalization.
+        activation : str
+            Name of the activation functions, one of: `leaky_relu`, `elu` or `none`.
+        slope : float
+            Negative slope for the `leaky_relu` activation.
+        """
+        super(InPlaceABN, self).__init__(num_features, eps, momentum, affine, activation, slope)
+
+    def forward(self, x):
+        x, _, _ = inplace_abn(x, self.weight, self.bias, self.running_mean, self.running_var,
+                           self.training, self.momentum, self.eps, self.activation, self.slope)
+        return x
+
+
+class InPlaceABNSync(ABN):
+    """InPlace Activated Batch Normalization with cross-GPU synchronization
+    This assumes that it will be replicated across GPUs using the same mechanism as in `nn.DistributedDataParallel`.
+    """
+
+    def forward(self, x):
+        x, _, _ =  inplace_abn_sync(x, self.weight, self.bias, self.running_mean, self.running_var,
+                                   self.training, self.momentum, self.eps, self.activation, self.slope)
+        return x
+
+    def __repr__(self):
+        rep = '{name}({num_features}, eps={eps}, momentum={momentum},' \
+              ' affine={affine}, activation={activation}'
+        if self.activation == "leaky_relu":
+            rep += ', slope={slope})'
+        else:
+            rep += ')'
+        return rep.format(name=self.__class__.__name__, **self.__dict__)
+
+
diff --git a/preprocess/humanparsing/modules/deeplab.py b/preprocess/humanparsing/modules/deeplab.py
new file mode 100644
index 0000000000000000000000000000000000000000..fd25b78369b27ef02c183a0b17b9bf8354c5f7c3
--- /dev/null
+++ b/preprocess/humanparsing/modules/deeplab.py
@@ -0,0 +1,84 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as functional
+
+from models._util import try_index
+from .bn import ABN
+
+
+class DeeplabV3(nn.Module):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 hidden_channels=256,
+                 dilations=(12, 24, 36),
+                 norm_act=ABN,
+                 pooling_size=None):
+        super(DeeplabV3, self).__init__()
+        self.pooling_size = pooling_size
+
+        self.map_convs = nn.ModuleList([
+            nn.Conv2d(in_channels, hidden_channels, 1, bias=False),
+            nn.Conv2d(in_channels, hidden_channels, 3, bias=False, dilation=dilations[0], padding=dilations[0]),
+            nn.Conv2d(in_channels, hidden_channels, 3, bias=False, dilation=dilations[1], padding=dilations[1]),
+            nn.Conv2d(in_channels, hidden_channels, 3, bias=False, dilation=dilations[2], padding=dilations[2])
+        ])
+        self.map_bn = norm_act(hidden_channels * 4)
+
+        self.global_pooling_conv = nn.Conv2d(in_channels, hidden_channels, 1, bias=False)
+        self.global_pooling_bn = norm_act(hidden_channels)
+
+        self.red_conv = nn.Conv2d(hidden_channels * 4, out_channels, 1, bias=False)
+        self.pool_red_conv = nn.Conv2d(hidden_channels, out_channels, 1, bias=False)
+        self.red_bn = norm_act(out_channels)
+
+        self.reset_parameters(self.map_bn.activation, self.map_bn.slope)
+
+    def reset_parameters(self, activation, slope):
+        gain = nn.init.calculate_gain(activation, slope)
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                nn.init.xavier_normal_(m.weight.data, gain)
+                if hasattr(m, "bias") and m.bias is not None:
+                    nn.init.constant_(m.bias, 0)
+            elif isinstance(m, ABN):
+                if hasattr(m, "weight") and m.weight is not None:
+                    nn.init.constant_(m.weight, 1)
+                if hasattr(m, "bias") and m.bias is not None:
+                    nn.init.constant_(m.bias, 0)
+
+    def forward(self, x):
+        # Map convolutions
+        out = torch.cat([m(x) for m in self.map_convs], dim=1)
+        out = self.map_bn(out)
+        out = self.red_conv(out)
+
+        # Global pooling
+        pool = self._global_pooling(x)
+        pool = self.global_pooling_conv(pool)
+        pool = self.global_pooling_bn(pool)
+        pool = self.pool_red_conv(pool)
+        if self.training or self.pooling_size is None:
+            pool = pool.repeat(1, 1, x.size(2), x.size(3))
+
+        out += pool
+        out = self.red_bn(out)
+        return out
+
+    def _global_pooling(self, x):
+        if self.training or self.pooling_size is None:
+            pool = x.view(x.size(0), x.size(1), -1).mean(dim=-1)
+            pool = pool.view(x.size(0), x.size(1), 1, 1)
+        else:
+            pooling_size = (min(try_index(self.pooling_size, 0), x.shape[2]),
+                            min(try_index(self.pooling_size, 1), x.shape[3]))
+            padding = (
+                (pooling_size[1] - 1) // 2,
+                (pooling_size[1] - 1) // 2 if pooling_size[1] % 2 == 1 else (pooling_size[1] - 1) // 2 + 1,
+                (pooling_size[0] - 1) // 2,
+                (pooling_size[0] - 1) // 2 if pooling_size[0] % 2 == 1 else (pooling_size[0] - 1) // 2 + 1
+            )
+
+            pool = functional.avg_pool2d(x, pooling_size, stride=1)
+            pool = functional.pad(pool, pad=padding, mode="replicate")
+        return pool
diff --git a/preprocess/humanparsing/modules/dense.py b/preprocess/humanparsing/modules/dense.py
new file mode 100644
index 0000000000000000000000000000000000000000..9638d6e86d2ae838550fefa9002a984af52e6cc8
--- /dev/null
+++ b/preprocess/humanparsing/modules/dense.py
@@ -0,0 +1,42 @@
+from collections import OrderedDict
+
+import torch
+import torch.nn as nn
+
+from .bn import ABN
+
+
+class DenseModule(nn.Module):
+    def __init__(self, in_channels, growth, layers, bottleneck_factor=4, norm_act=ABN, dilation=1):
+        super(DenseModule, self).__init__()
+        self.in_channels = in_channels
+        self.growth = growth
+        self.layers = layers
+
+        self.convs1 = nn.ModuleList()
+        self.convs3 = nn.ModuleList()
+        for i in range(self.layers):
+            self.convs1.append(nn.Sequential(OrderedDict([
+                ("bn", norm_act(in_channels)),
+                ("conv", nn.Conv2d(in_channels, self.growth * bottleneck_factor, 1, bias=False))
+            ])))
+            self.convs3.append(nn.Sequential(OrderedDict([
+                ("bn", norm_act(self.growth * bottleneck_factor)),
+                ("conv", nn.Conv2d(self.growth * bottleneck_factor, self.growth, 3, padding=dilation, bias=False,
+                                   dilation=dilation))
+            ])))
+            in_channels += self.growth
+
+    @property
+    def out_channels(self):
+        return self.in_channels + self.growth * self.layers
+
+    def forward(self, x):
+        inputs = [x]
+        for i in range(self.layers):
+            x = torch.cat(inputs, dim=1)
+            x = self.convs1[i](x)
+            x = self.convs3[i](x)
+            inputs += [x]
+
+        return torch.cat(inputs, dim=1)
diff --git a/preprocess/humanparsing/modules/functions.py b/preprocess/humanparsing/modules/functions.py
new file mode 100644
index 0000000000000000000000000000000000000000..4b2837260687dde56d4595b24aded5fddbc4bda8
--- /dev/null
+++ b/preprocess/humanparsing/modules/functions.py
@@ -0,0 +1,245 @@
+import pdb
+from os import path
+import torch
+import torch.distributed as dist
+import torch.autograd as autograd
+import torch.cuda.comm as comm
+from torch.autograd.function import once_differentiable
+from torch.utils.cpp_extension import load
+
+_src_path = path.join(path.dirname(path.abspath(__file__)), "src")
+_backend = load(name="inplace_abn",
+                extra_cflags=["-O3"],
+                sources=[path.join(_src_path, f) for f in [
+                    "inplace_abn.cpp",
+                    "inplace_abn_cpu.cpp",
+                    "inplace_abn_cuda.cu",
+                    "inplace_abn_cuda_half.cu"
+                ]],
+                extra_cuda_cflags=["--expt-extended-lambda"])
+
+# Activation names
+ACT_RELU = "relu"
+ACT_LEAKY_RELU = "leaky_relu"
+ACT_ELU = "elu"
+ACT_NONE = "none"
+
+
+def _check(fn, *args, **kwargs):
+    success = fn(*args, **kwargs)
+    if not success:
+        raise RuntimeError("CUDA Error encountered in {}".format(fn))
+
+
+def _broadcast_shape(x):
+    out_size = []
+    for i, s in enumerate(x.size()):
+        if i != 1:
+            out_size.append(1)
+        else:
+            out_size.append(s)
+    return out_size
+
+
+def _reduce(x):
+    if len(x.size()) == 2:
+        return x.sum(dim=0)
+    else:
+        n, c = x.size()[0:2]
+        return x.contiguous().view((n, c, -1)).sum(2).sum(0)
+
+
+def _count_samples(x):
+    count = 1
+    for i, s in enumerate(x.size()):
+        if i != 1:
+            count *= s
+    return count
+
+
+def _act_forward(ctx, x):
+    if ctx.activation == ACT_LEAKY_RELU:
+        _backend.leaky_relu_forward(x, ctx.slope)
+    elif ctx.activation == ACT_ELU:
+        _backend.elu_forward(x)
+    elif ctx.activation == ACT_NONE:
+        pass
+
+
+def _act_backward(ctx, x, dx):
+    if ctx.activation == ACT_LEAKY_RELU:
+        _backend.leaky_relu_backward(x, dx, ctx.slope)
+    elif ctx.activation == ACT_ELU:
+        _backend.elu_backward(x, dx)
+    elif ctx.activation == ACT_NONE:
+        pass
+
+
+class InPlaceABN(autograd.Function):
+    @staticmethod
+    def forward(ctx, x, weight, bias, running_mean, running_var,
+                training=True, momentum=0.1, eps=1e-05, activation=ACT_LEAKY_RELU, slope=0.01):
+        # Save context
+        ctx.training = training
+        ctx.momentum = momentum
+        ctx.eps = eps
+        ctx.activation = activation
+        ctx.slope = slope
+        ctx.affine = weight is not None and bias is not None
+
+        # Prepare inputs
+        count = _count_samples(x)
+        x = x.contiguous()
+        weight = weight.contiguous() if ctx.affine else x.new_empty(0)
+        bias = bias.contiguous() if ctx.affine else x.new_empty(0)
+
+        if ctx.training:
+            mean, var = _backend.mean_var(x)
+
+            # Update running stats
+            running_mean.mul_((1 - ctx.momentum)).add_(ctx.momentum * mean)
+            running_var.mul_((1 - ctx.momentum)).add_(ctx.momentum * var * count / (count - 1))
+
+            # Mark in-place modified tensors
+            ctx.mark_dirty(x, running_mean, running_var)
+        else:
+            mean, var = running_mean.contiguous(), running_var.contiguous()
+            ctx.mark_dirty(x)
+
+        # BN forward + activation
+        _backend.forward(x, mean, var, weight, bias, ctx.affine, ctx.eps)
+        _act_forward(ctx, x)
+
+        # Output
+        ctx.var = var
+        ctx.save_for_backward(x, var, weight, bias)
+        ctx.mark_non_differentiable(running_mean, running_var)
+        return x, running_mean, running_var
+
+    @staticmethod
+    @once_differentiable
+    def backward(ctx, dz, _drunning_mean, _drunning_var):
+        z, var, weight, bias = ctx.saved_tensors
+        dz = dz.contiguous()
+
+        # Undo activation
+        _act_backward(ctx, z, dz)
+
+        if ctx.training:
+            edz, eydz = _backend.edz_eydz(z, dz, weight, bias, ctx.affine, ctx.eps)
+        else:
+            # TODO: implement simplified CUDA backward for inference mode
+            edz = dz.new_zeros(dz.size(1))
+            eydz = dz.new_zeros(dz.size(1))
+
+        dx = _backend.backward(z, dz, var, weight, bias, edz, eydz, ctx.affine, ctx.eps)
+        # dweight = eydz * weight.sign() if ctx.affine else None
+        dweight = eydz if ctx.affine else None
+        if dweight is not None:
+            dweight[weight < 0] *= -1
+        dbias = edz if ctx.affine else None
+
+        return dx, dweight, dbias, None, None, None, None, None, None, None
+
+
+class InPlaceABNSync(autograd.Function):
+    @classmethod
+    def forward(cls, ctx, x, weight, bias, running_mean, running_var,
+                training=True, momentum=0.1, eps=1e-05, activation=ACT_LEAKY_RELU, slope=0.01, equal_batches=True):
+        # Save context
+        ctx.training = training
+        ctx.momentum = momentum
+        ctx.eps = eps
+        ctx.activation = activation
+        ctx.slope = slope
+        ctx.affine = weight is not None and bias is not None
+
+        # Prepare inputs
+        ctx.world_size = dist.get_world_size() if dist.is_initialized() else 1
+
+        # count = _count_samples(x)
+        batch_size = x.new_tensor([x.shape[0]], dtype=torch.long)
+
+        x = x.contiguous()
+        weight = weight.contiguous() if ctx.affine else x.new_empty(0)
+        bias = bias.contiguous() if ctx.affine else x.new_empty(0)
+
+        if ctx.training:
+            mean, var = _backend.mean_var(x)
+            if ctx.world_size > 1:
+                # get global batch size
+                if equal_batches:
+                    batch_size *= ctx.world_size
+                else:
+                    dist.all_reduce(batch_size, dist.ReduceOp.SUM)
+
+                ctx.factor = x.shape[0] / float(batch_size.item())
+
+                mean_all = mean.clone() * ctx.factor
+                dist.all_reduce(mean_all, dist.ReduceOp.SUM)
+
+                var_all = (var + (mean - mean_all) ** 2) * ctx.factor
+                dist.all_reduce(var_all, dist.ReduceOp.SUM)
+
+                mean = mean_all
+                var = var_all
+
+            # Update running stats
+            running_mean.mul_((1 - ctx.momentum)).add_(ctx.momentum * mean)
+            count = batch_size.item() * x.view(x.shape[0], x.shape[1], -1).shape[-1]
+            running_var.mul_((1 - ctx.momentum)).add_(ctx.momentum * var * (float(count) / (count - 1)))
+
+            # Mark in-place modified tensors
+            ctx.mark_dirty(x, running_mean, running_var)
+        else:
+            mean, var = running_mean.contiguous(), running_var.contiguous()
+            ctx.mark_dirty(x)
+
+        # BN forward + activation
+        _backend.forward(x, mean, var, weight, bias, ctx.affine, ctx.eps)
+        _act_forward(ctx, x)
+
+        # Output
+        ctx.var = var
+        ctx.save_for_backward(x, var, weight, bias)
+        ctx.mark_non_differentiable(running_mean, running_var)
+        return x, running_mean, running_var
+
+    @staticmethod
+    @once_differentiable
+    def backward(ctx, dz, _drunning_mean, _drunning_var):
+        z, var, weight, bias = ctx.saved_tensors
+        dz = dz.contiguous()
+
+        # Undo activation
+        _act_backward(ctx, z, dz)
+
+        if ctx.training:
+            edz, eydz = _backend.edz_eydz(z, dz, weight, bias, ctx.affine, ctx.eps)
+            edz_local = edz.clone()
+            eydz_local = eydz.clone()
+
+            if ctx.world_size > 1:
+                edz *= ctx.factor
+                dist.all_reduce(edz, dist.ReduceOp.SUM)
+
+                eydz *= ctx.factor
+                dist.all_reduce(eydz, dist.ReduceOp.SUM)
+        else:
+            edz_local = edz = dz.new_zeros(dz.size(1))
+            eydz_local = eydz = dz.new_zeros(dz.size(1))
+
+        dx = _backend.backward(z, dz, var, weight, bias, edz, eydz, ctx.affine, ctx.eps)
+        # dweight = eydz_local * weight.sign() if ctx.affine else None
+        dweight = eydz_local if ctx.affine else None
+        if dweight is not None:
+            dweight[weight < 0] *= -1
+        dbias = edz_local if ctx.affine else None
+
+        return dx, dweight, dbias, None, None, None, None, None, None, None
+
+
+inplace_abn = InPlaceABN.apply
+inplace_abn_sync = InPlaceABNSync.apply
+
+__all__ = ["inplace_abn", "inplace_abn_sync", "ACT_RELU", "ACT_LEAKY_RELU", "ACT_ELU", "ACT_NONE"]
diff --git a/preprocess/humanparsing/modules/misc.py b/preprocess/humanparsing/modules/misc.py
new file mode 100644
index 0000000000000000000000000000000000000000..3c50b69b38c950801baacba8b3684ffd23aef08b
--- /dev/null
+++ b/preprocess/humanparsing/modules/misc.py
@@ -0,0 +1,21 @@
+import torch.nn as nn
+import torch
+import torch.distributed as dist
+
+class GlobalAvgPool2d(nn.Module):
+    def __init__(self):
+        """Global average pooling over the input's spatial dimensions"""
+        super(GlobalAvgPool2d, self).__init__()
+
+    def forward(self, inputs):
+        in_size = inputs.size()
+        return inputs.view((in_size[0], in_size[1], -1)).mean(dim=2)
+
+class SingleGPU(nn.Module):
+    def __init__(self, module):
+        super(SingleGPU, self).__init__()
+        self.module=module
+
+    def forward(self, input):
+        return self.module(input.cuda(non_blocking=True))
+
diff --git a/preprocess/humanparsing/modules/residual.py b/preprocess/humanparsing/modules/residual.py
new file mode 100644
index 0000000000000000000000000000000000000000..8a5c90e0606a451ff690f67a2feac28476241d86
--- /dev/null
+++ b/preprocess/humanparsing/modules/residual.py
@@ -0,0 +1,182 @@
+from collections import OrderedDict
+
+import torch.nn as nn
+
+from .bn import ABN, ACT_LEAKY_RELU, ACT_ELU, ACT_NONE
+import torch.nn.functional as functional
+
+
+class ResidualBlock(nn.Module):
+    """Configurable residual block
+
+    Parameters
+    ----------
+    in_channels : int
+        Number of input channels.
+    channels : list of int
+        Number of channels in the internal feature maps. Can either have two or three elements: if three construct
+        a residual block with two `3 x 3` convolutions, otherwise construct a bottleneck block with `1 x 1`, then
+        `3 x 3` then `1 x 1` convolutions.
+    stride : int
+        Stride of the first `3 x 3` convolution
+    dilation : int
+        Dilation to apply to the `3 x 3` convolutions.
+    groups : int
+        Number of convolution groups. This is used to create ResNeXt-style blocks and is only compatible with
+        bottleneck blocks.
+    norm_act : callable
+        Function to create normalization / activation Module.
+    dropout: callable
+        Function to create Dropout Module.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 channels,
+                 stride=1,
+                 dilation=1,
+                 groups=1,
+                 norm_act=ABN,
+                 dropout=None):
+        super(ResidualBlock, self).__init__()
+
+        # Check parameters for inconsistencies
+        if len(channels) != 2 and len(channels) != 3:
+            raise ValueError("channels must contain either two or three values")
+        if len(channels) == 2 and groups != 1:
+            raise ValueError("groups > 1 are only valid if len(channels) == 3")
+
+        is_bottleneck = len(channels) == 3
+        need_proj_conv = stride != 1 or in_channels != channels[-1]
+
+        if not is_bottleneck:
+            bn2 = norm_act(channels[1])
+            bn2.activation = ACT_NONE
+            layers = [
+                ("conv1", nn.Conv2d(in_channels, channels[0], 3, stride=stride, padding=dilation, bias=False,
+                                    dilation=dilation)),
+                ("bn1", norm_act(channels[0])),
+                ("conv2", nn.Conv2d(channels[0], channels[1], 3, stride=1, padding=dilation, bias=False,
+                                    dilation=dilation)),
+                ("bn2", bn2)
+            ]
+            if dropout is not None:
+                layers = layers[0:2] + [("dropout", dropout())] + layers[2:]
+        else:
+            bn3 = norm_act(channels[2])
+            bn3.activation = ACT_NONE
+            layers = [
+                ("conv1", nn.Conv2d(in_channels, channels[0], 1, stride=1, padding=0, bias=False)),
+                ("bn1", norm_act(channels[0])),
+                ("conv2", nn.Conv2d(channels[0], channels[1], 3, stride=stride, padding=dilation, bias=False,
+                                    groups=groups, dilation=dilation)),
+                ("bn2", norm_act(channels[1])),
+                ("conv3", nn.Conv2d(channels[1], channels[2], 1, stride=1, padding=0, bias=False)),
+                ("bn3", bn3)
+            ]
+            if dropout is not None:
+                layers = layers[0:4] + [("dropout", dropout())] + layers[4:]
+        self.convs = nn.Sequential(OrderedDict(layers))
+
+        if need_proj_conv:
+            self.proj_conv = nn.Conv2d(in_channels, channels[-1], 1, stride=stride, padding=0, bias=False)
+            self.proj_bn = norm_act(channels[-1])
+            self.proj_bn.activation = ACT_NONE
+
+    def forward(self, x):
+        if hasattr(self, "proj_conv"):
+            residual = self.proj_conv(x)
+            residual = self.proj_bn(residual)
+        else:
+            residual = x
+        x = self.convs(x) + residual
+
+        if self.convs.bn1.activation == ACT_LEAKY_RELU:
+            return functional.leaky_relu(x, negative_slope=self.convs.bn1.slope, inplace=True)
+        elif self.convs.bn1.activation == ACT_ELU:
+            return functional.elu(x, inplace=True)
+        else:
+            return x
+
+
+class IdentityResidualBlock(nn.Module):
+    def __init__(self,
+                 in_channels,
+                 channels,
+                 stride=1,
+                 dilation=1,
+                 groups=1,
+                 norm_act=ABN,
+                 dropout=None):
+        """Configurable identity-mapping residual block
+
+        Parameters
+        ----------
+        in_channels : int
+            Number of input channels.
+        channels : list of int
+            Number of channels in the internal feature maps. Can either have two or three elements: if three construct
+            a residual block with two `3 x 3` convolutions, otherwise construct a bottleneck block with `1 x 1`, then
+            `3 x 3` then `1 x 1` convolutions.
+        stride : int
+            Stride of the first `3 x 3` convolution
+        dilation : int
+            Dilation to apply to the `3 x 3` convolutions.
+        groups : int
+            Number of convolution groups. This is used to create ResNeXt-style blocks and is only compatible with
+            bottleneck blocks.
+        norm_act : callable
+            Function to create normalization / activation Module.
+        dropout: callable
+            Function to create Dropout Module.
+        """
+        super(IdentityResidualBlock, self).__init__()
+
+        # Check parameters for inconsistencies
+        if len(channels) != 2 and len(channels) != 3:
+            raise ValueError("channels must contain either two or three values")
+        if len(channels) == 2 and groups != 1:
+            raise ValueError("groups > 1 are only valid if len(channels) == 3")
+
+        is_bottleneck = len(channels) == 3
+        need_proj_conv = stride != 1 or in_channels != channels[-1]
+
+        self.bn1 = norm_act(in_channels)
+        if not is_bottleneck:
+            layers = [
+                ("conv1", nn.Conv2d(in_channels, channels[0], 3, stride=stride, padding=dilation, bias=False,
+                                    dilation=dilation)),
+                ("bn2", norm_act(channels[0])),
+                ("conv2", nn.Conv2d(channels[0], channels[1], 3, stride=1, padding=dilation, bias=False,
+                                    dilation=dilation))
+            ]
+            if dropout is not None:
+                layers = layers[0:2] + [("dropout", dropout())] + layers[2:]
+        else:
+            layers = [
+                ("conv1", nn.Conv2d(in_channels, channels[0], 1, stride=stride, padding=0, bias=False)),
+                ("bn2", norm_act(channels[0])),
+                ("conv2", nn.Conv2d(channels[0], channels[1], 3, stride=1, padding=dilation, bias=False,
+                                    groups=groups, dilation=dilation)),
+                ("bn3", norm_act(channels[1])),
+                ("conv3", nn.Conv2d(channels[1], channels[2], 1, stride=1, padding=0, bias=False))
+            ]
+            if dropout is not None:
+                layers = layers[0:4] + [("dropout", dropout())] + layers[4:]
+        self.convs = nn.Sequential(OrderedDict(layers))
+
+        if need_proj_conv:
+            self.proj_conv = nn.Conv2d(in_channels, channels[-1], 1, stride=stride, padding=0, bias=False)
+
+    def forward(self, x):
+        if hasattr(self, "proj_conv"):
+            bn1 = self.bn1(x)
+            shortcut = self.proj_conv(bn1)
+        else:
+            shortcut = x.clone()
+            bn1 = self.bn1(x)
+
+        out = self.convs(bn1)
+        out.add_(shortcut)
+
+        return out
diff --git a/preprocess/humanparsing/modules/src/checks.h b/preprocess/humanparsing/modules/src/checks.h
new file mode 100644
index 0000000000000000000000000000000000000000..e761a6fe34d0789815b588eba7e3726026e0e868
--- /dev/null
+++ b/preprocess/humanparsing/modules/src/checks.h
@@ -0,0 +1,15 @@
+#pragma once
+
+#include <ATen/ATen.h>
+
+// Define AT_CHECK for old version of ATen where the same function was called AT_ASSERT
+#ifndef AT_CHECK
+#define AT_CHECK AT_ASSERT
+#endif
+
+#define CHECK_CUDA(x) AT_CHECK((x).type().is_cuda(), #x " must be a CUDA tensor")
+#define CHECK_CPU(x) AT_CHECK(!(x).type().is_cuda(), #x " must be a CPU tensor")
+#define CHECK_CONTIGUOUS(x) AT_CHECK((x).is_contiguous(), #x " must be contiguous")
+
+#define CHECK_CUDA_INPUT(x) CHECK_CUDA(x); CHECK_CONTIGUOUS(x)
+#define CHECK_CPU_INPUT(x) CHECK_CPU(x); CHECK_CONTIGUOUS(x)
\ No newline at end of file
diff --git a/preprocess/humanparsing/modules/src/inplace_abn.cpp b/preprocess/humanparsing/modules/src/inplace_abn.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..0a6b1128cc20cbfc476134154e23e5869a92b856
--- /dev/null
+++ b/preprocess/humanparsing/modules/src/inplace_abn.cpp
@@ -0,0 +1,95 @@
+#include <torch/extension.h>
+
+#include <vector>
+
+#include "inplace_abn.h"
+
+std::vector<at::Tensor> mean_var(at::Tensor x) {
+  if (x.is_cuda()) {
+    if (x.type().scalarType() == at::ScalarType::Half) {
+      return mean_var_cuda_h(x);
+    } else {
+      return mean_var_cuda(x);
+    }
+  } else {
+    return mean_var_cpu(x);
+  }
+}
+
+at::Tensor forward(at::Tensor x, at::Tensor mean, at::Tensor var, at::Tensor weight, at::Tensor bias,
+                   bool affine, float eps) {
+  if (x.is_cuda()) {
+    if (x.type().scalarType() == at::ScalarType::Half) {
+      return forward_cuda_h(x, mean, var, weight, bias, affine, eps);
+    } else {
+      return forward_cuda(x, mean, var, weight, bias, affine, eps);
+    }
+  } else {
+    return forward_cpu(x, mean, var, weight, bias, affine, eps);
+  }
+}
+
+std::vector<at::Tensor> edz_eydz(at::Tensor z, at::Tensor dz, at::Tensor weight, at::Tensor bias,
+                                 bool affine, float eps) {
+  if (z.is_cuda()) {
+    if (z.type().scalarType() == at::ScalarType::Half) {
+      return edz_eydz_cuda_h(z, dz, weight, bias, affine, eps);
+    } else {
+      return edz_eydz_cuda(z, dz, weight, bias, affine, eps);
+	}
+  } else {
+    return edz_eydz_cpu(z, dz, weight, bias, affine, eps);
+  }
+}
+
+at::Tensor backward(at::Tensor z, at::Tensor dz, at::Tensor var, at::Tensor weight, at::Tensor bias,
+                                 at::Tensor edz, at::Tensor eydz, bool affine, float eps) {
+  if (z.is_cuda()) {
+    if (z.type().scalarType() == at::ScalarType::Half) {
+      return backward_cuda_h(z, dz, var, weight, bias, edz, eydz, affine, eps);
+	} else {
+      return backward_cuda(z, dz, var, weight, bias, edz, eydz, affine, eps);
+    }
+  } else {
+    return backward_cpu(z, dz, var, weight, bias, edz, eydz, affine, eps);
+  }
+}
+
+void leaky_relu_forward(at::Tensor z, float slope) {
+  at::leaky_relu_(z, slope);
+}
+
+void leaky_relu_backward(at::Tensor z, at::Tensor dz, float slope) {
+  if (z.is_cuda()) {
+    if (z.type().scalarType() == at::ScalarType::Half) {
+      return leaky_relu_backward_cuda_h(z, dz, slope);
+	} else {
+      return leaky_relu_backward_cuda(z, dz, slope);
+    }
+  } else {
+    return leaky_relu_backward_cpu(z, dz, slope);
+  }
+}
+
+void elu_forward(at::Tensor z) {
+  at::elu_(z);
+}
+
+void elu_backward(at::Tensor z, at::Tensor dz) {
+  if (z.is_cuda()) {
+    return elu_backward_cuda(z, dz);
+  } else {
+    return elu_backward_cpu(z, dz);
+  }
+}
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+  m.def("mean_var", &mean_var, "Mean and variance computation");
+  m.def("forward", &forward, "In-place forward computation");
+  m.def("edz_eydz", &edz_eydz, "First part of backward computation");
+  m.def("backward", &backward, "Second part of backward computation");
+  m.def("leaky_relu_forward", &leaky_relu_forward, "Leaky relu forward computation");
+  m.def("leaky_relu_backward", &leaky_relu_backward, "Leaky relu backward computation and inversion");
+  m.def("elu_forward", &elu_forward, "Elu forward computation");
+  m.def("elu_backward", &elu_backward, "Elu backward computation and inversion");
+}
diff --git a/preprocess/humanparsing/modules/src/inplace_abn.h b/preprocess/humanparsing/modules/src/inplace_abn.h
new file mode 100644
index 0000000000000000000000000000000000000000..17afd1196449ecb6376f28961e54b55e1537492f
--- /dev/null
+++ b/preprocess/humanparsing/modules/src/inplace_abn.h
@@ -0,0 +1,88 @@
+#pragma once
+
+#include <ATen/ATen.h>
+
+#include <vector>
+
+std::vector<at::Tensor> mean_var_cpu(at::Tensor x);
+std::vector<at::Tensor> mean_var_cuda(at::Tensor x);
+std::vector<at::Tensor> mean_var_cuda_h(at::Tensor x);
+
+at::Tensor forward_cpu(at::Tensor x, at::Tensor mean, at::Tensor var, at::Tensor weight, at::Tensor bias,
+                       bool affine, float eps);
+at::Tensor forward_cuda(at::Tensor x, at::Tensor mean, at::Tensor var, at::Tensor weight, at::Tensor bias,
+                        bool affine, float eps);
+at::Tensor forward_cuda_h(at::Tensor x, at::Tensor mean, at::Tensor var, at::Tensor weight, at::Tensor bias,
+                          bool affine, float eps);
+
+std::vector<at::Tensor> edz_eydz_cpu(at::Tensor z, at::Tensor dz, at::Tensor weight, at::Tensor bias,
+                                     bool affine, float eps);
+std::vector<at::Tensor> edz_eydz_cuda(at::Tensor z, at::Tensor dz, at::Tensor weight, at::Tensor bias,
+                                      bool affine, float eps);
+std::vector<at::Tensor> edz_eydz_cuda_h(at::Tensor z, at::Tensor dz, at::Tensor weight, at::Tensor bias,
+                                        bool affine, float eps);
+
+at::Tensor backward_cpu(at::Tensor z, at::Tensor dz, at::Tensor var, at::Tensor weight, at::Tensor bias,
+                                     at::Tensor edz, at::Tensor eydz, bool affine, float eps);
+at::Tensor backward_cuda(at::Tensor z, at::Tensor dz, at::Tensor var, at::Tensor weight, at::Tensor bias,
+                                      at::Tensor edz, at::Tensor eydz, bool affine, float eps);
+at::Tensor backward_cuda_h(at::Tensor z, at::Tensor dz, at::Tensor var, at::Tensor weight, at::Tensor bias,
+                                        at::Tensor edz, at::Tensor eydz, bool affine, float eps);
+
+void leaky_relu_backward_cpu(at::Tensor z, at::Tensor dz, float slope);
+void leaky_relu_backward_cuda(at::Tensor z, at::Tensor dz, float slope);
+void leaky_relu_backward_cuda_h(at::Tensor z, at::Tensor dz, float slope);
+
+void elu_backward_cpu(at::Tensor z, at::Tensor dz);
+void elu_backward_cuda(at::Tensor z, at::Tensor dz);
+
+static void get_dims(at::Tensor x, int64_t& num, int64_t& chn, int64_t& sp) {
+  num = x.size(0);
+  chn = x.size(1);
+  sp = 1;
+  for (int64_t i = 2; i < x.ndimension(); ++i)
+    sp *= x.size(i);
+}
+
+/*
+ * Specialized CUDA reduction functions for BN
+ */
+#ifdef __CUDACC__
+
+#include "utils/cuda.cuh"
+
+template <typename T, typename Op>
+__device__ T reduce(Op op, int plane, int N, int S) {
+  T sum = (T)0;
+  for (int batch = 0; batch < N; ++batch) {
+    for (int x = threadIdx.x; x < S; x += blockDim.x) {
+      sum += op(batch, plane, x);
+    }
+  }
+
+  // sum over NumThreads within a warp
+  sum = warpSum(sum);
+
+  // 'transpose', and reduce within warp again
+  __shared__ T shared[32];
+  __syncthreads();
+  if (threadIdx.x % WARP_SIZE == 0) {
+    shared[threadIdx.x / WARP_SIZE] = sum;
+  }
+  if (threadIdx.x >= blockDim.x / WARP_SIZE && threadIdx.x < WARP_SIZE) {
+    // zero out the other entries in shared
+    shared[threadIdx.x] = (T)0;
+  }
+  __syncthreads();
+  if (threadIdx.x / WARP_SIZE == 0) {
+    sum = warpSum(shared[threadIdx.x]);
+    if (threadIdx.x == 0) {
+      shared[0] = sum;
+    }
+  }
+  __syncthreads();
+
+  // Everyone picks it up, should be broadcast into the whole gradInput
+  return shared[0];
+}
+#endif
diff --git a/preprocess/humanparsing/modules/src/inplace_abn_cpu.cpp b/preprocess/humanparsing/modules/src/inplace_abn_cpu.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..ffc6d38c52ea31661b8dd438dc3fe1958f50b61e
--- /dev/null
+++ b/preprocess/humanparsing/modules/src/inplace_abn_cpu.cpp
@@ -0,0 +1,119 @@
+#include <ATen/ATen.h>
+
+#include <vector>
+
+#include "utils/checks.h"
+#include "inplace_abn.h"
+
+at::Tensor reduce_sum(at::Tensor x) {
+  if (x.ndimension() == 2) {
+    return x.sum(0);
+  } else {
+    auto x_view = x.view({x.size(0), x.size(1), -1});
+    return x_view.sum(-1).sum(0);
+  }
+}
+
+at::Tensor broadcast_to(at::Tensor v, at::Tensor x) {
+  if (x.ndimension() == 2) {
+    return v;
+  } else {
+    std::vector<int64_t> broadcast_size = {1, -1};
+    for (int64_t i = 2; i < x.ndimension(); ++i)
+      broadcast_size.push_back(1);
+
+    return v.view(broadcast_size);
+  }
+}
+
+int64_t count(at::Tensor x) {
+  int64_t count = x.size(0);
+  for (int64_t i = 2; i < x.ndimension(); ++i)
+    count *= x.size(i);
+
+  return count;
+}
+
+at::Tensor invert_affine(at::Tensor z, at::Tensor weight, at::Tensor bias, bool affine, float eps) {
+  if (affine) {
+    return (z - broadcast_to(bias, z)) / broadcast_to(at::abs(weight) + eps, z);
+  } else {
+    return z;
+  }
+}
+
+std::vector<at::Tensor> mean_var_cpu(at::Tensor x) {
+  auto num = count(x);
+  auto mean = reduce_sum(x) / num;
+  auto diff = x - broadcast_to(mean, x);
+  auto var = reduce_sum(diff.pow(2)) / num;
+
+  return {mean, var};
+}
+
+at::Tensor forward_cpu(at::Tensor x, at::Tensor mean, at::Tensor var, at::Tensor weight, at::Tensor bias,
+                       bool affine, float eps) {
+  auto gamma = affine ? at::abs(weight) + eps : at::ones_like(var);
+  auto mul = at::rsqrt(var + eps) * gamma;
+
+  x.sub_(broadcast_to(mean, x));
+  x.mul_(broadcast_to(mul, x));
+  if (affine) x.add_(broadcast_to(bias, x));
+
+  return x;
+}
+
+std::vector<at::Tensor> edz_eydz_cpu(at::Tensor z, at::Tensor dz, at::Tensor weight, at::Tensor bias,
+                                     bool affine, float eps) {
+  auto edz = reduce_sum(dz);
+  auto y = invert_affine(z, weight, bias, affine, eps);
+  auto eydz = reduce_sum(y * dz);
+
+  return {edz, eydz};
+}
+
+at::Tensor backward_cpu(at::Tensor z, at::Tensor dz, at::Tensor var, at::Tensor weight, at::Tensor bias,
+                                     at::Tensor edz, at::Tensor eydz, bool affine, float eps) {
+  auto y = invert_affine(z, weight, bias, affine, eps);
+  auto mul = affine ? at::rsqrt(var + eps) * (at::abs(weight) + eps) : at::rsqrt(var + eps);
+
+  auto num = count(z);
+  auto dx = (dz - broadcast_to(edz / num, dz) - y * broadcast_to(eydz / num, dz)) * broadcast_to(mul, dz);
+  return dx;
+}
+
+void leaky_relu_backward_cpu(at::Tensor z, at::Tensor dz, float slope) {
+  CHECK_CPU_INPUT(z);
+  CHECK_CPU_INPUT(dz);
+
+  AT_DISPATCH_FLOATING_TYPES(z.type(), "leaky_relu_backward_cpu", ([&] {
+    int64_t count = z.numel();
+    auto *_z = z.data<scalar_t>();
+    auto *_dz = dz.data<scalar_t>();
+
+    for (int64_t i = 0; i < count; ++i) {
+      if (_z[i] < 0) {
+        _z[i] *= 1 / slope;
+        _dz[i] *= slope;
+      }
+    }
+  }));
+}
+
+void elu_backward_cpu(at::Tensor z, at::Tensor dz) {
+  CHECK_CPU_INPUT(z);
+  CHECK_CPU_INPUT(dz);
+
+  AT_DISPATCH_FLOATING_TYPES(z.type(), "elu_backward_cpu", ([&] {
+    int64_t count = z.numel();
+    auto *_z = z.data<scalar_t>();
+    auto *_dz = dz.data<scalar_t>();
+
+    for (int64_t i = 0; i < count; ++i) {
+      if (_z[i] < 0) {
+        _z[i] = log1p(_z[i]);
+        _dz[i] *= (_z[i] + 1.f);
+      }
+    }
+  }));
+}
diff --git a/preprocess/humanparsing/modules/src/inplace_abn_cuda.cu b/preprocess/humanparsing/modules/src/inplace_abn_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..b157b06d47173d1645c6a40c89f564b737e84d43
--- /dev/null
+++ b/preprocess/humanparsing/modules/src/inplace_abn_cuda.cu
@@ -0,0 +1,333 @@
+#include <ATen/ATen.h>
+
+#include <thrust/device_ptr.h>
+#include <thrust/transform.h>
+
+#include <vector>
+
+#include "utils/checks.h"
+#include "utils/cuda.cuh"
+#include "inplace_abn.h"
+
+#include <ATen/cuda/CUDAContext.h>
+
+// Operations for reduce
+template<typename T>
+struct SumOp {
+  __device__ SumOp(const T *t, int c, int s)
+      : tensor(t), chn(c), sp(s) {}
+  __device__ __forceinline__ T operator()(int batch, int plane, int n) {
+    return tensor[(batch * chn + plane) * sp + n];
+  }
+  const T *tensor;
+  const int chn;
+  const int sp;
+};
+
+template<typename T>
+struct VarOp {
+  __device__ VarOp(T m, const T *t, int c, int s)
+      : mean(m), tensor(t), chn(c), sp(s) {}
+  __device__ __forceinline__ T operator()(int batch, int plane, int n) {
+    T val = tensor[(batch * chn + plane) * sp + n];
+    return (val - mean) * (val - mean);
+  }
+  const T mean;
+  const T *tensor;
+  const int chn;
+  const int sp;
+};
+
+template<typename T>
+struct GradOp {
+  __device__ GradOp(T _weight, T _bias, const T *_z, const T *_dz, int c, int s)
+      : weight(_weight), bias(_bias), z(_z), dz(_dz), chn(c), sp(s) {}
+  __device__ __forceinline__ Pair<T> operator()(int batch, int plane, int n) {
+    T _y = (z[(batch * chn + plane) * sp + n] - bias) / weight;
+    T _dz = dz[(batch * chn + plane) * sp + n];
+    return Pair<T>(_dz, _y * _dz);
+  }
+  const T weight;
+  const T bias;
+  const T *z;
+  const T *dz;
+  const int chn;
+  const int sp;
+};
+
+/***********
+ * mean_var
+ ***********/
+
+template<typename T>
+__global__ void mean_var_kernel(const T *x, T *mean, T *var, int num, int chn, int sp) {
+  int plane = blockIdx.x;
+  T norm = T(1) / T(num * sp);
+
+  T _mean = reduce<T, SumOp<T>>(SumOp<T>(x, chn, sp), plane, num, sp) * norm;
+  __syncthreads();
+  T _var = reduce<T, VarOp<T>>(VarOp<T>(_mean, x, chn, sp), plane, num, sp) * norm;
+
+  if (threadIdx.x == 0) {
+    mean[plane] = _mean;
+    var[plane] = _var;
+  }
+}
+
+std::vector<at::Tensor> mean_var_cuda(at::Tensor x) {
+  CHECK_CUDA_INPUT(x);
+
+  // Extract dimensions
+  int64_t num, chn, sp;
+  get_dims(x, num, chn, sp);
+
+  // Prepare output tensors
+  auto mean = at::empty({chn}, x.options());
+  auto var = at::empty({chn}, x.options());
+
+  // Run kernel
+  dim3 blocks(chn);
+  dim3 threads(getNumThreads(sp));
+  auto stream = at::cuda::getCurrentCUDAStream();
+  AT_DISPATCH_FLOATING_TYPES(x.type(), "mean_var_cuda", ([&] {
+    mean_var_kernel<scalar_t><<<blocks, threads, 0, stream>>>(
+        x.data<scalar_t>(),
+        mean.data<scalar_t>(),
+        var.data<scalar_t>(),
+        num, chn, sp);
+  }));
+
+  return {mean, var};
+}
+
+/**********
+ * forward
+ **********/
+
+template<typename T>
+__global__ void forward_kernel(T *x, const T *mean, const T *var, const T *weight, const T *bias,
+                               bool affine, float eps, int num, int chn, int sp) {
+  int plane = blockIdx.x;
+
+  T _mean = mean[plane];
+  T _var = var[plane];
+  T _weight = affine ? abs(weight[plane]) + eps : T(1);
+  T _bias = affine ? bias[plane] : T(0);
+
+  T mul = rsqrt(_var + eps) * _weight;
+
+  for (int batch = 0; batch < num; ++batch) {
+    for (int n = threadIdx.x; n < sp; n += blockDim.x) {
+      T _x = x[(batch * chn + plane) * sp + n];
+      T _y = (_x - _mean) * mul + _bias;
+
+      x[(batch * chn + plane) * sp + n] = _y;
+    }
+  }
+}
+
+at::Tensor forward_cuda(at::Tensor x, at::Tensor mean, at::Tensor var, at::Tensor weight, at::Tensor bias,
+                        bool affine, float eps) {
+  CHECK_CUDA_INPUT(x);
+  CHECK_CUDA_INPUT(mean);
+  CHECK_CUDA_INPUT(var);
+  CHECK_CUDA_INPUT(weight);
+  CHECK_CUDA_INPUT(bias);
+
+  // Extract dimensions
+  int64_t num, chn, sp;
+  get_dims(x, num, chn, sp);
+
+  // Run kernel
+  dim3 blocks(chn);
+  dim3 threads(getNumThreads(sp));
+  auto stream = at::cuda::getCurrentCUDAStream();
+  AT_DISPATCH_FLOATING_TYPES(x.type(), "forward_cuda", ([&] {
+    forward_kernel<scalar_t><<<blocks, threads, 0, stream>>>(
+        x.data<scalar_t>(),
+        mean.data<scalar_t>(),
+        var.data<scalar_t>(),
+        weight.data<scalar_t>(),
+        bias.data<scalar_t>(),
+        affine, eps, num, chn, sp);
+  }));
+
+  return x;
+}
+
+/***********
+ * edz_eydz
+ ***********/
+
+template<typename T>
+__global__ void edz_eydz_kernel(const T *z, const T *dz, const T *weight, const T *bias,
+                                T *edz, T *eydz, bool affine, float eps, int num, int chn, int sp) {
+  int plane = blockIdx.x;
+
+  T _weight = affine ? abs(weight[plane]) + eps : 1.f;
+  T _bias = affine ? bias[plane] : 0.f;
+
+  Pair<T> res = reduce<Pair<T>, GradOp<T>>(GradOp<T>(_weight, _bias, z, dz, chn, sp), plane, num, sp);
+  __syncthreads();
+
+  if (threadIdx.x == 0) {
+    edz[plane] = res.v1;
+    eydz[plane] = res.v2;
+  }
+}
+
+std::vector<at::Tensor> edz_eydz_cuda(at::Tensor z, at::Tensor dz, at::Tensor weight, at::Tensor bias,
+                                      bool affine, float eps) {
+  CHECK_CUDA_INPUT(z);
+  CHECK_CUDA_INPUT(dz);
+  CHECK_CUDA_INPUT(weight);
+  CHECK_CUDA_INPUT(bias);
+
+  // Extract dimensions
+  int64_t num, chn, sp;
+  get_dims(z, num, chn, sp);
+
+  auto edz = at::empty({chn}, z.options());
+  auto eydz = at::empty({chn}, z.options());
+
+  // Run kernel
+  dim3 blocks(chn);
+  dim3 threads(getNumThreads(sp));
+  auto stream = at::cuda::getCurrentCUDAStream();
+  AT_DISPATCH_FLOATING_TYPES(z.type(), "edz_eydz_cuda", ([&] {
+    edz_eydz_kernel<scalar_t><<<blocks, threads, 0, stream>>>(
+        z.data<scalar_t>(),
+        dz.data<scalar_t>(),
+        weight.data<scalar_t>(),
+        bias.data<scalar_t>(),
+        edz.data<scalar_t>(),
+        eydz.data<scalar_t>(),
+        affine, eps, num, chn, sp);
+  }));
+
+  return {edz, eydz};
+}
+
+/***********
+ * backward
+ ***********/
+
+template<typename T>
+__global__ void backward_kernel(const T *z, const T *dz, const T *var, const T *weight, const T *bias, const T *edz,
+	                        const T *eydz, T *dx, bool affine, float eps, int num, int chn, int sp) {
+  int plane = blockIdx.x;
+
+  T _weight = affine ? abs(weight[plane]) + eps : 1.f;
+  T _bias = affine ? bias[plane] : 0.f;
+  T _var = var[plane];
+  T _edz = edz[plane];
+  T _eydz = eydz[plane];
+
+  T _mul = _weight * rsqrt(_var + eps);
+  T count = T(num * sp);
+
+  for (int batch = 0; batch < num; ++batch) {
+    for (int n = threadIdx.x; n < sp; n += blockDim.x) {
+      T _dz = dz[(batch * chn + plane) * sp + n];
+      T _y = (z[(batch * chn + plane) * sp + n] - _bias) / _weight;
+
+      dx[(batch * chn + plane) * sp + n] = (_dz - _edz / count - _y * _eydz / count) * _mul;
+    }
+  }
+}
+
+at::Tensor backward_cuda(at::Tensor z, at::Tensor dz, at::Tensor var, at::Tensor weight, at::Tensor bias,
+                                      at::Tensor edz, at::Tensor eydz, bool affine, float eps) {
+  CHECK_CUDA_INPUT(z);
+  CHECK_CUDA_INPUT(dz);
+  CHECK_CUDA_INPUT(var);
+  CHECK_CUDA_INPUT(weight);
+  CHECK_CUDA_INPUT(bias);
+  CHECK_CUDA_INPUT(edz);
+  CHECK_CUDA_INPUT(eydz);
+
+  // Extract dimensions
+  int64_t num, chn, sp;
+  get_dims(z, num, chn, sp);
+
+  auto dx = at::zeros_like(z);
+
+  // Run kernel
+  dim3 blocks(chn);
+  dim3 threads(getNumThreads(sp));
+  auto stream = at::cuda::getCurrentCUDAStream();
+  AT_DISPATCH_FLOATING_TYPES(z.type(), "backward_cuda", ([&] {
+    backward_kernel<scalar_t><<<blocks, threads, 0, stream>>>(
+        z.data<scalar_t>(),
+        dz.data<scalar_t>(),
+        var.data<scalar_t>(),
+        weight.data<scalar_t>(),
+        bias.data<scalar_t>(),
+        edz.data<scalar_t>(),
+        eydz.data<scalar_t>(),
+        dx.data<scalar_t>(),
+        affine, eps, num, chn, sp);
+  }));
+
+  return dx;
+}
+
+/**************
+ * activations
+ **************/
+
+template<typename T>
+inline void leaky_relu_backward_impl(T *z, T *dz, float slope, int64_t count) {
+  // Create thrust pointers
+  thrust::device_ptr<T> th_z = thrust::device_pointer_cast(z);
+  thrust::device_ptr<T> th_dz = thrust::device_pointer_cast(dz);
+
+  auto stream = at::cuda::getCurrentCUDAStream();
+  thrust::transform_if(thrust::cuda::par.on(stream),
+                       th_dz, th_dz + count, th_z, th_dz,
+                       [slope] __device__ (const T& dz) { return dz * slope; },
+                       [] __device__ (const T& z) { return z < 0; });
+  thrust::transform_if(thrust::cuda::par.on(stream),
+                       th_z, th_z + count, th_z,
+                       [slope] __device__ (const T& z) { return z / slope; },
+                       [] __device__ (const T& z) { return z < 0; });
+}
+
+void leaky_relu_backward_cuda(at::Tensor z, at::Tensor dz, float slope) {
+  CHECK_CUDA_INPUT(z);
+  CHECK_CUDA_INPUT(dz);
+
+  int64_t count = z.numel();
+
+  AT_DISPATCH_FLOATING_TYPES(z.type(), "leaky_relu_backward_cuda", ([&] {
+    leaky_relu_backward_impl<scalar_t>(z.data<scalar_t>(), dz.data<scalar_t>(), slope, count);
+  }));
+}
+
+template<typename T>
+inline void elu_backward_impl(T *z, T *dz, int64_t count) {
+  // Create thrust pointers
+  thrust::device_ptr<T> th_z = thrust::device_pointer_cast(z);
+  thrust::device_ptr<T> th_dz = thrust::device_pointer_cast(dz);
+
+  auto stream = at::cuda::getCurrentCUDAStream();
+  thrust::transform_if(thrust::cuda::par.on(stream),
+                       th_dz, th_dz + count, th_z, th_z, th_dz,
+                       [] __device__ (const T& dz, const T& z) { return dz * (z + 1.); },
+                       [] __device__ (const T& z) { return z < 0; });
+  thrust::transform_if(thrust::cuda::par.on(stream),
+                       th_z, th_z + count, th_z,
+                       [] __device__ (const T& z) { return log1p(z); },
+                       [] __device__ (const T& z) { return z < 0; });
+}
+
+void elu_backward_cuda(at::Tensor z, at::Tensor dz) {
+  CHECK_CUDA_INPUT(z);
+  CHECK_CUDA_INPUT(dz);
+
+  int64_t count = z.numel();
+
+  AT_DISPATCH_FLOATING_TYPES(z.type(), "leaky_relu_backward_cuda", ([&] {
+    elu_backward_impl<scalar_t>(z.data<scalar_t>(), dz.data<scalar_t>(), count);
+  }));
+}
diff --git a/preprocess/humanparsing/modules/src/inplace_abn_cuda_half.cu b/preprocess/humanparsing/modules/src/inplace_abn_cuda_half.cu
new file mode 100644
index 0000000000000000000000000000000000000000..bb63e73f9d90179e5bd5dae5579c4844da9c25e2
--- /dev/null
+++ b/preprocess/humanparsing/modules/src/inplace_abn_cuda_half.cu
@@ -0,0 +1,275 @@
+#include <ATen/ATen.h>
+
+#include <cuda_fp16.h>
+
+#include <vector>
+
+#include "utils/checks.h"
+#include "utils/cuda.cuh"
+#include "inplace_abn.h"
+
+#include <ATen/cuda/CUDAContext.h>
+
+// Operations for reduce
+struct SumOpH {
+  __device__ SumOpH(const half *t, int c, int s)
+      : tensor(t), chn(c), sp(s) {}
+  __device__ __forceinline__ float operator()(int batch, int plane, int n) {
+    return __half2float(tensor[(batch * chn + plane) * sp + n]);
+  }
+  const half *tensor;
+  const int chn;
+  const int sp;
+};
+
+struct VarOpH {
+  __device__ VarOpH(float m, const half *t, int c, int s)
+      : mean(m), tensor(t), chn(c), sp(s) {}
+  __device__ __forceinline__ float operator()(int batch, int plane, int n) {
+    const auto t = __half2float(tensor[(batch * chn + plane) * sp + n]);
+    return (t - mean) * (t - mean);
+  }
+  const float mean;
+  const half *tensor;
+  const int chn;
+  const int sp;
+};
+
+struct GradOpH {
+  __device__ GradOpH(float _weight, float _bias, const half *_z, const half *_dz, int c, int s)
+      : weight(_weight), bias(_bias), z(_z), dz(_dz), chn(c), sp(s) {}
+  __device__ __forceinline__ Pair<float> operator()(int batch, int plane, int n) {
+    float _y = (__half2float(z[(batch * chn + plane) * sp + n]) - bias) / weight;
+    float _dz = __half2float(dz[(batch * chn + plane) * sp + n]);
+    return Pair<float>(_dz, _y * _dz);
+  }
+  const float weight;
+  const float bias;
+  const half *z;
+  const half *dz;
+  const int chn;
+  const int sp;
+};
+
+/***********
+ * mean_var
+ ***********/
+
+__global__ void mean_var_kernel_h(const half *x, float *mean, float *var, int num, int chn, int sp) {
+  int plane = blockIdx.x;
+  float norm = 1.f / static_cast<float>(num * sp);
+
+  float _mean = reduce<float, SumOpH>(SumOpH(x, chn, sp), plane, num, sp) * norm;
+  __syncthreads();
+  float _var = reduce<float, VarOpH>(VarOpH(_mean, x, chn, sp), plane, num, sp) * norm;
+
+  if (threadIdx.x == 0) {
+    mean[plane] = _mean;
+    var[plane] = _var;
+  }
+}
+
+std::vector<at::Tensor> mean_var_cuda_h(at::Tensor x) {
+  CHECK_CUDA_INPUT(x);
+
+  // Extract dimensions
+  int64_t num, chn, sp;
+  get_dims(x, num, chn, sp);
+
+  // Prepare output tensors
+  auto mean = at::empty({chn},x.options().dtype(at::kFloat));
+  auto var = at::empty({chn},x.options().dtype(at::kFloat));
+
+  // Run kernel
+  dim3 blocks(chn);
+  dim3 threads(getNumThreads(sp));
+  auto stream = at::cuda::getCurrentCUDAStream();
+  mean_var_kernel_h<<<blocks, threads, 0, stream>>>(
+      reinterpret_cast<half*>(x.data<at::Half>()),
+      mean.data<float>(),
+      var.data<float>(),
+      num, chn, sp);
+
+  return {mean, var};
+}
+
+/**********
+ * forward
+ **********/
+
+__global__ void forward_kernel_h(half *x, const float *mean, const float *var, const float *weight, const float *bias,
+                                 bool affine, float eps, int num, int chn, int sp) {
+  int plane = blockIdx.x;
+
+  const float _mean = mean[plane];
+  const float _var = var[plane];
+  const float _weight = affine ? abs(weight[plane]) + eps : 1.f;
+  const float _bias = affine ? bias[plane] : 0.f;
+
+  const float mul = rsqrt(_var + eps) * _weight;
+
+  for (int batch = 0; batch < num; ++batch) {
+    for (int n = threadIdx.x; n < sp; n += blockDim.x) {
+      half *x_ptr = x + (batch * chn + plane) * sp + n;
+      float _x = __half2float(*x_ptr);
+      float _y = (_x - _mean) * mul + _bias;
+
+      *x_ptr = __float2half(_y);
+    }
+  }
+}
+
+at::Tensor forward_cuda_h(at::Tensor x, at::Tensor mean, at::Tensor var, at::Tensor weight, at::Tensor bias,
+                        bool affine, float eps) {
+  CHECK_CUDA_INPUT(x);
+  CHECK_CUDA_INPUT(mean);
+  CHECK_CUDA_INPUT(var);
+  CHECK_CUDA_INPUT(weight);
+  CHECK_CUDA_INPUT(bias);
+
+  // Extract dimensions
+  int64_t num, chn, sp;
+  get_dims(x, num, chn, sp);
+
+  // Run kernel
+  dim3 blocks(chn);
+  dim3 threads(getNumThreads(sp));
+  auto stream = at::cuda::getCurrentCUDAStream();
+  forward_kernel_h<<<blocks, threads, 0, stream>>>(
+      reinterpret_cast<half*>(x.data<at::Half>()),
+      mean.data<float>(),
+      var.data<float>(),
+      weight.data<float>(),
+      bias.data<float>(),
+      affine, eps, num, chn, sp);
+
+  return x;
+}
+
+__global__ void edz_eydz_kernel_h(const half *z, const half *dz, const float *weight, const float *bias,
+                                float *edz, float *eydz, bool affine, float eps, int num, int chn, int sp) {
+  int plane = blockIdx.x;
+
+  float _weight = affine ? abs(weight[plane]) + eps : 1.f;
+  float _bias = affine ? bias[plane] : 0.f;
+
+  Pair<float> res = reduce<Pair<float>, GradOpH>(GradOpH(_weight, _bias, z, dz, chn, sp), plane, num, sp);
+  __syncthreads();
+
+  if (threadIdx.x == 0) {
+    edz[plane] = res.v1;
+    eydz[plane] = res.v2;
+  }
+}
+
+std::vector<at::Tensor> edz_eydz_cuda_h(at::Tensor z, at::Tensor dz, at::Tensor weight, at::Tensor bias,
+                                      bool affine, float eps) {
+  CHECK_CUDA_INPUT(z);
+  CHECK_CUDA_INPUT(dz);
+  CHECK_CUDA_INPUT(weight);
+  CHECK_CUDA_INPUT(bias);
+
+  // Extract dimensions
+  int64_t num, chn, sp;
+  get_dims(z, num, chn, sp);
+
+  auto edz = at::empty({chn},z.options().dtype(at::kFloat));
+  auto eydz = at::empty({chn},z.options().dtype(at::kFloat));
+
+  // Run kernel
+  dim3 blocks(chn);
+  dim3 threads(getNumThreads(sp));
+  auto stream = at::cuda::getCurrentCUDAStream();
+  edz_eydz_kernel_h<<<blocks, threads, 0, stream>>>(
+        reinterpret_cast<half*>(z.data<at::Half>()),
+        reinterpret_cast<half*>(dz.data<at::Half>()),
+        weight.data<float>(),
+        bias.data<float>(),
+        edz.data<float>(),
+        eydz.data<float>(),
+        affine, eps, num, chn, sp);
+ 
+  return {edz, eydz};
+}
+
+__global__ void backward_kernel_h(const half *z, const half *dz, const float *var, const float *weight, const float *bias, const float *edz,
+                                  const float *eydz, half *dx, bool affine, float eps, int num, int chn, int sp) {
+  int plane = blockIdx.x;
+
+  float _weight = affine ? abs(weight[plane]) + eps : 1.f;
+  float _bias = affine ? bias[plane] : 0.f;
+  float _var = var[plane];
+  float _edz = edz[plane];
+  float _eydz = eydz[plane];
+
+  float _mul = _weight * rsqrt(_var + eps);
+  float count = float(num * sp);
+
+  for (int batch = 0; batch < num; ++batch) {
+    for (int n = threadIdx.x; n < sp; n += blockDim.x) {
+      float _dz = __half2float(dz[(batch * chn + plane) * sp + n]);
+      float _y = (__half2float(z[(batch * chn + plane) * sp + n]) - _bias) / _weight;
+
+      dx[(batch * chn + plane) * sp + n] = __float2half((_dz - _edz / count - _y * _eydz / count) * _mul);
+    }
+  }
+}
+
+at::Tensor backward_cuda_h(at::Tensor z, at::Tensor dz, at::Tensor var, at::Tensor weight, at::Tensor bias,
+                                      at::Tensor edz, at::Tensor eydz, bool affine, float eps) {
+  CHECK_CUDA_INPUT(z);
+  CHECK_CUDA_INPUT(dz);
+  CHECK_CUDA_INPUT(var);
+  CHECK_CUDA_INPUT(weight);
+  CHECK_CUDA_INPUT(bias);
+  CHECK_CUDA_INPUT(edz);
+  CHECK_CUDA_INPUT(eydz);
+
+  // Extract dimensions
+  int64_t num, chn, sp;
+  get_dims(z, num, chn, sp);
+
+  auto dx = at::zeros_like(z);
+
+  // Run kernel
+  dim3 blocks(chn);
+  dim3 threads(getNumThreads(sp));
+  auto stream = at::cuda::getCurrentCUDAStream();
+  backward_kernel_h<<<blocks, threads, 0, stream>>>(
+        reinterpret_cast<half*>(z.data<at::Half>()),
+        reinterpret_cast<half*>(dz.data<at::Half>()),
+        var.data<float>(),
+        weight.data<float>(),
+        bias.data<float>(),
+        edz.data<float>(),
+        eydz.data<float>(),
+        reinterpret_cast<half*>(dx.data<at::Half>()),
+        affine, eps, num, chn, sp);
+
+  return dx;
+}
+
+__global__ void leaky_relu_backward_impl_h(half *z, half *dz, float slope, int64_t count) {
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < count;  i += blockDim.x * gridDim.x){
+    float _z = __half2float(z[i]);
+    if (_z < 0) {
+      dz[i] = __float2half(__half2float(dz[i]) * slope);
+      z[i] = __float2half(_z / slope);
+    }
+  }
+}
+
+void leaky_relu_backward_cuda_h(at::Tensor z, at::Tensor dz, float slope) {
+  CHECK_CUDA_INPUT(z);
+  CHECK_CUDA_INPUT(dz);
+
+  int64_t count = z.numel();
+  dim3 threads(getNumThreads(count));
+  dim3 blocks = (count + threads.x - 1) / threads.x;
+  auto stream = at::cuda::getCurrentCUDAStream();
+  leaky_relu_backward_impl_h<<<blocks, threads, 0, stream>>>(
+      reinterpret_cast<half*>(z.data<at::Half>()),
+      reinterpret_cast<half*>(dz.data<at::Half>()),
+      slope, count);
+}
+
diff --git a/preprocess/humanparsing/modules/src/utils/checks.h b/preprocess/humanparsing/modules/src/utils/checks.h
new file mode 100644
index 0000000000000000000000000000000000000000..e761a6fe34d0789815b588eba7e3726026e0e868
--- /dev/null
+++ b/preprocess/humanparsing/modules/src/utils/checks.h
@@ -0,0 +1,15 @@
+#pragma once
+
+#include <ATen/ATen.h>
+
+// Define AT_CHECK for old version of ATen where the same function was called AT_ASSERT
+#ifndef AT_CHECK
+#define AT_CHECK AT_ASSERT
+#endif
+
+#define CHECK_CUDA(x) AT_CHECK((x).type().is_cuda(), #x " must be a CUDA tensor")
+#define CHECK_CPU(x) AT_CHECK(!(x).type().is_cuda(), #x " must be a CPU tensor")
+#define CHECK_CONTIGUOUS(x) AT_CHECK((x).is_contiguous(), #x " must be contiguous")
+
+#define CHECK_CUDA_INPUT(x) CHECK_CUDA(x); CHECK_CONTIGUOUS(x)
+#define CHECK_CPU_INPUT(x) CHECK_CPU(x); CHECK_CONTIGUOUS(x)
\ No newline at end of file
diff --git a/preprocess/humanparsing/modules/src/utils/common.h b/preprocess/humanparsing/modules/src/utils/common.h
new file mode 100644
index 0000000000000000000000000000000000000000..e8403eef8a233b75dd4bb353c16486fe1be2039a
--- /dev/null
+++ b/preprocess/humanparsing/modules/src/utils/common.h
@@ -0,0 +1,49 @@
+#pragma once
+
+#include <ATen/ATen.h>
+
+/*
+ * Functions to share code between CPU and GPU
+ */
+
+#ifdef __CUDACC__
+// CUDA versions
+
+#define HOST_DEVICE __host__ __device__
+#define INLINE_HOST_DEVICE __host__ __device__ inline
+#define FLOOR(x) floor(x)
+
+#if __CUDA_ARCH__ >= 600
+// Recent compute capabilities have block-level atomicAdd for all data types, so we use that
+#define ACCUM(x,y) atomicAdd_block(&(x),(y))
+#else
+// Older architectures don't have block-level atomicAdd, nor atomicAdd for doubles, so we defer to atomicAdd for float
+// and use the known atomicCAS-based implementation for double
+template<typename data_t>
+__device__ inline data_t atomic_add(data_t *address, data_t val) {
+  return atomicAdd(address, val);
+}
+
+template<>
+__device__ inline double atomic_add(double *address, double val) {
+  unsigned long long int* address_as_ull = (unsigned long long int*)address;
+  unsigned long long int old = *address_as_ull, assumed;
+  do {
+    assumed = old;
+    old = atomicCAS(address_as_ull, assumed, __double_as_longlong(val + __longlong_as_double(assumed)));
+  } while (assumed != old);
+  return __longlong_as_double(old);
+}
+
+#define ACCUM(x,y) atomic_add(&(x),(y))
+#endif // #if __CUDA_ARCH__ >= 600
+
+#else
+// CPU versions
+
+#define HOST_DEVICE
+#define INLINE_HOST_DEVICE inline
+#define FLOOR(x) std::floor(x)
+#define ACCUM(x,y) (x) += (y)
+
+#endif // #ifdef __CUDACC__
\ No newline at end of file
diff --git a/preprocess/humanparsing/modules/src/utils/cuda.cuh b/preprocess/humanparsing/modules/src/utils/cuda.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..60c0023835e02c5f7c539c28ac07b75b72df394b
--- /dev/null
+++ b/preprocess/humanparsing/modules/src/utils/cuda.cuh
@@ -0,0 +1,71 @@
+#pragma once
+
+/*
+ * General settings and functions
+ */
+const int WARP_SIZE = 32;
+const int MAX_BLOCK_SIZE = 1024;
+
+static int getNumThreads(int nElem) {
+  int threadSizes[6] = {32, 64, 128, 256, 512, MAX_BLOCK_SIZE};
+  for (int i = 0; i < 6; ++i) {
+    if (nElem <= threadSizes[i]) {
+      return threadSizes[i];
+    }
+  }
+  return MAX_BLOCK_SIZE;
+}
+
+/*
+ * Reduction utilities
+ */
+template <typename T>
+__device__ __forceinline__ T WARP_SHFL_XOR(T value, int laneMask, int width = warpSize,
+                                           unsigned int mask = 0xffffffff) {
+#if CUDART_VERSION >= 9000
+  return __shfl_xor_sync(mask, value, laneMask, width);
+#else
+  return __shfl_xor(value, laneMask, width);
+#endif
+}
+
+__device__ __forceinline__ int getMSB(int val) { return 31 - __clz(val); }
+
+template<typename T>
+struct Pair {
+  T v1, v2;
+  __device__ Pair() {}
+  __device__ Pair(T _v1, T _v2) : v1(_v1), v2(_v2) {}
+  __device__ Pair(T v) : v1(v), v2(v) {}
+  __device__ Pair(int v) : v1(v), v2(v) {}
+  __device__ Pair &operator+=(const Pair<T> &a) {
+    v1 += a.v1;
+    v2 += a.v2;
+    return *this;
+  }
+};
+
+template<typename T>
+static __device__ __forceinline__ T warpSum(T val) {
+#if __CUDA_ARCH__ >= 300
+  for (int i = 0; i < getMSB(WARP_SIZE); ++i) {
+    val += WARP_SHFL_XOR(val, 1 << i, WARP_SIZE);
+  }
+#else
+  __shared__ T values[MAX_BLOCK_SIZE];
+  values[threadIdx.x] = val;
+  __threadfence_block();
+  const int base = (threadIdx.x / WARP_SIZE) * WARP_SIZE;
+  for (int i = 1; i < WARP_SIZE; i++) {
+    val += values[base + ((i + threadIdx.x) % WARP_SIZE)];
+  }
+#endif
+  return val;
+}
+
+template<typename T>
+static __device__ __forceinline__ Pair<T> warpSum(Pair<T> value) {
+  value.v1 = warpSum(value.v1);
+  value.v2 = warpSum(value.v2);
+  return value;
+}
\ No newline at end of file
diff --git a/preprocess/humanparsing/networks/AugmentCE2P.py b/preprocess/humanparsing/networks/AugmentCE2P.py
new file mode 100644
index 0000000000000000000000000000000000000000..ce32f78dd0b92d943e5b1d573a33e2f69f247f23
--- /dev/null
+++ b/preprocess/humanparsing/networks/AugmentCE2P.py
@@ -0,0 +1,388 @@
+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+
+"""
+@Author  :   Peike Li
+@Contact :   peike.li@yahoo.com
+@File    :   AugmentCE2P.py
+@Time    :   8/4/19 3:35 PM
+@Desc    :
+@License :   This source code is licensed under the license found in the
+             LICENSE file in the root directory of this source tree.
+"""
+
+import functools
+import pdb
+
+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+# Note here we adopt the InplaceABNSync implementation from https://github.com/mapillary/inplace_abn
+# By default, the InplaceABNSync module contains a BatchNorm Layer and a LeakyReLu layer
+from modules import InPlaceABNSync
+import numpy as np
+
+BatchNorm2d = functools.partial(InPlaceABNSync, activation='none')
+
+affine_par = True
+
+pretrained_settings = {
+    'resnet101': {
+        'imagenet': {
+            'input_space': 'BGR',
+            'input_size': [3, 224, 224],
+            'input_range': [0, 1],
+            'mean': [0.406, 0.456, 0.485],
+            'std': [0.225, 0.224, 0.229],
+            'num_classes': 1000
+        }
+    },
+}
+
+
+def conv3x3(in_planes, out_planes, stride=1):
+    "3x3 convolution with padding"
+    return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,
+                     padding=1, bias=False)
+
+
+class Bottleneck(nn.Module):
+    expansion = 4
+
+    def __init__(self, inplanes, planes, stride=1, dilation=1, downsample=None, fist_dilation=1, multi_grid=1):
+        super(Bottleneck, self).__init__()
+        self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False)
+        self.bn1 = BatchNorm2d(planes)
+        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride,
+                               padding=dilation * multi_grid, dilation=dilation * multi_grid, bias=False)
+        self.bn2 = BatchNorm2d(planes)
+        self.conv3 = nn.Conv2d(planes, planes * 4, kernel_size=1, bias=False)
+        self.bn3 = BatchNorm2d(planes * 4)
+        self.relu = nn.ReLU(inplace=False)
+        self.relu_inplace = nn.ReLU(inplace=True)
+        self.downsample = downsample
+        self.dilation = dilation
+        self.stride = stride
+
+    def forward(self, x):
+        residual = x
+
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+
+        out = self.conv2(out)
+        out = self.bn2(out)
+        out = self.relu(out)
+
+        out = self.conv3(out)
+        out = self.bn3(out)
+
+        if self.downsample is not None:
+            residual = self.downsample(x)
+
+        out = out + residual
+        out = self.relu_inplace(out)
+
+        return out
+
+
+class CostomAdaptiveAvgPool2D(nn.Module):
+
+    def __init__(self, output_size):
+
+        super(CostomAdaptiveAvgPool2D, self).__init__()
+
+        self.output_size = output_size
+
+    def forward(self, x):
+
+        H_in, W_in = x.shape[-2:]
+        H_out, W_out = self.output_size
+
+        out_i = []
+        for i in range(H_out):
+            out_j = []
+            for j in range(W_out):
+                hs = int(np.floor(i * H_in / H_out))
+                he = int(np.ceil((i + 1) * H_in / H_out))
+
+                ws = int(np.floor(j * W_in / W_out))
+                we = int(np.ceil((j + 1) * W_in / W_out))
+
+                # print(hs, he, ws, we)
+                kernel_size = [he - hs, we - ws]
+
+                out = F.avg_pool2d(x[:, :, hs:he, ws:we], kernel_size)
+                out_j.append(out)
+
+            out_j = torch.concat(out_j, -1)
+            out_i.append(out_j)
+
+        out_i = torch.concat(out_i, -2)
+        return out_i
+
+
+class PSPModule(nn.Module):
+    """
+    Reference:
+        Zhao, Hengshuang, et al. *"Pyramid scene parsing network."*
+    """
+
+    def __init__(self, features, out_features=512, sizes=(1, 2, 3, 6)):
+        super(PSPModule, self).__init__()
+
+        self.stages = []
+        tmp = []
+        for size in sizes:
+            if size == 3 or size == 6:
+                tmp.append(self._make_stage_custom(features, out_features, size))
+            else:
+                tmp.append(self._make_stage(features, out_features, size))
+        self.stages = nn.ModuleList(tmp)
+        # self.stages = nn.ModuleList([self._make_stage(features, out_features, size) for size in sizes])
+        self.bottleneck = nn.Sequential(
+            nn.Conv2d(features + len(sizes) * out_features, out_features, kernel_size=3, padding=1, dilation=1,
+                      bias=False),
+            InPlaceABNSync(out_features),
+        )
+
+    def _make_stage(self, features, out_features, size):
+        prior = nn.AdaptiveAvgPool2d(output_size=(size, size))
+        conv = nn.Conv2d(features, out_features, kernel_size=1, bias=False)
+        bn = InPlaceABNSync(out_features)
+        return nn.Sequential(prior, conv, bn)
+
+    def _make_stage_custom(self, features, out_features, size):
+        prior = CostomAdaptiveAvgPool2D(output_size=(size, size))
+        conv = nn.Conv2d(features, out_features, kernel_size=1, bias=False)
+        bn = InPlaceABNSync(out_features)
+        return nn.Sequential(prior, conv, bn)
+
+    def forward(self, feats):
+        h, w = feats.size(2), feats.size(3)
+        priors = [F.interpolate(input=stage(feats), size=(h, w), mode='bilinear', align_corners=True) for stage in
+                  self.stages] + [feats]
+        bottle = self.bottleneck(torch.cat(priors, 1))
+        return bottle
+
+
+class ASPPModule(nn.Module):
+    """
+    Reference: 
+        Chen, Liang-Chieh, et al. *"Rethinking Atrous Convolution for Semantic Image Segmentation."*
+    """
+
+    def __init__(self, features, inner_features=256, out_features=512, dilations=(12, 24, 36)):
+        super(ASPPModule, self).__init__()
+
+        self.conv1 = nn.Sequential(nn.AdaptiveAvgPool2d((1, 1)),
+                                   nn.Conv2d(features, inner_features, kernel_size=1, padding=0, dilation=1,
+                                             bias=False),
+                                   InPlaceABNSync(inner_features))
+        self.conv2 = nn.Sequential(
+            nn.Conv2d(features, inner_features, kernel_size=1, padding=0, dilation=1, bias=False),
+            InPlaceABNSync(inner_features))
+        self.conv3 = nn.Sequential(
+            nn.Conv2d(features, inner_features, kernel_size=3, padding=dilations[0], dilation=dilations[0], bias=False),
+            InPlaceABNSync(inner_features))
+        self.conv4 = nn.Sequential(
+            nn.Conv2d(features, inner_features, kernel_size=3, padding=dilations[1], dilation=dilations[1], bias=False),
+            InPlaceABNSync(inner_features))
+        self.conv5 = nn.Sequential(
+            nn.Conv2d(features, inner_features, kernel_size=3, padding=dilations[2], dilation=dilations[2], bias=False),
+            InPlaceABNSync(inner_features))
+
+        self.bottleneck = nn.Sequential(
+            nn.Conv2d(inner_features * 5, out_features, kernel_size=1, padding=0, dilation=1, bias=False),
+            InPlaceABNSync(out_features),
+            nn.Dropout2d(0.1)
+        )
+
+    def forward(self, x):
+        _, _, h, w = x.size()
+
+        feat1 = F.interpolate(self.conv1(x), size=(h, w), mode='bilinear', align_corners=True)
+
+        feat2 = self.conv2(x)
+        feat3 = self.conv3(x)
+        feat4 = self.conv4(x)
+        feat5 = self.conv5(x)
+        out = torch.cat((feat1, feat2, feat3, feat4, feat5), 1)
+
+        bottle = self.bottleneck(out)
+        return bottle
+
+
+class Edge_Module(nn.Module):
+    """
+    Edge Learning Branch
+    """
+
+    def __init__(self, in_fea=[256, 512, 1024], mid_fea=256, out_fea=2):
+        super(Edge_Module, self).__init__()
+
+        self.conv1 = nn.Sequential(
+            nn.Conv2d(in_fea[0], mid_fea, kernel_size=1, padding=0, dilation=1, bias=False),
+            InPlaceABNSync(mid_fea)
+        )
+        self.conv2 = nn.Sequential(
+            nn.Conv2d(in_fea[1], mid_fea, kernel_size=1, padding=0, dilation=1, bias=False),
+            InPlaceABNSync(mid_fea)
+        )
+        self.conv3 = nn.Sequential(
+            nn.Conv2d(in_fea[2], mid_fea, kernel_size=1, padding=0, dilation=1, bias=False),
+            InPlaceABNSync(mid_fea)
+        )
+        self.conv4 = nn.Conv2d(mid_fea, out_fea, kernel_size=3, padding=1, dilation=1, bias=True)
+        self.conv5 = nn.Conv2d(out_fea * 3, out_fea, kernel_size=1, padding=0, dilation=1, bias=True)
+
+    def forward(self, x1, x2, x3):
+        _, _, h, w = x1.size()
+
+        edge1_fea = self.conv1(x1)
+        edge1 = self.conv4(edge1_fea)
+        edge2_fea = self.conv2(x2)
+        edge2 = self.conv4(edge2_fea)
+        edge3_fea = self.conv3(x3)
+        edge3 = self.conv4(edge3_fea)
+
+        edge2_fea = F.interpolate(edge2_fea, size=(h, w), mode='bilinear', align_corners=True)
+        edge3_fea = F.interpolate(edge3_fea, size=(h, w), mode='bilinear', align_corners=True)
+        edge2 = F.interpolate(edge2, size=(h, w), mode='bilinear', align_corners=True)
+        edge3 = F.interpolate(edge3, size=(h, w), mode='bilinear', align_corners=True)
+
+        edge = torch.cat([edge1, edge2, edge3], dim=1)
+        edge_fea = torch.cat([edge1_fea, edge2_fea, edge3_fea], dim=1)
+        edge = self.conv5(edge)
+
+        return edge, edge_fea
+
+
+class Decoder_Module(nn.Module):
+    """
+    Parsing Branch Decoder Module.
+    """
+
+    def __init__(self, num_classes):
+        super(Decoder_Module, self).__init__()
+        self.conv1 = nn.Sequential(
+            nn.Conv2d(512, 256, kernel_size=1, padding=0, dilation=1, bias=False),
+            InPlaceABNSync(256)
+        )
+        self.conv2 = nn.Sequential(
+            nn.Conv2d(256, 48, kernel_size=1, stride=1, padding=0, dilation=1, bias=False),
+            InPlaceABNSync(48)
+        )
+        self.conv3 = nn.Sequential(
+            nn.Conv2d(304, 256, kernel_size=1, padding=0, dilation=1, bias=False),
+            InPlaceABNSync(256),
+            nn.Conv2d(256, 256, kernel_size=1, padding=0, dilation=1, bias=False),
+            InPlaceABNSync(256)
+        )
+
+        self.conv4 = nn.Conv2d(256, num_classes, kernel_size=1, padding=0, dilation=1, bias=True)
+
+    def forward(self, xt, xl):
+        _, _, h, w = xl.size()
+        xt = F.interpolate(self.conv1(xt), size=(h, w), mode='bilinear', align_corners=True)
+        xl = self.conv2(xl)
+        x = torch.cat([xt, xl], dim=1)
+        x = self.conv3(x)
+        seg = self.conv4(x)
+        return seg, x
+
+
+class ResNet(nn.Module):
+    def __init__(self, block, layers, num_classes):
+        self.inplanes = 128
+        super(ResNet, self).__init__()
+        self.conv1 = conv3x3(3, 64, stride=2)
+        self.bn1 = BatchNorm2d(64)
+        self.relu1 = nn.ReLU(inplace=False)
+        self.conv2 = conv3x3(64, 64)
+        self.bn2 = BatchNorm2d(64)
+        self.relu2 = nn.ReLU(inplace=False)
+        self.conv3 = conv3x3(64, 128)
+        self.bn3 = BatchNorm2d(128)
+        self.relu3 = nn.ReLU(inplace=False)
+
+        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
+
+        self.layer1 = self._make_layer(block, 64, layers[0])
+        self.layer2 = self._make_layer(block, 128, layers[1], stride=2)
+        self.layer3 = self._make_layer(block, 256, layers[2], stride=2)
+        self.layer4 = self._make_layer(block, 512, layers[3], stride=1, dilation=2, multi_grid=(1, 1, 1))
+
+        self.context_encoding = PSPModule(2048, 512)
+
+        self.edge = Edge_Module()
+        self.decoder = Decoder_Module(num_classes)
+
+        self.fushion = nn.Sequential(
+            nn.Conv2d(1024, 256, kernel_size=1, padding=0, dilation=1, bias=False),
+            InPlaceABNSync(256),
+            nn.Dropout2d(0.1),
+            nn.Conv2d(256, num_classes, kernel_size=1, padding=0, dilation=1, bias=True)
+        )
+
+    def _make_layer(self, block, planes, blocks, stride=1, dilation=1, multi_grid=1):
+        downsample = None
+        if stride != 1 or self.inplanes != planes * block.expansion:
+            downsample = nn.Sequential(
+                nn.Conv2d(self.inplanes, planes * block.expansion,
+                          kernel_size=1, stride=stride, bias=False),
+                BatchNorm2d(planes * block.expansion, affine=affine_par))
+
+        layers = []
+        generate_multi_grid = lambda index, grids: grids[index % len(grids)] if isinstance(grids, tuple) else 1
+        layers.append(block(self.inplanes, planes, stride, dilation=dilation, downsample=downsample,
+                            multi_grid=generate_multi_grid(0, multi_grid)))
+        self.inplanes = planes * block.expansion
+        for i in range(1, blocks):
+            layers.append(
+                block(self.inplanes, planes, dilation=dilation, multi_grid=generate_multi_grid(i, multi_grid)))
+
+        return nn.Sequential(*layers)
+
+    def forward(self, x):
+        x = self.relu1(self.bn1(self.conv1(x)))
+        x = self.relu2(self.bn2(self.conv2(x)))
+        x = self.relu3(self.bn3(self.conv3(x)))
+        x = self.maxpool(x)
+        x2 = self.layer1(x)
+        x3 = self.layer2(x2)
+        x4 = self.layer3(x3)
+        x5 = self.layer4(x4)
+        x = self.context_encoding(x5)
+        parsing_result, parsing_fea = self.decoder(x, x2)
+        # Edge Branch
+        edge_result, edge_fea = self.edge(x2, x3, x4)
+        # Fusion Branch
+        x = torch.cat([parsing_fea, edge_fea], dim=1)
+        fusion_result = self.fushion(x)
+        return [[parsing_result, fusion_result], edge_result]
+
+
+def initialize_pretrained_model(model, settings, pretrained='./models/resnet101-imagenet.pth'):
+    model.input_space = settings['input_space']
+    model.input_size = settings['input_size']
+    model.input_range = settings['input_range']
+    model.mean = settings['mean']
+    model.std = settings['std']
+
+    if pretrained is not None:
+        saved_state_dict = torch.load(pretrained)
+        new_params = model.state_dict().copy()
+        for i in saved_state_dict:
+            i_parts = i.split('.')
+            if not i_parts[0] == 'fc':
+                new_params['.'.join(i_parts[0:])] = saved_state_dict[i]
+        model.load_state_dict(new_params)
+
+
+def resnet101(num_classes=20, pretrained='./models/resnet101-imagenet.pth'):
+    model = ResNet(Bottleneck, [3, 4, 23, 3], num_classes)
+    settings = pretrained_settings['resnet101']['imagenet']
+    initialize_pretrained_model(model, settings, pretrained)
+    return model
diff --git a/preprocess/humanparsing/networks/__init__.py b/preprocess/humanparsing/networks/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..3d5d384890e20652fa3ec282515ece6846ce447f
--- /dev/null
+++ b/preprocess/humanparsing/networks/__init__.py
@@ -0,0 +1,12 @@
+from __future__ import absolute_import
+from networks.AugmentCE2P import resnet101
+
+__factory = {
+    'resnet101': resnet101,
+}
+
+
+def init_model(name, *args, **kwargs):
+    if name not in __factory.keys():
+        raise KeyError("Unknown model arch: {}".format(name))
+    return __factory[name](*args, **kwargs)
\ No newline at end of file
diff --git a/preprocess/humanparsing/networks/backbone/mobilenetv2.py b/preprocess/humanparsing/networks/backbone/mobilenetv2.py
new file mode 100644
index 0000000000000000000000000000000000000000..6f2fe342877cfbc5796efea85af9abccfb80a27e
--- /dev/null
+++ b/preprocess/humanparsing/networks/backbone/mobilenetv2.py
@@ -0,0 +1,156 @@
+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+
+"""
+@Author  :   Peike Li
+@Contact :   peike.li@yahoo.com
+@File    :   mobilenetv2.py
+@Time    :   8/4/19 3:35 PM
+@Desc    :   
+@License :   This source code is licensed under the license found in the 
+             LICENSE file in the root directory of this source tree.
+"""
+
+import torch.nn as nn
+import math
+import functools
+
+from modules import InPlaceABN, InPlaceABNSync
+
+BatchNorm2d = functools.partial(InPlaceABNSync, activation='none')
+
+__all__ = ['mobilenetv2']
+
+
+def conv_bn(inp, oup, stride):
+    return nn.Sequential(
+        nn.Conv2d(inp, oup, 3, stride, 1, bias=False),
+        BatchNorm2d(oup),
+        nn.ReLU6(inplace=True)
+    )
+
+
+def conv_1x1_bn(inp, oup):
+    return nn.Sequential(
+        nn.Conv2d(inp, oup, 1, 1, 0, bias=False),
+        BatchNorm2d(oup),
+        nn.ReLU6(inplace=True)
+    )
+
+
+class InvertedResidual(nn.Module):
+    def __init__(self, inp, oup, stride, expand_ratio):
+        super(InvertedResidual, self).__init__()
+        self.stride = stride
+        assert stride in [1, 2]
+
+        hidden_dim = round(inp * expand_ratio)
+        self.use_res_connect = self.stride == 1 and inp == oup
+
+        if expand_ratio == 1:
+            self.conv = nn.Sequential(
+                # dw
+                nn.Conv2d(hidden_dim, hidden_dim, 3, stride, 1, groups=hidden_dim, bias=False),
+                BatchNorm2d(hidden_dim),
+                nn.ReLU6(inplace=True),
+                # pw-linear
+                nn.Conv2d(hidden_dim, oup, 1, 1, 0, bias=False),
+                BatchNorm2d(oup),
+            )
+        else:
+            self.conv = nn.Sequential(
+                # pw
+                nn.Conv2d(inp, hidden_dim, 1, 1, 0, bias=False),
+                BatchNorm2d(hidden_dim),
+                nn.ReLU6(inplace=True),
+                # dw
+                nn.Conv2d(hidden_dim, hidden_dim, 3, stride, 1, groups=hidden_dim, bias=False),
+                BatchNorm2d(hidden_dim),
+                nn.ReLU6(inplace=True),
+                # pw-linear
+                nn.Conv2d(hidden_dim, oup, 1, 1, 0, bias=False),
+                BatchNorm2d(oup),
+            )
+
+    def forward(self, x):
+        if self.use_res_connect:
+            return x + self.conv(x)
+        else:
+            return self.conv(x)
+
+
+class MobileNetV2(nn.Module):
+    def __init__(self, n_class=1000, input_size=224, width_mult=1.):
+        super(MobileNetV2, self).__init__()
+        block = InvertedResidual
+        input_channel = 32
+        last_channel = 1280
+        interverted_residual_setting = [
+            # t, c, n, s
+            [1, 16, 1, 1],
+            [6, 24, 2, 2],  # layer 2
+            [6, 32, 3, 2],  # layer 3
+            [6, 64, 4, 2],
+            [6, 96, 3, 1],  # layer 4
+            [6, 160, 3, 2],
+            [6, 320, 1, 1],  # layer 5
+        ]
+
+        # building first layer
+        assert input_size % 32 == 0
+        input_channel = int(input_channel * width_mult)
+        self.last_channel = int(last_channel * width_mult) if width_mult > 1.0 else last_channel
+        self.features = [conv_bn(3, input_channel, 2)]
+        # building inverted residual blocks
+        for t, c, n, s in interverted_residual_setting:
+            output_channel = int(c * width_mult)
+            for i in range(n):
+                if i == 0:
+                    self.features.append(block(input_channel, output_channel, s, expand_ratio=t))
+                else:
+                    self.features.append(block(input_channel, output_channel, 1, expand_ratio=t))
+                input_channel = output_channel
+        # building last several layers
+        self.features.append(conv_1x1_bn(input_channel, self.last_channel))
+        # make it nn.Sequential
+        self.features = nn.Sequential(*self.features)
+
+        # building classifier
+        self.classifier = nn.Sequential(
+            nn.Dropout(0.2),
+            nn.Linear(self.last_channel, n_class),
+        )
+
+        self._initialize_weights()
+
+    def forward(self, x):
+        x = self.features(x)
+        x = x.mean(3).mean(2)
+        x = self.classifier(x)
+        return x
+
+    def _initialize_weights(self):
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
+                m.weight.data.normal_(0, math.sqrt(2. / n))
+                if m.bias is not None:
+                    m.bias.data.zero_()
+            elif isinstance(m, BatchNorm2d):
+                m.weight.data.fill_(1)
+                m.bias.data.zero_()
+            elif isinstance(m, nn.Linear):
+                n = m.weight.size(1)
+                m.weight.data.normal_(0, 0.01)
+                m.bias.data.zero_()
+
+
+def mobilenetv2(pretrained=False, **kwargs):
+    """Constructs a MobileNet_V2 model.
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+    """
+    model = MobileNetV2(n_class=1000, **kwargs)
+    if pretrained:
+        model.load_state_dict(load_url(model_urls['mobilenetv2']), strict=False)
+    return model
diff --git a/preprocess/humanparsing/networks/backbone/resnet.py b/preprocess/humanparsing/networks/backbone/resnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..88d6f73bc4fc327e18123020e01ccf5c1b37f025
--- /dev/null
+++ b/preprocess/humanparsing/networks/backbone/resnet.py
@@ -0,0 +1,205 @@
+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+
+"""
+@Author  :   Peike Li
+@Contact :   peike.li@yahoo.com
+@File    :   resnet.py
+@Time    :   8/4/19 3:35 PM
+@Desc    :   
+@License :   This source code is licensed under the license found in the 
+             LICENSE file in the root directory of this source tree.
+"""
+
+import functools
+import torch.nn as nn
+import math
+from torch.utils.model_zoo import load_url
+
+from modules import InPlaceABNSync
+
+BatchNorm2d = functools.partial(InPlaceABNSync, activation='none')
+
+__all__ = ['ResNet', 'resnet18', 'resnet50', 'resnet101']  # resnet101 is coming soon!
+
+model_urls = {
+    'resnet18': 'http://sceneparsing.csail.mit.edu/model/pretrained_resnet/resnet18-imagenet.pth',
+    'resnet50': 'http://sceneparsing.csail.mit.edu/model/pretrained_resnet/resnet50-imagenet.pth',
+    'resnet101': 'http://sceneparsing.csail.mit.edu/model/pretrained_resnet/resnet101-imagenet.pth'
+}
+
+
+def conv3x3(in_planes, out_planes, stride=1):
+    "3x3 convolution with padding"
+    return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,
+                     padding=1, bias=False)
+
+
+class BasicBlock(nn.Module):
+    expansion = 1
+
+    def __init__(self, inplanes, planes, stride=1, downsample=None):
+        super(BasicBlock, self).__init__()
+        self.conv1 = conv3x3(inplanes, planes, stride)
+        self.bn1 = BatchNorm2d(planes)
+        self.relu = nn.ReLU(inplace=True)
+        self.conv2 = conv3x3(planes, planes)
+        self.bn2 = BatchNorm2d(planes)
+        self.downsample = downsample
+        self.stride = stride
+
+    def forward(self, x):
+        residual = x
+
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+
+        out = self.conv2(out)
+        out = self.bn2(out)
+
+        if self.downsample is not None:
+            residual = self.downsample(x)
+
+        out += residual
+        out = self.relu(out)
+
+        return out
+
+
+class Bottleneck(nn.Module):
+    expansion = 4
+
+    def __init__(self, inplanes, planes, stride=1, downsample=None):
+        super(Bottleneck, self).__init__()
+        self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False)
+        self.bn1 = BatchNorm2d(planes)
+        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride,
+                               padding=1, bias=False)
+        self.bn2 = BatchNorm2d(planes)
+        self.conv3 = nn.Conv2d(planes, planes * 4, kernel_size=1, bias=False)
+        self.bn3 = BatchNorm2d(planes * 4)
+        self.relu = nn.ReLU(inplace=True)
+        self.downsample = downsample
+        self.stride = stride
+
+    def forward(self, x):
+        residual = x
+
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+
+        out = self.conv2(out)
+        out = self.bn2(out)
+        out = self.relu(out)
+
+        out = self.conv3(out)
+        out = self.bn3(out)
+
+        if self.downsample is not None:
+            residual = self.downsample(x)
+
+        out += residual
+        out = self.relu(out)
+
+        return out
+
+
+class ResNet(nn.Module):
+
+    def __init__(self, block, layers, num_classes=1000):
+        self.inplanes = 128
+        super(ResNet, self).__init__()
+        self.conv1 = conv3x3(3, 64, stride=2)
+        self.bn1 = BatchNorm2d(64)
+        self.relu1 = nn.ReLU(inplace=True)
+        self.conv2 = conv3x3(64, 64)
+        self.bn2 = BatchNorm2d(64)
+        self.relu2 = nn.ReLU(inplace=True)
+        self.conv3 = conv3x3(64, 128)
+        self.bn3 = BatchNorm2d(128)
+        self.relu3 = nn.ReLU(inplace=True)
+        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
+
+        self.layer1 = self._make_layer(block, 64, layers[0])
+        self.layer2 = self._make_layer(block, 128, layers[1], stride=2)
+        self.layer3 = self._make_layer(block, 256, layers[2], stride=2)
+        self.layer4 = self._make_layer(block, 512, layers[3], stride=2)
+        self.avgpool = nn.AvgPool2d(7, stride=1)
+        self.fc = nn.Linear(512 * block.expansion, num_classes)
+
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
+                m.weight.data.normal_(0, math.sqrt(2. / n))
+            elif isinstance(m, BatchNorm2d):
+                m.weight.data.fill_(1)
+                m.bias.data.zero_()
+
+    def _make_layer(self, block, planes, blocks, stride=1):
+        downsample = None
+        if stride != 1 or self.inplanes != planes * block.expansion:
+            downsample = nn.Sequential(
+                nn.Conv2d(self.inplanes, planes * block.expansion,
+                          kernel_size=1, stride=stride, bias=False),
+                BatchNorm2d(planes * block.expansion),
+            )
+
+        layers = []
+        layers.append(block(self.inplanes, planes, stride, downsample))
+        self.inplanes = planes * block.expansion
+        for i in range(1, blocks):
+            layers.append(block(self.inplanes, planes))
+
+        return nn.Sequential(*layers)
+
+    def forward(self, x):
+        x = self.relu1(self.bn1(self.conv1(x)))
+        x = self.relu2(self.bn2(self.conv2(x)))
+        x = self.relu3(self.bn3(self.conv3(x)))
+        x = self.maxpool(x)
+
+        x = self.layer1(x)
+        x = self.layer2(x)
+        x = self.layer3(x)
+        x = self.layer4(x)
+
+        x = self.avgpool(x)
+        x = x.view(x.size(0), -1)
+        x = self.fc(x)
+
+        return x
+
+
+def resnet18(pretrained=False, **kwargs):
+    """Constructs a ResNet-18 model.
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+    """
+    model = ResNet(BasicBlock, [2, 2, 2, 2], **kwargs)
+    if pretrained:
+        model.load_state_dict(load_url(model_urls['resnet18']))
+    return model
+
+
+def resnet50(pretrained=False, **kwargs):
+    """Constructs a ResNet-50 model.
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+    """
+    model = ResNet(Bottleneck, [3, 4, 6, 3], **kwargs)
+    if pretrained:
+        model.load_state_dict(load_url(model_urls['resnet50']), strict=False)
+    return model
+
+
+def resnet101(pretrained=False, **kwargs):
+    """Constructs a ResNet-101 model.
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+    """
+    model = ResNet(Bottleneck, [3, 4, 23, 3], **kwargs)
+    if pretrained:
+        model.load_state_dict(load_url(model_urls['resnet101']), strict=False)
+    return model
diff --git a/preprocess/humanparsing/networks/backbone/resnext.py b/preprocess/humanparsing/networks/backbone/resnext.py
new file mode 100644
index 0000000000000000000000000000000000000000..96adb54146addc523be71591eb93afcc2c25307f
--- /dev/null
+++ b/preprocess/humanparsing/networks/backbone/resnext.py
@@ -0,0 +1,149 @@
+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+
+"""
+@Author  :   Peike Li
+@Contact :   peike.li@yahoo.com
+@File    :   resnext.py.py
+@Time    :   8/11/19 8:58 PM
+@Desc    :   
+@License :   This source code is licensed under the license found in the 
+             LICENSE file in the root directory of this source tree.
+"""
+import functools
+import torch.nn as nn
+import math
+from torch.utils.model_zoo import load_url
+
+from modules import InPlaceABNSync
+
+BatchNorm2d = functools.partial(InPlaceABNSync, activation='none')
+
+__all__ = ['ResNeXt', 'resnext101']  # support resnext 101
+
+model_urls = {
+    'resnext50': 'http://sceneparsing.csail.mit.edu/model/pretrained_resnet/resnext50-imagenet.pth',
+    'resnext101': 'http://sceneparsing.csail.mit.edu/model/pretrained_resnet/resnext101-imagenet.pth'
+}
+
+
+def conv3x3(in_planes, out_planes, stride=1):
+    "3x3 convolution with padding"
+    return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,
+                     padding=1, bias=False)
+
+
+class GroupBottleneck(nn.Module):
+    expansion = 2
+
+    def __init__(self, inplanes, planes, stride=1, groups=1, downsample=None):
+        super(GroupBottleneck, self).__init__()
+        self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False)
+        self.bn1 = BatchNorm2d(planes)
+        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride,
+                               padding=1, groups=groups, bias=False)
+        self.bn2 = BatchNorm2d(planes)
+        self.conv3 = nn.Conv2d(planes, planes * 2, kernel_size=1, bias=False)
+        self.bn3 = BatchNorm2d(planes * 2)
+        self.relu = nn.ReLU(inplace=True)
+        self.downsample = downsample
+        self.stride = stride
+
+    def forward(self, x):
+        residual = x
+
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+
+        out = self.conv2(out)
+        out = self.bn2(out)
+        out = self.relu(out)
+
+        out = self.conv3(out)
+        out = self.bn3(out)
+
+        if self.downsample is not None:
+            residual = self.downsample(x)
+
+        out += residual
+        out = self.relu(out)
+
+        return out
+
+
+class ResNeXt(nn.Module):
+
+    def __init__(self, block, layers, groups=32, num_classes=1000):
+        self.inplanes = 128
+        super(ResNeXt, self).__init__()
+        self.conv1 = conv3x3(3, 64, stride=2)
+        self.bn1 = BatchNorm2d(64)
+        self.relu1 = nn.ReLU(inplace=True)
+        self.conv2 = conv3x3(64, 64)
+        self.bn2 = BatchNorm2d(64)
+        self.relu2 = nn.ReLU(inplace=True)
+        self.conv3 = conv3x3(64, 128)
+        self.bn3 = BatchNorm2d(128)
+        self.relu3 = nn.ReLU(inplace=True)
+        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
+
+        self.layer1 = self._make_layer(block, 128, layers[0], groups=groups)
+        self.layer2 = self._make_layer(block, 256, layers[1], stride=2, groups=groups)
+        self.layer3 = self._make_layer(block, 512, layers[2], stride=2, groups=groups)
+        self.layer4 = self._make_layer(block, 1024, layers[3], stride=2, groups=groups)
+        self.avgpool = nn.AvgPool2d(7, stride=1)
+        self.fc = nn.Linear(1024 * block.expansion, num_classes)
+
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels // m.groups
+                m.weight.data.normal_(0, math.sqrt(2. / n))
+            elif isinstance(m, BatchNorm2d):
+                m.weight.data.fill_(1)
+                m.bias.data.zero_()
+
+    def _make_layer(self, block, planes, blocks, stride=1, groups=1):
+        downsample = None
+        if stride != 1 or self.inplanes != planes * block.expansion:
+            downsample = nn.Sequential(
+                nn.Conv2d(self.inplanes, planes * block.expansion,
+                          kernel_size=1, stride=stride, bias=False),
+                BatchNorm2d(planes * block.expansion),
+            )
+
+        layers = []
+        layers.append(block(self.inplanes, planes, stride, groups, downsample))
+        self.inplanes = planes * block.expansion
+        for i in range(1, blocks):
+            layers.append(block(self.inplanes, planes, groups=groups))
+
+        return nn.Sequential(*layers)
+
+    def forward(self, x):
+        x = self.relu1(self.bn1(self.conv1(x)))
+        x = self.relu2(self.bn2(self.conv2(x)))
+        x = self.relu3(self.bn3(self.conv3(x)))
+        x = self.maxpool(x)
+
+        x = self.layer1(x)
+        x = self.layer2(x)
+        x = self.layer3(x)
+        x = self.layer4(x)
+
+        x = self.avgpool(x)
+        x = x.view(x.size(0), -1)
+        x = self.fc(x)
+
+        return x
+
+
+def resnext101(pretrained=False, **kwargs):
+    """Constructs a ResNet-101 model.
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on Places
+    """
+    model = ResNeXt(GroupBottleneck, [3, 4, 23, 3], **kwargs)
+    if pretrained:
+        model.load_state_dict(load_url(model_urls['resnext101']), strict=False)
+    return model
diff --git a/preprocess/humanparsing/networks/context_encoding/aspp.py b/preprocess/humanparsing/networks/context_encoding/aspp.py
new file mode 100644
index 0000000000000000000000000000000000000000..d0ba531a8920665c982b1f3412bc030465d56d2a
--- /dev/null
+++ b/preprocess/humanparsing/networks/context_encoding/aspp.py
@@ -0,0 +1,64 @@
+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+
+"""
+@Author  :   Peike Li
+@Contact :   peike.li@yahoo.com
+@File    :   aspp.py
+@Time    :   8/4/19 3:36 PM
+@Desc    :   
+@License :   This source code is licensed under the license found in the 
+             LICENSE file in the root directory of this source tree.
+"""
+
+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+
+from modules import InPlaceABNSync
+
+
+class ASPPModule(nn.Module):
+    """
+    Reference:
+        Chen, Liang-Chieh, et al. *"Rethinking Atrous Convolution for Semantic Image Segmentation."*
+    """
+    def __init__(self, features, out_features=512, inner_features=256, dilations=(12, 24, 36)):
+        super(ASPPModule, self).__init__()
+
+        self.conv1 = nn.Sequential(nn.AdaptiveAvgPool2d((1, 1)),
+                                   nn.Conv2d(features, inner_features, kernel_size=1, padding=0, dilation=1,
+                                             bias=False),
+                                   InPlaceABNSync(inner_features))
+        self.conv2 = nn.Sequential(
+            nn.Conv2d(features, inner_features, kernel_size=1, padding=0, dilation=1, bias=False),
+            InPlaceABNSync(inner_features))
+        self.conv3 = nn.Sequential(
+            nn.Conv2d(features, inner_features, kernel_size=3, padding=dilations[0], dilation=dilations[0], bias=False),
+            InPlaceABNSync(inner_features))
+        self.conv4 = nn.Sequential(
+            nn.Conv2d(features, inner_features, kernel_size=3, padding=dilations[1], dilation=dilations[1], bias=False),
+            InPlaceABNSync(inner_features))
+        self.conv5 = nn.Sequential(
+            nn.Conv2d(features, inner_features, kernel_size=3, padding=dilations[2], dilation=dilations[2], bias=False),
+            InPlaceABNSync(inner_features))
+
+        self.bottleneck = nn.Sequential(
+            nn.Conv2d(inner_features * 5, out_features, kernel_size=1, padding=0, dilation=1, bias=False),
+            InPlaceABNSync(out_features),
+            nn.Dropout2d(0.1)
+        )
+
+    def forward(self, x):
+        _, _, h, w = x.size()
+
+        feat1 = F.interpolate(self.conv1(x), size=(h, w), mode='bilinear', align_corners=True)
+
+        feat2 = self.conv2(x)
+        feat3 = self.conv3(x)
+        feat4 = self.conv4(x)
+        feat5 = self.conv5(x)
+        out = torch.cat((feat1, feat2, feat3, feat4, feat5), 1)
+
+        bottle = self.bottleneck(out)
+        return bottle
\ No newline at end of file
diff --git a/preprocess/humanparsing/networks/context_encoding/ocnet.py b/preprocess/humanparsing/networks/context_encoding/ocnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..ac43ebf489ee478c48acf3f93b01b32bdb08cdf3
--- /dev/null
+++ b/preprocess/humanparsing/networks/context_encoding/ocnet.py
@@ -0,0 +1,226 @@
+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+
+"""
+@Author  :   Peike Li
+@Contact :   peike.li@yahoo.com
+@File    :   ocnet.py
+@Time    :   8/4/19 3:36 PM
+@Desc    :   
+@License :   This source code is licensed under the license found in the 
+             LICENSE file in the root directory of this source tree.
+"""
+
+import functools
+
+import torch
+import torch.nn as nn
+from torch.autograd import Variable
+from torch.nn import functional as F
+
+from modules import InPlaceABNSync
+BatchNorm2d = functools.partial(InPlaceABNSync, activation='none')
+
+
+class _SelfAttentionBlock(nn.Module):
+    '''
+    The basic implementation for self-attention block/non-local block
+    Input:
+        N X C X H X W
+    Parameters:
+        in_channels       : the dimension of the input feature map
+        key_channels      : the dimension after the key/query transform
+        value_channels    : the dimension after the value transform
+        scale             : choose the scale to downsample the input feature maps (save memory cost)
+    Return:
+        N X C X H X W
+        position-aware context features.(w/o concate or add with the input)
+    '''
+
+    def __init__(self, in_channels, key_channels, value_channels, out_channels=None, scale=1):
+        super(_SelfAttentionBlock, self).__init__()
+        self.scale = scale
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.key_channels = key_channels
+        self.value_channels = value_channels
+        if out_channels == None:
+            self.out_channels = in_channels
+        self.pool = nn.MaxPool2d(kernel_size=(scale, scale))
+        self.f_key = nn.Sequential(
+            nn.Conv2d(in_channels=self.in_channels, out_channels=self.key_channels,
+                      kernel_size=1, stride=1, padding=0),
+            InPlaceABNSync(self.key_channels),
+        )
+        self.f_query = self.f_key
+        self.f_value = nn.Conv2d(in_channels=self.in_channels, out_channels=self.value_channels,
+                                 kernel_size=1, stride=1, padding=0)
+        self.W = nn.Conv2d(in_channels=self.value_channels, out_channels=self.out_channels,
+                           kernel_size=1, stride=1, padding=0)
+        nn.init.constant(self.W.weight, 0)
+        nn.init.constant(self.W.bias, 0)
+
+    def forward(self, x):
+        batch_size, h, w = x.size(0), x.size(2), x.size(3)
+        if self.scale > 1:
+            x = self.pool(x)
+
+        value = self.f_value(x).view(batch_size, self.value_channels, -1)
+        value = value.permute(0, 2, 1)
+        query = self.f_query(x).view(batch_size, self.key_channels, -1)
+        query = query.permute(0, 2, 1)
+        key = self.f_key(x).view(batch_size, self.key_channels, -1)
+
+        sim_map = torch.matmul(query, key)
+        sim_map = (self.key_channels ** -.5) * sim_map
+        sim_map = F.softmax(sim_map, dim=-1)
+
+        context = torch.matmul(sim_map, value)
+        context = context.permute(0, 2, 1).contiguous()
+        context = context.view(batch_size, self.value_channels, *x.size()[2:])
+        context = self.W(context)
+        if self.scale > 1:
+            context = F.upsample(input=context, size=(h, w), mode='bilinear', align_corners=True)
+        return context
+
+
+class SelfAttentionBlock2D(_SelfAttentionBlock):
+    def __init__(self, in_channels, key_channels, value_channels, out_channels=None, scale=1):
+        super(SelfAttentionBlock2D, self).__init__(in_channels,
+                                                   key_channels,
+                                                   value_channels,
+                                                   out_channels,
+                                                   scale)
+
+
+class BaseOC_Module(nn.Module):
+    """
+    Implementation of the BaseOC module
+    Parameters:
+        in_features / out_features: the channels of the input / output feature maps.
+        dropout: we choose 0.05 as the default value.
+        size: you can apply multiple sizes. Here we only use one size.
+    Return:
+        features fused with Object context information.
+    """
+
+    def __init__(self, in_channels, out_channels, key_channels, value_channels, dropout, sizes=([1])):
+        super(BaseOC_Module, self).__init__()
+        self.stages = []
+        self.stages = nn.ModuleList(
+            [self._make_stage(in_channels, out_channels, key_channels, value_channels, size) for size in sizes])
+        self.conv_bn_dropout = nn.Sequential(
+            nn.Conv2d(2 * in_channels, out_channels, kernel_size=1, padding=0),
+            InPlaceABNSync(out_channels),
+            nn.Dropout2d(dropout)
+        )
+
+    def _make_stage(self, in_channels, output_channels, key_channels, value_channels, size):
+        return SelfAttentionBlock2D(in_channels,
+                                    key_channels,
+                                    value_channels,
+                                    output_channels,
+                                    size)
+
+    def forward(self, feats):
+        priors = [stage(feats) for stage in self.stages]
+        context = priors[0]
+        for i in range(1, len(priors)):
+            context += priors[i]
+        output = self.conv_bn_dropout(torch.cat([context, feats], 1))
+        return output
+
+
+class BaseOC_Context_Module(nn.Module):
+    """
+    Output only the context features.
+    Parameters:
+        in_features / out_features: the channels of the input / output feature maps.
+        dropout: specify the dropout ratio
+        fusion: We provide two different fusion method, "concat" or "add"
+        size: we find that directly learn the attention weights on even 1/8 feature maps is hard.
+    Return:
+        features after "concat" or "add"
+    """
+
+    def __init__(self, in_channels, out_channels, key_channels, value_channels, dropout, sizes=([1])):
+        super(BaseOC_Context_Module, self).__init__()
+        self.stages = []
+        self.stages = nn.ModuleList(
+            [self._make_stage(in_channels, out_channels, key_channels, value_channels, size) for size in sizes])
+        self.conv_bn_dropout = nn.Sequential(
+            nn.Conv2d(in_channels, out_channels, kernel_size=1, padding=0),
+            InPlaceABNSync(out_channels),
+        )
+
+    def _make_stage(self, in_channels, output_channels, key_channels, value_channels, size):
+        return SelfAttentionBlock2D(in_channels,
+                                    key_channels,
+                                    value_channels,
+                                    output_channels,
+                                    size)
+
+    def forward(self, feats):
+        priors = [stage(feats) for stage in self.stages]
+        context = priors[0]
+        for i in range(1, len(priors)):
+            context += priors[i]
+        output = self.conv_bn_dropout(context)
+        return output
+
+
+class ASP_OC_Module(nn.Module):
+    def __init__(self, features, out_features=256, dilations=(12, 24, 36)):
+        super(ASP_OC_Module, self).__init__()
+        self.context = nn.Sequential(nn.Conv2d(features, out_features, kernel_size=3, padding=1, dilation=1, bias=True),
+                                     InPlaceABNSync(out_features),
+                                     BaseOC_Context_Module(in_channels=out_features, out_channels=out_features,
+                                                           key_channels=out_features // 2, value_channels=out_features,
+                                                           dropout=0, sizes=([2])))
+        self.conv2 = nn.Sequential(nn.Conv2d(features, out_features, kernel_size=1, padding=0, dilation=1, bias=False),
+                                   InPlaceABNSync(out_features))
+        self.conv3 = nn.Sequential(
+            nn.Conv2d(features, out_features, kernel_size=3, padding=dilations[0], dilation=dilations[0], bias=False),
+            InPlaceABNSync(out_features))
+        self.conv4 = nn.Sequential(
+            nn.Conv2d(features, out_features, kernel_size=3, padding=dilations[1], dilation=dilations[1], bias=False),
+            InPlaceABNSync(out_features))
+        self.conv5 = nn.Sequential(
+            nn.Conv2d(features, out_features, kernel_size=3, padding=dilations[2], dilation=dilations[2], bias=False),
+            InPlaceABNSync(out_features))
+
+        self.conv_bn_dropout = nn.Sequential(
+            nn.Conv2d(out_features * 5, out_features, kernel_size=1, padding=0, dilation=1, bias=False),
+            InPlaceABNSync(out_features),
+            nn.Dropout2d(0.1)
+        )
+
+    def _cat_each(self, feat1, feat2, feat3, feat4, feat5):
+        assert (len(feat1) == len(feat2))
+        z = []
+        for i in range(len(feat1)):
+            z.append(torch.cat((feat1[i], feat2[i], feat3[i], feat4[i], feat5[i]), 1))
+        return z
+
+    def forward(self, x):
+        if isinstance(x, Variable):
+            _, _, h, w = x.size()
+        elif isinstance(x, tuple) or isinstance(x, list):
+            _, _, h, w = x[0].size()
+        else:
+            raise RuntimeError('unknown input type')
+
+        feat1 = self.context(x)
+        feat2 = self.conv2(x)
+        feat3 = self.conv3(x)
+        feat4 = self.conv4(x)
+        feat5 = self.conv5(x)
+
+        if isinstance(x, Variable):
+            out = torch.cat((feat1, feat2, feat3, feat4, feat5), 1)
+        elif isinstance(x, tuple) or isinstance(x, list):
+            out = self._cat_each(feat1, feat2, feat3, feat4, feat5)
+        else:
+            raise RuntimeError('unknown input type')
+        output = self.conv_bn_dropout(out)
+        return output
diff --git a/preprocess/humanparsing/networks/context_encoding/psp.py b/preprocess/humanparsing/networks/context_encoding/psp.py
new file mode 100644
index 0000000000000000000000000000000000000000..47181dc3f5fddb1c7fb80ad58a6694aae9ebd746
--- /dev/null
+++ b/preprocess/humanparsing/networks/context_encoding/psp.py
@@ -0,0 +1,48 @@
+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+
+"""
+@Author  :   Peike Li
+@Contact :   peike.li@yahoo.com
+@File    :   psp.py
+@Time    :   8/4/19 3:36 PM
+@Desc    :   
+@License :   This source code is licensed under the license found in the 
+             LICENSE file in the root directory of this source tree.
+"""
+
+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+
+from modules import InPlaceABNSync
+
+
+class PSPModule(nn.Module):
+    """
+    Reference:
+        Zhao, Hengshuang, et al. *"Pyramid scene parsing network."*
+    """
+    def __init__(self, features, out_features=512, sizes=(1, 2, 3, 6)):
+        super(PSPModule, self).__init__()
+
+        self.stages = []
+        self.stages = nn.ModuleList([self._make_stage(features, out_features, size) for size in sizes])
+        self.bottleneck = nn.Sequential(
+            nn.Conv2d(features + len(sizes) * out_features, out_features, kernel_size=3, padding=1, dilation=1,
+                      bias=False),
+            InPlaceABNSync(out_features),
+        )
+
+    def _make_stage(self, features, out_features, size):
+        prior = nn.AdaptiveAvgPool2d(output_size=(size, size))
+        conv = nn.Conv2d(features, out_features, kernel_size=1, bias=False)
+        bn = InPlaceABNSync(out_features)
+        return nn.Sequential(prior, conv, bn)
+
+    def forward(self, feats):
+        h, w = feats.size(2), feats.size(3)
+        priors = [F.interpolate(input=stage(feats), size=(h, w), mode='bilinear', align_corners=True) for stage in
+                  self.stages] + [feats]
+        bottle = self.bottleneck(torch.cat(priors, 1))
+        return bottle
\ No newline at end of file
diff --git a/preprocess/humanparsing/parsing_api.py b/preprocess/humanparsing/parsing_api.py
new file mode 100644
index 0000000000000000000000000000000000000000..a9a2cff9ba8e3eabef79216b3f825a17872441b7
--- /dev/null
+++ b/preprocess/humanparsing/parsing_api.py
@@ -0,0 +1,188 @@
+import pdb
+from pathlib import Path
+import sys
+
+PROJECT_ROOT = Path(__file__).absolute().parents[0].absolute()
+sys.path.insert(0, str(PROJECT_ROOT))
+import os
+import torch
+import numpy as np
+import cv2
+import torchvision.transforms as transforms
+from torch.utils.data import DataLoader
+from datasets.simple_extractor_dataset import SimpleFolderDataset
+from utils.transforms import transform_logits
+from tqdm import tqdm
+from PIL import Image
+
+
+def get_palette(num_cls):
+    """ Returns the color map for visualizing the segmentation mask.
+    Args:
+        num_cls: Number of classes
+    Returns:
+        The color map
+    """
+    n = num_cls
+    palette = [0] * (n * 3)
+    for j in range(0, n):
+        lab = j
+        palette[j * 3 + 0] = 0
+        palette[j * 3 + 1] = 0
+        palette[j * 3 + 2] = 0
+        i = 0
+        while lab:
+            palette[j * 3 + 0] |= (((lab >> 0) & 1) << (7 - i))
+            palette[j * 3 + 1] |= (((lab >> 1) & 1) << (7 - i))
+            palette[j * 3 + 2] |= (((lab >> 2) & 1) << (7 - i))
+            i += 1
+            lab >>= 3
+    return palette
+
+
+def delete_irregular(logits_result):
+    parsing_result = np.argmax(logits_result, axis=2)
+    upper_cloth = np.where(parsing_result == 4, 255, 0)
+    contours, hierarchy = cv2.findContours(upper_cloth.astype(np.uint8),
+                                           cv2.RETR_CCOMP, cv2.CHAIN_APPROX_TC89_L1)
+    area = []
+    for i in range(len(contours)):
+        a = cv2.contourArea(contours[i], True)
+        area.append(abs(a))
+    if len(area) != 0:
+        top = area.index(max(area))
+        M = cv2.moments(contours[top])
+        cY = int(M["m01"] / M["m00"])
+
+    dresses = np.where(parsing_result == 7, 255, 0)
+    contours_dress, hierarchy_dress = cv2.findContours(dresses.astype(np.uint8),
+                                                       cv2.RETR_CCOMP, cv2.CHAIN_APPROX_TC89_L1)
+    area_dress = []
+    for j in range(len(contours_dress)):
+        a_d = cv2.contourArea(contours_dress[j], True)
+        area_dress.append(abs(a_d))
+    if len(area_dress) != 0:
+        top_dress = area_dress.index(max(area_dress))
+        M_dress = cv2.moments(contours_dress[top_dress])
+        cY_dress = int(M_dress["m01"] / M_dress["m00"])
+    wear_type = "dresses"
+    if len(area) != 0:
+        if len(area_dress) != 0 and cY_dress > cY:
+            irregular_list = np.array([4, 5, 6])
+            logits_result[:, :, irregular_list] = -1
+        else:
+            irregular_list = np.array([5, 6, 7, 8, 9, 10, 12, 13])
+            logits_result[:cY, :, irregular_list] = -1
+            wear_type = "cloth_pant"
+        parsing_result = np.argmax(logits_result, axis=2)
+    # pad border
+    parsing_result = np.pad(parsing_result, pad_width=1, mode='constant', constant_values=0)
+    return parsing_result, wear_type
+
+
+
+def hole_fill(img):
+    img_copy = img.copy()
+    mask = np.zeros((img.shape[0] + 2, img.shape[1] + 2), dtype=np.uint8)
+    cv2.floodFill(img, mask, (0, 0), 255)
+    img_inverse = cv2.bitwise_not(img)
+    dst = cv2.bitwise_or(img_copy, img_inverse)
+    return dst
+
+def refine_mask(mask):
+    contours, hierarchy = cv2.findContours(mask.astype(np.uint8),
+                                           cv2.RETR_CCOMP, cv2.CHAIN_APPROX_TC89_L1)
+    area = []
+    for j in range(len(contours)):
+        a_d = cv2.contourArea(contours[j], True)
+        area.append(abs(a_d))
+    refine_mask = np.zeros_like(mask).astype(np.uint8)
+    if len(area) != 0:
+        i = area.index(max(area))
+        cv2.drawContours(refine_mask, contours, i, color=255, thickness=-1)
+        # keep large area in skin case
+        for j in range(len(area)):
+          if j != i and area[i] > 2000:
+             cv2.drawContours(refine_mask, contours, j, color=255, thickness=-1)
+    return refine_mask
+
+def refine_hole(parsing_result_filled, parsing_result, arm_mask):
+    filled_hole = cv2.bitwise_and(np.where(parsing_result_filled == 4, 255, 0),
+                                  np.where(parsing_result != 4, 255, 0)) - arm_mask * 255
+    contours, hierarchy = cv2.findContours(filled_hole, cv2.RETR_CCOMP, cv2.CHAIN_APPROX_TC89_L1)
+    refine_hole_mask = np.zeros_like(parsing_result).astype(np.uint8)
+    for i in range(len(contours)):
+        a = cv2.contourArea(contours[i], True)
+        # keep hole > 2000 pixels
+        if abs(a) > 2000:
+            cv2.drawContours(refine_hole_mask, contours, i, color=255, thickness=-1)
+    return refine_hole_mask + arm_mask
+
+def onnx_inference(session, lip_session, input_dir):
+    transform = transforms.Compose([
+        transforms.ToTensor(),
+        transforms.Normalize(mean=[0.406, 0.456, 0.485], std=[0.225, 0.224, 0.229])
+    ])
+    dataset = SimpleFolderDataset(root=input_dir, input_size=[512, 512], transform=transform)
+    dataloader = DataLoader(dataset)
+    with torch.no_grad():
+        for _, batch in enumerate(tqdm(dataloader)):
+            image, meta = batch
+            c = meta['center'].numpy()[0]
+            s = meta['scale'].numpy()[0]
+            w = meta['width'].numpy()[0]
+            h = meta['height'].numpy()[0]
+            output = session.run(None, {"input.1": image.numpy().astype(np.float32)})
+            upsample = torch.nn.Upsample(size=[512, 512], mode='bilinear', align_corners=True)
+            upsample_output = upsample(torch.from_numpy(output[1][0]).unsqueeze(0))
+            upsample_output = upsample_output.squeeze()
+            upsample_output = upsample_output.permute(1, 2, 0)  # CHW -> HWC
+            logits_result = transform_logits(upsample_output.data.cpu().numpy(), c, s, w, h, input_size=[512, 512])
+            parsing_result = np.argmax(logits_result, axis=2)
+            parsing_result = np.pad(parsing_result, pad_width=1, mode='constant', constant_values=0)
+            # try holefilling the clothes part
+            arm_mask = (parsing_result == 14).astype(np.float32) \
+                       + (parsing_result == 15).astype(np.float32)
+            upper_cloth_mask = (parsing_result == 4).astype(np.float32) + arm_mask
+            img = np.where(upper_cloth_mask, 255, 0)
+            dst = hole_fill(img.astype(np.uint8))
+            parsing_result_filled = dst / 255 * 4
+            parsing_result_woarm = np.where(parsing_result_filled == 4, parsing_result_filled, parsing_result)
+            # add back arm and refined hole between arm and cloth
+            refine_hole_mask = refine_hole(parsing_result_filled.astype(np.uint8), parsing_result.astype(np.uint8),
+                                           arm_mask.astype(np.uint8))
+            parsing_result = np.where(refine_hole_mask, parsing_result, parsing_result_woarm)
+            # remove padding
+            parsing_result = parsing_result[1:-1, 1:-1]
+
+        dataset_lip = SimpleFolderDataset(root=input_dir, input_size=[473, 473], transform=transform)
+        dataloader_lip = DataLoader(dataset_lip)
+        with torch.no_grad():
+            for _, batch in enumerate(tqdm(dataloader_lip)):
+                image, meta = batch
+                c = meta['center'].numpy()[0]
+                s = meta['scale'].numpy()[0]
+                w = meta['width'].numpy()[0]
+                h = meta['height'].numpy()[0]
+
+                output_lip = lip_session.run(None, {"input.1": image.numpy().astype(np.float32)})
+                upsample = torch.nn.Upsample(size=[473, 473], mode='bilinear', align_corners=True)
+                upsample_output_lip = upsample(torch.from_numpy(output_lip[1][0]).unsqueeze(0))
+                upsample_output_lip = upsample_output_lip.squeeze()
+                upsample_output_lip = upsample_output_lip.permute(1, 2, 0)  # CHW -> HWC
+                logits_result_lip = transform_logits(upsample_output_lip.data.cpu().numpy(), c, s, w, h,
+                                                     input_size=[473, 473])
+                parsing_result_lip = np.argmax(logits_result_lip, axis=2)
+    # add neck parsing result
+    neck_mask = np.logical_and(np.logical_not((parsing_result_lip == 13).astype(np.float32)),
+                               (parsing_result == 11).astype(np.float32))
+    parsing_result = np.where(neck_mask, 18, parsing_result)
+    palette = get_palette(19)
+    output_img = Image.fromarray(np.asarray(parsing_result, dtype=np.uint8))
+    output_img.putpalette(palette)
+    face_mask = torch.from_numpy((parsing_result == 11).astype(np.float32))
+
+    return output_img, face_mask
+
+
+
diff --git a/preprocess/humanparsing/run_parsing.py b/preprocess/humanparsing/run_parsing.py
new file mode 100644
index 0000000000000000000000000000000000000000..14028467468d280139329e1f197e63df886d1fb3
--- /dev/null
+++ b/preprocess/humanparsing/run_parsing.py
@@ -0,0 +1,29 @@
+import pdb
+from pathlib import Path
+import sys
+import os
+import onnxruntime as ort
+PROJECT_ROOT = Path(__file__).absolute().parents[0].absolute()
+sys.path.insert(0, str(PROJECT_ROOT))
+from parsing_api import onnx_inference
+import torch
+
+
+class Parsing:
+    def __init__(self, gpu_id: int):
+        # self.gpu_id = gpu_id
+        # torch.cuda.set_device(gpu_id)
+        session_options = ort.SessionOptions()
+        session_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
+        session_options.execution_mode = ort.ExecutionMode.ORT_SEQUENTIAL
+        # session_options.add_session_config_entry('gpu_id', str(gpu_id))
+        self.session = ort.InferenceSession(os.path.join(Path(__file__).absolute().parents[2].absolute(), 'ckpt/humanparsing/parsing_atr.onnx'),
+                                            sess_options=session_options, providers=['CPUExecutionProvider'])
+        self.lip_session = ort.InferenceSession(os.path.join(Path(__file__).absolute().parents[2].absolute(), 'ckpt/humanparsing/parsing_lip.onnx'),
+                                                sess_options=session_options, providers=['CPUExecutionProvider'])
+        
+
+    def __call__(self, input_image):
+        # torch.cuda.set_device(self.gpu_id)
+        parsed_image, face_mask = onnx_inference(self.session, self.lip_session, input_image)
+        return parsed_image, face_mask
diff --git a/preprocess/humanparsing/utils/__init__.py b/preprocess/humanparsing/utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/preprocess/humanparsing/utils/__pycache__/__init__.cpython-310.pyc b/preprocess/humanparsing/utils/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c82bf31f0d7163183b005cbbd2ecfd46ddc0b501
Binary files /dev/null and b/preprocess/humanparsing/utils/__pycache__/__init__.cpython-310.pyc differ
diff --git a/preprocess/humanparsing/utils/__pycache__/transforms.cpython-310.pyc b/preprocess/humanparsing/utils/__pycache__/transforms.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e0f4d51c7f084bc12a038c7fc0440873706dcece
Binary files /dev/null and b/preprocess/humanparsing/utils/__pycache__/transforms.cpython-310.pyc differ
diff --git a/preprocess/humanparsing/utils/consistency_loss.py b/preprocess/humanparsing/utils/consistency_loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..b872fdcc10ecef02762399278191e48e79ea9a1f
--- /dev/null
+++ b/preprocess/humanparsing/utils/consistency_loss.py
@@ -0,0 +1,33 @@
+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+
+"""
+@Author  :   Peike Li
+@Contact :   peike.li@yahoo.com
+@File    :   kl_loss.py
+@Time    :   7/23/19 4:02 PM
+@Desc    :   
+@License :   This source code is licensed under the license found in the 
+             LICENSE file in the root directory of this source tree.
+"""
+import torch
+import torch.nn.functional as F
+from torch import nn
+from datasets.target_generation import generate_edge_tensor
+
+
+class ConsistencyLoss(nn.Module):
+    def __init__(self, ignore_index=255):
+        super(ConsistencyLoss, self).__init__()
+        self.ignore_index=ignore_index
+
+    def forward(self, parsing, edge, label):
+        parsing_pre = torch.argmax(parsing, dim=1)
+        parsing_pre[label==self.ignore_index]=self.ignore_index
+        generated_edge = generate_edge_tensor(parsing_pre)
+        edge_pre = torch.argmax(edge, dim=1)
+        v_generate_edge = generated_edge[label!=255]
+        v_edge_pre = edge_pre[label!=255]
+        v_edge_pre = v_edge_pre.type(torch.cuda.FloatTensor)
+        positive_union = (v_generate_edge==1)&(v_edge_pre==1) # only the positive values count
+        return F.smooth_l1_loss(v_generate_edge[positive_union].squeeze(0), v_edge_pre[positive_union].squeeze(0))
diff --git a/preprocess/humanparsing/utils/criterion.py b/preprocess/humanparsing/utils/criterion.py
new file mode 100644
index 0000000000000000000000000000000000000000..968894319042331482692e42804f103074e4b710
--- /dev/null
+++ b/preprocess/humanparsing/utils/criterion.py
@@ -0,0 +1,142 @@
+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+
+"""
+@Author  :   Peike Li
+@Contact :   peike.li@yahoo.com
+@File    :   criterion.py
+@Time    :   8/30/19 8:59 PM
+@Desc    :
+@License :   This source code is licensed under the license found in the
+             LICENSE file in the root directory of this source tree.
+"""
+
+import torch.nn as nn
+import torch
+import numpy as np
+from torch.nn import functional as F
+from .lovasz_softmax import LovaszSoftmax
+from .kl_loss import KLDivergenceLoss
+from .consistency_loss import ConsistencyLoss
+
+NUM_CLASSES = 20
+
+
+class CriterionAll(nn.Module):
+    def __init__(self, use_class_weight=False, ignore_index=255, lambda_1=1, lambda_2=1, lambda_3=1,
+                 num_classes=20):
+        super(CriterionAll, self).__init__()
+        self.ignore_index = ignore_index
+        self.use_class_weight = use_class_weight
+        self.criterion = torch.nn.CrossEntropyLoss(ignore_index=ignore_index)
+        self.lovasz = LovaszSoftmax(ignore_index=ignore_index)
+        self.kldiv = KLDivergenceLoss(ignore_index=ignore_index)
+        self.reg = ConsistencyLoss(ignore_index=ignore_index)
+        self.lamda_1 = lambda_1
+        self.lamda_2 = lambda_2
+        self.lamda_3 = lambda_3
+        self.num_classes = num_classes
+
+    def parsing_loss(self, preds, target, cycle_n=None):
+        """
+        Loss function definition.
+
+        Args:
+            preds: [[parsing result1, parsing result2],[edge result]]
+            target: [parsing label, egde label]
+            soft_preds: [[parsing result1, parsing result2],[edge result]]
+        Returns:
+            Calculated Loss.
+        """
+        h, w = target[0].size(1), target[0].size(2)
+
+        pos_num = torch.sum(target[1] == 1, dtype=torch.float)
+        neg_num = torch.sum(target[1] == 0, dtype=torch.float)
+
+        weight_pos = neg_num / (pos_num + neg_num)
+        weight_neg = pos_num / (pos_num + neg_num)
+        weights = torch.tensor([weight_neg, weight_pos])  # edge loss weight
+
+        loss = 0
+
+        # loss for segmentation
+        preds_parsing = preds[0]
+        for pred_parsing in preds_parsing:
+            scale_pred = F.interpolate(input=pred_parsing, size=(h, w),
+                                       mode='bilinear', align_corners=True)
+
+            loss += 0.5 * self.lamda_1 * self.lovasz(scale_pred, target[0])
+            if target[2] is None:
+                loss += 0.5 * self.lamda_1 * self.criterion(scale_pred, target[0])
+            else:
+                soft_scale_pred = F.interpolate(input=target[2], size=(h, w),
+                                                mode='bilinear', align_corners=True)
+                soft_scale_pred = moving_average(soft_scale_pred, to_one_hot(target[0], num_cls=self.num_classes),
+                                                 1.0 / (cycle_n + 1.0))
+                loss += 0.5 * self.lamda_1 * self.kldiv(scale_pred, soft_scale_pred, target[0])
+
+        # loss for edge
+        preds_edge = preds[1]
+        for pred_edge in preds_edge:
+            scale_pred = F.interpolate(input=pred_edge, size=(h, w),
+                                       mode='bilinear', align_corners=True)
+            if target[3] is None:
+                loss += self.lamda_2 * F.cross_entropy(scale_pred, target[1],
+                                                       weights.cuda(), ignore_index=self.ignore_index)
+            else:
+                soft_scale_edge = F.interpolate(input=target[3], size=(h, w),
+                                                mode='bilinear', align_corners=True)
+                soft_scale_edge = moving_average(soft_scale_edge, to_one_hot(target[1], num_cls=2),
+                                                 1.0 / (cycle_n + 1.0))
+                loss += self.lamda_2 * self.kldiv(scale_pred, soft_scale_edge, target[0])
+
+        # consistency regularization
+        preds_parsing = preds[0]
+        preds_edge = preds[1]
+        for pred_parsing in preds_parsing:
+            scale_pred = F.interpolate(input=pred_parsing, size=(h, w),
+                                       mode='bilinear', align_corners=True)
+            scale_edge = F.interpolate(input=preds_edge[0], size=(h, w),
+                                       mode='bilinear', align_corners=True)
+            loss += self.lamda_3 * self.reg(scale_pred, scale_edge, target[0])
+
+        return loss
+
+    def forward(self, preds, target, cycle_n=None):
+        loss = self.parsing_loss(preds, target, cycle_n)
+        return loss
+
+    def _generate_weights(self, masks, num_classes):
+        """
+        masks: torch.Tensor with shape [B, H, W]
+        """
+        masks_label = masks.data.cpu().numpy().astype(np.int64)
+        pixel_nums = []
+        tot_pixels = 0
+        for i in range(num_classes):
+            pixel_num_of_cls_i = np.sum(masks_label == i).astype(np.float)
+            pixel_nums.append(pixel_num_of_cls_i)
+            tot_pixels += pixel_num_of_cls_i
+        weights = []
+        for i in range(num_classes):
+            weights.append(
+                (tot_pixels - pixel_nums[i]) / tot_pixels / (num_classes - 1)
+            )
+        weights = np.array(weights, dtype=np.float)
+        # weights = torch.from_numpy(weights).float().to(masks.device)
+        return weights
+
+
+def moving_average(target1, target2, alpha=1.0):
+    target = 0
+    target += (1.0 - alpha) * target1
+    target += target2 * alpha
+    return target
+
+
+def to_one_hot(tensor, num_cls, dim=1, ignore_index=255):
+    b, h, w = tensor.shape
+    tensor[tensor == ignore_index] = 0
+    onehot_tensor = torch.zeros(b, num_cls, h, w).cuda()
+    onehot_tensor.scatter_(dim, tensor.unsqueeze(dim), 1)
+    return onehot_tensor
diff --git a/preprocess/humanparsing/utils/encoding.py b/preprocess/humanparsing/utils/encoding.py
new file mode 100644
index 0000000000000000000000000000000000000000..e8654706c345e8a13219f2c8e4cfa7700f531612
--- /dev/null
+++ b/preprocess/humanparsing/utils/encoding.py
@@ -0,0 +1,188 @@
+##+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
+## Created by: Hang Zhang
+## ECE Department, Rutgers University
+## Email: zhang.hang@rutgers.edu
+## Copyright (c) 2017
+##
+## This source code is licensed under the MIT-style license found in the
+## LICENSE file in the root directory of this source tree
+##+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
+
+"""Encoding Data Parallel"""
+import threading
+import functools
+import torch
+from torch.autograd import Variable, Function
+import torch.cuda.comm as comm
+from torch.nn.parallel.data_parallel import DataParallel
+from torch.nn.parallel.parallel_apply import get_a_var
+from torch.nn.parallel._functions import ReduceAddCoalesced, Broadcast
+
+torch_ver = torch.__version__[:3]
+
+__all__ = ['allreduce', 'DataParallelModel', 'DataParallelCriterion', 'patch_replication_callback']
+
+def allreduce(*inputs):
+    """Cross GPU all reduce autograd operation for calculate mean and
+    variance in SyncBN.
+    """
+    return AllReduce.apply(*inputs)
+
+class AllReduce(Function):
+    @staticmethod
+    def forward(ctx, num_inputs, *inputs):
+        ctx.num_inputs = num_inputs
+        ctx.target_gpus = [inputs[i].get_device() for i in range(0, len(inputs), num_inputs)]
+        inputs = [inputs[i:i + num_inputs]
+                 for i in range(0, len(inputs), num_inputs)]
+        # sort before reduce sum
+        inputs = sorted(inputs, key=lambda i: i[0].get_device())
+        results = comm.reduce_add_coalesced(inputs, ctx.target_gpus[0])
+        outputs = comm.broadcast_coalesced(results, ctx.target_gpus)
+        return tuple([t for tensors in outputs for t in tensors])
+
+    @staticmethod
+    def backward(ctx, *inputs):
+        inputs = [i.data for i in inputs]
+        inputs = [inputs[i:i + ctx.num_inputs]
+                 for i in range(0, len(inputs), ctx.num_inputs)]
+        results = comm.reduce_add_coalesced(inputs, ctx.target_gpus[0])
+        outputs = comm.broadcast_coalesced(results, ctx.target_gpus)
+        return (None,) + tuple([Variable(t) for tensors in outputs for t in tensors])
+
+class Reduce(Function):
+    @staticmethod
+    def forward(ctx, *inputs):
+        ctx.target_gpus = [inputs[i].get_device() for i in range(len(inputs))]
+        inputs = sorted(inputs, key=lambda i: i.get_device())
+        return comm.reduce_add(inputs)
+
+    @staticmethod
+    def backward(ctx, gradOutput):
+        return Broadcast.apply(ctx.target_gpus, gradOutput)
+
+
+class DataParallelModel(DataParallel):
+    """Implements data parallelism at the module level.
+
+    This container parallelizes the application of the given module by
+    splitting the input across the specified devices by chunking in the
+    batch dimension.
+    In the forward pass, the module is replicated on each device,
+    and each replica handles a portion of the input. During the backwards pass, gradients from each replica are summed into the original module.
+    Note that the outputs are not gathered, please use compatible
+    :class:`encoding.parallel.DataParallelCriterion`.
+
+    The batch size should be larger than the number of GPUs used. It should
+    also be an integer multiple of the number of GPUs so that each chunk is
+    the same size (so that each GPU processes the same number of samples).
+
+    Args:
+        module: module to be parallelized
+        device_ids: CUDA devices (default: all devices)
+
+    Reference:
+        Hang Zhang, Kristin Dana, Jianping Shi, Zhongyue Zhang, Xiaogang Wang, Ambrish Tyagi,
+        Amit Agrawal. “Context Encoding for Semantic Segmentation.
+        *The IEEE Conference on Computer Vision and Pattern Recognition (CVPR) 2018*
+
+    Example::
+
+        >>> net = encoding.nn.DataParallelModel(model, device_ids=[0, 1, 2])
+        >>> y = net(x)
+    """
+    def gather(self, outputs, output_device):
+        return outputs
+
+    def replicate(self, module, device_ids):
+        modules = super(DataParallelModel, self).replicate(module, device_ids)
+        return modules
+
+
+class DataParallelCriterion(DataParallel):
+    """
+    Calculate loss in multiple-GPUs, which balance the memory usage for
+    Semantic Segmentation.
+
+    The targets are splitted across the specified devices by chunking in
+    the batch dimension. Please use together with :class:`encoding.parallel.DataParallelModel`.
+
+    Reference:
+        Hang Zhang, Kristin Dana, Jianping Shi, Zhongyue Zhang, Xiaogang Wang, Ambrish Tyagi,
+        Amit Agrawal. “Context Encoding for Semantic Segmentation.
+        *The IEEE Conference on Computer Vision and Pattern Recognition (CVPR) 2018*
+
+    Example::
+
+        >>> net = encoding.nn.DataParallelModel(model, device_ids=[0, 1, 2])
+        >>> criterion = encoding.nn.DataParallelCriterion(criterion, device_ids=[0, 1, 2])
+        >>> y = net(x)
+        >>> loss = criterion(y, target)
+    """
+    def forward(self, inputs, *targets, **kwargs):
+        # input should be already scatterd
+        # scattering the targets instead
+        if not self.device_ids:
+            return self.module(inputs, *targets, **kwargs)
+        targets, kwargs = self.scatter(targets, kwargs, self.device_ids)
+        if len(self.device_ids) == 1:
+            return self.module(inputs, *targets[0], **kwargs[0])
+        replicas = self.replicate(self.module, self.device_ids[:len(inputs)])
+        outputs = _criterion_parallel_apply(replicas, inputs, targets, kwargs)
+        return Reduce.apply(*outputs) / len(outputs)
+
+
+def _criterion_parallel_apply(modules, inputs, targets, kwargs_tup=None, devices=None):
+    assert len(modules) == len(inputs)
+    assert len(targets) == len(inputs)
+    if kwargs_tup:
+        assert len(modules) == len(kwargs_tup)
+    else:
+        kwargs_tup = ({},) * len(modules)
+    if devices is not None:
+        assert len(modules) == len(devices)
+    else:
+        devices = [None] * len(modules)
+
+    lock = threading.Lock()
+    results = {}
+    if torch_ver != "0.3":
+        grad_enabled = torch.is_grad_enabled()
+
+    def _worker(i, module, input, target, kwargs, device=None):
+        if torch_ver != "0.3":
+            torch.set_grad_enabled(grad_enabled)
+        if device is None:
+            device = get_a_var(input).get_device()
+        try:
+            if not isinstance(input, tuple):
+                input = (input,)
+            with torch.cuda.device(device):
+                output = module(*(input + target), **kwargs)
+            with lock:
+                results[i] = output
+        except Exception as e:
+            with lock:
+                results[i] = e
+
+    if len(modules) > 1:
+        threads = [threading.Thread(target=_worker,
+                                    args=(i, module, input, target,
+                                          kwargs, device),)
+                   for i, (module, input, target, kwargs, device) in
+                   enumerate(zip(modules, inputs, targets, kwargs_tup, devices))]
+
+        for thread in threads:
+            thread.start()
+        for thread in threads:
+            thread.join()
+    else:
+        _worker(0, modules[0], inputs[0], kwargs_tup[0], devices[0])
+
+    outputs = []
+    for i in range(len(inputs)):
+        output = results[i]
+        if isinstance(output, Exception):
+            raise output
+        outputs.append(output)
+    return outputs
diff --git a/preprocess/humanparsing/utils/kl_loss.py b/preprocess/humanparsing/utils/kl_loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..9a685d945fb852a81324513ae55498857f1a4552
--- /dev/null
+++ b/preprocess/humanparsing/utils/kl_loss.py
@@ -0,0 +1,44 @@
+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+
+"""
+@Author  :   Peike Li
+@Contact :   peike.li@yahoo.com
+@File    :   kl_loss.py
+@Time    :   7/23/19 4:02 PM
+@Desc    :   
+@License :   This source code is licensed under the license found in the 
+             LICENSE file in the root directory of this source tree.
+"""
+import torch
+import torch.nn.functional as F
+from torch import nn
+
+
+def flatten_probas(input, target, labels, ignore=255):
+    """
+    Flattens predictions in the batch.
+    """
+    B, C, H, W = input.size()
+    input = input.permute(0, 2, 3, 1).contiguous().view(-1, C)  # B * H * W, C = P, C
+    target = target.permute(0, 2, 3, 1).contiguous().view(-1, C)  # B * H * W, C = P, C
+    labels = labels.view(-1)
+    if ignore is None:
+        return input, target
+    valid = (labels != ignore)
+    vinput = input[valid.nonzero().squeeze()]
+    vtarget = target[valid.nonzero().squeeze()]
+    return vinput, vtarget
+
+
+class KLDivergenceLoss(nn.Module):
+    def __init__(self, ignore_index=255, T=1):
+        super(KLDivergenceLoss, self).__init__()
+        self.ignore_index=ignore_index
+        self.T = T
+
+    def forward(self, input, target, label):
+        log_input_prob = F.log_softmax(input / self.T, dim=1)
+        target_porb = F.softmax(target / self.T, dim=1)
+        loss = F.kl_div(*flatten_probas(log_input_prob, target_porb, label, ignore=self.ignore_index))
+        return self.T*self.T*loss # balanced
diff --git a/preprocess/humanparsing/utils/lovasz_softmax.py b/preprocess/humanparsing/utils/lovasz_softmax.py
new file mode 100644
index 0000000000000000000000000000000000000000..b6e444f684c0d9bda9d7c2d54a4e79fac0ddf081
--- /dev/null
+++ b/preprocess/humanparsing/utils/lovasz_softmax.py
@@ -0,0 +1,279 @@
+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+
+"""
+@Author  :   Peike Li
+@Contact :   peike.li@yahoo.com
+@File    :   lovasz_softmax.py
+@Time    :   8/30/19 7:12 PM
+@Desc    :   Lovasz-Softmax and Jaccard hinge loss in PyTorch
+             Maxim Berman 2018 ESAT-PSI KU Leuven (MIT License)
+@License :   This source code is licensed under the license found in the
+             LICENSE file in the root directory of this source tree.
+"""
+
+from __future__ import print_function, division
+
+import torch
+from torch.autograd import Variable
+import torch.nn.functional as F
+import numpy as np
+from torch import nn
+
+try:
+    from itertools import ifilterfalse
+except ImportError:  # py3k
+    from itertools import filterfalse as ifilterfalse
+
+
+def lovasz_grad(gt_sorted):
+    """
+    Computes gradient of the Lovasz extension w.r.t sorted errors
+    See Alg. 1 in paper
+    """
+    p = len(gt_sorted)
+    gts = gt_sorted.sum()
+    intersection = gts - gt_sorted.float().cumsum(0)
+    union = gts + (1 - gt_sorted).float().cumsum(0)
+    jaccard = 1. - intersection / union
+    if p > 1:  # cover 1-pixel case
+        jaccard[1:p] = jaccard[1:p] - jaccard[0:-1]
+    return jaccard
+
+
+def iou_binary(preds, labels, EMPTY=1., ignore=None, per_image=True):
+    """
+    IoU for foreground class
+    binary: 1 foreground, 0 background
+    """
+    if not per_image:
+        preds, labels = (preds,), (labels,)
+    ious = []
+    for pred, label in zip(preds, labels):
+        intersection = ((label == 1) & (pred == 1)).sum()
+        union = ((label == 1) | ((pred == 1) & (label != ignore))).sum()
+        if not union:
+            iou = EMPTY
+        else:
+            iou = float(intersection) / float(union)
+        ious.append(iou)
+    iou = mean(ious)  # mean accross images if per_image
+    return 100 * iou
+
+
+def iou(preds, labels, C, EMPTY=1., ignore=None, per_image=False):
+    """
+    Array of IoU for each (non ignored) class
+    """
+    if not per_image:
+        preds, labels = (preds,), (labels,)
+    ious = []
+    for pred, label in zip(preds, labels):
+        iou = []
+        for i in range(C):
+            if i != ignore:  # The ignored label is sometimes among predicted classes (ENet - CityScapes)
+                intersection = ((label == i) & (pred == i)).sum()
+                union = ((label == i) | ((pred == i) & (label != ignore))).sum()
+                if not union:
+                    iou.append(EMPTY)
+                else:
+                    iou.append(float(intersection) / float(union))
+        ious.append(iou)
+    ious = [mean(iou) for iou in zip(*ious)]  # mean accross images if per_image
+    return 100 * np.array(ious)
+
+
+# --------------------------- BINARY LOSSES ---------------------------
+
+
+def lovasz_hinge(logits, labels, per_image=True, ignore=None):
+    """
+    Binary Lovasz hinge loss
+      logits: [B, H, W] Variable, logits at each pixel (between -\infty and +\infty)
+      labels: [B, H, W] Tensor, binary ground truth masks (0 or 1)
+      per_image: compute the loss per image instead of per batch
+      ignore: void class id
+    """
+    if per_image:
+        loss = mean(lovasz_hinge_flat(*flatten_binary_scores(log.unsqueeze(0), lab.unsqueeze(0), ignore))
+                    for log, lab in zip(logits, labels))
+    else:
+        loss = lovasz_hinge_flat(*flatten_binary_scores(logits, labels, ignore))
+    return loss
+
+
+def lovasz_hinge_flat(logits, labels):
+    """
+    Binary Lovasz hinge loss
+      logits: [P] Variable, logits at each prediction (between -\infty and +\infty)
+      labels: [P] Tensor, binary ground truth labels (0 or 1)
+      ignore: label to ignore
+    """
+    if len(labels) == 0:
+        # only void pixels, the gradients should be 0
+        return logits.sum() * 0.
+    signs = 2. * labels.float() - 1.
+    errors = (1. - logits * Variable(signs))
+    errors_sorted, perm = torch.sort(errors, dim=0, descending=True)
+    perm = perm.data
+    gt_sorted = labels[perm]
+    grad = lovasz_grad(gt_sorted)
+    loss = torch.dot(F.relu(errors_sorted), Variable(grad))
+    return loss
+
+
+def flatten_binary_scores(scores, labels, ignore=None):
+    """
+    Flattens predictions in the batch (binary case)
+    Remove labels equal to 'ignore'
+    """
+    scores = scores.view(-1)
+    labels = labels.view(-1)
+    if ignore is None:
+        return scores, labels
+    valid = (labels != ignore)
+    vscores = scores[valid]
+    vlabels = labels[valid]
+    return vscores, vlabels
+
+
+class StableBCELoss(torch.nn.modules.Module):
+    def __init__(self):
+        super(StableBCELoss, self).__init__()
+
+    def forward(self, input, target):
+        neg_abs = - input.abs()
+        loss = input.clamp(min=0) - input * target + (1 + neg_abs.exp()).log()
+        return loss.mean()
+
+
+def binary_xloss(logits, labels, ignore=None):
+    """
+    Binary Cross entropy loss
+      logits: [B, H, W] Variable, logits at each pixel (between -\infty and +\infty)
+      labels: [B, H, W] Tensor, binary ground truth masks (0 or 1)
+      ignore: void class id
+    """
+    logits, labels = flatten_binary_scores(logits, labels, ignore)
+    loss = StableBCELoss()(logits, Variable(labels.float()))
+    return loss
+
+
+# --------------------------- MULTICLASS LOSSES ---------------------------
+
+
+def lovasz_softmax(probas, labels, classes='present', per_image=False, ignore=255, weighted=None):
+    """
+    Multi-class Lovasz-Softmax loss
+      probas: [B, C, H, W] Variable, class probabilities at each prediction (between 0 and 1).
+              Interpreted as binary (sigmoid) output with outputs of size [B, H, W].
+      labels: [B, H, W] Tensor, ground truth labels (between 0 and C - 1)
+      classes: 'all' for all, 'present' for classes present in labels, or a list of classes to average.
+      per_image: compute the loss per image instead of per batch
+      ignore: void class labels
+    """
+    if per_image:
+        loss = mean(lovasz_softmax_flat(*flatten_probas(prob.unsqueeze(0), lab.unsqueeze(0), ignore), classes=classes, weighted=weighted)
+                    for prob, lab in zip(probas, labels))
+    else:
+        loss = lovasz_softmax_flat(*flatten_probas(probas, labels, ignore), classes=classes, weighted=weighted )
+    return loss
+
+
+def lovasz_softmax_flat(probas, labels, classes='present', weighted=None):
+    """
+    Multi-class Lovasz-Softmax loss
+      probas: [P, C] Variable, class probabilities at each prediction (between 0 and 1)
+      labels: [P] Tensor, ground truth labels (between 0 and C - 1)
+      classes: 'all' for all, 'present' for classes present in labels, or a list of classes to average.
+    """
+    if probas.numel() == 0:
+        # only void pixels, the gradients should be 0
+        return probas * 0.
+    C = probas.size(1)
+    losses = []
+    class_to_sum = list(range(C)) if classes in ['all', 'present'] else classes
+    for c in class_to_sum:
+        fg = (labels == c).float()  # foreground for class c
+        if (classes is 'present' and fg.sum() == 0):
+            continue
+        if C == 1:
+            if len(classes) > 1:
+                raise ValueError('Sigmoid output possible only with 1 class')
+            class_pred = probas[:, 0]
+        else:
+            class_pred = probas[:, c]
+        errors = (Variable(fg) - class_pred).abs()
+        errors_sorted, perm = torch.sort(errors, 0, descending=True)
+        perm = perm.data
+        fg_sorted = fg[perm]
+        if weighted is not None:
+            losses.append(weighted[c]*torch.dot(errors_sorted, Variable(lovasz_grad(fg_sorted))))
+        else:
+            losses.append(torch.dot(errors_sorted, Variable(lovasz_grad(fg_sorted))))
+    return mean(losses)
+
+
+def flatten_probas(probas, labels, ignore=None):
+    """
+    Flattens predictions in the batch
+    """
+    if probas.dim() == 3:
+        # assumes output of a sigmoid layer
+        B, H, W = probas.size()
+        probas = probas.view(B, 1, H, W)
+    B, C, H, W = probas.size()
+    probas = probas.permute(0, 2, 3, 1).contiguous().view(-1, C)  # B * H * W, C = P, C
+    labels = labels.view(-1)
+    if ignore is None:
+        return probas, labels
+    valid = (labels != ignore)
+    vprobas = probas[valid.nonzero().squeeze()]
+    vlabels = labels[valid]
+    return vprobas, vlabels
+
+
+def xloss(logits, labels, ignore=None):
+    """
+    Cross entropy loss
+    """
+    return F.cross_entropy(logits, Variable(labels), ignore_index=255)
+
+
+# --------------------------- HELPER FUNCTIONS ---------------------------
+def isnan(x):
+    return x != x
+
+
+def mean(l, ignore_nan=False, empty=0):
+    """
+    nanmean compatible with generators.
+    """
+    l = iter(l)
+    if ignore_nan:
+        l = ifilterfalse(isnan, l)
+    try:
+        n = 1
+        acc = next(l)
+    except StopIteration:
+        if empty == 'raise':
+            raise ValueError('Empty mean')
+        return empty
+    for n, v in enumerate(l, 2):
+        acc += v
+    if n == 1:
+        return acc
+    return acc / n
+
+# --------------------------- Class ---------------------------
+class LovaszSoftmax(nn.Module):
+    def __init__(self, per_image=False, ignore_index=255, weighted=None):
+        super(LovaszSoftmax, self).__init__()
+        self.lovasz_softmax = lovasz_softmax
+        self.per_image = per_image
+        self.ignore_index=ignore_index
+        self.weighted = weighted
+
+    def forward(self, pred, label):
+        pred = F.softmax(pred, dim=1)
+        return self.lovasz_softmax(pred, label, per_image=self.per_image, ignore=self.ignore_index, weighted=self.weighted)
\ No newline at end of file
diff --git a/preprocess/humanparsing/utils/miou.py b/preprocess/humanparsing/utils/miou.py
new file mode 100644
index 0000000000000000000000000000000000000000..51a2cc965a5c0cfd5497c9191906898da31485dd
--- /dev/null
+++ b/preprocess/humanparsing/utils/miou.py
@@ -0,0 +1,155 @@
+import cv2
+import os
+import numpy as np
+
+from collections import OrderedDict
+from PIL import Image as PILImage
+from utils.transforms import transform_parsing
+
+LABELS = ['Background', 'Hat', 'Hair', 'Glove', 'Sunglasses', 'Upper-clothes', 'Dress', 'Coat', \
+          'Socks', 'Pants', 'Jumpsuits', 'Scarf', 'Skirt', 'Face', 'Left-arm', 'Right-arm', 'Left-leg',
+          'Right-leg', 'Left-shoe', 'Right-shoe']
+
+
+# LABELS = ['Background', 'Head', 'Torso', 'Upper Arms', 'Lower Arms', 'Upper Legs', 'Lower Legs']
+
+def get_palette(num_cls):
+    """ Returns the color map for visualizing the segmentation mask.
+    Args:
+        num_cls: Number of classes
+    Returns:
+        The color map
+    """
+
+    n = num_cls
+    palette = [0] * (n * 3)
+    for j in range(0, n):
+        lab = j
+        palette[j * 3 + 0] = 0
+        palette[j * 3 + 1] = 0
+        palette[j * 3 + 2] = 0
+        i = 0
+        while lab:
+            palette[j * 3 + 0] |= (((lab >> 0) & 1) << (7 - i))
+            palette[j * 3 + 1] |= (((lab >> 1) & 1) << (7 - i))
+            palette[j * 3 + 2] |= (((lab >> 2) & 1) << (7 - i))
+            i += 1
+            lab >>= 3
+    return palette
+
+
+def get_confusion_matrix(gt_label, pred_label, num_classes):
+    """
+    Calcute the confusion matrix by given label and pred
+    :param gt_label: the ground truth label
+    :param pred_label: the pred label
+    :param num_classes: the nunber of class
+    :return: the confusion matrix
+    """
+    index = (gt_label * num_classes + pred_label).astype('int32')
+    label_count = np.bincount(index)
+    confusion_matrix = np.zeros((num_classes, num_classes))
+
+    for i_label in range(num_classes):
+        for i_pred_label in range(num_classes):
+            cur_index = i_label * num_classes + i_pred_label
+            if cur_index < len(label_count):
+                confusion_matrix[i_label, i_pred_label] = label_count[cur_index]
+
+    return confusion_matrix
+
+
+def compute_mean_ioU(preds, scales, centers, num_classes, datadir, input_size=[473, 473], dataset='val'):
+    val_file = os.path.join(datadir, dataset + '_id.txt')
+    val_id = [i_id.strip() for i_id in open(val_file)]
+
+    confusion_matrix = np.zeros((num_classes, num_classes))
+
+    for i, pred_out in enumerate(preds):
+        im_name = val_id[i]
+        gt_path = os.path.join(datadir, dataset + '_segmentations', im_name + '.png')
+        gt = np.array(PILImage.open(gt_path))
+        h, w = gt.shape
+        s = scales[i]
+        c = centers[i]
+        pred = transform_parsing(pred_out, c, s, w, h, input_size)
+
+        gt = np.asarray(gt, dtype=np.int32)
+        pred = np.asarray(pred, dtype=np.int32)
+
+        ignore_index = gt != 255
+
+        gt = gt[ignore_index]
+        pred = pred[ignore_index]
+
+        confusion_matrix += get_confusion_matrix(gt, pred, num_classes)
+
+    pos = confusion_matrix.sum(1)
+    res = confusion_matrix.sum(0)
+    tp = np.diag(confusion_matrix)
+
+    pixel_accuracy = (tp.sum() / pos.sum()) * 100
+    mean_accuracy = ((tp / np.maximum(1.0, pos)).mean()) * 100
+    IoU_array = (tp / np.maximum(1.0, pos + res - tp))
+    IoU_array = IoU_array * 100
+    mean_IoU = IoU_array.mean()
+    print('Pixel accuracy: %f \n' % pixel_accuracy)
+    print('Mean accuracy: %f \n' % mean_accuracy)
+    print('Mean IU: %f \n' % mean_IoU)
+    name_value = []
+
+    for i, (label, iou) in enumerate(zip(LABELS, IoU_array)):
+        name_value.append((label, iou))
+
+    name_value.append(('Pixel accuracy', pixel_accuracy))
+    name_value.append(('Mean accuracy', mean_accuracy))
+    name_value.append(('Mean IU', mean_IoU))
+    name_value = OrderedDict(name_value)
+    return name_value
+
+
+def compute_mean_ioU_file(preds_dir, num_classes, datadir, dataset='val'):
+    list_path = os.path.join(datadir, dataset + '_id.txt')
+    val_id = [i_id.strip() for i_id in open(list_path)]
+
+    confusion_matrix = np.zeros((num_classes, num_classes))
+
+    for i, im_name in enumerate(val_id):
+        gt_path = os.path.join(datadir, 'segmentations', im_name + '.png')
+        gt = cv2.imread(gt_path, cv2.IMREAD_GRAYSCALE)
+
+        pred_path = os.path.join(preds_dir, im_name + '.png')
+        pred = np.asarray(PILImage.open(pred_path))
+
+        gt = np.asarray(gt, dtype=np.int32)
+        pred = np.asarray(pred, dtype=np.int32)
+
+        ignore_index = gt != 255
+
+        gt = gt[ignore_index]
+        pred = pred[ignore_index]
+
+        confusion_matrix += get_confusion_matrix(gt, pred, num_classes)
+
+    pos = confusion_matrix.sum(1)
+    res = confusion_matrix.sum(0)
+    tp = np.diag(confusion_matrix)
+
+    pixel_accuracy = (tp.sum() / pos.sum()) * 100
+    mean_accuracy = ((tp / np.maximum(1.0, pos)).mean()) * 100
+    IoU_array = (tp / np.maximum(1.0, pos + res - tp))
+    IoU_array = IoU_array * 100
+    mean_IoU = IoU_array.mean()
+    print('Pixel accuracy: %f \n' % pixel_accuracy)
+    print('Mean accuracy: %f \n' % mean_accuracy)
+    print('Mean IU: %f \n' % mean_IoU)
+    name_value = []
+
+    for i, (label, iou) in enumerate(zip(LABELS, IoU_array)):
+        name_value.append((label, iou))
+
+    name_value.append(('Pixel accuracy', pixel_accuracy))
+    name_value.append(('Mean accuracy', mean_accuracy))
+    name_value.append(('Mean IU', mean_IoU))
+    name_value = OrderedDict(name_value)
+    return name_value
diff --git a/preprocess/humanparsing/utils/schp.py b/preprocess/humanparsing/utils/schp.py
new file mode 100644
index 0000000000000000000000000000000000000000..f57470452fac8183dc5c17156439416c15bd3265
--- /dev/null
+++ b/preprocess/humanparsing/utils/schp.py
@@ -0,0 +1,80 @@
+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+
+"""
+@Author  :   Peike Li
+@Contact :   peike.li@yahoo.com
+@File    :   schp.py
+@Time    :   4/8/19 2:11 PM
+@Desc    :   
+@License :   This source code is licensed under the license found in the 
+             LICENSE file in the root directory of this source tree.
+"""
+
+import os
+import torch
+import modules
+
+def moving_average(net1, net2, alpha=1):
+    for param1, param2 in zip(net1.parameters(), net2.parameters()):
+        param1.data *= (1.0 - alpha)
+        param1.data += param2.data * alpha
+
+
+def _check_bn(module, flag):
+    if issubclass(module.__class__, modules.bn.InPlaceABNSync):
+        flag[0] = True
+
+
+def check_bn(model):
+    flag = [False]
+    model.apply(lambda module: _check_bn(module, flag))
+    return flag[0]
+
+
+def reset_bn(module):
+    if issubclass(module.__class__, modules.bn.InPlaceABNSync):
+        module.running_mean = torch.zeros_like(module.running_mean)
+        module.running_var = torch.ones_like(module.running_var)
+
+
+def _get_momenta(module, momenta):
+    if issubclass(module.__class__, modules.bn.InPlaceABNSync):
+        momenta[module] = module.momentum
+
+
+def _set_momenta(module, momenta):
+    if issubclass(module.__class__, modules.bn.InPlaceABNSync):
+        module.momentum = momenta[module]
+
+
+def bn_re_estimate(loader, model):
+    if not check_bn(model):
+        print('No batch norm layer detected')
+        return
+    model.train()
+    momenta = {}
+    model.apply(reset_bn)
+    model.apply(lambda module: _get_momenta(module, momenta))
+    n = 0
+    for i_iter, batch in enumerate(loader):
+        images, labels, _ = batch
+        b = images.data.size(0)
+        momentum = b / (n + b)
+        for module in momenta.keys():
+            module.momentum = momentum
+        model(images)
+        n += b
+    model.apply(lambda module: _set_momenta(module, momenta))
+
+
+def save_schp_checkpoint(states, is_best_parsing, output_dir, filename='schp_checkpoint.pth.tar'):
+    save_path = os.path.join(output_dir, filename)
+    if os.path.exists(save_path):
+        os.remove(save_path)
+    torch.save(states, save_path)
+    if is_best_parsing and 'state_dict' in states:
+        best_save_path = os.path.join(output_dir, 'model_parsing_best.pth.tar')
+        if os.path.exists(best_save_path):
+            os.remove(best_save_path)
+        torch.save(states, best_save_path)
diff --git a/preprocess/humanparsing/utils/soft_dice_loss.py b/preprocess/humanparsing/utils/soft_dice_loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..cb5895fd37467d36f213f941d1b01d6d6f7f194c
--- /dev/null
+++ b/preprocess/humanparsing/utils/soft_dice_loss.py
@@ -0,0 +1,111 @@
+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+
+"""
+@Author  :   Peike Li
+@Contact :   peike.li@yahoo.com
+@File    :   soft_dice_loss.py
+@Time    :   8/13/19 5:09 PM
+@Desc    :   
+@License :   This source code is licensed under the license found in the 
+             LICENSE file in the root directory of this source tree.
+"""
+
+from __future__ import print_function, division
+
+import torch
+import torch.nn.functional as F
+from torch import nn
+
+try:
+    from itertools import ifilterfalse
+except ImportError:  # py3k
+    from itertools import filterfalse as ifilterfalse
+
+
+def tversky_loss(probas, labels, alpha=0.5, beta=0.5, epsilon=1e-6):
+    '''
+    Tversky loss function.
+        probas: [P, C] Variable, class probabilities at each prediction (between 0 and 1)
+        labels: [P] Tensor, ground truth labels (between 0 and C - 1)
+
+    Same as soft dice loss when alpha=beta=0.5.
+    Same as Jaccord loss when alpha=beta=1.0.
+    See `Tversky loss function for image segmentation using 3D fully convolutional deep networks`
+    https://arxiv.org/pdf/1706.05721.pdf
+    '''
+    C = probas.size(1)
+    losses = []
+    for c in list(range(C)):
+        fg = (labels == c).float()
+        if fg.sum() == 0:
+            continue
+        class_pred = probas[:, c]
+        p0 = class_pred
+        p1 = 1 - class_pred
+        g0 = fg
+        g1 = 1 - fg
+        numerator = torch.sum(p0 * g0)
+        denominator = numerator + alpha * torch.sum(p0 * g1) + beta * torch.sum(p1 * g0)
+        losses.append(1 - ((numerator) / (denominator + epsilon)))
+    return mean(losses)
+
+
+def flatten_probas(probas, labels, ignore=255):
+    """
+    Flattens predictions in the batch
+    """
+    B, C, H, W = probas.size()
+    probas = probas.permute(0, 2, 3, 1).contiguous().view(-1, C)  # B * H * W, C = P, C
+    labels = labels.view(-1)
+    if ignore is None:
+        return probas, labels
+    valid = (labels != ignore)
+    vprobas = probas[valid.nonzero().squeeze()]
+    vlabels = labels[valid]
+    return vprobas, vlabels
+
+
+def isnan(x):
+    return x != x
+
+
+def mean(l, ignore_nan=False, empty=0):
+    """
+    nanmean compatible with generators.
+    """
+    l = iter(l)
+    if ignore_nan:
+        l = ifilterfalse(isnan, l)
+    try:
+        n = 1
+        acc = next(l)
+    except StopIteration:
+        if empty == 'raise':
+            raise ValueError('Empty mean')
+        return empty
+    for n, v in enumerate(l, 2):
+        acc += v
+    if n == 1:
+        return acc
+    return acc / n
+
+
+class SoftDiceLoss(nn.Module):
+    def __init__(self, ignore_index=255):
+        super(SoftDiceLoss, self).__init__()
+        self.ignore_index = ignore_index
+
+    def forward(self, pred, label):
+        pred = F.softmax(pred, dim=1)
+        return tversky_loss(*flatten_probas(pred, label, ignore=self.ignore_index), alpha=0.5, beta=0.5)
+
+
+class SoftJaccordLoss(nn.Module):
+    def __init__(self, ignore_index=255):
+        super(SoftJaccordLoss, self).__init__()
+        self.ignore_index = ignore_index
+
+    def forward(self, pred, label):
+        pred = F.softmax(pred, dim=1)
+        return tversky_loss(*flatten_probas(pred, label, ignore=self.ignore_index), alpha=1.0, beta=1.0)
diff --git a/preprocess/humanparsing/utils/transforms.py b/preprocess/humanparsing/utils/transforms.py
new file mode 100644
index 0000000000000000000000000000000000000000..1442a728938ca19fcb4ac21ae6588266df45631c
--- /dev/null
+++ b/preprocess/humanparsing/utils/transforms.py
@@ -0,0 +1,167 @@
+# ------------------------------------------------------------------------------
+# Copyright (c) Microsoft
+# Licensed under the MIT License.
+# Written by Bin Xiao (Bin.Xiao@microsoft.com)
+# ------------------------------------------------------------------------------
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+import cv2
+import torch
+
+class BRG2Tensor_transform(object):
+    def __call__(self, pic):
+        img = torch.from_numpy(pic.transpose((2, 0, 1)))
+        if isinstance(img, torch.ByteTensor):
+            return img.float()
+        else:
+            return img
+
+class BGR2RGB_transform(object):
+    def __call__(self, tensor):
+        return tensor[[2,1,0],:,:]
+
+def flip_back(output_flipped, matched_parts):
+    '''
+    ouput_flipped: numpy.ndarray(batch_size, num_joints, height, width)
+    '''
+    assert output_flipped.ndim == 4,\
+        'output_flipped should be [batch_size, num_joints, height, width]'
+
+    output_flipped = output_flipped[:, :, :, ::-1]
+
+    for pair in matched_parts:
+        tmp = output_flipped[:, pair[0], :, :].copy()
+        output_flipped[:, pair[0], :, :] = output_flipped[:, pair[1], :, :]
+        output_flipped[:, pair[1], :, :] = tmp
+
+    return output_flipped
+
+
+def fliplr_joints(joints, joints_vis, width, matched_parts):
+    """
+    flip coords
+    """
+    # Flip horizontal
+    joints[:, 0] = width - joints[:, 0] - 1
+
+    # Change left-right parts
+    for pair in matched_parts:
+        joints[pair[0], :], joints[pair[1], :] = \
+            joints[pair[1], :], joints[pair[0], :].copy()
+        joints_vis[pair[0], :], joints_vis[pair[1], :] = \
+            joints_vis[pair[1], :], joints_vis[pair[0], :].copy()
+
+    return joints*joints_vis, joints_vis
+
+
+def transform_preds(coords, center, scale, input_size):
+    target_coords = np.zeros(coords.shape)
+    trans = get_affine_transform(center, scale, 0, input_size, inv=1)
+    for p in range(coords.shape[0]):
+        target_coords[p, 0:2] = affine_transform(coords[p, 0:2], trans)
+    return target_coords
+
+def transform_parsing(pred, center, scale, width, height, input_size):
+
+    trans = get_affine_transform(center, scale, 0, input_size, inv=1)
+    target_pred = cv2.warpAffine(
+            pred,
+            trans,
+            (int(width), int(height)), #(int(width), int(height)),
+            flags=cv2.INTER_NEAREST,
+            borderMode=cv2.BORDER_CONSTANT,
+            borderValue=(0))
+
+    return target_pred
+
+def transform_logits(logits, center, scale, width, height, input_size):
+
+    trans = get_affine_transform(center, scale, 0, input_size, inv=1)
+    channel = logits.shape[2]
+    target_logits = []
+    for i in range(channel):
+        target_logit = cv2.warpAffine(
+            logits[:,:,i],
+            trans,
+            (int(width), int(height)), #(int(width), int(height)),
+            flags=cv2.INTER_LINEAR,
+            borderMode=cv2.BORDER_CONSTANT,
+            borderValue=(0))
+        target_logits.append(target_logit)
+    target_logits = np.stack(target_logits,axis=2)
+
+    return target_logits
+
+
+def get_affine_transform(center,
+                         scale,
+                         rot,
+                         output_size,
+                         shift=np.array([0, 0], dtype=np.float32),
+                         inv=0):
+    if not isinstance(scale, np.ndarray) and not isinstance(scale, list):
+        print(scale)
+        scale = np.array([scale, scale])
+
+    scale_tmp = scale
+
+    src_w = scale_tmp[0]
+    dst_w = output_size[1]
+    dst_h = output_size[0]
+
+    rot_rad = np.pi * rot / 180
+    src_dir = get_dir([0, src_w * -0.5], rot_rad)
+    dst_dir = np.array([0, (dst_w-1) * -0.5], np.float32)
+
+    src = np.zeros((3, 2), dtype=np.float32)
+    dst = np.zeros((3, 2), dtype=np.float32)
+    src[0, :] = center + scale_tmp * shift
+    src[1, :] = center + src_dir + scale_tmp * shift
+    dst[0, :] = [(dst_w-1) * 0.5, (dst_h-1) * 0.5]
+    dst[1, :] = np.array([(dst_w-1) * 0.5, (dst_h-1) * 0.5]) + dst_dir
+
+    src[2:, :] = get_3rd_point(src[0, :], src[1, :])
+    dst[2:, :] = get_3rd_point(dst[0, :], dst[1, :])
+
+    if inv:
+        trans = cv2.getAffineTransform(np.float32(dst), np.float32(src))
+    else:
+        trans = cv2.getAffineTransform(np.float32(src), np.float32(dst))
+
+    return trans
+
+
+def affine_transform(pt, t):
+    new_pt = np.array([pt[0], pt[1], 1.]).T
+    new_pt = np.dot(t, new_pt)
+    return new_pt[:2]
+
+
+def get_3rd_point(a, b):
+    direct = a - b
+    return b + np.array([-direct[1], direct[0]], dtype=np.float32)
+
+
+def get_dir(src_point, rot_rad):
+    sn, cs = np.sin(rot_rad), np.cos(rot_rad)
+
+    src_result = [0, 0]
+    src_result[0] = src_point[0] * cs - src_point[1] * sn
+    src_result[1] = src_point[0] * sn + src_point[1] * cs
+
+    return src_result
+
+
+def crop(img, center, scale, output_size, rot=0):
+    trans = get_affine_transform(center, scale, rot, output_size)
+
+    dst_img = cv2.warpAffine(img,
+                             trans,
+                             (int(output_size[1]), int(output_size[0])),
+                             flags=cv2.INTER_LINEAR)
+
+    return dst_img
diff --git a/preprocess/humanparsing/utils/warmup_scheduler.py b/preprocess/humanparsing/utils/warmup_scheduler.py
new file mode 100644
index 0000000000000000000000000000000000000000..2528a9c598d5ee3477d60e2f8591ec37e8afb41d
--- /dev/null
+++ b/preprocess/humanparsing/utils/warmup_scheduler.py
@@ -0,0 +1,71 @@
+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+
+"""
+@Author  :   Peike Li
+@Contact :   peike.li@yahoo.com
+@File    :   warmup_scheduler.py
+@Time    :   3/28/19 2:24 PM
+@Desc    :   
+@License :   This source code is licensed under the license found in the 
+             LICENSE file in the root directory of this source tree.
+"""
+
+import math
+from torch.optim.lr_scheduler import _LRScheduler
+
+
+class GradualWarmupScheduler(_LRScheduler):
+    """ Gradually warm-up learning rate with cosine annealing in optimizer.
+    Proposed in 'Accurate, Large Minibatch SGD: Training ImageNet in 1 Hour'.
+    """
+
+    def __init__(self, optimizer, total_epoch, eta_min=0, warmup_epoch=10, last_epoch=-1):
+        self.total_epoch = total_epoch
+        self.eta_min = eta_min
+        self.warmup_epoch = warmup_epoch
+        super(GradualWarmupScheduler, self).__init__(optimizer, last_epoch)
+
+    def get_lr(self):
+        if self.last_epoch <= self.warmup_epoch:
+            return [self.eta_min + self.last_epoch*(base_lr - self.eta_min)/self.warmup_epoch for base_lr in self.base_lrs]
+        else:
+            return [self.eta_min + (base_lr-self.eta_min)*(1+math.cos(math.pi*(self.last_epoch-self.warmup_epoch)/(self.total_epoch-self.warmup_epoch))) / 2 for base_lr in self.base_lrs]
+
+
+class SGDRScheduler(_LRScheduler):
+    """ Consine annealing with warm up and restarts.
+    Proposed in `SGDR: Stochastic Gradient Descent with Warm Restarts`.
+    """
+    def __init__(self, optimizer, total_epoch=150, start_cyclical=100, cyclical_base_lr=7e-4, cyclical_epoch=10, eta_min=0, warmup_epoch=10, last_epoch=-1):
+        self.total_epoch = total_epoch
+        self.start_cyclical = start_cyclical
+        self.cyclical_epoch = cyclical_epoch
+        self.cyclical_base_lr = cyclical_base_lr
+        self.eta_min = eta_min
+        self.warmup_epoch = warmup_epoch
+        super(SGDRScheduler, self).__init__(optimizer, last_epoch)
+
+    def get_lr(self):
+        if self.last_epoch < self.warmup_epoch:
+            return [self.eta_min + self.last_epoch*(base_lr - self.eta_min)/self.warmup_epoch for base_lr in self.base_lrs]
+        elif self.last_epoch < self.start_cyclical:
+            return [self.eta_min + (base_lr-self.eta_min)*(1+math.cos(math.pi*(self.last_epoch-self.warmup_epoch)/(self.start_cyclical-self.warmup_epoch))) / 2 for base_lr in self.base_lrs]
+        else:
+            return [self.eta_min + (self.cyclical_base_lr-self.eta_min)*(1+math.cos(math.pi* ((self.last_epoch-self.start_cyclical)% self.cyclical_epoch)/self.cyclical_epoch)) / 2 for base_lr in self.base_lrs]
+
+
+if __name__ == '__main__':
+    import matplotlib.pyplot as plt
+    import torch
+    model = torch.nn.Linear(10, 2)
+    optimizer = torch.optim.SGD(params=model.parameters(), lr=7e-3, momentum=0.9, weight_decay=5e-4)
+    scheduler_warmup = SGDRScheduler(optimizer, total_epoch=150, eta_min=7e-5, warmup_epoch=10, start_cyclical=100, cyclical_base_lr=3.5e-3, cyclical_epoch=10)
+    lr = []
+    for epoch in range(0,150):
+        scheduler_warmup.step(epoch)
+        lr.append(scheduler_warmup.get_lr())
+    plt.style.use('ggplot')
+    plt.plot(list(range(0,150)), lr)
+    plt.show()
+
diff --git a/preprocess/openpose/__pycache__/run_openpose.cpython-310.pyc b/preprocess/openpose/__pycache__/run_openpose.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9de68647f4065e9b66e8c04ba763860fe229faa5
Binary files /dev/null and b/preprocess/openpose/__pycache__/run_openpose.cpython-310.pyc differ
diff --git a/preprocess/openpose/annotator/__pycache__/util.cpython-310.pyc b/preprocess/openpose/annotator/__pycache__/util.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..33df1a203e13b3e18c7d7c90f22a12c1daa2e9c7
Binary files /dev/null and b/preprocess/openpose/annotator/__pycache__/util.cpython-310.pyc differ
diff --git a/preprocess/openpose/annotator/openpose/LICENSE b/preprocess/openpose/annotator/openpose/LICENSE
new file mode 100644
index 0000000000000000000000000000000000000000..6f60b76d35fa1012809985780964a5068adce4fd
--- /dev/null
+++ b/preprocess/openpose/annotator/openpose/LICENSE
@@ -0,0 +1,108 @@
+OPENPOSE: MULTIPERSON KEYPOINT DETECTION
+SOFTWARE LICENSE AGREEMENT
+ACADEMIC OR NON-PROFIT ORGANIZATION NONCOMMERCIAL RESEARCH USE ONLY
+
+BY USING OR DOWNLOADING THE SOFTWARE, YOU ARE AGREEING TO THE TERMS OF THIS LICENSE AGREEMENT.  IF YOU DO NOT AGREE WITH THESE TERMS, YOU MAY NOT USE OR DOWNLOAD THE SOFTWARE.
+
+This is a license agreement ("Agreement") between your academic institution or non-profit organization or self (called "Licensee" or "You" in this Agreement) and Carnegie Mellon University (called "Licensor" in this Agreement).  All rights not specifically granted to you in this Agreement are reserved for Licensor. 
+
+RESERVATION OF OWNERSHIP AND GRANT OF LICENSE: 
+Licensor retains exclusive ownership of any copy of the Software (as defined below) licensed under this Agreement and hereby grants to Licensee a personal, non-exclusive, 
+non-transferable license to use the Software for noncommercial research purposes, without the right to sublicense, pursuant to the terms and conditions of this Agreement.  As used in this Agreement, the term "Software" means (i) the actual copy of all or any portion of code for program routines made accessible to Licensee by Licensor pursuant to this Agreement, inclusive of backups, updates, and/or merged copies permitted hereunder or subsequently supplied by Licensor,  including all or any file structures, programming instructions, user interfaces and screen formats and sequences as well as any and all documentation and instructions related to it, and (ii) all or any derivatives and/or modifications created or made by You to any of the items specified in (i).
+
+CONFIDENTIALITY: Licensee acknowledges that the Software is proprietary to Licensor, and as such, Licensee agrees to receive all such materials in confidence and use the Software only in accordance with the terms of this Agreement.  Licensee agrees to use reasonable effort to protect the Software from unauthorized use, reproduction, distribution, or publication.
+
+COPYRIGHT: The Software is owned by Licensor and is protected by United 
+States copyright laws and applicable international treaties and/or conventions.
+
+PERMITTED USES:  The Software may be used for your own noncommercial internal research purposes. You understand and agree that Licensor is not obligated to implement any suggestions and/or feedback you might provide regarding the Software, but to the extent Licensor does so, you are not entitled to any compensation related thereto.
+
+DERIVATIVES: You may create derivatives of or make modifications to the Software, however, You agree that all and any such derivatives and modifications will be owned by Licensor and become a part of the Software licensed to You under this Agreement.  You may only use such derivatives and modifications for your own noncommercial internal research purposes, and you may not otherwise use, distribute or copy such derivatives and modifications in violation of this Agreement.
+
+BACKUPS:  If Licensee is an organization, it may make that number of copies of the Software necessary for internal noncommercial use at a single site within its organization provided that all information appearing in or on the original labels, including the copyright and trademark notices are copied onto the labels of the copies.
+
+USES NOT PERMITTED:  You may not distribute, copy or use the Software except as explicitly permitted herein. Licensee has not been granted any trademark license as part of this Agreement and may not use the name or mark “OpenPose", "Carnegie Mellon" or any renditions thereof without the prior written permission of Licensor.
+
+You may not sell, rent, lease, sublicense, lend, time-share or transfer, in whole or in part, or provide third parties access to prior or present versions (or any parts thereof) of the Software.
+
+ASSIGNMENT: You may not assign this Agreement or your rights hereunder without the prior written consent of Licensor. Any attempted assignment without such consent shall be null and void.
+
+TERM: The term of the license granted by this Agreement is from Licensee's acceptance of this Agreement by downloading the Software or by using the Software until terminated as provided below.
+
+The Agreement automatically terminates without notice if you fail to comply with any provision of this Agreement.  Licensee may terminate this Agreement by ceasing using the Software.  Upon any termination of this Agreement, Licensee will delete any and all copies of the Software. You agree that all provisions which operate to protect the proprietary rights of Licensor shall remain in force should breach occur and that the obligation of confidentiality described in this Agreement is binding in perpetuity and, as such, survives the term of the Agreement.
+
+FEE: Provided Licensee abides completely by the terms and conditions of this Agreement, there is no fee due to Licensor for Licensee's use of the Software in accordance with this Agreement.
+
+DISCLAIMER OF WARRANTIES:  THE SOFTWARE IS PROVIDED "AS-IS" WITHOUT WARRANTY OF ANY KIND INCLUDING ANY WARRANTIES OF PERFORMANCE OR MERCHANTABILITY OR FITNESS FOR A PARTICULAR USE OR PURPOSE OR OF NON-INFRINGEMENT.  LICENSEE BEARS ALL RISK RELATING TO QUALITY AND PERFORMANCE OF THE SOFTWARE AND RELATED MATERIALS.
+
+SUPPORT AND MAINTENANCE: No Software support or training by the Licensor is provided as part of this Agreement.  
+
+EXCLUSIVE REMEDY AND LIMITATION OF LIABILITY: To the maximum extent permitted under applicable law, Licensor shall not be liable for direct, indirect, special, incidental, or consequential damages or lost profits related to Licensee's use of and/or inability to use the Software, even if Licensor is advised of the possibility of such damage.
+
+EXPORT REGULATION: Licensee agrees to comply with any and all applicable 
+U.S. export control laws, regulations, and/or other laws related to embargoes and sanction programs administered by the Office of Foreign Assets Control.
+
+SEVERABILITY: If any provision(s) of this Agreement shall be held to be invalid, illegal, or unenforceable by a court or other tribunal of competent jurisdiction, the validity, legality and enforceability of the remaining provisions shall not in any way be affected or impaired thereby.
+
+NO IMPLIED WAIVERS: No failure or delay by Licensor in enforcing any right or remedy under this Agreement shall be construed as a waiver of any future or other exercise of such right or remedy by Licensor.
+
+GOVERNING LAW: This Agreement shall be construed and enforced in accordance with the laws of the Commonwealth of Pennsylvania without reference to conflict of laws principles.  You consent to the personal jurisdiction of the courts of this County and waive their rights to venue outside of Allegheny County, Pennsylvania.
+
+ENTIRE AGREEMENT AND AMENDMENTS: This Agreement constitutes the sole and entire agreement between Licensee and Licensor as to the matter set forth herein and supersedes any previous agreements, understandings, and arrangements between the parties relating hereto.
+
+
+
+************************************************************************
+
+THIRD-PARTY SOFTWARE NOTICES AND INFORMATION
+
+This project incorporates material from the project(s) listed below (collectively, "Third Party Code").  This Third Party Code is licensed to you under their original license terms set forth below.  We reserves all other rights not expressly granted, whether by implication, estoppel or otherwise.
+ 
+1.	Caffe, version 1.0.0, (https://github.com/BVLC/caffe/)
+
+COPYRIGHT
+
+All contributions by the University of California:
+Copyright (c) 2014-2017 The Regents of the University of California (Regents)
+All rights reserved.
+
+All other contributions:
+Copyright (c) 2014-2017, the respective contributors
+All rights reserved.
+
+Caffe uses a shared copyright model: each contributor holds copyright over
+their contributions to Caffe. The project versioning records all such
+contribution and copyright details. If a contributor wants to further mark
+their specific copyright on a particular contribution, they should indicate
+their copyright solely in the commit message of the change when it is
+committed.
+
+LICENSE
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met: 
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer. 
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution. 
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+CONTRIBUTION AGREEMENT
+
+By contributing to the BVLC/caffe repository through pull-request, comment,
+or otherwise, the contributor releases their content to the
+license and copyright terms herein.
+
+************END OF THIRD-PARTY SOFTWARE NOTICES AND INFORMATION**********
\ No newline at end of file
diff --git a/preprocess/openpose/annotator/openpose/__init__.py b/preprocess/openpose/annotator/openpose/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..cf6a81da4affb1b3d0d6101c72a86c7d61188ccb
--- /dev/null
+++ b/preprocess/openpose/annotator/openpose/__init__.py
@@ -0,0 +1,102 @@
+# Openpose
+# Original from CMU https://github.com/CMU-Perceptual-Computing-Lab/openpose
+# 2nd Edited by https://github.com/Hzzone/pytorch-openpose
+# 3rd Edited by ControlNet
+# 4th Edited by ControlNet (added face and correct hands)
+
+import os
+import pdb
+
+os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"
+
+import torch
+import numpy as np
+from . import util
+from .body import Body
+from .hand import Hand
+from .face import Face
+from annotator.util import annotator_ckpts_path
+
+body_model_path = "https://huggingface.co./lllyasviel/Annotators/resolve/main/body_pose_model.pth"
+hand_model_path = "https://huggingface.co./lllyasviel/Annotators/resolve/main/hand_pose_model.pth"
+face_model_path = "https://huggingface.co./lllyasviel/Annotators/resolve/main/facenet.pth"
+
+
+def draw_pose(pose, H, W, draw_body=True, draw_hand=True, draw_face=True):
+    bodies = pose['bodies']
+    faces = pose['faces']
+    hands = pose['hands']
+    candidate = bodies['candidate']
+    subset = bodies['subset']
+    canvas = np.zeros(shape=(H, W, 3), dtype=np.uint8)
+
+    if draw_body:
+        canvas = util.draw_bodypose(canvas, candidate, subset)
+
+    if draw_hand:
+        canvas = util.draw_handpose(canvas, hands)
+
+    if draw_face:
+        canvas = util.draw_facepose(canvas, faces)
+
+    return canvas
+
+
+class OpenposeDetector:
+    def __init__(self):
+        body_modelpath = os.path.join(annotator_ckpts_path, "body_pose_model.pth")
+        # hand_modelpath = os.path.join(annotator_ckpts_path, "hand_pose_model.pth")
+        # face_modelpath = os.path.join(annotator_ckpts_path, "facenet.pth")
+
+        if not os.path.exists(body_modelpath):
+            from basicsr.utils.download_util import load_file_from_url
+            load_file_from_url(body_model_path, model_dir=annotator_ckpts_path)
+
+        # if not os.path.exists(hand_modelpath):
+        #     from basicsr.utils.download_util import load_file_from_url
+        #     load_file_from_url(hand_model_path, model_dir=annotator_ckpts_path)
+
+        # if not os.path.exists(face_modelpath):
+        #     from basicsr.utils.download_util import load_file_from_url
+        #     load_file_from_url(face_model_path, model_dir=annotator_ckpts_path)
+
+        self.body_estimation = Body(body_modelpath)
+        # self.hand_estimation = Hand(hand_modelpath)
+        # self.face_estimation = Face(face_modelpath)
+
+
+    def __call__(self, oriImg, hand_and_face=False, return_is_index=False):
+        oriImg = oriImg[:, :, ::-1].copy()
+        H, W, C = oriImg.shape
+        with torch.no_grad():
+            candidate, subset = self.body_estimation(oriImg)
+            hands = []
+            faces = []
+            if hand_and_face:
+                # Hand
+                hands_list = util.handDetect(candidate, subset, oriImg)
+                for x, y, w, is_left in hands_list:
+                    peaks = self.hand_estimation(oriImg[y:y + w, x:x + w, :]).astype(np.float32)
+                    if peaks.ndim == 2 and peaks.shape[1] == 2:
+                        peaks[:, 0] = np.where(peaks[:, 0] < 1e-6, -1, peaks[:, 0] + x) / float(W)
+                        peaks[:, 1] = np.where(peaks[:, 1] < 1e-6, -1, peaks[:, 1] + y) / float(H)
+                        hands.append(peaks.tolist())
+                # Face
+                faces_list = util.faceDetect(candidate, subset, oriImg)
+                for x, y, w in faces_list:
+                    heatmaps = self.face_estimation(oriImg[y:y + w, x:x + w, :])
+                    peaks = self.face_estimation.compute_peaks_from_heatmaps(heatmaps).astype(np.float32)
+                    if peaks.ndim == 2 and peaks.shape[1] == 2:
+                        peaks[:, 0] = np.where(peaks[:, 0] < 1e-6, -1, peaks[:, 0] + x) / float(W)
+                        peaks[:, 1] = np.where(peaks[:, 1] < 1e-6, -1, peaks[:, 1] + y) / float(H)
+                        faces.append(peaks.tolist())
+            if candidate.ndim == 2 and candidate.shape[1] == 4:
+                candidate = candidate[:, :2]
+                candidate[:, 0] /= float(W)
+                candidate[:, 1] /= float(H)
+            bodies = dict(candidate=candidate.tolist(), subset=subset.tolist())
+            pose = dict(bodies=bodies, hands=hands, faces=faces)
+            if return_is_index:
+                return pose
+            else:
+                return pose, draw_pose(pose, H, W)
diff --git a/preprocess/openpose/annotator/openpose/__pycache__/__init__.cpython-310.pyc b/preprocess/openpose/annotator/openpose/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..728e414b06cbf3bf39f485db24d7c0d63c286d1f
Binary files /dev/null and b/preprocess/openpose/annotator/openpose/__pycache__/__init__.cpython-310.pyc differ
diff --git a/preprocess/openpose/annotator/openpose/__pycache__/body.cpython-310.pyc b/preprocess/openpose/annotator/openpose/__pycache__/body.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..12c41e0e56328c954e14dd239695f6f820967a69
Binary files /dev/null and b/preprocess/openpose/annotator/openpose/__pycache__/body.cpython-310.pyc differ
diff --git a/preprocess/openpose/annotator/openpose/__pycache__/face.cpython-310.pyc b/preprocess/openpose/annotator/openpose/__pycache__/face.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..66e87892a34bc87ffb13b79777fd7fca2d822540
Binary files /dev/null and b/preprocess/openpose/annotator/openpose/__pycache__/face.cpython-310.pyc differ
diff --git a/preprocess/openpose/annotator/openpose/__pycache__/hand.cpython-310.pyc b/preprocess/openpose/annotator/openpose/__pycache__/hand.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0a5ecff54a03e5284e62484a003b1147e1c3d31f
Binary files /dev/null and b/preprocess/openpose/annotator/openpose/__pycache__/hand.cpython-310.pyc differ
diff --git a/preprocess/openpose/annotator/openpose/__pycache__/model.cpython-310.pyc b/preprocess/openpose/annotator/openpose/__pycache__/model.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..812b981c757bfaaaa10b88278049dab6636ec44a
Binary files /dev/null and b/preprocess/openpose/annotator/openpose/__pycache__/model.cpython-310.pyc differ
diff --git a/preprocess/openpose/annotator/openpose/__pycache__/util.cpython-310.pyc b/preprocess/openpose/annotator/openpose/__pycache__/util.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8d81ef8d82b2cd2f43c44c5cbc9d0a6410b10fe8
Binary files /dev/null and b/preprocess/openpose/annotator/openpose/__pycache__/util.cpython-310.pyc differ
diff --git a/preprocess/openpose/annotator/openpose/body.py b/preprocess/openpose/annotator/openpose/body.py
new file mode 100644
index 0000000000000000000000000000000000000000..27012f28ad0e736f7e7bd877f35db3b64fe8bd9c
--- /dev/null
+++ b/preprocess/openpose/annotator/openpose/body.py
@@ -0,0 +1,229 @@
+from pathlib import Path
+import sys
+PROJECT_ROOT = Path(__file__).absolute().parents[3].absolute()
+# print(PROJECT_ROOT)
+
+import cv2
+import numpy as np
+import math
+import time
+from scipy.ndimage.filters import gaussian_filter
+import matplotlib.pyplot as plt
+import matplotlib
+import torch
+from torchvision import transforms
+
+from . import util
+from .model import bodypose_model
+
+
+class Body(object):
+    def __init__(self, model_path):
+        self.model = bodypose_model()
+        # if torch.cuda.is_available():
+        #     self.model = self.model.cuda()
+        #     print('cuda')
+        model_dict = util.transfer(self.model, torch.load(model_path))
+        self.model.load_state_dict(model_dict)
+        self.model.eval()
+
+
+    def __call__(self, oriImg):
+        # scale_search = [0.5, 1.0, 1.5, 2.0]
+        scale_search = [0.5]
+        boxsize = 368
+        stride = 8
+        padValue = 128
+        thre1 = 0.1
+        thre2 = 0.05
+        multiplier = [x * boxsize / oriImg.shape[0] for x in scale_search]
+        heatmap_avg = np.zeros((oriImg.shape[0], oriImg.shape[1], 19))
+        paf_avg = np.zeros((oriImg.shape[0], oriImg.shape[1], 38))
+
+        for m in range(len(multiplier)):
+            scale = multiplier[m]
+            imageToTest = util.smart_resize_k(oriImg, fx=scale, fy=scale)
+            imageToTest_padded, pad = util.padRightDownCorner(imageToTest, stride, padValue)
+            im = np.transpose(np.float32(imageToTest_padded[:, :, :, np.newaxis]), (3, 2, 0, 1)) / 256 - 0.5
+            im = np.ascontiguousarray(im)
+
+            data = torch.from_numpy(im).float()
+            if torch.cuda.is_available():
+                data = data.cuda()
+            # data = data.permute([2, 0, 1]).unsqueeze(0).float()
+            with torch.no_grad():
+                Mconv7_stage6_L1, Mconv7_stage6_L2 = self.model(data)
+
+            Mconv7_stage6_L1 = Mconv7_stage6_L1.cpu().numpy()
+            Mconv7_stage6_L2 = Mconv7_stage6_L2.cpu().numpy()
+
+            # extract outputs, resize, and remove padding
+            # heatmap = np.transpose(np.squeeze(net.blobs[output_blobs.keys()[1]].data), (1, 2, 0))  # output 1 is heatmaps
+            heatmap = np.transpose(np.squeeze(Mconv7_stage6_L2), (1, 2, 0))  # output 1 is heatmaps
+            heatmap = util.smart_resize_k(heatmap, fx=stride, fy=stride)
+            heatmap = heatmap[:imageToTest_padded.shape[0] - pad[2], :imageToTest_padded.shape[1] - pad[3], :]
+            heatmap = util.smart_resize(heatmap, (oriImg.shape[0], oriImg.shape[1]))
+
+            # paf = np.transpose(np.squeeze(net.blobs[output_blobs.keys()[0]].data), (1, 2, 0))  # output 0 is PAFs
+            paf = np.transpose(np.squeeze(Mconv7_stage6_L1), (1, 2, 0))  # output 0 is PAFs
+            paf = util.smart_resize_k(paf, fx=stride, fy=stride)
+            paf = paf[:imageToTest_padded.shape[0] - pad[2], :imageToTest_padded.shape[1] - pad[3], :]
+            paf = util.smart_resize(paf, (oriImg.shape[0], oriImg.shape[1]))
+
+            heatmap_avg += heatmap_avg + heatmap / len(multiplier)
+            paf_avg += + paf / len(multiplier)
+
+        all_peaks = []
+        peak_counter = 0
+
+        for part in range(18):
+            map_ori = heatmap_avg[:, :, part]
+            one_heatmap = gaussian_filter(map_ori, sigma=3)
+
+            map_left = np.zeros(one_heatmap.shape)
+            map_left[1:, :] = one_heatmap[:-1, :]
+            map_right = np.zeros(one_heatmap.shape)
+            map_right[:-1, :] = one_heatmap[1:, :]
+            map_up = np.zeros(one_heatmap.shape)
+            map_up[:, 1:] = one_heatmap[:, :-1]
+            map_down = np.zeros(one_heatmap.shape)
+            map_down[:, :-1] = one_heatmap[:, 1:]
+
+            peaks_binary = np.logical_and.reduce(
+                (one_heatmap >= map_left, one_heatmap >= map_right, one_heatmap >= map_up, one_heatmap >= map_down,
+                 one_heatmap > thre1))
+            peaks = list(zip(np.nonzero(peaks_binary)[1], np.nonzero(peaks_binary)[0]))  # note reverse
+            peaks_with_score = [x + (map_ori[x[1], x[0]],) for x in peaks]
+            peak_id = range(peak_counter, peak_counter + len(peaks))
+            peaks_with_score_and_id = [peaks_with_score[i] + (peak_id[i],) for i in range(len(peak_id))]
+
+            all_peaks.append(peaks_with_score_and_id)
+            peak_counter += len(peaks)
+
+        # find connection in the specified sequence, center 29 is in the position 15
+        limbSeq = [[2, 3], [2, 6], [3, 4], [4, 5], [6, 7], [7, 8], [2, 9], [9, 10], \
+                   [10, 11], [2, 12], [12, 13], [13, 14], [2, 1], [1, 15], [15, 17], \
+                   [1, 16], [16, 18], [3, 17], [6, 18]]
+        # the middle joints heatmap correpondence
+        mapIdx = [[31, 32], [39, 40], [33, 34], [35, 36], [41, 42], [43, 44], [19, 20], [21, 22], \
+                  [23, 24], [25, 26], [27, 28], [29, 30], [47, 48], [49, 50], [53, 54], [51, 52], \
+                  [55, 56], [37, 38], [45, 46]]
+
+        connection_all = []
+        special_k = []
+        mid_num = 10
+
+        for k in range(len(mapIdx)):
+            score_mid = paf_avg[:, :, [x - 19 for x in mapIdx[k]]]
+            candA = all_peaks[limbSeq[k][0] - 1]
+            candB = all_peaks[limbSeq[k][1] - 1]
+            nA = len(candA)
+            nB = len(candB)
+            indexA, indexB = limbSeq[k]
+            if (nA != 0 and nB != 0):
+                connection_candidate = []
+                for i in range(nA):
+                    for j in range(nB):
+                        vec = np.subtract(candB[j][:2], candA[i][:2])
+                        norm = math.sqrt(vec[0] * vec[0] + vec[1] * vec[1])
+                        norm = max(0.001, norm)
+                        vec = np.divide(vec, norm)
+
+                        startend = list(zip(np.linspace(candA[i][0], candB[j][0], num=mid_num), \
+                                            np.linspace(candA[i][1], candB[j][1], num=mid_num)))
+
+                        vec_x = np.array([score_mid[int(round(startend[I][1])), int(round(startend[I][0])), 0] \
+                                          for I in range(len(startend))])
+                        vec_y = np.array([score_mid[int(round(startend[I][1])), int(round(startend[I][0])), 1] \
+                                          for I in range(len(startend))])
+
+                        score_midpts = np.multiply(vec_x, vec[0]) + np.multiply(vec_y, vec[1])
+                        score_with_dist_prior = sum(score_midpts) / len(score_midpts) + min(
+                            0.5 * oriImg.shape[0] / norm - 1, 0)
+                        criterion1 = len(np.nonzero(score_midpts > thre2)[0]) > 0.8 * len(score_midpts)
+                        criterion2 = score_with_dist_prior > 0
+                        if criterion1 and criterion2:
+                            connection_candidate.append(
+                                [i, j, score_with_dist_prior, score_with_dist_prior + candA[i][2] + candB[j][2]])
+
+                connection_candidate = sorted(connection_candidate, key=lambda x: x[2], reverse=True)
+                connection = np.zeros((0, 5))
+                for c in range(len(connection_candidate)):
+                    i, j, s = connection_candidate[c][0:3]
+                    if (i not in connection[:, 3] and j not in connection[:, 4]):
+                        connection = np.vstack([connection, [candA[i][3], candB[j][3], s, i, j]])
+                        if (len(connection) >= min(nA, nB)):
+                            break
+
+                connection_all.append(connection)
+            else:
+                special_k.append(k)
+                connection_all.append([])
+
+        # last number in each row is the total parts number of that person
+        # the second last number in each row is the score of the overall configuration
+        subset = -1 * np.ones((0, 20))
+        candidate = np.array([item for sublist in all_peaks for item in sublist])
+
+        for k in range(len(mapIdx)):
+            if k not in special_k:
+                partAs = connection_all[k][:, 0]
+                partBs = connection_all[k][:, 1]
+                indexA, indexB = np.array(limbSeq[k]) - 1
+
+                for i in range(len(connection_all[k])):  # = 1:size(temp,1)
+                    found = 0
+                    subset_idx = [-1, -1]
+                    for j in range(len(subset)):  # 1:size(subset,1):
+                        if subset[j][indexA] == partAs[i] or subset[j][indexB] == partBs[i]:
+                            subset_idx[found] = j
+                            found += 1
+
+                    if found == 1:
+                        j = subset_idx[0]
+                        if subset[j][indexB] != partBs[i]:
+                            subset[j][indexB] = partBs[i]
+                            subset[j][-1] += 1
+                            subset[j][-2] += candidate[partBs[i].astype(int), 2] + connection_all[k][i][2]
+                    elif found == 2:  # if found 2 and disjoint, merge them
+                        j1, j2 = subset_idx
+                        membership = ((subset[j1] >= 0).astype(int) + (subset[j2] >= 0).astype(int))[:-2]
+                        if len(np.nonzero(membership == 2)[0]) == 0:  # merge
+                            subset[j1][:-2] += (subset[j2][:-2] + 1)
+                            subset[j1][-2:] += subset[j2][-2:]
+                            subset[j1][-2] += connection_all[k][i][2]
+                            subset = np.delete(subset, j2, 0)
+                        else:  # as like found == 1
+                            subset[j1][indexB] = partBs[i]
+                            subset[j1][-1] += 1
+                            subset[j1][-2] += candidate[partBs[i].astype(int), 2] + connection_all[k][i][2]
+
+                    # if find no partA in the subset, create a new subset
+                    elif not found and k < 17:
+                        row = -1 * np.ones(20)
+                        row[indexA] = partAs[i]
+                        row[indexB] = partBs[i]
+                        row[-1] = 2
+                        row[-2] = sum(candidate[connection_all[k][i, :2].astype(int), 2]) + connection_all[k][i][2]
+                        subset = np.vstack([subset, row])
+        # delete some rows of subset which has few parts occur
+        deleteIdx = []
+        for i in range(len(subset)):
+            if subset[i][-1] < 4 or subset[i][-2] / subset[i][-1] < 0.4:
+                deleteIdx.append(i)
+        subset = np.delete(subset, deleteIdx, axis=0)
+
+        # subset: n*20 array, 0-17 is the index in candidate, 18 is the total score, 19 is the total parts
+        # candidate: x, y, score, id
+        return candidate, subset
+
+
+# if __name__ == "__main__":
+#     body_estimation = Body('../model/body_pose_model.pth')
+
+#     test_image = '../images/ski.jpg'
+#     oriImg = cv2.imread(test_image)  # B,G,R order
+#     candidate, subset = body_estimation(oriImg)
+#     canvas = util.draw_bodypose(oriImg, candidate, subset)
+#     plt.imshow(canvas[:, :, [2, 1, 0]])
+#     plt.show()
diff --git a/preprocess/openpose/annotator/openpose/face.py b/preprocess/openpose/annotator/openpose/face.py
new file mode 100644
index 0000000000000000000000000000000000000000..8cf6241830325abdadcd699361992fb30ffdd694
--- /dev/null
+++ b/preprocess/openpose/annotator/openpose/face.py
@@ -0,0 +1,368 @@
+import logging
+import numpy as np
+from torchvision.transforms import ToTensor, ToPILImage
+import torch
+import torch.nn.functional as F
+import cv2
+
+from . import util
+from torch.nn import Conv2d, Module, ReLU, MaxPool2d, init
+
+
+class FaceNet(Module):
+    """Model the cascading heatmaps. """
+    def __init__(self):
+        super(FaceNet, self).__init__()
+        # cnn to make feature map
+        self.relu = ReLU()
+        self.max_pooling_2d = MaxPool2d(kernel_size=2, stride=2)
+        self.conv1_1 = Conv2d(in_channels=3, out_channels=64,
+                              kernel_size=3, stride=1, padding=1)
+        self.conv1_2 = Conv2d(
+            in_channels=64, out_channels=64, kernel_size=3, stride=1,
+            padding=1)
+        self.conv2_1 = Conv2d(
+            in_channels=64, out_channels=128, kernel_size=3, stride=1,
+            padding=1)
+        self.conv2_2 = Conv2d(
+            in_channels=128, out_channels=128, kernel_size=3, stride=1,
+            padding=1)
+        self.conv3_1 = Conv2d(
+            in_channels=128, out_channels=256, kernel_size=3, stride=1,
+            padding=1)
+        self.conv3_2 = Conv2d(
+            in_channels=256, out_channels=256, kernel_size=3, stride=1,
+            padding=1)
+        self.conv3_3 = Conv2d(
+            in_channels=256, out_channels=256, kernel_size=3, stride=1,
+            padding=1)
+        self.conv3_4 = Conv2d(
+            in_channels=256, out_channels=256, kernel_size=3, stride=1,
+            padding=1)
+        self.conv4_1 = Conv2d(
+            in_channels=256, out_channels=512, kernel_size=3, stride=1,
+            padding=1)
+        self.conv4_2 = Conv2d(
+            in_channels=512, out_channels=512, kernel_size=3, stride=1,
+            padding=1)
+        self.conv4_3 = Conv2d(
+            in_channels=512, out_channels=512, kernel_size=3, stride=1,
+            padding=1)
+        self.conv4_4 = Conv2d(
+            in_channels=512, out_channels=512, kernel_size=3, stride=1,
+            padding=1)
+        self.conv5_1 = Conv2d(
+            in_channels=512, out_channels=512, kernel_size=3, stride=1,
+            padding=1)
+        self.conv5_2 = Conv2d(
+            in_channels=512, out_channels=512, kernel_size=3, stride=1,
+            padding=1)
+        self.conv5_3_CPM = Conv2d(
+            in_channels=512, out_channels=128, kernel_size=3, stride=1,
+            padding=1)
+
+        # stage1
+        self.conv6_1_CPM = Conv2d(
+            in_channels=128, out_channels=512, kernel_size=1, stride=1,
+            padding=0)
+        self.conv6_2_CPM = Conv2d(
+            in_channels=512, out_channels=71, kernel_size=1, stride=1,
+            padding=0)
+
+        # stage2
+        self.Mconv1_stage2 = Conv2d(
+            in_channels=199, out_channels=128, kernel_size=7, stride=1,
+            padding=3)
+        self.Mconv2_stage2 = Conv2d(
+            in_channels=128, out_channels=128, kernel_size=7, stride=1,
+            padding=3)
+        self.Mconv3_stage2 = Conv2d(
+            in_channels=128, out_channels=128, kernel_size=7, stride=1,
+            padding=3)
+        self.Mconv4_stage2 = Conv2d(
+            in_channels=128, out_channels=128, kernel_size=7, stride=1,
+            padding=3)
+        self.Mconv5_stage2 = Conv2d(
+            in_channels=128, out_channels=128, kernel_size=7, stride=1,
+            padding=3)
+        self.Mconv6_stage2 = Conv2d(
+            in_channels=128, out_channels=128, kernel_size=1, stride=1,
+            padding=0)
+        self.Mconv7_stage2 = Conv2d(
+            in_channels=128, out_channels=71, kernel_size=1, stride=1,
+            padding=0)
+
+        # stage3
+        self.Mconv1_stage3 = Conv2d(
+            in_channels=199, out_channels=128, kernel_size=7, stride=1,
+            padding=3)
+        self.Mconv2_stage3 = Conv2d(
+            in_channels=128, out_channels=128, kernel_size=7, stride=1,
+            padding=3)
+        self.Mconv3_stage3 = Conv2d(
+            in_channels=128, out_channels=128, kernel_size=7, stride=1,
+            padding=3)
+        self.Mconv4_stage3 = Conv2d(
+            in_channels=128, out_channels=128, kernel_size=7, stride=1,
+            padding=3)
+        self.Mconv5_stage3 = Conv2d(
+            in_channels=128, out_channels=128, kernel_size=7, stride=1,
+            padding=3)
+        self.Mconv6_stage3 = Conv2d(
+            in_channels=128, out_channels=128, kernel_size=1, stride=1,
+            padding=0)
+        self.Mconv7_stage3 = Conv2d(
+            in_channels=128, out_channels=71, kernel_size=1, stride=1,
+            padding=0)
+
+        # stage4
+        self.Mconv1_stage4 = Conv2d(
+            in_channels=199, out_channels=128, kernel_size=7, stride=1,
+            padding=3)
+        self.Mconv2_stage4 = Conv2d(
+            in_channels=128, out_channels=128, kernel_size=7, stride=1,
+            padding=3)
+        self.Mconv3_stage4 = Conv2d(
+            in_channels=128, out_channels=128, kernel_size=7, stride=1,
+            padding=3)
+        self.Mconv4_stage4 = Conv2d(
+            in_channels=128, out_channels=128, kernel_size=7, stride=1,
+            padding=3)
+        self.Mconv5_stage4 = Conv2d(
+            in_channels=128, out_channels=128, kernel_size=7, stride=1,
+            padding=3)
+        self.Mconv6_stage4 = Conv2d(
+            in_channels=128, out_channels=128, kernel_size=1, stride=1,
+            padding=0)
+        self.Mconv7_stage4 = Conv2d(
+            in_channels=128, out_channels=71, kernel_size=1, stride=1,
+            padding=0)
+
+        # stage5
+        self.Mconv1_stage5 = Conv2d(
+            in_channels=199, out_channels=128, kernel_size=7, stride=1,
+            padding=3)
+        self.Mconv2_stage5 = Conv2d(
+            in_channels=128, out_channels=128, kernel_size=7, stride=1,
+            padding=3)
+        self.Mconv3_stage5 = Conv2d(
+            in_channels=128, out_channels=128, kernel_size=7, stride=1,
+            padding=3)
+        self.Mconv4_stage5 = Conv2d(
+            in_channels=128, out_channels=128, kernel_size=7, stride=1,
+            padding=3)
+        self.Mconv5_stage5 = Conv2d(
+            in_channels=128, out_channels=128, kernel_size=7, stride=1,
+            padding=3)
+        self.Mconv6_stage5 = Conv2d(
+            in_channels=128, out_channels=128, kernel_size=1, stride=1,
+            padding=0)
+        self.Mconv7_stage5 = Conv2d(
+            in_channels=128, out_channels=71, kernel_size=1, stride=1,
+            padding=0)
+
+        # stage6
+        self.Mconv1_stage6 = Conv2d(
+            in_channels=199, out_channels=128, kernel_size=7, stride=1,
+            padding=3)
+        self.Mconv2_stage6 = Conv2d(
+            in_channels=128, out_channels=128, kernel_size=7, stride=1,
+            padding=3)
+        self.Mconv3_stage6 = Conv2d(
+            in_channels=128, out_channels=128, kernel_size=7, stride=1,
+            padding=3)
+        self.Mconv4_stage6 = Conv2d(
+            in_channels=128, out_channels=128, kernel_size=7, stride=1,
+            padding=3)
+        self.Mconv5_stage6 = Conv2d(
+            in_channels=128, out_channels=128, kernel_size=7, stride=1,
+            padding=3)
+        self.Mconv6_stage6 = Conv2d(
+            in_channels=128, out_channels=128, kernel_size=1, stride=1,
+            padding=0)
+        self.Mconv7_stage6 = Conv2d(
+            in_channels=128, out_channels=71, kernel_size=1, stride=1,
+            padding=0)
+
+        for m in self.modules():
+            if isinstance(m, Conv2d):
+                init.constant_(m.bias, 0)
+
+    def forward(self, x):
+        """Return a list of heatmaps."""
+        heatmaps = []
+
+        h = self.relu(self.conv1_1(x))
+        h = self.relu(self.conv1_2(h))
+        h = self.max_pooling_2d(h)
+        h = self.relu(self.conv2_1(h))
+        h = self.relu(self.conv2_2(h))
+        h = self.max_pooling_2d(h)
+        h = self.relu(self.conv3_1(h))
+        h = self.relu(self.conv3_2(h))
+        h = self.relu(self.conv3_3(h))
+        h = self.relu(self.conv3_4(h))
+        h = self.max_pooling_2d(h)
+        h = self.relu(self.conv4_1(h))
+        h = self.relu(self.conv4_2(h))
+        h = self.relu(self.conv4_3(h))
+        h = self.relu(self.conv4_4(h))
+        h = self.relu(self.conv5_1(h))
+        h = self.relu(self.conv5_2(h))
+        h = self.relu(self.conv5_3_CPM(h))
+        feature_map = h
+
+        # stage1
+        h = self.relu(self.conv6_1_CPM(h))
+        h = self.conv6_2_CPM(h)
+        heatmaps.append(h)
+
+        # stage2
+        h = torch.cat([h, feature_map], dim=1)  # channel concat
+        h = self.relu(self.Mconv1_stage2(h))
+        h = self.relu(self.Mconv2_stage2(h))
+        h = self.relu(self.Mconv3_stage2(h))
+        h = self.relu(self.Mconv4_stage2(h))
+        h = self.relu(self.Mconv5_stage2(h))
+        h = self.relu(self.Mconv6_stage2(h))
+        h = self.Mconv7_stage2(h)
+        heatmaps.append(h)
+
+        # stage3
+        h = torch.cat([h, feature_map], dim=1)  # channel concat
+        h = self.relu(self.Mconv1_stage3(h))
+        h = self.relu(self.Mconv2_stage3(h))
+        h = self.relu(self.Mconv3_stage3(h))
+        h = self.relu(self.Mconv4_stage3(h))
+        h = self.relu(self.Mconv5_stage3(h))
+        h = self.relu(self.Mconv6_stage3(h))
+        h = self.Mconv7_stage3(h)
+        heatmaps.append(h)
+
+        # stage4
+        h = torch.cat([h, feature_map], dim=1)  # channel concat
+        h = self.relu(self.Mconv1_stage4(h))
+        h = self.relu(self.Mconv2_stage4(h))
+        h = self.relu(self.Mconv3_stage4(h))
+        h = self.relu(self.Mconv4_stage4(h))
+        h = self.relu(self.Mconv5_stage4(h))
+        h = self.relu(self.Mconv6_stage4(h))
+        h = self.Mconv7_stage4(h)
+        heatmaps.append(h)
+
+        # stage5
+        h = torch.cat([h, feature_map], dim=1)  # channel concat
+        h = self.relu(self.Mconv1_stage5(h))
+        h = self.relu(self.Mconv2_stage5(h))
+        h = self.relu(self.Mconv3_stage5(h))
+        h = self.relu(self.Mconv4_stage5(h))
+        h = self.relu(self.Mconv5_stage5(h))
+        h = self.relu(self.Mconv6_stage5(h))
+        h = self.Mconv7_stage5(h)
+        heatmaps.append(h)
+
+        # stage6
+        h = torch.cat([h, feature_map], dim=1)  # channel concat
+        h = self.relu(self.Mconv1_stage6(h))
+        h = self.relu(self.Mconv2_stage6(h))
+        h = self.relu(self.Mconv3_stage6(h))
+        h = self.relu(self.Mconv4_stage6(h))
+        h = self.relu(self.Mconv5_stage6(h))
+        h = self.relu(self.Mconv6_stage6(h))
+        h = self.Mconv7_stage6(h)
+        heatmaps.append(h)
+
+        return heatmaps
+
+
+LOG = logging.getLogger(__name__)
+TOTEN = ToTensor()
+TOPIL = ToPILImage()
+
+
+params = {
+    'gaussian_sigma': 2.5,
+    'inference_img_size': 736,  # 368, 736, 1312
+    'heatmap_peak_thresh': 0.1,
+    'crop_scale': 1.5,
+    'line_indices': [
+        [0, 1], [1, 2], [2, 3], [3, 4], [4, 5], [5, 6],
+        [6, 7], [7, 8], [8, 9], [9, 10], [10, 11], [11, 12], [12, 13],
+        [13, 14], [14, 15], [15, 16],
+        [17, 18], [18, 19], [19, 20], [20, 21],
+        [22, 23], [23, 24], [24, 25], [25, 26],
+        [27, 28], [28, 29], [29, 30],
+        [31, 32], [32, 33], [33, 34], [34, 35],
+        [36, 37], [37, 38], [38, 39], [39, 40], [40, 41], [41, 36],
+        [42, 43], [43, 44], [44, 45], [45, 46], [46, 47], [47, 42],
+        [48, 49], [49, 50], [50, 51], [51, 52], [52, 53], [53, 54],
+        [54, 55], [55, 56], [56, 57], [57, 58], [58, 59], [59, 48],
+        [60, 61], [61, 62], [62, 63], [63, 64], [64, 65], [65, 66],
+        [66, 67], [67, 60]
+    ],
+}
+
+
+class Face(object):
+    """
+    The OpenPose face landmark detector model.
+
+    Args:
+        inference_size: set the size of the inference image size, suggested:
+            368, 736, 1312, default 736
+        gaussian_sigma: blur the heatmaps, default 2.5
+        heatmap_peak_thresh: return landmark if over threshold, default 0.1
+
+    """
+    def __init__(self, face_model_path,
+                 inference_size=None,
+                 gaussian_sigma=None,
+                 heatmap_peak_thresh=None):
+        self.inference_size = inference_size or params["inference_img_size"]
+        self.sigma = gaussian_sigma or params['gaussian_sigma']
+        self.threshold = heatmap_peak_thresh or params["heatmap_peak_thresh"]
+        self.model = FaceNet()
+        self.model.load_state_dict(torch.load(face_model_path))
+        if torch.cuda.is_available():
+            self.model = self.model.cuda()
+            print('cuda')
+        self.model.eval()
+
+    def __call__(self, face_img):
+        H, W, C = face_img.shape
+
+        w_size = 384
+        x_data = torch.from_numpy(util.smart_resize(face_img, (w_size, w_size))).permute([2, 0, 1]) / 256.0 - 0.5
+
+        if torch.cuda.is_available():
+            x_data = x_data.cuda()
+
+        with torch.no_grad():
+            hs = self.model(x_data[None, ...])
+
+            # output_path = "/home/aigc/ProjectVTON/WebDemo/onnx_models/face_estimation.onnx"
+            # torch.onnx.export(self.model, x_data[None, ...], output_path, export_params=True,
+            #                   opset_version=11,
+            #                   do_constant_folding=True)
+            heatmaps = F.interpolate(
+                hs[-1],
+                (H, W),
+                mode='bilinear', align_corners=True).cpu().numpy()[0]
+        return heatmaps
+
+    def compute_peaks_from_heatmaps(self, heatmaps):
+        all_peaks = []
+        for part in range(heatmaps.shape[0]):
+            map_ori = heatmaps[part].copy()
+            binary = np.ascontiguousarray(map_ori > 0.05, dtype=np.uint8)
+
+            if np.sum(binary) == 0:
+                continue
+
+            positions = np.where(binary > 0.5)
+            intensities = map_ori[positions]
+            mi = np.argmax(intensities)
+            y, x = positions[0][mi], positions[1][mi]
+            all_peaks.append([x, y])
+
+        return np.array(all_peaks)
diff --git a/preprocess/openpose/annotator/openpose/hand.py b/preprocess/openpose/annotator/openpose/hand.py
new file mode 100644
index 0000000000000000000000000000000000000000..eefe0af97b6e82d4664d2e477a1f2ac45489a419
--- /dev/null
+++ b/preprocess/openpose/annotator/openpose/hand.py
@@ -0,0 +1,98 @@
+import cv2
+import json
+import numpy as np
+import math
+import time
+from scipy.ndimage.filters import gaussian_filter
+import matplotlib.pyplot as plt
+import matplotlib
+import torch
+from skimage.measure import label
+
+from .model import handpose_model
+from . import util
+
+class Hand(object):
+    def __init__(self, model_path):
+        self.model = handpose_model()
+        if torch.cuda.is_available():
+            self.model = self.model.cuda()
+            print('cuda')
+        model_dict = util.transfer(self.model, torch.load(model_path))
+        self.model.load_state_dict(model_dict)
+        self.model.eval()
+
+    def __call__(self, oriImgRaw):
+        scale_search = [0.5, 1.0, 1.5, 2.0]
+        # scale_search = [0.5]
+        boxsize = 368
+        stride = 8
+        padValue = 128
+        thre = 0.05
+        multiplier = [x * boxsize for x in scale_search]
+
+        wsize = 128
+        heatmap_avg = np.zeros((wsize, wsize, 22))
+
+        Hr, Wr, Cr = oriImgRaw.shape
+
+        oriImg = cv2.GaussianBlur(oriImgRaw, (0, 0), 0.8)
+
+        for m in range(len(multiplier)):
+            scale = multiplier[m]
+            imageToTest = util.smart_resize(oriImg, (scale, scale))
+
+            imageToTest_padded, pad = util.padRightDownCorner(imageToTest, stride, padValue)
+            im = np.transpose(np.float32(imageToTest_padded[:, :, :, np.newaxis]), (3, 2, 0, 1)) / 256 - 0.5
+            im = np.ascontiguousarray(im)
+
+            data = torch.from_numpy(im).float()
+            if torch.cuda.is_available():
+                data = data.cuda()
+
+            with torch.no_grad():
+                output = self.model(data).cpu().numpy()
+
+                # output_path = "/home/aigc/ProjectVTON/WebDemo/onnx_models/hand_estimation.onnx"
+                # torch.onnx.export(self.model, data, output_path, export_params=True,
+                #                   opset_version=11,
+                #                   do_constant_folding=True)
+
+            # extract outputs, resize, and remove padding
+            heatmap = np.transpose(np.squeeze(output), (1, 2, 0))  # output 1 is heatmaps
+            heatmap = util.smart_resize_k(heatmap, fx=stride, fy=stride)
+            heatmap = heatmap[:imageToTest_padded.shape[0] - pad[2], :imageToTest_padded.shape[1] - pad[3], :]
+            heatmap = util.smart_resize(heatmap, (wsize, wsize))
+
+            heatmap_avg += heatmap / len(multiplier)
+
+        all_peaks = []
+        for part in range(21):
+            map_ori = heatmap_avg[:, :, part]
+            one_heatmap = gaussian_filter(map_ori, sigma=3)
+            binary = np.ascontiguousarray(one_heatmap > thre, dtype=np.uint8)
+
+            if np.sum(binary) == 0:
+                all_peaks.append([0, 0])
+                continue
+            label_img, label_numbers = label(binary, return_num=True, connectivity=binary.ndim)
+            max_index = np.argmax([np.sum(map_ori[label_img == i]) for i in range(1, label_numbers + 1)]) + 1
+            label_img[label_img != max_index] = 0
+            map_ori[label_img == 0] = 0
+
+            y, x = util.npmax(map_ori)
+            y = int(float(y) * float(Hr) / float(wsize))
+            x = int(float(x) * float(Wr) / float(wsize))
+            all_peaks.append([x, y])
+        return np.array(all_peaks)
+
+if __name__ == "__main__":
+    hand_estimation = Hand('../model/hand_pose_model.pth')
+
+    # test_image = '../images/hand.jpg'
+    test_image = '../images/hand.jpg'
+    oriImg = cv2.imread(test_image)  # B,G,R order
+    peaks = hand_estimation(oriImg)
+    canvas = util.draw_handpose(oriImg, peaks, True)
+    cv2.imshow('', canvas)
+    cv2.waitKey(0)
\ No newline at end of file
diff --git a/preprocess/openpose/annotator/openpose/model.py b/preprocess/openpose/annotator/openpose/model.py
new file mode 100644
index 0000000000000000000000000000000000000000..5dfc80de827a17beccb9b0f3f7588545be78c9de
--- /dev/null
+++ b/preprocess/openpose/annotator/openpose/model.py
@@ -0,0 +1,219 @@
+import torch
+from collections import OrderedDict
+
+import torch
+import torch.nn as nn
+
+def make_layers(block, no_relu_layers):
+    layers = []
+    for layer_name, v in block.items():
+        if 'pool' in layer_name:
+            layer = nn.MaxPool2d(kernel_size=v[0], stride=v[1],
+                                    padding=v[2])
+            layers.append((layer_name, layer))
+        else:
+            conv2d = nn.Conv2d(in_channels=v[0], out_channels=v[1],
+                               kernel_size=v[2], stride=v[3],
+                               padding=v[4])
+            layers.append((layer_name, conv2d))
+            if layer_name not in no_relu_layers:
+                layers.append(('relu_'+layer_name, nn.ReLU(inplace=True)))
+
+    return nn.Sequential(OrderedDict(layers))
+
+class bodypose_model(nn.Module):
+    def __init__(self):
+        super(bodypose_model, self).__init__()
+
+        # these layers have no relu layer
+        no_relu_layers = ['conv5_5_CPM_L1', 'conv5_5_CPM_L2', 'Mconv7_stage2_L1',\
+                          'Mconv7_stage2_L2', 'Mconv7_stage3_L1', 'Mconv7_stage3_L2',\
+                          'Mconv7_stage4_L1', 'Mconv7_stage4_L2', 'Mconv7_stage5_L1',\
+                          'Mconv7_stage5_L2', 'Mconv7_stage6_L1', 'Mconv7_stage6_L1']
+        blocks = {}
+        block0 = OrderedDict([
+                      ('conv1_1', [3, 64, 3, 1, 1]),
+                      ('conv1_2', [64, 64, 3, 1, 1]),
+                      ('pool1_stage1', [2, 2, 0]),
+                      ('conv2_1', [64, 128, 3, 1, 1]),
+                      ('conv2_2', [128, 128, 3, 1, 1]),
+                      ('pool2_stage1', [2, 2, 0]),
+                      ('conv3_1', [128, 256, 3, 1, 1]),
+                      ('conv3_2', [256, 256, 3, 1, 1]),
+                      ('conv3_3', [256, 256, 3, 1, 1]),
+                      ('conv3_4', [256, 256, 3, 1, 1]),
+                      ('pool3_stage1', [2, 2, 0]),
+                      ('conv4_1', [256, 512, 3, 1, 1]),
+                      ('conv4_2', [512, 512, 3, 1, 1]),
+                      ('conv4_3_CPM', [512, 256, 3, 1, 1]),
+                      ('conv4_4_CPM', [256, 128, 3, 1, 1])
+                  ])
+
+
+        # Stage 1
+        block1_1 = OrderedDict([
+                        ('conv5_1_CPM_L1', [128, 128, 3, 1, 1]),
+                        ('conv5_2_CPM_L1', [128, 128, 3, 1, 1]),
+                        ('conv5_3_CPM_L1', [128, 128, 3, 1, 1]),
+                        ('conv5_4_CPM_L1', [128, 512, 1, 1, 0]),
+                        ('conv5_5_CPM_L1', [512, 38, 1, 1, 0])
+                    ])
+
+        block1_2 = OrderedDict([
+                        ('conv5_1_CPM_L2', [128, 128, 3, 1, 1]),
+                        ('conv5_2_CPM_L2', [128, 128, 3, 1, 1]),
+                        ('conv5_3_CPM_L2', [128, 128, 3, 1, 1]),
+                        ('conv5_4_CPM_L2', [128, 512, 1, 1, 0]),
+                        ('conv5_5_CPM_L2', [512, 19, 1, 1, 0])
+                    ])
+        blocks['block1_1'] = block1_1
+        blocks['block1_2'] = block1_2
+
+        self.model0 = make_layers(block0, no_relu_layers)
+
+        # Stages 2 - 6
+        for i in range(2, 7):
+            blocks['block%d_1' % i] = OrderedDict([
+                    ('Mconv1_stage%d_L1' % i, [185, 128, 7, 1, 3]),
+                    ('Mconv2_stage%d_L1' % i, [128, 128, 7, 1, 3]),
+                    ('Mconv3_stage%d_L1' % i, [128, 128, 7, 1, 3]),
+                    ('Mconv4_stage%d_L1' % i, [128, 128, 7, 1, 3]),
+                    ('Mconv5_stage%d_L1' % i, [128, 128, 7, 1, 3]),
+                    ('Mconv6_stage%d_L1' % i, [128, 128, 1, 1, 0]),
+                    ('Mconv7_stage%d_L1' % i, [128, 38, 1, 1, 0])
+                ])
+
+            blocks['block%d_2' % i] = OrderedDict([
+                    ('Mconv1_stage%d_L2' % i, [185, 128, 7, 1, 3]),
+                    ('Mconv2_stage%d_L2' % i, [128, 128, 7, 1, 3]),
+                    ('Mconv3_stage%d_L2' % i, [128, 128, 7, 1, 3]),
+                    ('Mconv4_stage%d_L2' % i, [128, 128, 7, 1, 3]),
+                    ('Mconv5_stage%d_L2' % i, [128, 128, 7, 1, 3]),
+                    ('Mconv6_stage%d_L2' % i, [128, 128, 1, 1, 0]),
+                    ('Mconv7_stage%d_L2' % i, [128, 19, 1, 1, 0])
+                ])
+
+        for k in blocks.keys():
+            blocks[k] = make_layers(blocks[k], no_relu_layers)
+
+        self.model1_1 = blocks['block1_1']
+        self.model2_1 = blocks['block2_1']
+        self.model3_1 = blocks['block3_1']
+        self.model4_1 = blocks['block4_1']
+        self.model5_1 = blocks['block5_1']
+        self.model6_1 = blocks['block6_1']
+
+        self.model1_2 = blocks['block1_2']
+        self.model2_2 = blocks['block2_2']
+        self.model3_2 = blocks['block3_2']
+        self.model4_2 = blocks['block4_2']
+        self.model5_2 = blocks['block5_2']
+        self.model6_2 = blocks['block6_2']
+
+
+    def forward(self, x):
+
+        out1 = self.model0(x)
+
+        out1_1 = self.model1_1(out1)
+        out1_2 = self.model1_2(out1)
+        out2 = torch.cat([out1_1, out1_2, out1], 1)
+
+        out2_1 = self.model2_1(out2)
+        out2_2 = self.model2_2(out2)
+        out3 = torch.cat([out2_1, out2_2, out1], 1)
+
+        out3_1 = self.model3_1(out3)
+        out3_2 = self.model3_2(out3)
+        out4 = torch.cat([out3_1, out3_2, out1], 1)
+
+        out4_1 = self.model4_1(out4)
+        out4_2 = self.model4_2(out4)
+        out5 = torch.cat([out4_1, out4_2, out1], 1)
+
+        out5_1 = self.model5_1(out5)
+        out5_2 = self.model5_2(out5)
+        out6 = torch.cat([out5_1, out5_2, out1], 1)
+
+        out6_1 = self.model6_1(out6)
+        out6_2 = self.model6_2(out6)
+
+        return out6_1, out6_2
+
+class handpose_model(nn.Module):
+    def __init__(self):
+        super(handpose_model, self).__init__()
+
+        # these layers have no relu layer
+        no_relu_layers = ['conv6_2_CPM', 'Mconv7_stage2', 'Mconv7_stage3',\
+                          'Mconv7_stage4', 'Mconv7_stage5', 'Mconv7_stage6']
+        # stage 1
+        block1_0 = OrderedDict([
+                ('conv1_1', [3, 64, 3, 1, 1]),
+                ('conv1_2', [64, 64, 3, 1, 1]),
+                ('pool1_stage1', [2, 2, 0]),
+                ('conv2_1', [64, 128, 3, 1, 1]),
+                ('conv2_2', [128, 128, 3, 1, 1]),
+                ('pool2_stage1', [2, 2, 0]),
+                ('conv3_1', [128, 256, 3, 1, 1]),
+                ('conv3_2', [256, 256, 3, 1, 1]),
+                ('conv3_3', [256, 256, 3, 1, 1]),
+                ('conv3_4', [256, 256, 3, 1, 1]),
+                ('pool3_stage1', [2, 2, 0]),
+                ('conv4_1', [256, 512, 3, 1, 1]),
+                ('conv4_2', [512, 512, 3, 1, 1]),
+                ('conv4_3', [512, 512, 3, 1, 1]),
+                ('conv4_4', [512, 512, 3, 1, 1]),
+                ('conv5_1', [512, 512, 3, 1, 1]),
+                ('conv5_2', [512, 512, 3, 1, 1]),
+                ('conv5_3_CPM', [512, 128, 3, 1, 1])
+            ])
+
+        block1_1 = OrderedDict([
+            ('conv6_1_CPM', [128, 512, 1, 1, 0]),
+            ('conv6_2_CPM', [512, 22, 1, 1, 0])
+        ])
+
+        blocks = {}
+        blocks['block1_0'] = block1_0
+        blocks['block1_1'] = block1_1
+
+        # stage 2-6
+        for i in range(2, 7):
+            blocks['block%d' % i] = OrderedDict([
+                    ('Mconv1_stage%d' % i, [150, 128, 7, 1, 3]),
+                    ('Mconv2_stage%d' % i, [128, 128, 7, 1, 3]),
+                    ('Mconv3_stage%d' % i, [128, 128, 7, 1, 3]),
+                    ('Mconv4_stage%d' % i, [128, 128, 7, 1, 3]),
+                    ('Mconv5_stage%d' % i, [128, 128, 7, 1, 3]),
+                    ('Mconv6_stage%d' % i, [128, 128, 1, 1, 0]),
+                    ('Mconv7_stage%d' % i, [128, 22, 1, 1, 0])
+                ])
+
+        for k in blocks.keys():
+            blocks[k] = make_layers(blocks[k], no_relu_layers)
+
+        self.model1_0 = blocks['block1_0']
+        self.model1_1 = blocks['block1_1']
+        self.model2 = blocks['block2']
+        self.model3 = blocks['block3']
+        self.model4 = blocks['block4']
+        self.model5 = blocks['block5']
+        self.model6 = blocks['block6']
+
+    def forward(self, x):
+        out1_0 = self.model1_0(x)
+        out1_1 = self.model1_1(out1_0)
+        concat_stage2 = torch.cat([out1_1, out1_0], 1)
+        out_stage2 = self.model2(concat_stage2)
+        concat_stage3 = torch.cat([out_stage2, out1_0], 1)
+        out_stage3 = self.model3(concat_stage3)
+        concat_stage4 = torch.cat([out_stage3, out1_0], 1)
+        out_stage4 = self.model4(concat_stage4)
+        concat_stage5 = torch.cat([out_stage4, out1_0], 1)
+        out_stage5 = self.model5(concat_stage5)
+        concat_stage6 = torch.cat([out_stage5, out1_0], 1)
+        out_stage6 = self.model6(concat_stage6)
+        return out_stage6
+
+
diff --git a/preprocess/openpose/annotator/openpose/util.py b/preprocess/openpose/annotator/openpose/util.py
new file mode 100644
index 0000000000000000000000000000000000000000..73d7d0153b38d143eb8090e07a9784a274b619ed
--- /dev/null
+++ b/preprocess/openpose/annotator/openpose/util.py
@@ -0,0 +1,297 @@
+import math
+import numpy as np
+import matplotlib
+import cv2
+
+
+eps = 0.01
+
+
+def smart_resize(x, s):
+    Ht, Wt = s
+    if x.ndim == 2:
+        Ho, Wo = x.shape
+        Co = 1
+    else:
+        Ho, Wo, Co = x.shape
+    if Co == 3 or Co == 1:
+        k = float(Ht + Wt) / float(Ho + Wo)
+        return cv2.resize(x, (int(Wt), int(Ht)), interpolation=cv2.INTER_AREA if k < 1 else cv2.INTER_LANCZOS4)
+    else:
+        return np.stack([smart_resize(x[:, :, i], s) for i in range(Co)], axis=2)
+
+
+def smart_resize_k(x, fx, fy):
+    if x.ndim == 2:
+        Ho, Wo = x.shape
+        Co = 1
+    else:
+        Ho, Wo, Co = x.shape
+    Ht, Wt = Ho * fy, Wo * fx
+    if Co == 3 or Co == 1:
+        k = float(Ht + Wt) / float(Ho + Wo)
+        return cv2.resize(x, (int(Wt), int(Ht)), interpolation=cv2.INTER_AREA if k < 1 else cv2.INTER_LANCZOS4)
+    else:
+        return np.stack([smart_resize_k(x[:, :, i], fx, fy) for i in range(Co)], axis=2)
+
+
+def padRightDownCorner(img, stride, padValue):
+    h = img.shape[0]
+    w = img.shape[1]
+
+    pad = 4 * [None]
+    pad[0] = 0 # up
+    pad[1] = 0 # left
+    pad[2] = 0 if (h % stride == 0) else stride - (h % stride) # down
+    pad[3] = 0 if (w % stride == 0) else stride - (w % stride) # right
+
+    img_padded = img
+    pad_up = np.tile(img_padded[0:1, :, :]*0 + padValue, (pad[0], 1, 1))
+    img_padded = np.concatenate((pad_up, img_padded), axis=0)
+    pad_left = np.tile(img_padded[:, 0:1, :]*0 + padValue, (1, pad[1], 1))
+    img_padded = np.concatenate((pad_left, img_padded), axis=1)
+    pad_down = np.tile(img_padded[-2:-1, :, :]*0 + padValue, (pad[2], 1, 1))
+    img_padded = np.concatenate((img_padded, pad_down), axis=0)
+    pad_right = np.tile(img_padded[:, -2:-1, :]*0 + padValue, (1, pad[3], 1))
+    img_padded = np.concatenate((img_padded, pad_right), axis=1)
+
+    return img_padded, pad
+
+
+def transfer(model, model_weights):
+    transfered_model_weights = {}
+    for weights_name in model.state_dict().keys():
+        transfered_model_weights[weights_name] = model_weights['.'.join(weights_name.split('.')[1:])]
+    return transfered_model_weights
+
+
+def draw_bodypose(canvas, candidate, subset):
+    H, W, C = canvas.shape
+    candidate = np.array(candidate)
+    subset = np.array(subset)
+
+    stickwidth = 4
+
+    limbSeq = [[2, 3], [2, 6], [3, 4], [4, 5], [6, 7], [7, 8], [2, 9], [9, 10], \
+               [10, 11], [2, 12], [12, 13], [13, 14], [2, 1], [1, 15], [15, 17], \
+               [1, 16], [16, 18], [3, 17], [6, 18]]
+
+    colors = [[255, 0, 0], [255, 85, 0], [255, 170, 0], [255, 255, 0], [170, 255, 0], [85, 255, 0], [0, 255, 0], \
+              [0, 255, 85], [0, 255, 170], [0, 255, 255], [0, 170, 255], [0, 85, 255], [0, 0, 255], [85, 0, 255], \
+              [170, 0, 255], [255, 0, 255], [255, 0, 170], [255, 0, 85]]
+
+    for i in range(17):
+        for n in range(len(subset)):
+            index = subset[n][np.array(limbSeq[i]) - 1]
+            if -1 in index:
+                continue
+            Y = candidate[index.astype(int), 0] * float(W)
+            X = candidate[index.astype(int), 1] * float(H)
+            mX = np.mean(X)
+            mY = np.mean(Y)
+            length = ((X[0] - X[1]) ** 2 + (Y[0] - Y[1]) ** 2) ** 0.5
+            angle = math.degrees(math.atan2(X[0] - X[1], Y[0] - Y[1]))
+            polygon = cv2.ellipse2Poly((int(mY), int(mX)), (int(length / 2), stickwidth), int(angle), 0, 360, 1)
+            cv2.fillConvexPoly(canvas, polygon, colors[i])
+
+    canvas = (canvas * 0.6).astype(np.uint8)
+
+    for i in range(18):
+        for n in range(len(subset)):
+            index = int(subset[n][i])
+            if index == -1:
+                continue
+            x, y = candidate[index][0:2]
+            x = int(x * W)
+            y = int(y * H)
+            cv2.circle(canvas, (int(x), int(y)), 4, colors[i], thickness=-1)
+
+    return canvas
+
+
+def draw_handpose(canvas, all_hand_peaks):
+    H, W, C = canvas.shape
+
+    edges = [[0, 1], [1, 2], [2, 3], [3, 4], [0, 5], [5, 6], [6, 7], [7, 8], [0, 9], [9, 10], \
+             [10, 11], [11, 12], [0, 13], [13, 14], [14, 15], [15, 16], [0, 17], [17, 18], [18, 19], [19, 20]]
+
+    for peaks in all_hand_peaks:
+        peaks = np.array(peaks)
+
+        for ie, e in enumerate(edges):
+            x1, y1 = peaks[e[0]]
+            x2, y2 = peaks[e[1]]
+            x1 = int(x1 * W)
+            y1 = int(y1 * H)
+            x2 = int(x2 * W)
+            y2 = int(y2 * H)
+            if x1 > eps and y1 > eps and x2 > eps and y2 > eps:
+                cv2.line(canvas, (x1, y1), (x2, y2), matplotlib.colors.hsv_to_rgb([ie / float(len(edges)), 1.0, 1.0]) * 255, thickness=2)
+
+        for i, keyponit in enumerate(peaks):
+            x, y = keyponit
+            x = int(x * W)
+            y = int(y * H)
+            if x > eps and y > eps:
+                cv2.circle(canvas, (x, y), 4, (0, 0, 255), thickness=-1)
+    return canvas
+
+
+def draw_facepose(canvas, all_lmks):
+    H, W, C = canvas.shape
+    for lmks in all_lmks:
+        lmks = np.array(lmks)
+        for lmk in lmks:
+            x, y = lmk
+            x = int(x * W)
+            y = int(y * H)
+            if x > eps and y > eps:
+                cv2.circle(canvas, (x, y), 3, (255, 255, 255), thickness=-1)
+    return canvas
+
+
+# detect hand according to body pose keypoints
+# please refer to https://github.com/CMU-Perceptual-Computing-Lab/openpose/blob/master/src/openpose/hand/handDetector.cpp
+def handDetect(candidate, subset, oriImg):
+    # right hand: wrist 4, elbow 3, shoulder 2
+    # left hand: wrist 7, elbow 6, shoulder 5
+    ratioWristElbow = 0.33
+    detect_result = []
+    image_height, image_width = oriImg.shape[0:2]
+    for person in subset.astype(int):
+        # if any of three not detected
+        has_left = np.sum(person[[5, 6, 7]] == -1) == 0
+        has_right = np.sum(person[[2, 3, 4]] == -1) == 0
+        if not (has_left or has_right):
+            continue
+        hands = []
+        #left hand
+        if has_left:
+            left_shoulder_index, left_elbow_index, left_wrist_index = person[[5, 6, 7]]
+            x1, y1 = candidate[left_shoulder_index][:2]
+            x2, y2 = candidate[left_elbow_index][:2]
+            x3, y3 = candidate[left_wrist_index][:2]
+            hands.append([x1, y1, x2, y2, x3, y3, True])
+        # right hand
+        if has_right:
+            right_shoulder_index, right_elbow_index, right_wrist_index = person[[2, 3, 4]]
+            x1, y1 = candidate[right_shoulder_index][:2]
+            x2, y2 = candidate[right_elbow_index][:2]
+            x3, y3 = candidate[right_wrist_index][:2]
+            hands.append([x1, y1, x2, y2, x3, y3, False])
+
+        for x1, y1, x2, y2, x3, y3, is_left in hands:
+            # pos_hand = pos_wrist + ratio * (pos_wrist - pos_elbox) = (1 + ratio) * pos_wrist - ratio * pos_elbox
+            # handRectangle.x = posePtr[wrist*3] + ratioWristElbow * (posePtr[wrist*3] - posePtr[elbow*3]);
+            # handRectangle.y = posePtr[wrist*3+1] + ratioWristElbow * (posePtr[wrist*3+1] - posePtr[elbow*3+1]);
+            # const auto distanceWristElbow = getDistance(poseKeypoints, person, wrist, elbow);
+            # const auto distanceElbowShoulder = getDistance(poseKeypoints, person, elbow, shoulder);
+            # handRectangle.width = 1.5f * fastMax(distanceWristElbow, 0.9f * distanceElbowShoulder);
+            x = x3 + ratioWristElbow * (x3 - x2)
+            y = y3 + ratioWristElbow * (y3 - y2)
+            distanceWristElbow = math.sqrt((x3 - x2) ** 2 + (y3 - y2) ** 2)
+            distanceElbowShoulder = math.sqrt((x2 - x1) ** 2 + (y2 - y1) ** 2)
+            width = 1.5 * max(distanceWristElbow, 0.9 * distanceElbowShoulder)
+            # x-y refers to the center --> offset to topLeft point
+            # handRectangle.x -= handRectangle.width / 2.f;
+            # handRectangle.y -= handRectangle.height / 2.f;
+            x -= width / 2
+            y -= width / 2  # width = height
+            # overflow the image
+            if x < 0: x = 0
+            if y < 0: y = 0
+            width1 = width
+            width2 = width
+            if x + width > image_width: width1 = image_width - x
+            if y + width > image_height: width2 = image_height - y
+            width = min(width1, width2)
+            # the max hand box value is 20 pixels
+            if width >= 20:
+                detect_result.append([int(x), int(y), int(width), is_left])
+
+    '''
+    return value: [[x, y, w, True if left hand else False]].
+    width=height since the network require squared input.
+    x, y is the coordinate of top left 
+    '''
+    return detect_result
+
+
+# Written by Lvmin
+def faceDetect(candidate, subset, oriImg):
+    # left right eye ear 14 15 16 17
+    detect_result = []
+    image_height, image_width = oriImg.shape[0:2]
+    for person in subset.astype(int):
+        has_head = person[0] > -1
+        if not has_head:
+            continue
+
+        has_left_eye = person[14] > -1
+        has_right_eye = person[15] > -1
+        has_left_ear = person[16] > -1
+        has_right_ear = person[17] > -1
+
+        if not (has_left_eye or has_right_eye or has_left_ear or has_right_ear):
+            continue
+
+        head, left_eye, right_eye, left_ear, right_ear = person[[0, 14, 15, 16, 17]]
+
+        width = 0.0
+        x0, y0 = candidate[head][:2]
+
+        if has_left_eye:
+            x1, y1 = candidate[left_eye][:2]
+            d = max(abs(x0 - x1), abs(y0 - y1))
+            width = max(width, d * 3.0)
+
+        if has_right_eye:
+            x1, y1 = candidate[right_eye][:2]
+            d = max(abs(x0 - x1), abs(y0 - y1))
+            width = max(width, d * 3.0)
+
+        if has_left_ear:
+            x1, y1 = candidate[left_ear][:2]
+            d = max(abs(x0 - x1), abs(y0 - y1))
+            width = max(width, d * 1.5)
+
+        if has_right_ear:
+            x1, y1 = candidate[right_ear][:2]
+            d = max(abs(x0 - x1), abs(y0 - y1))
+            width = max(width, d * 1.5)
+
+        x, y = x0, y0
+
+        x -= width
+        y -= width
+
+        if x < 0:
+            x = 0
+
+        if y < 0:
+            y = 0
+
+        width1 = width * 2
+        width2 = width * 2
+
+        if x + width > image_width:
+            width1 = image_width - x
+
+        if y + width > image_height:
+            width2 = image_height - y
+
+        width = min(width1, width2)
+
+        if width >= 20:
+            detect_result.append([int(x), int(y), int(width)])
+
+    return detect_result
+
+
+# get max index of 2d array
+def npmax(array):
+    arrayindex = array.argmax(1)
+    arrayvalue = array.max(1)
+    i = arrayvalue.argmax()
+    j = arrayindex[i]
+    return i, j
diff --git a/preprocess/openpose/annotator/util.py b/preprocess/openpose/annotator/util.py
new file mode 100644
index 0000000000000000000000000000000000000000..67473bd263da26ffb8b9a757bcf56c19c7bd2cbc
--- /dev/null
+++ b/preprocess/openpose/annotator/util.py
@@ -0,0 +1,100 @@
+import random
+
+import numpy as np
+import cv2
+import os
+from pathlib import Path
+
+PROJECT_ROOT = Path(__file__).absolute().parents[3].absolute()
+
+annotator_ckpts_path = os.path.join(PROJECT_ROOT, 'ckpt/openpose/ckpts')
+# print(annotator_ckpts_path)
+
+def HWC3(x):
+    assert x.dtype == np.uint8
+    if x.ndim == 2:
+        x = x[:, :, None]
+    assert x.ndim == 3
+    H, W, C = x.shape
+    assert C == 1 or C == 3 or C == 4
+    if C == 3:
+        return x
+    if C == 1:
+        return np.concatenate([x, x, x], axis=2)
+    if C == 4:
+        color = x[:, :, 0:3].astype(np.float32)
+        alpha = x[:, :, 3:4].astype(np.float32) / 255.0
+        y = color * alpha + 255.0 * (1.0 - alpha)
+        y = y.clip(0, 255).astype(np.uint8)
+        return y
+
+
+def resize_image(input_image, resolution):
+    H, W, C = input_image.shape
+    H = float(H)
+    W = float(W)
+    k = float(resolution) / min(H, W)
+    H *= k
+    W *= k
+    H = int(np.round(H / 64.0)) * 64
+    W = int(np.round(W / 64.0)) * 64
+    img = cv2.resize(input_image, (W, H), interpolation=cv2.INTER_LANCZOS4 if k > 1 else cv2.INTER_AREA)
+    return img
+
+
+def nms(x, t, s):
+    x = cv2.GaussianBlur(x.astype(np.float32), (0, 0), s)
+
+    f1 = np.array([[0, 0, 0], [1, 1, 1], [0, 0, 0]], dtype=np.uint8)
+    f2 = np.array([[0, 1, 0], [0, 1, 0], [0, 1, 0]], dtype=np.uint8)
+    f3 = np.array([[1, 0, 0], [0, 1, 0], [0, 0, 1]], dtype=np.uint8)
+    f4 = np.array([[0, 0, 1], [0, 1, 0], [1, 0, 0]], dtype=np.uint8)
+
+    y = np.zeros_like(x)
+
+    for f in [f1, f2, f3, f4]:
+        np.putmask(y, cv2.dilate(x, kernel=f) == x, x)
+
+    z = np.zeros_like(y, dtype=np.uint8)
+    z[y > t] = 255
+    return z
+
+
+def make_noise_disk(H, W, C, F):
+    noise = np.random.uniform(low=0, high=1, size=((H // F) + 2, (W // F) + 2, C))
+    noise = cv2.resize(noise, (W + 2 * F, H + 2 * F), interpolation=cv2.INTER_CUBIC)
+    noise = noise[F: F + H, F: F + W]
+    noise -= np.min(noise)
+    noise /= np.max(noise)
+    if C == 1:
+        noise = noise[:, :, None]
+    return noise
+
+
+def min_max_norm(x):
+    x -= np.min(x)
+    x /= np.maximum(np.max(x), 1e-5)
+    return x
+
+
+def safe_step(x, step=2):
+    y = x.astype(np.float32) * float(step + 1)
+    y = y.astype(np.int32).astype(np.float32) / float(step)
+    return y
+
+
+def img2mask(img, H, W, low=10, high=90):
+    assert img.ndim == 3 or img.ndim == 2
+    assert img.dtype == np.uint8
+
+    if img.ndim == 3:
+        y = img[:, :, random.randrange(0, img.shape[2])]
+    else:
+        y = img
+
+    y = cv2.resize(y, (W, H), interpolation=cv2.INTER_CUBIC)
+
+    if random.uniform(0, 1) < 0.5:
+        y = 255 - y
+
+    return y < np.percentile(y, random.randrange(low, high))
diff --git a/preprocess/openpose/run_openpose.py b/preprocess/openpose/run_openpose.py
new file mode 100644
index 0000000000000000000000000000000000000000..fa0ed1fe39e8c871726555f184f1dcc3c8dd51bc
--- /dev/null
+++ b/preprocess/openpose/run_openpose.py
@@ -0,0 +1,84 @@
+import pdb
+
+import config
+from pathlib import Path
+import sys
+
+PROJECT_ROOT = Path(__file__).absolute().parents[0].absolute()
+sys.path.insert(0, str(PROJECT_ROOT))
+import os
+
+import cv2
+import einops
+import numpy as np
+import random
+import time
+import json
+
+# from pytorch_lightning import seed_everything
+from preprocess.openpose.annotator.util import resize_image, HWC3
+from preprocess.openpose.annotator.openpose import OpenposeDetector
+
+import argparse
+from PIL import Image
+import torch
+import pdb
+
+# os.environ['CUDA_VISIBLE_DEVICES'] = '0,1,2,3'
+
+class OpenPose:
+    def __init__(self, gpu_id: int):
+        # self.gpu_id = gpu_id
+        # torch.cuda.set_device(gpu_id)
+        self.preprocessor = OpenposeDetector()
+
+    def __call__(self, input_image, resolution=384):
+        # torch.cuda.set_device(self.gpu_id)
+        if isinstance(input_image, Image.Image):
+            input_image = np.asarray(input_image)
+        elif type(input_image) == str:
+            input_image = np.asarray(Image.open(input_image))
+        else:
+            raise ValueError
+        with torch.no_grad():
+            input_image = HWC3(input_image)
+            input_image = resize_image(input_image, resolution)
+            H, W, C = input_image.shape
+            assert (H == 512 and W == 384), 'Incorrect input image shape'
+            pose, detected_map = self.preprocessor(input_image, hand_and_face=False)
+
+            candidate = pose['bodies']['candidate']
+            subset = pose['bodies']['subset'][0][:18]
+            for i in range(18):
+                if subset[i] == -1:
+                    candidate.insert(i, [0, 0])
+                    for j in range(i, 18):
+                        if(subset[j]) != -1:
+                            subset[j] += 1
+                elif subset[i] != i:
+                    candidate.pop(i)
+                    for j in range(i, 18):
+                        if(subset[j]) != -1:
+                            subset[j] -= 1
+
+            candidate = candidate[:18]
+
+            for i in range(18):
+                candidate[i][0] *= 384
+                candidate[i][1] *= 512
+
+            keypoints = {"pose_keypoints_2d": candidate}
+            # with open("/home/aigc/ProjectVTON/OpenPose/keypoints/keypoints.json", "w") as f:
+            #     json.dump(keypoints, f)
+            #
+            # # print(candidate)
+            # output_image = cv2.resize(cv2.cvtColor(detected_map, cv2.COLOR_BGR2RGB), (768, 1024))
+            # cv2.imwrite('/home/aigc/ProjectVTON/OpenPose/keypoints/out_pose.jpg', output_image)
+
+        return keypoints
+
+
+if __name__ == '__main__':
+
+    model = OpenPose()
+    model('./images/bad_model.jpg')
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..a9243fb4b897c3cc398fda901f58f4978a0eba28
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,23 @@
+transformers==4.36.2
+torch==2.0.1
+torchvision==0.15.2
+torchaudio==2.0.2
+numpy==1.24.4
+scipy==1.10.1
+scikit-image==0.21.0
+opencv-python==4.7.0.72
+pillow==9.4.0
+diffusers==0.25.0
+transformers==4.36.2
+accelerate==0.26.1
+matplotlib==3.7.4
+tqdm==4.64.1
+config==0.5.1
+einops==0.7.0
+onnxruntime==1.16.2
+basicsr
+av
+fvcore
+cloudpickle
+omegaconf
+pycocotools
\ No newline at end of file
diff --git a/src/__pycache__/attentionhacked_animate.cpython-310.pyc b/src/__pycache__/attentionhacked_animate.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b5ed75a37e8baeee66fa4654054338fcce6d2ffc
Binary files /dev/null and b/src/__pycache__/attentionhacked_animate.cpython-310.pyc differ
diff --git a/src/__pycache__/attentionhacked_animate_ref.cpython-310.pyc b/src/__pycache__/attentionhacked_animate_ref.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9f12e8d3669fd743deb4bf4d8b1e1adcdc377cee
Binary files /dev/null and b/src/__pycache__/attentionhacked_animate_ref.cpython-310.pyc differ
diff --git a/src/__pycache__/controlnet_hacked_light_animate.cpython-310.pyc b/src/__pycache__/controlnet_hacked_light_animate.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e64f32f6d027a4b86d7c5b4aad039136db62a903
Binary files /dev/null and b/src/__pycache__/controlnet_hacked_light_animate.cpython-310.pyc differ
diff --git a/src/__pycache__/pose_hacked_unet_encoder_animate.cpython-310.pyc b/src/__pycache__/pose_hacked_unet_encoder_animate.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4ad0cf8de453c8006f28dfc5751521eba57234c6
Binary files /dev/null and b/src/__pycache__/pose_hacked_unet_encoder_animate.cpython-310.pyc differ
diff --git a/src/__pycache__/pose_hacked_unet_encoder_animate_cfg.cpython-310.pyc b/src/__pycache__/pose_hacked_unet_encoder_animate_cfg.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f3364c00fdad9274ce99eee5db6afb70d8fc953c
Binary files /dev/null and b/src/__pycache__/pose_hacked_unet_encoder_animate_cfg.cpython-310.pyc differ
diff --git a/src/__pycache__/pose_hacked_unet_encoder_animate_zero.cpython-310.pyc b/src/__pycache__/pose_hacked_unet_encoder_animate_zero.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..02826ef1b8624b6d51bf35abb0892d522e9d45aa
Binary files /dev/null and b/src/__pycache__/pose_hacked_unet_encoder_animate_zero.cpython-310.pyc differ
diff --git a/src/__pycache__/transformerhacked_animate.cpython-310.pyc b/src/__pycache__/transformerhacked_animate.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4a6b4171109244777d97136136b3f74d672243a7
Binary files /dev/null and b/src/__pycache__/transformerhacked_animate.cpython-310.pyc differ
diff --git a/src/__pycache__/transformerhacked_animate_ref.cpython-310.pyc b/src/__pycache__/transformerhacked_animate_ref.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..59a7cb4c4a3ccb015de9acfb1e14116c8108debb
Binary files /dev/null and b/src/__pycache__/transformerhacked_animate_ref.cpython-310.pyc differ
diff --git a/src/__pycache__/unet_block_hacked_animate.cpython-310.pyc b/src/__pycache__/unet_block_hacked_animate.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a752371dc152b2f0b33a7b85fe9fe6189ad28fcf
Binary files /dev/null and b/src/__pycache__/unet_block_hacked_animate.cpython-310.pyc differ
diff --git a/src/__pycache__/unet_block_hacked_animate_ref.cpython-310.pyc b/src/__pycache__/unet_block_hacked_animate_ref.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..38f90beabbf9ce2575a57b1c34a6bc0035a6fc5e
Binary files /dev/null and b/src/__pycache__/unet_block_hacked_animate_ref.cpython-310.pyc differ
diff --git a/src/__pycache__/unet_hacked_animate.cpython-310.pyc b/src/__pycache__/unet_hacked_animate.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9bec02f2968fed69e17e2dbf09f10bb30022f76f
Binary files /dev/null and b/src/__pycache__/unet_hacked_animate.cpython-310.pyc differ
diff --git a/src/__pycache__/unet_hacked_animate_ref.cpython-310.pyc b/src/__pycache__/unet_hacked_animate_ref.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6e01e064c40df16767b4dbaeb857487dd84c2011
Binary files /dev/null and b/src/__pycache__/unet_hacked_animate_ref.cpython-310.pyc differ
diff --git a/src/attentionhacked_garmnet.py b/src/attentionhacked_garmnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..66885c4778c01ae34512c752a05b537577defaec
--- /dev/null
+++ b/src/attentionhacked_garmnet.py
@@ -0,0 +1,670 @@
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Any, Dict, Optional
+
+import torch
+import torch.nn.functional as F
+from torch import nn
+
+from diffusers.utils import USE_PEFT_BACKEND
+from diffusers.utils.torch_utils import maybe_allow_in_graph
+from diffusers.models.activations import GEGLU, GELU, ApproximateGELU
+from diffusers.models.attention_processor import Attention
+from diffusers.models.embeddings import SinusoidalPositionalEmbedding
+from diffusers.models.lora import LoRACompatibleLinear
+from diffusers.models.normalization import AdaLayerNorm, AdaLayerNormContinuous, AdaLayerNormZero, RMSNorm
+
+
+def _chunked_feed_forward(
+    ff: nn.Module, hidden_states: torch.Tensor, chunk_dim: int, chunk_size: int, lora_scale: Optional[float] = None
+):
+    # "feed_forward_chunk_size" can be used to save memory
+    if hidden_states.shape[chunk_dim] % chunk_size != 0:
+        raise ValueError(
+            f"`hidden_states` dimension to be chunked: {hidden_states.shape[chunk_dim]} has to be divisible by chunk size: {chunk_size}. Make sure to set an appropriate `chunk_size` when calling `unet.enable_forward_chunking`."
+        )
+
+    num_chunks = hidden_states.shape[chunk_dim] // chunk_size
+    if lora_scale is None:
+        ff_output = torch.cat(
+            [ff(hid_slice) for hid_slice in hidden_states.chunk(num_chunks, dim=chunk_dim)],
+            dim=chunk_dim,
+        )
+    else:
+        # TOOD(Patrick): LoRA scale can be removed once PEFT refactor is complete
+        ff_output = torch.cat(
+            [ff(hid_slice, scale=lora_scale) for hid_slice in hidden_states.chunk(num_chunks, dim=chunk_dim)],
+            dim=chunk_dim,
+        )
+
+    return ff_output
+
+
+@maybe_allow_in_graph
+class GatedSelfAttentionDense(nn.Module):
+    r"""
+    A gated self-attention dense layer that combines visual features and object features.
+
+    Parameters:
+        query_dim (`int`): The number of channels in the query.
+        context_dim (`int`): The number of channels in the context.
+        n_heads (`int`): The number of heads to use for attention.
+        d_head (`int`): The number of channels in each head.
+    """
+
+    def __init__(self, query_dim: int, context_dim: int, n_heads: int, d_head: int):
+        super().__init__()
+
+        # we need a linear projection since we need cat visual feature and obj feature
+        self.linear = nn.Linear(context_dim, query_dim)
+
+        self.attn = Attention(query_dim=query_dim, heads=n_heads, dim_head=d_head)
+        self.ff = FeedForward(query_dim, activation_fn="geglu")
+
+        self.norm1 = nn.LayerNorm(query_dim)
+        self.norm2 = nn.LayerNorm(query_dim)
+
+        self.register_parameter("alpha_attn", nn.Parameter(torch.tensor(0.0)))
+        self.register_parameter("alpha_dense", nn.Parameter(torch.tensor(0.0)))
+
+        self.enabled = True
+
+    def forward(self, x: torch.Tensor, objs: torch.Tensor) -> torch.Tensor:
+        if not self.enabled:
+            return x
+
+        n_visual = x.shape[1]
+        objs = self.linear(objs)
+
+        x = x + self.alpha_attn.tanh() * self.attn(self.norm1(torch.cat([x, objs], dim=1)))[:, :n_visual, :]
+        x = x + self.alpha_dense.tanh() * self.ff(self.norm2(x))
+
+        return x
+
+
+@maybe_allow_in_graph
+class BasicTransformerBlock(nn.Module):
+    r"""
+    A basic Transformer block.
+
+    Parameters:
+        dim (`int`): The number of channels in the input and output.
+        num_attention_heads (`int`): The number of heads to use for multi-head attention.
+        attention_head_dim (`int`): The number of channels in each head.
+        dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use.
+        cross_attention_dim (`int`, *optional*): The size of the encoder_hidden_states vector for cross attention.
+        activation_fn (`str`, *optional*, defaults to `"geglu"`): Activation function to be used in feed-forward.
+        num_embeds_ada_norm (:
+            obj: `int`, *optional*): The number of diffusion steps used during training. See `Transformer2DModel`.
+        attention_bias (:
+            obj: `bool`, *optional*, defaults to `False`): Configure if the attentions should contain a bias parameter.
+        only_cross_attention (`bool`, *optional*):
+            Whether to use only cross-attention layers. In this case two cross attention layers are used.
+        double_self_attention (`bool`, *optional*):
+            Whether to use two self-attention layers. In this case no cross attention layers are used.
+        upcast_attention (`bool`, *optional*):
+            Whether to upcast the attention computation to float32. This is useful for mixed precision training.
+        norm_elementwise_affine (`bool`, *optional*, defaults to `True`):
+            Whether to use learnable elementwise affine parameters for normalization.
+        norm_type (`str`, *optional*, defaults to `"layer_norm"`):
+            The normalization layer to use. Can be `"layer_norm"`, `"ada_norm"` or `"ada_norm_zero"`.
+        final_dropout (`bool` *optional*, defaults to False):
+            Whether to apply a final dropout after the last feed-forward layer.
+        attention_type (`str`, *optional*, defaults to `"default"`):
+            The type of attention to use. Can be `"default"` or `"gated"` or `"gated-text-image"`.
+        positional_embeddings (`str`, *optional*, defaults to `None`):
+            The type of positional embeddings to apply to.
+        num_positional_embeddings (`int`, *optional*, defaults to `None`):
+            The maximum number of positional embeddings to apply.
+    """
+
+    def __init__(
+        self,
+        dim: int,
+        num_attention_heads: int,
+        attention_head_dim: int,
+        dropout=0.0,
+        cross_attention_dim: Optional[int] = None,
+        activation_fn: str = "geglu",
+        num_embeds_ada_norm: Optional[int] = None,
+        attention_bias: bool = False,
+        only_cross_attention: bool = False,
+        double_self_attention: bool = False,
+        upcast_attention: bool = False,
+        norm_elementwise_affine: bool = True,
+        norm_type: str = "layer_norm",  # 'layer_norm', 'ada_norm', 'ada_norm_zero', 'ada_norm_single'
+        norm_eps: float = 1e-5,
+        final_dropout: bool = False,
+        attention_type: str = "default",
+        positional_embeddings: Optional[str] = None,
+        num_positional_embeddings: Optional[int] = None,
+        ada_norm_continous_conditioning_embedding_dim: Optional[int] = None,
+        ada_norm_bias: Optional[int] = None,
+        ff_inner_dim: Optional[int] = None,
+        ff_bias: bool = True,
+        attention_out_bias: bool = True,
+    ):
+        super().__init__()
+        self.only_cross_attention = only_cross_attention
+
+        self.use_ada_layer_norm_zero = (num_embeds_ada_norm is not None) and norm_type == "ada_norm_zero"
+        self.use_ada_layer_norm = (num_embeds_ada_norm is not None) and norm_type == "ada_norm"
+        self.use_ada_layer_norm_single = norm_type == "ada_norm_single"
+        self.use_layer_norm = norm_type == "layer_norm"
+        self.use_ada_layer_norm_continuous = norm_type == "ada_norm_continuous"
+
+        if norm_type in ("ada_norm", "ada_norm_zero") and num_embeds_ada_norm is None:
+            raise ValueError(
+                f"`norm_type` is set to {norm_type}, but `num_embeds_ada_norm` is not defined. Please make sure to"
+                f" define `num_embeds_ada_norm` if setting `norm_type` to {norm_type}."
+            )
+
+        if positional_embeddings and (num_positional_embeddings is None):
+            raise ValueError(
+                "If `positional_embedding` type is defined, `num_positition_embeddings` must also be defined."
+            )
+
+        if positional_embeddings == "sinusoidal":
+            self.pos_embed = SinusoidalPositionalEmbedding(dim, max_seq_length=num_positional_embeddings)
+        else:
+            self.pos_embed = None
+
+        # Define 3 blocks. Each block has its own normalization layer.
+        # 1. Self-Attn
+        if self.use_ada_layer_norm:
+            self.norm1 = AdaLayerNorm(dim, num_embeds_ada_norm)
+        elif self.use_ada_layer_norm_zero:
+            self.norm1 = AdaLayerNormZero(dim, num_embeds_ada_norm)
+        elif self.use_ada_layer_norm_continuous:
+            self.norm1 = AdaLayerNormContinuous(
+                dim,
+                ada_norm_continous_conditioning_embedding_dim,
+                norm_elementwise_affine,
+                norm_eps,
+                ada_norm_bias,
+                "rms_norm",
+            )
+        else:
+            self.norm1 = nn.LayerNorm(dim, elementwise_affine=norm_elementwise_affine, eps=norm_eps)
+
+        self.attn1 = Attention(
+            query_dim=dim,
+            heads=num_attention_heads,
+            dim_head=attention_head_dim,
+            dropout=dropout,
+            bias=attention_bias,
+            cross_attention_dim=cross_attention_dim if only_cross_attention else None,
+            upcast_attention=upcast_attention,
+            out_bias=attention_out_bias,
+        )
+
+        # 2. Cross-Attn
+        if cross_attention_dim is not None or double_self_attention:
+            # We currently only use AdaLayerNormZero for self attention where there will only be one attention block.
+            # I.e. the number of returned modulation chunks from AdaLayerZero would not make sense if returned during
+            # the second cross attention block.
+            if self.use_ada_layer_norm:
+                self.norm2 = AdaLayerNorm(dim, num_embeds_ada_norm)
+            elif self.use_ada_layer_norm_continuous:
+                self.norm2 = AdaLayerNormContinuous(
+                    dim,
+                    ada_norm_continous_conditioning_embedding_dim,
+                    norm_elementwise_affine,
+                    norm_eps,
+                    ada_norm_bias,
+                    "rms_norm",
+                )
+            else:
+                self.norm2 = nn.LayerNorm(dim, norm_eps, norm_elementwise_affine)
+
+            self.attn2 = Attention(
+                query_dim=dim,
+                cross_attention_dim=cross_attention_dim if not double_self_attention else None,
+                heads=num_attention_heads,
+                dim_head=attention_head_dim,
+                dropout=dropout,
+                bias=attention_bias,
+                upcast_attention=upcast_attention,
+                out_bias=attention_out_bias,
+            )  # is self-attn if encoder_hidden_states is none
+        else:
+            self.norm2 = None
+            self.attn2 = None
+
+        # 3. Feed-forward
+        if self.use_ada_layer_norm_continuous:
+            self.norm3 = AdaLayerNormContinuous(
+                dim,
+                ada_norm_continous_conditioning_embedding_dim,
+                norm_elementwise_affine,
+                norm_eps,
+                ada_norm_bias,
+                "layer_norm",
+            )
+        elif not self.use_ada_layer_norm_single:
+            self.norm3 = nn.LayerNorm(dim, norm_eps, norm_elementwise_affine)
+
+        self.ff = FeedForward(
+            dim,
+            dropout=dropout,
+            activation_fn=activation_fn,
+            final_dropout=final_dropout,
+            inner_dim=ff_inner_dim,
+            bias=ff_bias,
+        )
+
+        # 4. Fuser
+        if attention_type == "gated" or attention_type == "gated-text-image":
+            self.fuser = GatedSelfAttentionDense(dim, cross_attention_dim, num_attention_heads, attention_head_dim)
+
+        # 5. Scale-shift for PixArt-Alpha.
+        if self.use_ada_layer_norm_single:
+            self.scale_shift_table = nn.Parameter(torch.randn(6, dim) / dim**0.5)
+
+        # let chunk size default to None
+        self._chunk_size = None
+        self._chunk_dim = 0
+
+    def set_chunk_feed_forward(self, chunk_size: Optional[int], dim: int = 0):
+        # Sets chunk feed-forward
+        self._chunk_size = chunk_size
+        self._chunk_dim = dim
+
+    def forward(
+        self,
+        hidden_states: torch.FloatTensor,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        timestep: Optional[torch.LongTensor] = None,
+        cross_attention_kwargs: Dict[str, Any] = None,
+        class_labels: Optional[torch.LongTensor] = None,
+        added_cond_kwargs: Optional[Dict[str, torch.Tensor]] = None,
+    ) -> torch.FloatTensor:
+        # Notice that normalization is always applied before the real computation in the following blocks.
+        # 0. Self-Attention
+        batch_size = hidden_states.shape[0]
+        if self.use_ada_layer_norm:
+            norm_hidden_states = self.norm1(hidden_states, timestep)
+        elif self.use_ada_layer_norm_zero:
+            norm_hidden_states, gate_msa, shift_mlp, scale_mlp, gate_mlp = self.norm1(
+                hidden_states, timestep, class_labels, hidden_dtype=hidden_states.dtype
+            )
+        elif self.use_layer_norm:
+            norm_hidden_states = self.norm1(hidden_states)
+        elif self.use_ada_layer_norm_continuous:
+            norm_hidden_states = self.norm1(hidden_states, added_cond_kwargs["pooled_text_emb"])
+        elif self.use_ada_layer_norm_single:
+            shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = (
+                self.scale_shift_table[None] + timestep.reshape(batch_size, 6, -1)
+            ).chunk(6, dim=1)
+            norm_hidden_states = self.norm1(hidden_states)
+            norm_hidden_states = norm_hidden_states * (1 + scale_msa) + shift_msa
+            norm_hidden_states = norm_hidden_states.squeeze(1)
+        else:
+            raise ValueError("Incorrect norm used")
+
+        if self.pos_embed is not None:
+            norm_hidden_states = self.pos_embed(norm_hidden_states)
+
+        garment_features = []
+        garment_features.append(norm_hidden_states)
+
+        # 1. Retrieve lora scale.
+        lora_scale = cross_attention_kwargs.get("scale", 1.0) if cross_attention_kwargs is not None else 1.0
+
+        # 2. Prepare GLIGEN inputs
+        cross_attention_kwargs = cross_attention_kwargs.copy() if cross_attention_kwargs is not None else {}
+        gligen_kwargs = cross_attention_kwargs.pop("gligen", None)
+
+        attn_output = self.attn1(
+            norm_hidden_states,
+            encoder_hidden_states=encoder_hidden_states if self.only_cross_attention else None,
+            attention_mask=attention_mask,
+            **cross_attention_kwargs,
+        )
+        if self.use_ada_layer_norm_zero:
+            attn_output = gate_msa.unsqueeze(1) * attn_output
+        elif self.use_ada_layer_norm_single:
+            attn_output = gate_msa * attn_output
+
+        hidden_states = attn_output + hidden_states
+        if hidden_states.ndim == 4:
+            hidden_states = hidden_states.squeeze(1)
+
+        # 2.5 GLIGEN Control
+        if gligen_kwargs is not None:
+            hidden_states = self.fuser(hidden_states, gligen_kwargs["objs"])
+
+        # 3. Cross-Attention
+        if self.attn2 is not None:
+            if self.use_ada_layer_norm:
+                norm_hidden_states = self.norm2(hidden_states, timestep)
+            elif self.use_ada_layer_norm_zero or self.use_layer_norm:
+                norm_hidden_states = self.norm2(hidden_states)
+            elif self.use_ada_layer_norm_single:
+                # For PixArt norm2 isn't applied here:
+                # https://github.com/PixArt-alpha/PixArt-alpha/blob/0f55e922376d8b797edd44d25d0e7464b260dcab/diffusion/model/nets/PixArtMS.py#L70C1-L76C103
+                norm_hidden_states = hidden_states
+            elif self.use_ada_layer_norm_continuous:
+                norm_hidden_states = self.norm2(hidden_states, added_cond_kwargs["pooled_text_emb"])
+            else:
+                raise ValueError("Incorrect norm")
+
+            if self.pos_embed is not None and self.use_ada_layer_norm_single is False:
+                norm_hidden_states = self.pos_embed(norm_hidden_states)
+
+            attn_output = self.attn2(
+                norm_hidden_states,
+                encoder_hidden_states=encoder_hidden_states,
+                attention_mask=encoder_attention_mask,
+                **cross_attention_kwargs,
+            )
+            hidden_states = attn_output + hidden_states
+
+        # 4. Feed-forward
+        if self.use_ada_layer_norm_continuous:
+            norm_hidden_states = self.norm3(hidden_states, added_cond_kwargs["pooled_text_emb"])
+        elif not self.use_ada_layer_norm_single:
+            norm_hidden_states = self.norm3(hidden_states)
+
+        if self.use_ada_layer_norm_zero:
+            norm_hidden_states = norm_hidden_states * (1 + scale_mlp[:, None]) + shift_mlp[:, None]
+
+        if self.use_ada_layer_norm_single:
+            norm_hidden_states = self.norm2(hidden_states)
+            norm_hidden_states = norm_hidden_states * (1 + scale_mlp) + shift_mlp
+
+        if self._chunk_size is not None:
+            # "feed_forward_chunk_size" can be used to save memory
+            ff_output = _chunked_feed_forward(
+                self.ff, norm_hidden_states, self._chunk_dim, self._chunk_size, lora_scale=lora_scale
+            )
+        else:
+            ff_output = self.ff(norm_hidden_states, scale=lora_scale)
+
+        if self.use_ada_layer_norm_zero:
+            ff_output = gate_mlp.unsqueeze(1) * ff_output
+        elif self.use_ada_layer_norm_single:
+            ff_output = gate_mlp * ff_output
+
+        hidden_states = ff_output + hidden_states
+        if hidden_states.ndim == 4:
+            hidden_states = hidden_states.squeeze(1)
+
+        return hidden_states, garment_features
+
+
+@maybe_allow_in_graph
+class TemporalBasicTransformerBlock(nn.Module):
+    r"""
+    A basic Transformer block for video like data.
+
+    Parameters:
+        dim (`int`): The number of channels in the input and output.
+        time_mix_inner_dim (`int`): The number of channels for temporal attention.
+        num_attention_heads (`int`): The number of heads to use for multi-head attention.
+        attention_head_dim (`int`): The number of channels in each head.
+        cross_attention_dim (`int`, *optional*): The size of the encoder_hidden_states vector for cross attention.
+    """
+
+    def __init__(
+        self,
+        dim: int,
+        time_mix_inner_dim: int,
+        num_attention_heads: int,
+        attention_head_dim: int,
+        cross_attention_dim: Optional[int] = None,
+    ):
+        super().__init__()
+        self.is_res = dim == time_mix_inner_dim
+
+        self.norm_in = nn.LayerNorm(dim)
+
+        # Define 3 blocks. Each block has its own normalization layer.
+        # 1. Self-Attn
+        self.norm_in = nn.LayerNorm(dim)
+        self.ff_in = FeedForward(
+            dim,
+            dim_out=time_mix_inner_dim,
+            activation_fn="geglu",
+        )
+
+        self.norm1 = nn.LayerNorm(time_mix_inner_dim)
+        self.attn1 = Attention(
+            query_dim=time_mix_inner_dim,
+            heads=num_attention_heads,
+            dim_head=attention_head_dim,
+            cross_attention_dim=None,
+        )
+
+        # 2. Cross-Attn
+        if cross_attention_dim is not None:
+            # We currently only use AdaLayerNormZero for self attention where there will only be one attention block.
+            # I.e. the number of returned modulation chunks from AdaLayerZero would not make sense if returned during
+            # the second cross attention block.
+            self.norm2 = nn.LayerNorm(time_mix_inner_dim)
+            self.attn2 = Attention(
+                query_dim=time_mix_inner_dim,
+                cross_attention_dim=cross_attention_dim,
+                heads=num_attention_heads,
+                dim_head=attention_head_dim,
+            )  # is self-attn if encoder_hidden_states is none
+        else:
+            self.norm2 = None
+            self.attn2 = None
+
+        # 3. Feed-forward
+        self.norm3 = nn.LayerNorm(time_mix_inner_dim)
+        self.ff = FeedForward(time_mix_inner_dim, activation_fn="geglu")
+
+        # let chunk size default to None
+        self._chunk_size = None
+        self._chunk_dim = None
+
+    def set_chunk_feed_forward(self, chunk_size: Optional[int], **kwargs):
+        # Sets chunk feed-forward
+        self._chunk_size = chunk_size
+        # chunk dim should be hardcoded to 1 to have better speed vs. memory trade-off
+        self._chunk_dim = 1
+
+    def forward(
+        self,
+        hidden_states: torch.FloatTensor,
+        num_frames: int,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+    ) -> torch.FloatTensor:
+        # Notice that normalization is always applied before the real computation in the following blocks.
+        # 0. Self-Attention
+        batch_size = hidden_states.shape[0]
+
+        batch_frames, seq_length, channels = hidden_states.shape
+        batch_size = batch_frames // num_frames
+
+        hidden_states = hidden_states[None, :].reshape(batch_size, num_frames, seq_length, channels)
+        hidden_states = hidden_states.permute(0, 2, 1, 3)
+        hidden_states = hidden_states.reshape(batch_size * seq_length, num_frames, channels)
+
+        residual = hidden_states
+        hidden_states = self.norm_in(hidden_states)
+
+        if self._chunk_size is not None:
+            hidden_states = _chunked_feed_forward(self.ff_in, hidden_states, self._chunk_dim, self._chunk_size)
+        else:
+            hidden_states = self.ff_in(hidden_states)
+
+        if self.is_res:
+            hidden_states = hidden_states + residual
+
+        norm_hidden_states = self.norm1(hidden_states)
+        attn_output = self.attn1(norm_hidden_states, encoder_hidden_states=None)
+        hidden_states = attn_output + hidden_states
+
+        # 3. Cross-Attention
+        if self.attn2 is not None:
+            norm_hidden_states = self.norm2(hidden_states)
+            attn_output = self.attn2(norm_hidden_states, encoder_hidden_states=encoder_hidden_states)
+            hidden_states = attn_output + hidden_states
+
+        # 4. Feed-forward
+        norm_hidden_states = self.norm3(hidden_states)
+
+        if self._chunk_size is not None:
+            ff_output = _chunked_feed_forward(self.ff, norm_hidden_states, self._chunk_dim, self._chunk_size)
+        else:
+            ff_output = self.ff(norm_hidden_states)
+
+        if self.is_res:
+            hidden_states = ff_output + hidden_states
+        else:
+            hidden_states = ff_output
+
+        hidden_states = hidden_states[None, :].reshape(batch_size, seq_length, num_frames, channels)
+        hidden_states = hidden_states.permute(0, 2, 1, 3)
+        hidden_states = hidden_states.reshape(batch_size * num_frames, seq_length, channels)
+
+        return hidden_states
+
+
+class SkipFFTransformerBlock(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        num_attention_heads: int,
+        attention_head_dim: int,
+        kv_input_dim: int,
+        kv_input_dim_proj_use_bias: bool,
+        dropout=0.0,
+        cross_attention_dim: Optional[int] = None,
+        attention_bias: bool = False,
+        attention_out_bias: bool = True,
+    ):
+        super().__init__()
+        if kv_input_dim != dim:
+            self.kv_mapper = nn.Linear(kv_input_dim, dim, kv_input_dim_proj_use_bias)
+        else:
+            self.kv_mapper = None
+
+        self.norm1 = RMSNorm(dim, 1e-06)
+
+        self.attn1 = Attention(
+            query_dim=dim,
+            heads=num_attention_heads,
+            dim_head=attention_head_dim,
+            dropout=dropout,
+            bias=attention_bias,
+            cross_attention_dim=cross_attention_dim,
+            out_bias=attention_out_bias,
+        )
+
+        self.norm2 = RMSNorm(dim, 1e-06)
+
+        self.attn2 = Attention(
+            query_dim=dim,
+            cross_attention_dim=cross_attention_dim,
+            heads=num_attention_heads,
+            dim_head=attention_head_dim,
+            dropout=dropout,
+            bias=attention_bias,
+            out_bias=attention_out_bias,
+        )
+
+    def forward(self, hidden_states, encoder_hidden_states, cross_attention_kwargs):
+        cross_attention_kwargs = cross_attention_kwargs.copy() if cross_attention_kwargs is not None else {}
+
+        if self.kv_mapper is not None:
+            encoder_hidden_states = self.kv_mapper(F.silu(encoder_hidden_states))
+
+        norm_hidden_states = self.norm1(hidden_states)
+
+        attn_output = self.attn1(
+            norm_hidden_states,
+            encoder_hidden_states=encoder_hidden_states,
+            **cross_attention_kwargs,
+        )
+
+        hidden_states = attn_output + hidden_states
+
+        norm_hidden_states = self.norm2(hidden_states)
+
+        attn_output = self.attn2(
+            norm_hidden_states,
+            encoder_hidden_states=encoder_hidden_states,
+            **cross_attention_kwargs,
+        )
+
+        hidden_states = attn_output + hidden_states
+
+        return hidden_states
+
+
+class FeedForward(nn.Module):
+    r"""
+    A feed-forward layer.
+
+    Parameters:
+        dim (`int`): The number of channels in the input.
+        dim_out (`int`, *optional*): The number of channels in the output. If not given, defaults to `dim`.
+        mult (`int`, *optional*, defaults to 4): The multiplier to use for the hidden dimension.
+        dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use.
+        activation_fn (`str`, *optional*, defaults to `"geglu"`): Activation function to be used in feed-forward.
+        final_dropout (`bool` *optional*, defaults to False): Apply a final dropout.
+        bias (`bool`, defaults to True): Whether to use a bias in the linear layer.
+    """
+
+    def __init__(
+        self,
+        dim: int,
+        dim_out: Optional[int] = None,
+        mult: int = 4,
+        dropout: float = 0.0,
+        activation_fn: str = "geglu",
+        final_dropout: bool = False,
+        inner_dim=None,
+        bias: bool = True,
+    ):
+        super().__init__()
+        if inner_dim is None:
+            inner_dim = int(dim * mult)
+        dim_out = dim_out if dim_out is not None else dim
+        linear_cls = LoRACompatibleLinear if not USE_PEFT_BACKEND else nn.Linear
+
+        if activation_fn == "gelu":
+            act_fn = GELU(dim, inner_dim, bias=bias)
+        if activation_fn == "gelu-approximate":
+            act_fn = GELU(dim, inner_dim, approximate="tanh", bias=bias)
+        elif activation_fn == "geglu":
+            act_fn = GEGLU(dim, inner_dim, bias=bias)
+        elif activation_fn == "geglu-approximate":
+            act_fn = ApproximateGELU(dim, inner_dim, bias=bias)
+
+        self.net = nn.ModuleList([])
+        # project in
+        self.net.append(act_fn)
+        # project dropout
+        self.net.append(nn.Dropout(dropout))
+        # project out
+        self.net.append(linear_cls(inner_dim, dim_out, bias=bias))
+        # FF as used in Vision Transformer, MLP-Mixer, etc. have a final dropout
+        if final_dropout:
+            self.net.append(nn.Dropout(dropout))
+
+    def forward(self, hidden_states: torch.Tensor, scale: float = 1.0) -> torch.Tensor:
+        compatible_cls = (GEGLU,) if USE_PEFT_BACKEND else (GEGLU, LoRACompatibleLinear)
+        for module in self.net:
+            if isinstance(module, compatible_cls):
+                hidden_states = module(hidden_states, scale)
+            else:
+                hidden_states = module(hidden_states)
+        return hidden_states
diff --git a/src/attentionhacked_tryon.py b/src/attentionhacked_tryon.py
new file mode 100644
index 0000000000000000000000000000000000000000..9947e7c878f9f1fd3701b939dfc243628fa54652
--- /dev/null
+++ b/src/attentionhacked_tryon.py
@@ -0,0 +1,682 @@
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Any, Dict, Optional
+
+import torch
+import torch.nn.functional as F
+from torch import nn
+
+from diffusers.utils import USE_PEFT_BACKEND
+from diffusers.utils.torch_utils import maybe_allow_in_graph
+from diffusers.models.activations import GEGLU, GELU, ApproximateGELU
+from diffusers.models.attention_processor import Attention
+from diffusers.models.embeddings import SinusoidalPositionalEmbedding
+from diffusers.models.lora import LoRACompatibleLinear
+from diffusers.models.normalization import AdaLayerNorm, AdaLayerNormContinuous, AdaLayerNormZero, RMSNorm
+
+
+def _chunked_feed_forward(
+    ff: nn.Module, hidden_states: torch.Tensor, chunk_dim: int, chunk_size: int, lora_scale: Optional[float] = None
+):
+    # "feed_forward_chunk_size" can be used to save memory
+    if hidden_states.shape[chunk_dim] % chunk_size != 0:
+        raise ValueError(
+            f"`hidden_states` dimension to be chunked: {hidden_states.shape[chunk_dim]} has to be divisible by chunk size: {chunk_size}. Make sure to set an appropriate `chunk_size` when calling `unet.enable_forward_chunking`."
+        )
+
+    num_chunks = hidden_states.shape[chunk_dim] // chunk_size
+    if lora_scale is None:
+        ff_output = torch.cat(
+            [ff(hid_slice) for hid_slice in hidden_states.chunk(num_chunks, dim=chunk_dim)],
+            dim=chunk_dim,
+        )
+    else:
+        # TOOD(Patrick): LoRA scale can be removed once PEFT refactor is complete
+        ff_output = torch.cat(
+            [ff(hid_slice, scale=lora_scale) for hid_slice in hidden_states.chunk(num_chunks, dim=chunk_dim)],
+            dim=chunk_dim,
+        )
+
+    return ff_output
+
+
+@maybe_allow_in_graph
+class GatedSelfAttentionDense(nn.Module):
+    r"""
+    A gated self-attention dense layer that combines visual features and object features.
+
+    Parameters:
+        query_dim (`int`): The number of channels in the query.
+        context_dim (`int`): The number of channels in the context.
+        n_heads (`int`): The number of heads to use for attention.
+        d_head (`int`): The number of channels in each head.
+    """
+
+    def __init__(self, query_dim: int, context_dim: int, n_heads: int, d_head: int):
+        super().__init__()
+
+        # we need a linear projection since we need cat visual feature and obj feature
+        self.linear = nn.Linear(context_dim, query_dim)
+
+        self.attn = Attention(query_dim=query_dim, heads=n_heads, dim_head=d_head)
+        self.ff = FeedForward(query_dim, activation_fn="geglu")
+
+        self.norm1 = nn.LayerNorm(query_dim)
+        self.norm2 = nn.LayerNorm(query_dim)
+
+        self.register_parameter("alpha_attn", nn.Parameter(torch.tensor(0.0)))
+        self.register_parameter("alpha_dense", nn.Parameter(torch.tensor(0.0)))
+
+        self.enabled = True
+
+    def forward(self, x: torch.Tensor, objs: torch.Tensor) -> torch.Tensor:
+        if not self.enabled:
+            return x
+
+        n_visual = x.shape[1]
+        objs = self.linear(objs)
+
+        x = x + self.alpha_attn.tanh() * self.attn(self.norm1(torch.cat([x, objs], dim=1)))[:, :n_visual, :]
+        x = x + self.alpha_dense.tanh() * self.ff(self.norm2(x))
+
+        return x
+
+
+@maybe_allow_in_graph
+class BasicTransformerBlock(nn.Module):
+    r"""
+    A basic Transformer block.
+
+    Parameters:
+        dim (`int`): The number of channels in the input and output.
+        num_attention_heads (`int`): The number of heads to use for multi-head attention.
+        attention_head_dim (`int`): The number of channels in each head.
+        dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use.
+        cross_attention_dim (`int`, *optional*): The size of the encoder_hidden_states vector for cross attention.
+        activation_fn (`str`, *optional*, defaults to `"geglu"`): Activation function to be used in feed-forward.
+        num_embeds_ada_norm (:
+            obj: `int`, *optional*): The number of diffusion steps used during training. See `Transformer2DModel`.
+        attention_bias (:
+            obj: `bool`, *optional*, defaults to `False`): Configure if the attentions should contain a bias parameter.
+        only_cross_attention (`bool`, *optional*):
+            Whether to use only cross-attention layers. In this case two cross attention layers are used.
+        double_self_attention (`bool`, *optional*):
+            Whether to use two self-attention layers. In this case no cross attention layers are used.
+        upcast_attention (`bool`, *optional*):
+            Whether to upcast the attention computation to float32. This is useful for mixed precision training.
+        norm_elementwise_affine (`bool`, *optional*, defaults to `True`):
+            Whether to use learnable elementwise affine parameters for normalization.
+        norm_type (`str`, *optional*, defaults to `"layer_norm"`):
+            The normalization layer to use. Can be `"layer_norm"`, `"ada_norm"` or `"ada_norm_zero"`.
+        final_dropout (`bool` *optional*, defaults to False):
+            Whether to apply a final dropout after the last feed-forward layer.
+        attention_type (`str`, *optional*, defaults to `"default"`):
+            The type of attention to use. Can be `"default"` or `"gated"` or `"gated-text-image"`.
+        positional_embeddings (`str`, *optional*, defaults to `None`):
+            The type of positional embeddings to apply to.
+        num_positional_embeddings (`int`, *optional*, defaults to `None`):
+            The maximum number of positional embeddings to apply.
+    """
+
+    def __init__(
+        self,
+        dim: int,
+        num_attention_heads: int,
+        attention_head_dim: int,
+        dropout=0.0,
+        cross_attention_dim: Optional[int] = None,
+        activation_fn: str = "geglu",
+        num_embeds_ada_norm: Optional[int] = None,
+        attention_bias: bool = False,
+        only_cross_attention: bool = False,
+        double_self_attention: bool = False,
+        upcast_attention: bool = False,
+        norm_elementwise_affine: bool = True,
+        norm_type: str = "layer_norm",  # 'layer_norm', 'ada_norm', 'ada_norm_zero', 'ada_norm_single'
+        norm_eps: float = 1e-5,
+        final_dropout: bool = False,
+        attention_type: str = "default",
+        positional_embeddings: Optional[str] = None,
+        num_positional_embeddings: Optional[int] = None,
+        ada_norm_continous_conditioning_embedding_dim: Optional[int] = None,
+        ada_norm_bias: Optional[int] = None,
+        ff_inner_dim: Optional[int] = None,
+        ff_bias: bool = True,
+        attention_out_bias: bool = True,
+    ):
+        super().__init__()
+        self.only_cross_attention = only_cross_attention
+
+        self.use_ada_layer_norm_zero = (num_embeds_ada_norm is not None) and norm_type == "ada_norm_zero"
+        self.use_ada_layer_norm = (num_embeds_ada_norm is not None) and norm_type == "ada_norm"
+        self.use_ada_layer_norm_single = norm_type == "ada_norm_single"
+        self.use_layer_norm = norm_type == "layer_norm"
+        self.use_ada_layer_norm_continuous = norm_type == "ada_norm_continuous"
+
+        if norm_type in ("ada_norm", "ada_norm_zero") and num_embeds_ada_norm is None:
+            raise ValueError(
+                f"`norm_type` is set to {norm_type}, but `num_embeds_ada_norm` is not defined. Please make sure to"
+                f" define `num_embeds_ada_norm` if setting `norm_type` to {norm_type}."
+            )
+
+        if positional_embeddings and (num_positional_embeddings is None):
+            raise ValueError(
+                "If `positional_embedding` type is defined, `num_positition_embeddings` must also be defined."
+            )
+
+        if positional_embeddings == "sinusoidal":
+            self.pos_embed = SinusoidalPositionalEmbedding(dim, max_seq_length=num_positional_embeddings)
+        else:
+            self.pos_embed = None
+
+        # Define 3 blocks. Each block has its own normalization layer.
+        # 1. Self-Attn
+        if self.use_ada_layer_norm:
+            self.norm1 = AdaLayerNorm(dim, num_embeds_ada_norm)
+        elif self.use_ada_layer_norm_zero:
+            self.norm1 = AdaLayerNormZero(dim, num_embeds_ada_norm)
+        elif self.use_ada_layer_norm_continuous:
+            self.norm1 = AdaLayerNormContinuous(
+                dim,
+                ada_norm_continous_conditioning_embedding_dim,
+                norm_elementwise_affine,
+                norm_eps,
+                ada_norm_bias,
+                "rms_norm",
+            )
+        else:
+            self.norm1 = nn.LayerNorm(dim, elementwise_affine=norm_elementwise_affine, eps=norm_eps)
+
+        self.attn1 = Attention(
+            query_dim=dim,
+            heads=num_attention_heads,
+            dim_head=attention_head_dim,
+            dropout=dropout,
+            bias=attention_bias,
+            cross_attention_dim=cross_attention_dim if only_cross_attention else None,
+            upcast_attention=upcast_attention,
+            out_bias=attention_out_bias,
+        )
+
+        # 2. Cross-Attn
+        if cross_attention_dim is not None or double_self_attention:
+            # We currently only use AdaLayerNormZero for self attention where there will only be one attention block.
+            # I.e. the number of returned modulation chunks from AdaLayerZero would not make sense if returned during
+            # the second cross attention block.
+            if self.use_ada_layer_norm:
+                self.norm2 = AdaLayerNorm(dim, num_embeds_ada_norm)
+            elif self.use_ada_layer_norm_continuous:
+                self.norm2 = AdaLayerNormContinuous(
+                    dim,
+                    ada_norm_continous_conditioning_embedding_dim,
+                    norm_elementwise_affine,
+                    norm_eps,
+                    ada_norm_bias,
+                    "rms_norm",
+                )
+            else:
+                self.norm2 = nn.LayerNorm(dim, norm_eps, norm_elementwise_affine)
+
+            self.attn2 = Attention(
+                query_dim=dim,
+                cross_attention_dim=cross_attention_dim if not double_self_attention else None,
+                heads=num_attention_heads,
+                dim_head=attention_head_dim,
+                dropout=dropout,
+                bias=attention_bias,
+                upcast_attention=upcast_attention,
+                out_bias=attention_out_bias,
+            )  # is self-attn if encoder_hidden_states is none
+        else:
+            self.norm2 = None
+            self.attn2 = None
+
+        # 3. Feed-forward
+        if self.use_ada_layer_norm_continuous:
+            self.norm3 = AdaLayerNormContinuous(
+                dim,
+                ada_norm_continous_conditioning_embedding_dim,
+                norm_elementwise_affine,
+                norm_eps,
+                ada_norm_bias,
+                "layer_norm",
+            )
+        elif not self.use_ada_layer_norm_single:
+            self.norm3 = nn.LayerNorm(dim, norm_eps, norm_elementwise_affine)
+
+        self.ff = FeedForward(
+            dim,
+            dropout=dropout,
+            activation_fn=activation_fn,
+            final_dropout=final_dropout,
+            inner_dim=ff_inner_dim,
+            bias=ff_bias,
+        )
+
+        # 4. Fuser
+        if attention_type == "gated" or attention_type == "gated-text-image":
+            self.fuser = GatedSelfAttentionDense(dim, cross_attention_dim, num_attention_heads, attention_head_dim)
+
+        # 5. Scale-shift for PixArt-Alpha.
+        if self.use_ada_layer_norm_single:
+            self.scale_shift_table = nn.Parameter(torch.randn(6, dim) / dim**0.5)
+
+        # let chunk size default to None
+        self._chunk_size = None
+        self._chunk_dim = 0
+
+    def set_chunk_feed_forward(self, chunk_size: Optional[int], dim: int = 0):
+        # Sets chunk feed-forward
+        self._chunk_size = chunk_size
+        self._chunk_dim = dim
+
+    def forward(
+        self,
+        hidden_states: torch.FloatTensor,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        timestep: Optional[torch.LongTensor] = None,
+        cross_attention_kwargs: Dict[str, Any] = None,
+        class_labels: Optional[torch.LongTensor] = None,
+        garment_features=None,
+        curr_garment_feat_idx=0,
+        added_cond_kwargs: Optional[Dict[str, torch.Tensor]] = None,
+    ) -> torch.FloatTensor:
+        # Notice that normalization is always applied before the real computation in the following blocks.
+        # 0. Self-Attention
+        batch_size = hidden_states.shape[0]
+
+        
+        
+        if self.use_ada_layer_norm:
+            norm_hidden_states = self.norm1(hidden_states, timestep)
+        elif self.use_ada_layer_norm_zero:
+            norm_hidden_states, gate_msa, shift_mlp, scale_mlp, gate_mlp = self.norm1(
+                hidden_states, timestep, class_labels, hidden_dtype=hidden_states.dtype
+            )
+        elif self.use_layer_norm:
+            norm_hidden_states = self.norm1(hidden_states)
+        elif self.use_ada_layer_norm_continuous:
+            norm_hidden_states = self.norm1(hidden_states, added_cond_kwargs["pooled_text_emb"])
+        elif self.use_ada_layer_norm_single:
+            shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = (
+                self.scale_shift_table[None] + timestep.reshape(batch_size, 6, -1)
+            ).chunk(6, dim=1)
+            norm_hidden_states = self.norm1(hidden_states)
+            norm_hidden_states = norm_hidden_states * (1 + scale_msa) + shift_msa
+            norm_hidden_states = norm_hidden_states.squeeze(1)
+        else:
+            raise ValueError("Incorrect norm used")
+
+        if self.pos_embed is not None:
+            norm_hidden_states = self.pos_embed(norm_hidden_states)
+
+        # 1. Retrieve lora scale.
+        lora_scale = cross_attention_kwargs.get("scale", 1.0) if cross_attention_kwargs is not None else 1.0
+
+        # 2. Prepare GLIGEN inputs
+        cross_attention_kwargs = cross_attention_kwargs.copy() if cross_attention_kwargs is not None else {}
+        gligen_kwargs = cross_attention_kwargs.pop("gligen", None)
+
+
+        #type2
+        modify_norm_hidden_states = torch.cat([norm_hidden_states,garment_features[curr_garment_feat_idx]], dim=1)
+        curr_garment_feat_idx +=1
+        attn_output = self.attn1(
+            #norm_hidden_states,
+            modify_norm_hidden_states,
+            encoder_hidden_states=encoder_hidden_states if self.only_cross_attention else None,
+            attention_mask=attention_mask,
+            **cross_attention_kwargs,
+        )
+        if self.use_ada_layer_norm_zero:
+            attn_output = gate_msa.unsqueeze(1) * attn_output
+        elif self.use_ada_layer_norm_single:
+            attn_output = gate_msa * attn_output
+
+
+        #type2
+        hidden_states = attn_output[:,:hidden_states.shape[-2],:] + hidden_states
+
+
+
+
+        if hidden_states.ndim == 4:
+            hidden_states = hidden_states.squeeze(1)
+
+        # 2.5 GLIGEN Control
+        if gligen_kwargs is not None:
+            hidden_states = self.fuser(hidden_states, gligen_kwargs["objs"])
+
+        # 3. Cross-Attention
+        if self.attn2 is not None:
+            if self.use_ada_layer_norm:
+                norm_hidden_states = self.norm2(hidden_states, timestep)
+            elif self.use_ada_layer_norm_zero or self.use_layer_norm:
+                norm_hidden_states = self.norm2(hidden_states)
+            elif self.use_ada_layer_norm_single:
+                # For PixArt norm2 isn't applied here:
+                # https://github.com/PixArt-alpha/PixArt-alpha/blob/0f55e922376d8b797edd44d25d0e7464b260dcab/diffusion/model/nets/PixArtMS.py#L70C1-L76C103
+                norm_hidden_states = hidden_states
+            elif self.use_ada_layer_norm_continuous:
+                norm_hidden_states = self.norm2(hidden_states, added_cond_kwargs["pooled_text_emb"])
+            else:
+                raise ValueError("Incorrect norm")
+
+            if self.pos_embed is not None and self.use_ada_layer_norm_single is False:
+                norm_hidden_states = self.pos_embed(norm_hidden_states)
+
+            attn_output = self.attn2(
+                norm_hidden_states,
+                encoder_hidden_states=encoder_hidden_states,
+                attention_mask=encoder_attention_mask,
+                **cross_attention_kwargs,
+            )
+            hidden_states = attn_output + hidden_states
+
+        # 4. Feed-forward
+        if self.use_ada_layer_norm_continuous:
+            norm_hidden_states = self.norm3(hidden_states, added_cond_kwargs["pooled_text_emb"])
+        elif not self.use_ada_layer_norm_single:
+            norm_hidden_states = self.norm3(hidden_states)
+
+        if self.use_ada_layer_norm_zero:
+            norm_hidden_states = norm_hidden_states * (1 + scale_mlp[:, None]) + shift_mlp[:, None]
+
+        if self.use_ada_layer_norm_single:
+            norm_hidden_states = self.norm2(hidden_states)
+            norm_hidden_states = norm_hidden_states * (1 + scale_mlp) + shift_mlp
+
+        if self._chunk_size is not None:
+            # "feed_forward_chunk_size" can be used to save memory
+            ff_output = _chunked_feed_forward(
+                self.ff, norm_hidden_states, self._chunk_dim, self._chunk_size, lora_scale=lora_scale
+            )
+        else:
+            ff_output = self.ff(norm_hidden_states, scale=lora_scale)
+
+        if self.use_ada_layer_norm_zero:
+            ff_output = gate_mlp.unsqueeze(1) * ff_output
+        elif self.use_ada_layer_norm_single:
+            ff_output = gate_mlp * ff_output
+
+        hidden_states = ff_output + hidden_states
+        if hidden_states.ndim == 4:
+            hidden_states = hidden_states.squeeze(1)
+        return hidden_states,curr_garment_feat_idx
+
+
+@maybe_allow_in_graph
+class TemporalBasicTransformerBlock(nn.Module):
+    r"""
+    A basic Transformer block for video like data.
+
+    Parameters:
+        dim (`int`): The number of channels in the input and output.
+        time_mix_inner_dim (`int`): The number of channels for temporal attention.
+        num_attention_heads (`int`): The number of heads to use for multi-head attention.
+        attention_head_dim (`int`): The number of channels in each head.
+        cross_attention_dim (`int`, *optional*): The size of the encoder_hidden_states vector for cross attention.
+    """
+
+    def __init__(
+        self,
+        dim: int,
+        time_mix_inner_dim: int,
+        num_attention_heads: int,
+        attention_head_dim: int,
+        cross_attention_dim: Optional[int] = None,
+    ):
+        super().__init__()
+        self.is_res = dim == time_mix_inner_dim
+
+        self.norm_in = nn.LayerNorm(dim)
+
+        # Define 3 blocks. Each block has its own normalization layer.
+        # 1. Self-Attn
+        self.norm_in = nn.LayerNorm(dim)
+        self.ff_in = FeedForward(
+            dim,
+            dim_out=time_mix_inner_dim,
+            activation_fn="geglu",
+        )
+
+        self.norm1 = nn.LayerNorm(time_mix_inner_dim)
+        self.attn1 = Attention(
+            query_dim=time_mix_inner_dim,
+            heads=num_attention_heads,
+            dim_head=attention_head_dim,
+            cross_attention_dim=None,
+        )
+
+        # 2. Cross-Attn
+        if cross_attention_dim is not None:
+            # We currently only use AdaLayerNormZero for self attention where there will only be one attention block.
+            # I.e. the number of returned modulation chunks from AdaLayerZero would not make sense if returned during
+            # the second cross attention block.
+            self.norm2 = nn.LayerNorm(time_mix_inner_dim)
+            self.attn2 = Attention(
+                query_dim=time_mix_inner_dim,
+                cross_attention_dim=cross_attention_dim,
+                heads=num_attention_heads,
+                dim_head=attention_head_dim,
+            )  # is self-attn if encoder_hidden_states is none
+        else:
+            self.norm2 = None
+            self.attn2 = None
+
+        # 3. Feed-forward
+        self.norm3 = nn.LayerNorm(time_mix_inner_dim)
+        self.ff = FeedForward(time_mix_inner_dim, activation_fn="geglu")
+
+        # let chunk size default to None
+        self._chunk_size = None
+        self._chunk_dim = None
+
+    def set_chunk_feed_forward(self, chunk_size: Optional[int], **kwargs):
+        # Sets chunk feed-forward
+        self._chunk_size = chunk_size
+        # chunk dim should be hardcoded to 1 to have better speed vs. memory trade-off
+        self._chunk_dim = 1
+
+    def forward(
+        self,
+        hidden_states: torch.FloatTensor,
+        num_frames: int,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+    ) -> torch.FloatTensor:
+        # Notice that normalization is always applied before the real computation in the following blocks.
+        # 0. Self-Attention
+        batch_size = hidden_states.shape[0]
+
+        batch_frames, seq_length, channels = hidden_states.shape
+        batch_size = batch_frames // num_frames
+
+        hidden_states = hidden_states[None, :].reshape(batch_size, num_frames, seq_length, channels)
+        hidden_states = hidden_states.permute(0, 2, 1, 3)
+        hidden_states = hidden_states.reshape(batch_size * seq_length, num_frames, channels)
+
+        residual = hidden_states
+        hidden_states = self.norm_in(hidden_states)
+
+        if self._chunk_size is not None:
+            hidden_states = _chunked_feed_forward(self.ff_in, hidden_states, self._chunk_dim, self._chunk_size)
+        else:
+            hidden_states = self.ff_in(hidden_states)
+
+        if self.is_res:
+            hidden_states = hidden_states + residual
+
+        norm_hidden_states = self.norm1(hidden_states)
+        attn_output = self.attn1(norm_hidden_states, encoder_hidden_states=None)
+        hidden_states = attn_output + hidden_states
+
+        # 3. Cross-Attention
+        if self.attn2 is not None:
+            norm_hidden_states = self.norm2(hidden_states)
+            attn_output = self.attn2(norm_hidden_states, encoder_hidden_states=encoder_hidden_states)
+            hidden_states = attn_output + hidden_states
+
+        # 4. Feed-forward
+        norm_hidden_states = self.norm3(hidden_states)
+
+        if self._chunk_size is not None:
+            ff_output = _chunked_feed_forward(self.ff, norm_hidden_states, self._chunk_dim, self._chunk_size)
+        else:
+            ff_output = self.ff(norm_hidden_states)
+
+        if self.is_res:
+            hidden_states = ff_output + hidden_states
+        else:
+            hidden_states = ff_output
+
+        hidden_states = hidden_states[None, :].reshape(batch_size, seq_length, num_frames, channels)
+        hidden_states = hidden_states.permute(0, 2, 1, 3)
+        hidden_states = hidden_states.reshape(batch_size * num_frames, seq_length, channels)
+
+        return hidden_states
+
+
+class SkipFFTransformerBlock(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        num_attention_heads: int,
+        attention_head_dim: int,
+        kv_input_dim: int,
+        kv_input_dim_proj_use_bias: bool,
+        dropout=0.0,
+        cross_attention_dim: Optional[int] = None,
+        attention_bias: bool = False,
+        attention_out_bias: bool = True,
+    ):
+        super().__init__()
+        if kv_input_dim != dim:
+            self.kv_mapper = nn.Linear(kv_input_dim, dim, kv_input_dim_proj_use_bias)
+        else:
+            self.kv_mapper = None
+
+        self.norm1 = RMSNorm(dim, 1e-06)
+
+        self.attn1 = Attention(
+            query_dim=dim,
+            heads=num_attention_heads,
+            dim_head=attention_head_dim,
+            dropout=dropout,
+            bias=attention_bias,
+            cross_attention_dim=cross_attention_dim,
+            out_bias=attention_out_bias,
+        )
+
+        self.norm2 = RMSNorm(dim, 1e-06)
+
+        self.attn2 = Attention(
+            query_dim=dim,
+            cross_attention_dim=cross_attention_dim,
+            heads=num_attention_heads,
+            dim_head=attention_head_dim,
+            dropout=dropout,
+            bias=attention_bias,
+            out_bias=attention_out_bias,
+        )
+
+    def forward(self, hidden_states, encoder_hidden_states, cross_attention_kwargs):
+        cross_attention_kwargs = cross_attention_kwargs.copy() if cross_attention_kwargs is not None else {}
+
+        if self.kv_mapper is not None:
+            encoder_hidden_states = self.kv_mapper(F.silu(encoder_hidden_states))
+
+        norm_hidden_states = self.norm1(hidden_states)
+
+        attn_output = self.attn1(
+            norm_hidden_states,
+            encoder_hidden_states=encoder_hidden_states,
+            **cross_attention_kwargs,
+        )
+
+        hidden_states = attn_output + hidden_states
+
+        norm_hidden_states = self.norm2(hidden_states)
+
+        attn_output = self.attn2(
+            norm_hidden_states,
+            encoder_hidden_states=encoder_hidden_states,
+            **cross_attention_kwargs,
+        )
+
+        hidden_states = attn_output + hidden_states
+
+        return hidden_states
+
+
+class FeedForward(nn.Module):
+    r"""
+    A feed-forward layer.
+
+    Parameters:
+        dim (`int`): The number of channels in the input.
+        dim_out (`int`, *optional*): The number of channels in the output. If not given, defaults to `dim`.
+        mult (`int`, *optional*, defaults to 4): The multiplier to use for the hidden dimension.
+        dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use.
+        activation_fn (`str`, *optional*, defaults to `"geglu"`): Activation function to be used in feed-forward.
+        final_dropout (`bool` *optional*, defaults to False): Apply a final dropout.
+        bias (`bool`, defaults to True): Whether to use a bias in the linear layer.
+    """
+
+    def __init__(
+        self,
+        dim: int,
+        dim_out: Optional[int] = None,
+        mult: int = 4,
+        dropout: float = 0.0,
+        activation_fn: str = "geglu",
+        final_dropout: bool = False,
+        inner_dim=None,
+        bias: bool = True,
+    ):
+        super().__init__()
+        if inner_dim is None:
+            inner_dim = int(dim * mult)
+        dim_out = dim_out if dim_out is not None else dim
+        linear_cls = LoRACompatibleLinear if not USE_PEFT_BACKEND else nn.Linear
+
+        if activation_fn == "gelu":
+            act_fn = GELU(dim, inner_dim, bias=bias)
+        if activation_fn == "gelu-approximate":
+            act_fn = GELU(dim, inner_dim, approximate="tanh", bias=bias)
+        elif activation_fn == "geglu":
+            act_fn = GEGLU(dim, inner_dim, bias=bias)
+        elif activation_fn == "geglu-approximate":
+            act_fn = ApproximateGELU(dim, inner_dim, bias=bias)
+
+        self.net = nn.ModuleList([])
+        # project in
+        self.net.append(act_fn)
+        # project dropout
+        self.net.append(nn.Dropout(dropout))
+        # project out
+        self.net.append(linear_cls(inner_dim, dim_out, bias=bias))
+        # FF as used in Vision Transformer, MLP-Mixer, etc. have a final dropout
+        if final_dropout:
+            self.net.append(nn.Dropout(dropout))
+
+    def forward(self, hidden_states: torch.Tensor, scale: float = 1.0) -> torch.Tensor:
+        compatible_cls = (GEGLU,) if USE_PEFT_BACKEND else (GEGLU, LoRACompatibleLinear)
+        for module in self.net:
+            if isinstance(module, compatible_cls):
+                hidden_states = module(hidden_states, scale)
+            else:
+                hidden_states = module(hidden_states)
+        return hidden_states
diff --git a/src/transformerhacked_garmnet.py b/src/transformerhacked_garmnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..488ceb24d2d088b999641aca91383ad9d223d652
--- /dev/null
+++ b/src/transformerhacked_garmnet.py
@@ -0,0 +1,460 @@
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from dataclasses import dataclass
+from typing import Any, Dict, Optional
+
+import torch
+import torch.nn.functional as F
+from torch import nn
+
+from diffusers.configuration_utils import ConfigMixin, register_to_config
+from diffusers.models.embeddings import ImagePositionalEmbeddings
+from diffusers.utils import USE_PEFT_BACKEND, BaseOutput, deprecate, is_torch_version
+from src.attentionhacked_garmnet import BasicTransformerBlock
+from diffusers.models.embeddings import PatchEmbed, PixArtAlphaTextProjection
+from diffusers.models.lora import LoRACompatibleConv, LoRACompatibleLinear
+from diffusers.models.modeling_utils import ModelMixin
+from diffusers.models.normalization import AdaLayerNormSingle
+
+
+@dataclass
+class Transformer2DModelOutput(BaseOutput):
+    """
+    The output of [`Transformer2DModel`].
+
+    Args:
+        sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` or `(batch size, num_vector_embeds - 1, num_latent_pixels)` if [`Transformer2DModel`] is discrete):
+            The hidden states output conditioned on the `encoder_hidden_states` input. If discrete, returns probability
+            distributions for the unnoised latent pixels.
+    """
+
+    sample: torch.FloatTensor
+
+
+class Transformer2DModel(ModelMixin, ConfigMixin):
+    """
+    A 2D Transformer model for image-like data.
+
+    Parameters:
+        num_attention_heads (`int`, *optional*, defaults to 16): The number of heads to use for multi-head attention.
+        attention_head_dim (`int`, *optional*, defaults to 88): The number of channels in each head.
+        in_channels (`int`, *optional*):
+            The number of channels in the input and output (specify if the input is **continuous**).
+        num_layers (`int`, *optional*, defaults to 1): The number of layers of Transformer blocks to use.
+        dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use.
+        cross_attention_dim (`int`, *optional*): The number of `encoder_hidden_states` dimensions to use.
+        sample_size (`int`, *optional*): The width of the latent images (specify if the input is **discrete**).
+            This is fixed during training since it is used to learn a number of position embeddings.
+        num_vector_embeds (`int`, *optional*):
+            The number of classes of the vector embeddings of the latent pixels (specify if the input is **discrete**).
+            Includes the class for the masked latent pixel.
+        activation_fn (`str`, *optional*, defaults to `"geglu"`): Activation function to use in feed-forward.
+        num_embeds_ada_norm ( `int`, *optional*):
+            The number of diffusion steps used during training. Pass if at least one of the norm_layers is
+            `AdaLayerNorm`. This is fixed during training since it is used to learn a number of embeddings that are
+            added to the hidden states.
+
+            During inference, you can denoise for up to but not more steps than `num_embeds_ada_norm`.
+        attention_bias (`bool`, *optional*):
+            Configure if the `TransformerBlocks` attention should contain a bias parameter.
+    """
+
+    _supports_gradient_checkpointing = True
+
+    @register_to_config
+    def __init__(
+        self,
+        num_attention_heads: int = 16,
+        attention_head_dim: int = 88,
+        in_channels: Optional[int] = None,
+        out_channels: Optional[int] = None,
+        num_layers: int = 1,
+        dropout: float = 0.0,
+        norm_num_groups: int = 32,
+        cross_attention_dim: Optional[int] = None,
+        attention_bias: bool = False,
+        sample_size: Optional[int] = None,
+        num_vector_embeds: Optional[int] = None,
+        patch_size: Optional[int] = None,
+        activation_fn: str = "geglu",
+        num_embeds_ada_norm: Optional[int] = None,
+        use_linear_projection: bool = False,
+        only_cross_attention: bool = False,
+        double_self_attention: bool = False,
+        upcast_attention: bool = False,
+        norm_type: str = "layer_norm",
+        norm_elementwise_affine: bool = True,
+        norm_eps: float = 1e-5,
+        attention_type: str = "default",
+        caption_channels: int = None,
+    ):
+        super().__init__()
+        self.use_linear_projection = use_linear_projection
+        self.num_attention_heads = num_attention_heads
+        self.attention_head_dim = attention_head_dim
+        inner_dim = num_attention_heads * attention_head_dim
+
+        conv_cls = nn.Conv2d if USE_PEFT_BACKEND else LoRACompatibleConv
+        linear_cls = nn.Linear if USE_PEFT_BACKEND else LoRACompatibleLinear
+
+        # 1. Transformer2DModel can process both standard continuous images of shape `(batch_size, num_channels, width, height)` as well as quantized image embeddings of shape `(batch_size, num_image_vectors)`
+        # Define whether input is continuous or discrete depending on configuration
+        self.is_input_continuous = (in_channels is not None) and (patch_size is None)
+        self.is_input_vectorized = num_vector_embeds is not None
+        self.is_input_patches = in_channels is not None and patch_size is not None
+
+        if norm_type == "layer_norm" and num_embeds_ada_norm is not None:
+            deprecation_message = (
+                f"The configuration file of this model: {self.__class__} is outdated. `norm_type` is either not set or"
+                " incorrectly set to `'layer_norm'`.Make sure to set `norm_type` to `'ada_norm'` in the config."
+                " Please make sure to update the config accordingly as leaving `norm_type` might led to incorrect"
+                " results in future versions. If you have downloaded this checkpoint from the Hugging Face Hub, it"
+                " would be very nice if you could open a Pull request for the `transformer/config.json` file"
+            )
+            deprecate("norm_type!=num_embeds_ada_norm", "1.0.0", deprecation_message, standard_warn=False)
+            norm_type = "ada_norm"
+
+        if self.is_input_continuous and self.is_input_vectorized:
+            raise ValueError(
+                f"Cannot define both `in_channels`: {in_channels} and `num_vector_embeds`: {num_vector_embeds}. Make"
+                " sure that either `in_channels` or `num_vector_embeds` is None."
+            )
+        elif self.is_input_vectorized and self.is_input_patches:
+            raise ValueError(
+                f"Cannot define both `num_vector_embeds`: {num_vector_embeds} and `patch_size`: {patch_size}. Make"
+                " sure that either `num_vector_embeds` or `num_patches` is None."
+            )
+        elif not self.is_input_continuous and not self.is_input_vectorized and not self.is_input_patches:
+            raise ValueError(
+                f"Has to define `in_channels`: {in_channels}, `num_vector_embeds`: {num_vector_embeds}, or patch_size:"
+                f" {patch_size}. Make sure that `in_channels`, `num_vector_embeds` or `num_patches` is not None."
+            )
+
+        # 2. Define input layers
+        if self.is_input_continuous:
+            self.in_channels = in_channels
+
+            self.norm = torch.nn.GroupNorm(num_groups=norm_num_groups, num_channels=in_channels, eps=1e-6, affine=True)
+            if use_linear_projection:
+                self.proj_in = linear_cls(in_channels, inner_dim)
+            else:
+                self.proj_in = conv_cls(in_channels, inner_dim, kernel_size=1, stride=1, padding=0)
+        elif self.is_input_vectorized:
+            assert sample_size is not None, "Transformer2DModel over discrete input must provide sample_size"
+            assert num_vector_embeds is not None, "Transformer2DModel over discrete input must provide num_embed"
+
+            self.height = sample_size
+            self.width = sample_size
+            self.num_vector_embeds = num_vector_embeds
+            self.num_latent_pixels = self.height * self.width
+
+            self.latent_image_embedding = ImagePositionalEmbeddings(
+                num_embed=num_vector_embeds, embed_dim=inner_dim, height=self.height, width=self.width
+            )
+        elif self.is_input_patches:
+            assert sample_size is not None, "Transformer2DModel over patched input must provide sample_size"
+
+            self.height = sample_size
+            self.width = sample_size
+
+            self.patch_size = patch_size
+            interpolation_scale = self.config.sample_size // 64  # => 64 (= 512 pixart) has interpolation scale 1
+            interpolation_scale = max(interpolation_scale, 1)
+            self.pos_embed = PatchEmbed(
+                height=sample_size,
+                width=sample_size,
+                patch_size=patch_size,
+                in_channels=in_channels,
+                embed_dim=inner_dim,
+                interpolation_scale=interpolation_scale,
+            )
+
+        # 3. Define transformers blocks
+        self.transformer_blocks = nn.ModuleList(
+            [
+                BasicTransformerBlock(
+                    inner_dim,
+                    num_attention_heads,
+                    attention_head_dim,
+                    dropout=dropout,
+                    cross_attention_dim=cross_attention_dim,
+                    activation_fn=activation_fn,
+                    num_embeds_ada_norm=num_embeds_ada_norm,
+                    attention_bias=attention_bias,
+                    only_cross_attention=only_cross_attention,
+                    double_self_attention=double_self_attention,
+                    upcast_attention=upcast_attention,
+                    norm_type=norm_type,
+                    norm_elementwise_affine=norm_elementwise_affine,
+                    norm_eps=norm_eps,
+                    attention_type=attention_type,
+                )
+                for d in range(num_layers)
+            ]
+        )
+
+        # 4. Define output layers
+        self.out_channels = in_channels if out_channels is None else out_channels
+        if self.is_input_continuous:
+            # TODO: should use out_channels for continuous projections
+            if use_linear_projection:
+                self.proj_out = linear_cls(inner_dim, in_channels)
+            else:
+                self.proj_out = conv_cls(inner_dim, in_channels, kernel_size=1, stride=1, padding=0)
+        elif self.is_input_vectorized:
+            self.norm_out = nn.LayerNorm(inner_dim)
+            self.out = nn.Linear(inner_dim, self.num_vector_embeds - 1)
+        elif self.is_input_patches and norm_type != "ada_norm_single":
+            self.norm_out = nn.LayerNorm(inner_dim, elementwise_affine=False, eps=1e-6)
+            self.proj_out_1 = nn.Linear(inner_dim, 2 * inner_dim)
+            self.proj_out_2 = nn.Linear(inner_dim, patch_size * patch_size * self.out_channels)
+        elif self.is_input_patches and norm_type == "ada_norm_single":
+            self.norm_out = nn.LayerNorm(inner_dim, elementwise_affine=False, eps=1e-6)
+            self.scale_shift_table = nn.Parameter(torch.randn(2, inner_dim) / inner_dim**0.5)
+            self.proj_out = nn.Linear(inner_dim, patch_size * patch_size * self.out_channels)
+
+        # 5. PixArt-Alpha blocks.
+        self.adaln_single = None
+        self.use_additional_conditions = False
+        if norm_type == "ada_norm_single":
+            self.use_additional_conditions = self.config.sample_size == 128
+            # TODO(Sayak, PVP) clean this, for now we use sample size to determine whether to use
+            # additional conditions until we find better name
+            self.adaln_single = AdaLayerNormSingle(inner_dim, use_additional_conditions=self.use_additional_conditions)
+
+        self.caption_projection = None
+        if caption_channels is not None:
+            self.caption_projection = PixArtAlphaTextProjection(in_features=caption_channels, hidden_size=inner_dim)
+
+        self.gradient_checkpointing = False
+
+    def _set_gradient_checkpointing(self, module, value=False):
+        if hasattr(module, "gradient_checkpointing"):
+            module.gradient_checkpointing = value
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        timestep: Optional[torch.LongTensor] = None,
+        added_cond_kwargs: Dict[str, torch.Tensor] = None,
+        class_labels: Optional[torch.LongTensor] = None,
+        cross_attention_kwargs: Dict[str, Any] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
+        return_dict: bool = True,
+    ):
+        """
+        The [`Transformer2DModel`] forward method.
+
+        Args:
+            hidden_states (`torch.LongTensor` of shape `(batch size, num latent pixels)` if discrete, `torch.FloatTensor` of shape `(batch size, channel, height, width)` if continuous):
+                Input `hidden_states`.
+            encoder_hidden_states ( `torch.FloatTensor` of shape `(batch size, sequence len, embed dims)`, *optional*):
+                Conditional embeddings for cross attention layer. If not given, cross-attention defaults to
+                self-attention.
+            timestep ( `torch.LongTensor`, *optional*):
+                Used to indicate denoising step. Optional timestep to be applied as an embedding in `AdaLayerNorm`.
+            class_labels ( `torch.LongTensor` of shape `(batch size, num classes)`, *optional*):
+                Used to indicate class labels conditioning. Optional class labels to be applied as an embedding in
+                `AdaLayerZeroNorm`.
+            cross_attention_kwargs ( `Dict[str, Any]`, *optional*):
+                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
+                `self.processor` in
+                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            attention_mask ( `torch.Tensor`, *optional*):
+                An attention mask of shape `(batch, key_tokens)` is applied to `encoder_hidden_states`. If `1` the mask
+                is kept, otherwise if `0` it is discarded. Mask will be converted into a bias, which adds large
+                negative values to the attention scores corresponding to "discard" tokens.
+            encoder_attention_mask ( `torch.Tensor`, *optional*):
+                Cross-attention mask applied to `encoder_hidden_states`. Two formats supported:
+
+                    * Mask `(batch, sequence_length)` True = keep, False = discard.
+                    * Bias `(batch, 1, sequence_length)` 0 = keep, -10000 = discard.
+
+                If `ndim == 2`: will be interpreted as a mask, then converted into a bias consistent with the format
+                above. This bias will be added to the cross-attention scores.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~models.unet_2d_condition.UNet2DConditionOutput`] instead of a plain
+                tuple.
+
+        Returns:
+            If `return_dict` is True, an [`~models.transformer_2d.Transformer2DModelOutput`] is returned, otherwise a
+            `tuple` where the first element is the sample tensor.
+        """
+        # ensure attention_mask is a bias, and give it a singleton query_tokens dimension.
+        #   we may have done this conversion already, e.g. if we came here via UNet2DConditionModel#forward.
+        #   we can tell by counting dims; if ndim == 2: it's a mask rather than a bias.
+        # expects mask of shape:
+        #   [batch, key_tokens]
+        # adds singleton query_tokens dimension:
+        #   [batch,                    1, key_tokens]
+        # this helps to broadcast it as a bias over attention scores, which will be in one of the following shapes:
+        #   [batch,  heads, query_tokens, key_tokens] (e.g. torch sdp attn)
+        #   [batch * heads, query_tokens, key_tokens] (e.g. xformers or classic attn)
+        if attention_mask is not None and attention_mask.ndim == 2:
+            # assume that mask is expressed as:
+            #   (1 = keep,      0 = discard)
+            # convert mask into a bias that can be added to attention scores:
+            #       (keep = +0,     discard = -10000.0)
+            attention_mask = (1 - attention_mask.to(hidden_states.dtype)) * -10000.0
+            attention_mask = attention_mask.unsqueeze(1)
+
+        # convert encoder_attention_mask to a bias the same way we do for attention_mask
+        if encoder_attention_mask is not None and encoder_attention_mask.ndim == 2:
+            encoder_attention_mask = (1 - encoder_attention_mask.to(hidden_states.dtype)) * -10000.0
+            encoder_attention_mask = encoder_attention_mask.unsqueeze(1)
+
+        # Retrieve lora scale.
+        lora_scale = cross_attention_kwargs.get("scale", 1.0) if cross_attention_kwargs is not None else 1.0
+
+        # 1. Input
+        if self.is_input_continuous:
+            batch, _, height, width = hidden_states.shape
+            residual = hidden_states
+
+            hidden_states = self.norm(hidden_states)
+            if not self.use_linear_projection:
+                hidden_states = (
+                    self.proj_in(hidden_states, scale=lora_scale)
+                    if not USE_PEFT_BACKEND
+                    else self.proj_in(hidden_states)
+                )
+                inner_dim = hidden_states.shape[1]
+                hidden_states = hidden_states.permute(0, 2, 3, 1).reshape(batch, height * width, inner_dim)
+            else:
+                inner_dim = hidden_states.shape[1]
+                hidden_states = hidden_states.permute(0, 2, 3, 1).reshape(batch, height * width, inner_dim)
+                hidden_states = (
+                    self.proj_in(hidden_states, scale=lora_scale)
+                    if not USE_PEFT_BACKEND
+                    else self.proj_in(hidden_states)
+                )
+
+        elif self.is_input_vectorized:
+            hidden_states = self.latent_image_embedding(hidden_states)
+        elif self.is_input_patches:
+            height, width = hidden_states.shape[-2] // self.patch_size, hidden_states.shape[-1] // self.patch_size
+            hidden_states = self.pos_embed(hidden_states)
+
+            if self.adaln_single is not None:
+                if self.use_additional_conditions and added_cond_kwargs is None:
+                    raise ValueError(
+                        "`added_cond_kwargs` cannot be None when using additional conditions for `adaln_single`."
+                    )
+                batch_size = hidden_states.shape[0]
+                timestep, embedded_timestep = self.adaln_single(
+                    timestep, added_cond_kwargs, batch_size=batch_size, hidden_dtype=hidden_states.dtype
+                )
+
+        # 2. Blocks
+        if self.caption_projection is not None:
+            batch_size = hidden_states.shape[0]
+            encoder_hidden_states = self.caption_projection(encoder_hidden_states)
+            encoder_hidden_states = encoder_hidden_states.view(batch_size, -1, hidden_states.shape[-1])
+
+        garment_features = []
+        for block in self.transformer_blocks:
+            if self.training and self.gradient_checkpointing:
+
+                def create_custom_forward(module, return_dict=None):
+                    def custom_forward(*inputs):
+                        if return_dict is not None:
+                            return module(*inputs, return_dict=return_dict)
+                        else:
+                            return module(*inputs)
+
+                    return custom_forward
+
+                ckpt_kwargs: Dict[str, Any] = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
+                hidden_states,out_garment_feat = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(block),
+                    hidden_states,
+                    attention_mask,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                    timestep,
+                    cross_attention_kwargs,
+                    class_labels,
+                    **ckpt_kwargs,
+                )
+            else:
+                hidden_states,out_garment_feat = block(
+                    hidden_states,
+                    attention_mask=attention_mask,
+                    encoder_hidden_states=encoder_hidden_states,
+                    encoder_attention_mask=encoder_attention_mask,
+                    timestep=timestep,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    class_labels=class_labels,
+                )
+            garment_features += out_garment_feat
+        # 3. Output
+        if self.is_input_continuous:
+            if not self.use_linear_projection:
+                hidden_states = hidden_states.reshape(batch, height, width, inner_dim).permute(0, 3, 1, 2).contiguous()
+                hidden_states = (
+                    self.proj_out(hidden_states, scale=lora_scale)
+                    if not USE_PEFT_BACKEND
+                    else self.proj_out(hidden_states)
+                )
+            else:
+                hidden_states = (
+                    self.proj_out(hidden_states, scale=lora_scale)
+                    if not USE_PEFT_BACKEND
+                    else self.proj_out(hidden_states)
+                )
+                hidden_states = hidden_states.reshape(batch, height, width, inner_dim).permute(0, 3, 1, 2).contiguous()
+
+            output = hidden_states + residual
+        elif self.is_input_vectorized:
+            hidden_states = self.norm_out(hidden_states)
+            logits = self.out(hidden_states)
+            # (batch, self.num_vector_embeds - 1, self.num_latent_pixels)
+            logits = logits.permute(0, 2, 1)
+
+            # log(p(x_0))
+            output = F.log_softmax(logits.double(), dim=1).float()
+
+        if self.is_input_patches:
+            if self.config.norm_type != "ada_norm_single":
+                conditioning = self.transformer_blocks[0].norm1.emb(
+                    timestep, class_labels, hidden_dtype=hidden_states.dtype
+                )
+                shift, scale = self.proj_out_1(F.silu(conditioning)).chunk(2, dim=1)
+                hidden_states = self.norm_out(hidden_states) * (1 + scale[:, None]) + shift[:, None]
+                hidden_states = self.proj_out_2(hidden_states)
+            elif self.config.norm_type == "ada_norm_single":
+                shift, scale = (self.scale_shift_table[None] + embedded_timestep[:, None]).chunk(2, dim=1)
+                hidden_states = self.norm_out(hidden_states)
+                # Modulation
+                hidden_states = hidden_states * (1 + scale) + shift
+                hidden_states = self.proj_out(hidden_states)
+                hidden_states = hidden_states.squeeze(1)
+
+            # unpatchify
+            if self.adaln_single is None:
+                height = width = int(hidden_states.shape[1] ** 0.5)
+            hidden_states = hidden_states.reshape(
+                shape=(-1, height, width, self.patch_size, self.patch_size, self.out_channels)
+            )
+            hidden_states = torch.einsum("nhwpqc->nchpwq", hidden_states)
+            output = hidden_states.reshape(
+                shape=(-1, self.out_channels, height * self.patch_size, width * self.patch_size)
+            )
+
+        if not return_dict:
+            return (output,) ,garment_features
+
+        return Transformer2DModelOutput(sample=output),garment_features
diff --git a/src/transformerhacked_tryon.py b/src/transformerhacked_tryon.py
new file mode 100644
index 0000000000000000000000000000000000000000..9a4ac5daf7c5e2f894b2ca0763d5950bd4845aae
--- /dev/null
+++ b/src/transformerhacked_tryon.py
@@ -0,0 +1,467 @@
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from dataclasses import dataclass
+from typing import Any, Dict, Optional
+
+import torch
+import torch.nn.functional as F
+from torch import nn
+
+from diffusers.configuration_utils import ConfigMixin, register_to_config
+from diffusers.models.embeddings import ImagePositionalEmbeddings
+from diffusers.utils import USE_PEFT_BACKEND, BaseOutput, deprecate, is_torch_version
+from src.attentionhacked_tryon import BasicTransformerBlock
+from diffusers.models.embeddings import PatchEmbed, PixArtAlphaTextProjection
+from diffusers.models.lora import LoRACompatibleConv, LoRACompatibleLinear
+from diffusers.models.modeling_utils import ModelMixin
+from diffusers.models.normalization import AdaLayerNormSingle
+
+
+@dataclass
+class Transformer2DModelOutput(BaseOutput):
+    """
+    The output of [`Transformer2DModel`].
+
+    Args:
+        sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` or `(batch size, num_vector_embeds - 1, num_latent_pixels)` if [`Transformer2DModel`] is discrete):
+            The hidden states output conditioned on the `encoder_hidden_states` input. If discrete, returns probability
+            distributions for the unnoised latent pixels.
+    """
+
+    sample: torch.FloatTensor
+
+
+class Transformer2DModel(ModelMixin, ConfigMixin):
+    """
+    A 2D Transformer model for image-like data.
+
+    Parameters:
+        num_attention_heads (`int`, *optional*, defaults to 16): The number of heads to use for multi-head attention.
+        attention_head_dim (`int`, *optional*, defaults to 88): The number of channels in each head.
+        in_channels (`int`, *optional*):
+            The number of channels in the input and output (specify if the input is **continuous**).
+        num_layers (`int`, *optional*, defaults to 1): The number of layers of Transformer blocks to use.
+        dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use.
+        cross_attention_dim (`int`, *optional*): The number of `encoder_hidden_states` dimensions to use.
+        sample_size (`int`, *optional*): The width of the latent images (specify if the input is **discrete**).
+            This is fixed during training since it is used to learn a number of position embeddings.
+        num_vector_embeds (`int`, *optional*):
+            The number of classes of the vector embeddings of the latent pixels (specify if the input is **discrete**).
+            Includes the class for the masked latent pixel.
+        activation_fn (`str`, *optional*, defaults to `"geglu"`): Activation function to use in feed-forward.
+        num_embeds_ada_norm ( `int`, *optional*):
+            The number of diffusion steps used during training. Pass if at least one of the norm_layers is
+            `AdaLayerNorm`. This is fixed during training since it is used to learn a number of embeddings that are
+            added to the hidden states.
+
+            During inference, you can denoise for up to but not more steps than `num_embeds_ada_norm`.
+        attention_bias (`bool`, *optional*):
+            Configure if the `TransformerBlocks` attention should contain a bias parameter.
+    """
+
+    _supports_gradient_checkpointing = True
+
+    @register_to_config
+    def __init__(
+        self,
+        num_attention_heads: int = 16,
+        attention_head_dim: int = 88,
+        in_channels: Optional[int] = None,
+        out_channels: Optional[int] = None,
+        num_layers: int = 1,
+        dropout: float = 0.0,
+        norm_num_groups: int = 32,
+        cross_attention_dim: Optional[int] = None,
+        attention_bias: bool = False,
+        sample_size: Optional[int] = None,
+        num_vector_embeds: Optional[int] = None,
+        patch_size: Optional[int] = None,
+        activation_fn: str = "geglu",
+        num_embeds_ada_norm: Optional[int] = None,
+        use_linear_projection: bool = False,
+        only_cross_attention: bool = False,
+        double_self_attention: bool = False,
+        upcast_attention: bool = False,
+        norm_type: str = "layer_norm",
+        norm_elementwise_affine: bool = True,
+        norm_eps: float = 1e-5,
+        attention_type: str = "default",
+        caption_channels: int = None,
+    ):
+        super().__init__()
+        self.use_linear_projection = use_linear_projection
+        self.num_attention_heads = num_attention_heads
+        self.attention_head_dim = attention_head_dim
+        inner_dim = num_attention_heads * attention_head_dim
+
+        conv_cls = nn.Conv2d if USE_PEFT_BACKEND else LoRACompatibleConv
+        linear_cls = nn.Linear if USE_PEFT_BACKEND else LoRACompatibleLinear
+
+        # 1. Transformer2DModel can process both standard continuous images of shape `(batch_size, num_channels, width, height)` as well as quantized image embeddings of shape `(batch_size, num_image_vectors)`
+        # Define whether input is continuous or discrete depending on configuration
+        self.is_input_continuous = (in_channels is not None) and (patch_size is None)
+        self.is_input_vectorized = num_vector_embeds is not None
+        self.is_input_patches = in_channels is not None and patch_size is not None
+
+        if norm_type == "layer_norm" and num_embeds_ada_norm is not None:
+            deprecation_message = (
+                f"The configuration file of this model: {self.__class__} is outdated. `norm_type` is either not set or"
+                " incorrectly set to `'layer_norm'`.Make sure to set `norm_type` to `'ada_norm'` in the config."
+                " Please make sure to update the config accordingly as leaving `norm_type` might led to incorrect"
+                " results in future versions. If you have downloaded this checkpoint from the Hugging Face Hub, it"
+                " would be very nice if you could open a Pull request for the `transformer/config.json` file"
+            )
+            deprecate("norm_type!=num_embeds_ada_norm", "1.0.0", deprecation_message, standard_warn=False)
+            norm_type = "ada_norm"
+
+        if self.is_input_continuous and self.is_input_vectorized:
+            raise ValueError(
+                f"Cannot define both `in_channels`: {in_channels} and `num_vector_embeds`: {num_vector_embeds}. Make"
+                " sure that either `in_channels` or `num_vector_embeds` is None."
+            )
+        elif self.is_input_vectorized and self.is_input_patches:
+            raise ValueError(
+                f"Cannot define both `num_vector_embeds`: {num_vector_embeds} and `patch_size`: {patch_size}. Make"
+                " sure that either `num_vector_embeds` or `num_patches` is None."
+            )
+        elif not self.is_input_continuous and not self.is_input_vectorized and not self.is_input_patches:
+            raise ValueError(
+                f"Has to define `in_channels`: {in_channels}, `num_vector_embeds`: {num_vector_embeds}, or patch_size:"
+                f" {patch_size}. Make sure that `in_channels`, `num_vector_embeds` or `num_patches` is not None."
+            )
+
+        # 2. Define input layers
+        if self.is_input_continuous:
+            self.in_channels = in_channels
+
+            self.norm = torch.nn.GroupNorm(num_groups=norm_num_groups, num_channels=in_channels, eps=1e-6, affine=True)
+            if use_linear_projection:
+                self.proj_in = linear_cls(in_channels, inner_dim)
+            else:
+                self.proj_in = conv_cls(in_channels, inner_dim, kernel_size=1, stride=1, padding=0)
+        elif self.is_input_vectorized:
+            assert sample_size is not None, "Transformer2DModel over discrete input must provide sample_size"
+            assert num_vector_embeds is not None, "Transformer2DModel over discrete input must provide num_embed"
+
+            self.height = sample_size
+            self.width = sample_size
+            self.num_vector_embeds = num_vector_embeds
+            self.num_latent_pixels = self.height * self.width
+
+            self.latent_image_embedding = ImagePositionalEmbeddings(
+                num_embed=num_vector_embeds, embed_dim=inner_dim, height=self.height, width=self.width
+            )
+        elif self.is_input_patches:
+            assert sample_size is not None, "Transformer2DModel over patched input must provide sample_size"
+
+            self.height = sample_size
+            self.width = sample_size
+
+            self.patch_size = patch_size
+            interpolation_scale = self.config.sample_size // 64  # => 64 (= 512 pixart) has interpolation scale 1
+            interpolation_scale = max(interpolation_scale, 1)
+            self.pos_embed = PatchEmbed(
+                height=sample_size,
+                width=sample_size,
+                patch_size=patch_size,
+                in_channels=in_channels,
+                embed_dim=inner_dim,
+                interpolation_scale=interpolation_scale,
+            )
+
+        # 3. Define transformers blocks
+        self.transformer_blocks = nn.ModuleList(
+            [
+                BasicTransformerBlock(
+                    inner_dim,
+                    num_attention_heads,
+                    attention_head_dim,
+                    dropout=dropout,
+                    cross_attention_dim=cross_attention_dim,
+                    activation_fn=activation_fn,
+                    num_embeds_ada_norm=num_embeds_ada_norm,
+                    attention_bias=attention_bias,
+                    only_cross_attention=only_cross_attention,
+                    double_self_attention=double_self_attention,
+                    upcast_attention=upcast_attention,
+                    norm_type=norm_type,
+                    norm_elementwise_affine=norm_elementwise_affine,
+                    norm_eps=norm_eps,
+                    attention_type=attention_type,
+                )
+                for d in range(num_layers)
+            ]
+        )
+
+        # 4. Define output layers
+        self.out_channels = in_channels if out_channels is None else out_channels
+        if self.is_input_continuous:
+            # TODO: should use out_channels for continuous projections
+            if use_linear_projection:
+                self.proj_out = linear_cls(inner_dim, in_channels)
+            else:
+                self.proj_out = conv_cls(inner_dim, in_channels, kernel_size=1, stride=1, padding=0)
+        elif self.is_input_vectorized:
+            self.norm_out = nn.LayerNorm(inner_dim)
+            self.out = nn.Linear(inner_dim, self.num_vector_embeds - 1)
+        elif self.is_input_patches and norm_type != "ada_norm_single":
+            self.norm_out = nn.LayerNorm(inner_dim, elementwise_affine=False, eps=1e-6)
+            self.proj_out_1 = nn.Linear(inner_dim, 2 * inner_dim)
+            self.proj_out_2 = nn.Linear(inner_dim, patch_size * patch_size * self.out_channels)
+        elif self.is_input_patches and norm_type == "ada_norm_single":
+            self.norm_out = nn.LayerNorm(inner_dim, elementwise_affine=False, eps=1e-6)
+            self.scale_shift_table = nn.Parameter(torch.randn(2, inner_dim) / inner_dim**0.5)
+            self.proj_out = nn.Linear(inner_dim, patch_size * patch_size * self.out_channels)
+
+        # 5. PixArt-Alpha blocks.
+        self.adaln_single = None
+        self.use_additional_conditions = False
+        if norm_type == "ada_norm_single":
+            self.use_additional_conditions = self.config.sample_size == 128
+            # TODO(Sayak, PVP) clean this, for now we use sample size to determine whether to use
+            # additional conditions until we find better name
+            self.adaln_single = AdaLayerNormSingle(inner_dim, use_additional_conditions=self.use_additional_conditions)
+
+        self.caption_projection = None
+        if caption_channels is not None:
+            self.caption_projection = PixArtAlphaTextProjection(in_features=caption_channels, hidden_size=inner_dim)
+
+        self.gradient_checkpointing = False
+
+    def _set_gradient_checkpointing(self, module, value=False):
+        if hasattr(module, "gradient_checkpointing"):
+            module.gradient_checkpointing = value
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        timestep: Optional[torch.LongTensor] = None,
+        added_cond_kwargs: Dict[str, torch.Tensor] = None,
+        class_labels: Optional[torch.LongTensor] = None,
+        cross_attention_kwargs: Dict[str, Any] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
+        garment_features=None,
+        curr_garment_feat_idx=0,
+        return_dict: bool = True,
+    ):
+        """
+        The [`Transformer2DModel`] forward method.
+
+        Args:
+            hidden_states (`torch.LongTensor` of shape `(batch size, num latent pixels)` if discrete, `torch.FloatTensor` of shape `(batch size, channel, height, width)` if continuous):
+                Input `hidden_states`.
+            encoder_hidden_states ( `torch.FloatTensor` of shape `(batch size, sequence len, embed dims)`, *optional*):
+                Conditional embeddings for cross attention layer. If not given, cross-attention defaults to
+                self-attention.
+            timestep ( `torch.LongTensor`, *optional*):
+                Used to indicate denoising step. Optional timestep to be applied as an embedding in `AdaLayerNorm`.
+            class_labels ( `torch.LongTensor` of shape `(batch size, num classes)`, *optional*):
+                Used to indicate class labels conditioning. Optional class labels to be applied as an embedding in
+                `AdaLayerZeroNorm`.
+            cross_attention_kwargs ( `Dict[str, Any]`, *optional*):
+                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
+                `self.processor` in
+                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            attention_mask ( `torch.Tensor`, *optional*):
+                An attention mask of shape `(batch, key_tokens)` is applied to `encoder_hidden_states`. If `1` the mask
+                is kept, otherwise if `0` it is discarded. Mask will be converted into a bias, which adds large
+                negative values to the attention scores corresponding to "discard" tokens.
+            encoder_attention_mask ( `torch.Tensor`, *optional*):
+                Cross-attention mask applied to `encoder_hidden_states`. Two formats supported:
+
+                    * Mask `(batch, sequence_length)` True = keep, False = discard.
+                    * Bias `(batch, 1, sequence_length)` 0 = keep, -10000 = discard.
+
+                If `ndim == 2`: will be interpreted as a mask, then converted into a bias consistent with the format
+                above. This bias will be added to the cross-attention scores.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~models.unet_2d_condition.UNet2DConditionOutput`] instead of a plain
+                tuple.
+
+        Returns:
+            If `return_dict` is True, an [`~models.transformer_2d.Transformer2DModelOutput`] is returned, otherwise a
+            `tuple` where the first element is the sample tensor.
+        """
+        # ensure attention_mask is a bias, and give it a singleton query_tokens dimension.
+        #   we may have done this conversion already, e.g. if we came here via UNet2DConditionModel#forward.
+        #   we can tell by counting dims; if ndim == 2: it's a mask rather than a bias.
+        # expects mask of shape:
+        #   [batch, key_tokens]
+        # adds singleton query_tokens dimension:
+        #   [batch,                    1, key_tokens]
+        # this helps to broadcast it as a bias over attention scores, which will be in one of the following shapes:
+        #   [batch,  heads, query_tokens, key_tokens] (e.g. torch sdp attn)
+        #   [batch * heads, query_tokens, key_tokens] (e.g. xformers or classic attn)
+        if attention_mask is not None and attention_mask.ndim == 2:
+            # assume that mask is expressed as:
+            #   (1 = keep,      0 = discard)
+            # convert mask into a bias that can be added to attention scores:
+            #       (keep = +0,     discard = -10000.0)
+            attention_mask = (1 - attention_mask.to(hidden_states.dtype)) * -10000.0
+            attention_mask = attention_mask.unsqueeze(1)
+
+        # convert encoder_attention_mask to a bias the same way we do for attention_mask
+        if encoder_attention_mask is not None and encoder_attention_mask.ndim == 2:
+            encoder_attention_mask = (1 - encoder_attention_mask.to(hidden_states.dtype)) * -10000.0
+            encoder_attention_mask = encoder_attention_mask.unsqueeze(1)
+
+        # Retrieve lora scale.
+        lora_scale = cross_attention_kwargs.get("scale", 1.0) if cross_attention_kwargs is not None else 1.0
+
+        # 1. Input
+        if self.is_input_continuous:
+            batch, _, height, width = hidden_states.shape
+            residual = hidden_states
+
+            hidden_states = self.norm(hidden_states)
+            if not self.use_linear_projection:
+                hidden_states = (
+                    self.proj_in(hidden_states, scale=lora_scale)
+                    if not USE_PEFT_BACKEND
+                    else self.proj_in(hidden_states)
+                )
+                inner_dim = hidden_states.shape[1]
+                hidden_states = hidden_states.permute(0, 2, 3, 1).reshape(batch, height * width, inner_dim)
+            else:
+                inner_dim = hidden_states.shape[1]
+                hidden_states = hidden_states.permute(0, 2, 3, 1).reshape(batch, height * width, inner_dim)
+                hidden_states = (
+                    self.proj_in(hidden_states, scale=lora_scale)
+                    if not USE_PEFT_BACKEND
+                    else self.proj_in(hidden_states)
+                )
+
+        elif self.is_input_vectorized:
+            hidden_states = self.latent_image_embedding(hidden_states)
+        elif self.is_input_patches:
+            height, width = hidden_states.shape[-2] // self.patch_size, hidden_states.shape[-1] // self.patch_size
+            hidden_states = self.pos_embed(hidden_states)
+
+            if self.adaln_single is not None:
+                if self.use_additional_conditions and added_cond_kwargs is None:
+                    raise ValueError(
+                        "`added_cond_kwargs` cannot be None when using additional conditions for `adaln_single`."
+                    )
+                batch_size = hidden_states.shape[0]
+                timestep, embedded_timestep = self.adaln_single(
+                    timestep, added_cond_kwargs, batch_size=batch_size, hidden_dtype=hidden_states.dtype
+                )
+
+        # 2. Blocks
+        if self.caption_projection is not None:
+            batch_size = hidden_states.shape[0]
+            encoder_hidden_states = self.caption_projection(encoder_hidden_states)
+            encoder_hidden_states = encoder_hidden_states.view(batch_size, -1, hidden_states.shape[-1])
+
+
+        for block in self.transformer_blocks:
+            if self.training and self.gradient_checkpointing:
+
+                def create_custom_forward(module, return_dict=None):
+                    def custom_forward(*inputs):
+                        if return_dict is not None:
+                            return module(*inputs, return_dict=return_dict)
+                        else:
+                            return module(*inputs)
+
+                    return custom_forward
+
+                ckpt_kwargs: Dict[str, Any] = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
+                hidden_states,curr_garment_feat_idx = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(block),
+                    hidden_states,
+                    attention_mask,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                    timestep,
+                    cross_attention_kwargs,
+                    class_labels,
+                    garment_features,
+                    curr_garment_feat_idx,
+                    **ckpt_kwargs,
+                )
+            else:
+                hidden_states,curr_garment_feat_idx = block(
+                    hidden_states,
+                    attention_mask=attention_mask,
+                    encoder_hidden_states=encoder_hidden_states,
+                    encoder_attention_mask=encoder_attention_mask,
+                    timestep=timestep,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    class_labels=class_labels,
+                    garment_features=garment_features,
+                    curr_garment_feat_idx=curr_garment_feat_idx,
+                )
+
+ 
+        # 3. Output
+        if self.is_input_continuous:
+            if not self.use_linear_projection:
+                hidden_states = hidden_states.reshape(batch, height, width, inner_dim).permute(0, 3, 1, 2).contiguous()
+                hidden_states = (
+                    self.proj_out(hidden_states, scale=lora_scale)
+                    if not USE_PEFT_BACKEND
+                    else self.proj_out(hidden_states)
+                )
+            else:
+                hidden_states = (
+                    self.proj_out(hidden_states, scale=lora_scale)
+                    if not USE_PEFT_BACKEND
+                    else self.proj_out(hidden_states)
+                )
+                hidden_states = hidden_states.reshape(batch, height, width, inner_dim).permute(0, 3, 1, 2).contiguous()
+
+            output = hidden_states + residual
+        elif self.is_input_vectorized:
+            hidden_states = self.norm_out(hidden_states)
+            logits = self.out(hidden_states)
+            # (batch, self.num_vector_embeds - 1, self.num_latent_pixels)
+            logits = logits.permute(0, 2, 1)
+
+            # log(p(x_0))
+            output = F.log_softmax(logits.double(), dim=1).float()
+
+        if self.is_input_patches:
+            if self.config.norm_type != "ada_norm_single":
+                conditioning = self.transformer_blocks[0].norm1.emb(
+                    timestep, class_labels, hidden_dtype=hidden_states.dtype
+                )
+                shift, scale = self.proj_out_1(F.silu(conditioning)).chunk(2, dim=1)
+                hidden_states = self.norm_out(hidden_states) * (1 + scale[:, None]) + shift[:, None]
+                hidden_states = self.proj_out_2(hidden_states)
+            elif self.config.norm_type == "ada_norm_single":
+                shift, scale = (self.scale_shift_table[None] + embedded_timestep[:, None]).chunk(2, dim=1)
+                hidden_states = self.norm_out(hidden_states)
+                # Modulation
+                hidden_states = hidden_states * (1 + scale) + shift
+                hidden_states = self.proj_out(hidden_states)
+                hidden_states = hidden_states.squeeze(1)
+
+            # unpatchify
+            if self.adaln_single is None:
+                height = width = int(hidden_states.shape[1] ** 0.5)
+            hidden_states = hidden_states.reshape(
+                shape=(-1, height, width, self.patch_size, self.patch_size, self.out_channels)
+            )
+            hidden_states = torch.einsum("nhwpqc->nchpwq", hidden_states)
+            output = hidden_states.reshape(
+                shape=(-1, self.out_channels, height * self.patch_size, width * self.patch_size)
+            )
+
+        if not return_dict:
+            return (output,),curr_garment_feat_idx
+
+        return Transformer2DModelOutput(sample=output),curr_garment_feat_idx
diff --git a/src/tryon_pipeline.py b/src/tryon_pipeline.py
new file mode 100644
index 0000000000000000000000000000000000000000..628cedc2e26c38c77971aa798af323c8518ec68a
--- /dev/null
+++ b/src/tryon_pipeline.py
@@ -0,0 +1,1895 @@
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+
+import numpy as np
+import PIL.Image
+import torch
+from transformers import (
+    CLIPImageProcessor,
+    CLIPTextModel,
+    CLIPTextModelWithProjection,
+    CLIPTokenizer,
+    CLIPVisionModelWithProjection,
+)
+
+from diffusers.image_processor import PipelineImageInput, VaeImageProcessor
+from diffusers.loaders import (
+    FromSingleFileMixin,
+    IPAdapterMixin,
+    StableDiffusionXLLoraLoaderMixin,
+    TextualInversionLoaderMixin,
+)
+from diffusers.models import AutoencoderKL, ImageProjection, UNet2DConditionModel
+from diffusers.models.attention_processor import (
+    AttnProcessor2_0,
+    FusedAttnProcessor2_0,
+    LoRAAttnProcessor2_0,
+    LoRAXFormersAttnProcessor,
+    XFormersAttnProcessor,
+)
+from diffusers.models.lora import adjust_lora_scale_text_encoder
+from diffusers.schedulers import KarrasDiffusionSchedulers
+from diffusers.utils import (
+    USE_PEFT_BACKEND,
+    deprecate,
+    is_invisible_watermark_available,
+    is_torch_xla_available,
+    logging,
+    replace_example_docstring,
+    scale_lora_layers,
+    unscale_lora_layers,
+)
+from diffusers.utils.torch_utils import randn_tensor
+from diffusers.pipelines.pipeline_utils import DiffusionPipeline
+
+
+
+if is_torch_xla_available():
+    import torch_xla.core.xla_model as xm
+
+    XLA_AVAILABLE = True
+else:
+    XLA_AVAILABLE = False
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> import torch
+        >>> from diffusers import StableDiffusionXLInpaintPipeline
+        >>> from diffusers.utils import load_image
+
+        >>> pipe = StableDiffusionXLInpaintPipeline.from_pretrained(
+        ...     "stabilityai/stable-diffusion-xl-base-1.0",
+        ...     torch_dtype=torch.float16,
+        ...     variant="fp16",
+        ...     use_safetensors=True,
+        ... )
+        >>> pipe.to("cuda")
+
+        >>> img_url = "https://raw.githubusercontent.com/CompVis/latent-diffusion/main/data/inpainting_examples/overture-creations-5sI6fQgYIuo.png"
+        >>> mask_url = "https://raw.githubusercontent.com/CompVis/latent-diffusion/main/data/inpainting_examples/overture-creations-5sI6fQgYIuo_mask.png"
+
+        >>> init_image = load_image(img_url).convert("RGB")
+        >>> mask_image = load_image(mask_url).convert("RGB")
+
+        >>> prompt = "A majestic tiger sitting on a bench"
+        >>> image = pipe(
+        ...     prompt=prompt, image=init_image, mask_image=mask_image, num_inference_steps=50, strength=0.80
+        ... ).images[0]
+        ```
+"""
+
+
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.rescale_noise_cfg
+def rescale_noise_cfg(noise_cfg, noise_pred_text, guidance_rescale=0.0):
+    """
+    Rescale `noise_cfg` according to `guidance_rescale`. Based on findings of [Common Diffusion Noise Schedules and
+    Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf). See Section 3.4
+    """
+    std_text = noise_pred_text.std(dim=list(range(1, noise_pred_text.ndim)), keepdim=True)
+    std_cfg = noise_cfg.std(dim=list(range(1, noise_cfg.ndim)), keepdim=True)
+    # rescale the results from guidance (fixes overexposure)
+    noise_pred_rescaled = noise_cfg * (std_text / std_cfg)
+    # mix with the original results from guidance by factor guidance_rescale to avoid "plain looking" images
+    noise_cfg = guidance_rescale * noise_pred_rescaled + (1 - guidance_rescale) * noise_cfg
+    return noise_cfg
+
+
+def mask_pil_to_torch(mask, height, width):
+    # preprocess mask
+    if isinstance(mask, (PIL.Image.Image, np.ndarray)):
+        mask = [mask]
+
+    if isinstance(mask, list) and isinstance(mask[0], PIL.Image.Image):
+        mask = [i.resize((width, height), resample=PIL.Image.LANCZOS) for i in mask]
+        mask = np.concatenate([np.array(m.convert("L"))[None, None, :] for m in mask], axis=0)
+        mask = mask.astype(np.float32) / 255.0
+    elif isinstance(mask, list) and isinstance(mask[0], np.ndarray):
+        mask = np.concatenate([m[None, None, :] for m in mask], axis=0)
+
+    mask = torch.from_numpy(mask)
+    return mask
+
+
+def prepare_mask_and_masked_image(image, mask, height, width, return_image: bool = False):
+    """
+    Prepares a pair (image, mask) to be consumed by the Stable Diffusion pipeline. This means that those inputs will be
+    converted to ``torch.Tensor`` with shapes ``batch x channels x height x width`` where ``channels`` is ``3`` for the
+    ``image`` and ``1`` for the ``mask``.
+
+    The ``image`` will be converted to ``torch.float32`` and normalized to be in ``[-1, 1]``. The ``mask`` will be
+    binarized (``mask > 0.5``) and cast to ``torch.float32`` too.
+
+    Args:
+        image (Union[np.array, PIL.Image, torch.Tensor]): The image to inpaint.
+            It can be a ``PIL.Image``, or a ``height x width x 3`` ``np.array`` or a ``channels x height x width``
+            ``torch.Tensor`` or a ``batch x channels x height x width`` ``torch.Tensor``.
+        mask (_type_): The mask to apply to the image, i.e. regions to inpaint.
+            It can be a ``PIL.Image``, or a ``height x width`` ``np.array`` or a ``1 x height x width``
+            ``torch.Tensor`` or a ``batch x 1 x height x width`` ``torch.Tensor``.
+
+
+    Raises:
+        ValueError: ``torch.Tensor`` images should be in the ``[-1, 1]`` range. ValueError: ``torch.Tensor`` mask
+        should be in the ``[0, 1]`` range. ValueError: ``mask`` and ``image`` should have the same spatial dimensions.
+        TypeError: ``mask`` is a ``torch.Tensor`` but ``image`` is not
+            (ot the other way around).
+
+    Returns:
+        tuple[torch.Tensor]: The pair (mask, masked_image) as ``torch.Tensor`` with 4
+            dimensions: ``batch x channels x height x width``.
+    """
+
+    # checkpoint. TOD(Yiyi) - need to clean this up later
+    deprecation_message = "The prepare_mask_and_masked_image method is deprecated and will be removed in a future version. Please use VaeImageProcessor.preprocess instead"
+    deprecate(
+        "prepare_mask_and_masked_image",
+        "0.30.0",
+        deprecation_message,
+    )
+    if image is None:
+        raise ValueError("`image` input cannot be undefined.")
+
+    if mask is None:
+        raise ValueError("`mask_image` input cannot be undefined.")
+
+    if isinstance(image, torch.Tensor):
+        if not isinstance(mask, torch.Tensor):
+            mask = mask_pil_to_torch(mask, height, width)
+
+        if image.ndim == 3:
+            image = image.unsqueeze(0)
+
+        # Batch and add channel dim for single mask
+        if mask.ndim == 2:
+            mask = mask.unsqueeze(0).unsqueeze(0)
+
+        # Batch single mask or add channel dim
+        if mask.ndim == 3:
+            # Single batched mask, no channel dim or single mask not batched but channel dim
+            if mask.shape[0] == 1:
+                mask = mask.unsqueeze(0)
+
+            # Batched masks no channel dim
+            else:
+                mask = mask.unsqueeze(1)
+
+        assert image.ndim == 4 and mask.ndim == 4, "Image and Mask must have 4 dimensions"
+        # assert image.shape[-2:] == mask.shape[-2:], "Image and Mask must have the same spatial dimensions"
+        assert image.shape[0] == mask.shape[0], "Image and Mask must have the same batch size"
+
+        # Check image is in [-1, 1]
+        # if image.min() < -1 or image.max() > 1:
+        #    raise ValueError("Image should be in [-1, 1] range")
+
+        # Check mask is in [0, 1]
+        if mask.min() < 0 or mask.max() > 1:
+            raise ValueError("Mask should be in [0, 1] range")
+
+        # Binarize mask
+        mask[mask < 0.5] = 0
+        mask[mask >= 0.5] = 1
+
+        # Image as float32
+        image = image.to(dtype=torch.float32)
+    elif isinstance(mask, torch.Tensor):
+        raise TypeError(f"`mask` is a torch.Tensor but `image` (type: {type(image)} is not")
+    else:
+        # preprocess image
+        if isinstance(image, (PIL.Image.Image, np.ndarray)):
+            image = [image]
+        if isinstance(image, list) and isinstance(image[0], PIL.Image.Image):
+            # resize all images w.r.t passed height an width
+            image = [i.resize((width, height), resample=PIL.Image.LANCZOS) for i in image]
+            image = [np.array(i.convert("RGB"))[None, :] for i in image]
+            image = np.concatenate(image, axis=0)
+        elif isinstance(image, list) and isinstance(image[0], np.ndarray):
+            image = np.concatenate([i[None, :] for i in image], axis=0)
+
+        image = image.transpose(0, 3, 1, 2)
+        image = torch.from_numpy(image).to(dtype=torch.float32) / 127.5 - 1.0
+
+        mask = mask_pil_to_torch(mask, height, width)
+        mask[mask < 0.5] = 0
+        mask[mask >= 0.5] = 1
+
+    if image.shape[1] == 4:
+        # images are in latent space and thus can't
+        # be masked set masked_image to None
+        # we assume that the checkpoint is not an inpainting
+        # checkpoint. TOD(Yiyi) - need to clean this up later
+        masked_image = None
+    else:
+        masked_image = image * (mask < 0.5)
+
+    # n.b. ensure backwards compatibility as old function does not return image
+    if return_image:
+        return mask, masked_image, image
+
+    return mask, masked_image
+
+
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.retrieve_latents
+def retrieve_latents(
+    encoder_output: torch.Tensor, generator: Optional[torch.Generator] = None, sample_mode: str = "sample"
+):
+    if hasattr(encoder_output, "latent_dist") and sample_mode == "sample":
+        return encoder_output.latent_dist.sample(generator)
+    elif hasattr(encoder_output, "latent_dist") and sample_mode == "argmax":
+        return encoder_output.latent_dist.mode()
+    elif hasattr(encoder_output, "latents"):
+        return encoder_output.latents
+    else:
+        raise AttributeError("Could not access latents of provided encoder_output")
+
+
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps
+def retrieve_timesteps(
+    scheduler,
+    num_inference_steps: Optional[int] = None,
+    device: Optional[Union[str, torch.device]] = None,
+    timesteps: Optional[List[int]] = None,
+    **kwargs,
+):
+    """
+    Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
+    custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
+
+    Args:
+        scheduler (`SchedulerMixin`):
+            The scheduler to get timesteps from.
+        num_inference_steps (`int`):
+            The number of diffusion steps used when generating samples with a pre-trained model. If used,
+            `timesteps` must be `None`.
+        device (`str` or `torch.device`, *optional*):
+            The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+        timesteps (`List[int]`, *optional*):
+                Custom timesteps used to support arbitrary spacing between timesteps. If `None`, then the default
+                timestep spacing strategy of the scheduler is used. If `timesteps` is passed, `num_inference_steps`
+                must be `None`.
+
+    Returns:
+        `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
+        second element is the number of inference steps.
+    """
+    if timesteps is not None:
+        accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accepts_timesteps:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" timestep schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    else:
+        scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+    return timesteps, num_inference_steps
+
+
+class StableDiffusionXLInpaintPipeline(
+    DiffusionPipeline,
+    TextualInversionLoaderMixin,
+    StableDiffusionXLLoraLoaderMixin,
+    FromSingleFileMixin,
+    IPAdapterMixin,
+):
+    r"""
+    Pipeline for text-to-image generation using Stable Diffusion XL.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+
+    The pipeline also inherits the following loading methods:
+        - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
+        - [`~loaders.FromSingleFileMixin.from_single_file`] for loading `.ckpt` files
+        - [`~loaders.StableDiffusionXLLoraLoaderMixin.load_lora_weights`] for loading LoRA weights
+        - [`~loaders.StableDiffusionXLLoraLoaderMixin.save_lora_weights`] for saving LoRA weights
+        - [`~loaders.IPAdapterMixin.load_ip_adapter`] for loading IP Adapters
+
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
+        text_encoder ([`CLIPTextModel`]):
+            Frozen text-encoder. Stable Diffusion XL uses the text portion of
+            [CLIP](https://huggingface.co./docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
+            the [clip-vit-large-patch14](https://huggingface.co./openai/clip-vit-large-patch14) variant.
+        text_encoder_2 ([` CLIPTextModelWithProjection`]):
+            Second frozen text-encoder. Stable Diffusion XL uses the text and pool portion of
+            [CLIP](https://huggingface.co./docs/transformers/model_doc/clip#transformers.CLIPTextModelWithProjection),
+            specifically the
+            [laion/CLIP-ViT-bigG-14-laion2B-39B-b160k](https://huggingface.co./laion/CLIP-ViT-bigG-14-laion2B-39B-b160k)
+            variant.
+        tokenizer (`CLIPTokenizer`):
+            Tokenizer of class
+            [CLIPTokenizer](https://huggingface.co./docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
+        tokenizer_2 (`CLIPTokenizer`):
+            Second Tokenizer of class
+            [CLIPTokenizer](https://huggingface.co./docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
+        unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
+            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+        requires_aesthetics_score (`bool`, *optional*, defaults to `"False"`):
+            Whether the `unet` requires a aesthetic_score condition to be passed during inference. Also see the config
+            of `stabilityai/stable-diffusion-xl-refiner-1-0`.
+        force_zeros_for_empty_prompt (`bool`, *optional*, defaults to `"True"`):
+            Whether the negative prompt embeddings shall be forced to always be set to 0. Also see the config of
+            `stabilityai/stable-diffusion-xl-base-1-0`.
+        add_watermarker (`bool`, *optional*):
+            Whether to use the [invisible_watermark library](https://github.com/ShieldMnt/invisible-watermark/) to
+            watermark output images. If not defined, it will default to True if the package is installed, otherwise no
+            watermarker will be used.
+    """
+
+    model_cpu_offload_seq = "text_encoder->text_encoder_2->image_encoder->unet->vae"
+
+    _optional_components = [
+        "tokenizer",
+        "tokenizer_2",
+        "text_encoder",
+        "text_encoder_2",
+        "image_encoder",
+        "feature_extractor",
+    ]
+    _callback_tensor_inputs = [
+        "latents",
+        "prompt_embeds",
+        "negative_prompt_embeds",
+        "add_text_embeds",
+        "add_time_ids",
+        "negative_pooled_prompt_embeds",
+        "add_neg_time_ids",
+        "mask",
+        "masked_image_latents",
+    ]
+
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        text_encoder_2: CLIPTextModelWithProjection,
+        tokenizer: CLIPTokenizer,
+        tokenizer_2: CLIPTokenizer,
+        unet: UNet2DConditionModel,
+        scheduler: KarrasDiffusionSchedulers,
+        image_encoder: CLIPVisionModelWithProjection = None,
+        feature_extractor: CLIPImageProcessor = None,
+        requires_aesthetics_score: bool = False,
+        force_zeros_for_empty_prompt: bool = True,
+    ):
+        super().__init__()
+
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            text_encoder_2=text_encoder_2,
+            tokenizer=tokenizer,
+            tokenizer_2=tokenizer_2,
+            unet=unet,
+            image_encoder=image_encoder,
+            feature_extractor=feature_extractor,
+            scheduler=scheduler,
+        )
+        self.register_to_config(force_zeros_for_empty_prompt=force_zeros_for_empty_prompt)
+        self.register_to_config(requires_aesthetics_score=requires_aesthetics_score)
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
+        self.mask_processor = VaeImageProcessor(
+            vae_scale_factor=self.vae_scale_factor, do_normalize=False, do_binarize=True, do_convert_grayscale=True
+        )
+
+
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_slicing
+    def enable_vae_slicing(self):
+        r"""
+        Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to
+        compute decoding in several steps. This is useful to save some memory and allow larger batch sizes.
+        """
+        self.vae.enable_slicing()
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_slicing
+    def disable_vae_slicing(self):
+        r"""
+        Disable sliced VAE decoding. If `enable_vae_slicing` was previously enabled, this method will go back to
+        computing decoding in one step.
+        """
+        self.vae.disable_slicing()
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_tiling
+    def enable_vae_tiling(self):
+        r"""
+        Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to
+        compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow
+        processing larger images.
+        """
+        self.vae.enable_tiling()
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_tiling
+    def disable_vae_tiling(self):
+        r"""
+        Disable tiled VAE decoding. If `enable_vae_tiling` was previously enabled, this method will go back to
+        computing decoding in one step.
+        """
+        self.vae.disable_tiling()
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_image
+    def encode_image(self, image, device, num_images_per_prompt, output_hidden_states=None):
+        dtype = next(self.image_encoder.parameters()).dtype
+        # print(image.shape)
+        if not isinstance(image, torch.Tensor):
+            image = self.feature_extractor(image, return_tensors="pt").pixel_values
+
+        image = image.to(device=device, dtype=dtype)
+        if output_hidden_states:
+            image_enc_hidden_states = self.image_encoder(image, output_hidden_states=True).hidden_states[-2]
+            image_enc_hidden_states = image_enc_hidden_states.repeat_interleave(num_images_per_prompt, dim=0)
+            uncond_image_enc_hidden_states = self.image_encoder(
+                torch.zeros_like(image), output_hidden_states=True
+            ).hidden_states[-2]
+            uncond_image_enc_hidden_states = uncond_image_enc_hidden_states.repeat_interleave(
+                num_images_per_prompt, dim=0
+            )
+            return image_enc_hidden_states, uncond_image_enc_hidden_states
+        else:
+            image_embeds = self.image_encoder(image).image_embeds
+            image_embeds = image_embeds.repeat_interleave(num_images_per_prompt, dim=0)
+            uncond_image_embeds = torch.zeros_like(image_embeds)
+
+            return image_embeds, uncond_image_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_ip_adapter_image_embeds
+    def prepare_ip_adapter_image_embeds(self, ip_adapter_image, device, num_images_per_prompt):
+        if not isinstance(ip_adapter_image, list):
+            ip_adapter_image = [ip_adapter_image]
+
+        # if len(ip_adapter_image) != len(self.unet.encoder_hid_proj.image_projection_layers):
+        #     raise ValueError(
+        #         f"`ip_adapter_image` must have same length as the number of IP Adapters. Got {len(ip_adapter_image)} images and {len(self.unet.encoder_hid_proj.image_projection_layers)} IP Adapters."
+        #     )
+
+        image_embeds = []
+        # print(ip_adapter_image.shape)
+        for single_ip_adapter_image in ip_adapter_image:
+            # print(single_ip_adapter_image.shape)
+            # ip_adapter_image, self.unet.encoder_hid_proj.image_projection_layers
+            output_hidden_state = not isinstance(self.unet.encoder_hid_proj, ImageProjection)
+            # print(output_hidden_state)
+            single_image_embeds, single_negative_image_embeds = self.encode_image(
+                single_ip_adapter_image, device, 1, output_hidden_state
+            )
+            # print(single_image_embeds.shape)
+            # single_image_embeds = torch.stack([single_image_embeds] * num_images_per_prompt, dim=0)
+            # single_negative_image_embeds = torch.stack([single_negative_image_embeds] * num_images_per_prompt, dim=0)
+            # print(single_image_embeds.shape)
+            if self.do_classifier_free_guidance:
+                single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds])
+                single_image_embeds = single_image_embeds.to(device)
+
+            image_embeds.append(single_image_embeds)
+
+        return image_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.encode_prompt
+    def encode_prompt(
+        self,
+        prompt: str,
+        prompt_2: Optional[str] = None,
+        device: Optional[torch.device] = None,
+        num_images_per_prompt: int = 1,
+        do_classifier_free_guidance: bool = True,
+        negative_prompt: Optional[str] = None,
+        negative_prompt_2: Optional[str] = None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
+        lora_scale: Optional[float] = None,
+        clip_skip: Optional[int] = None,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts to be sent to the `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is
+                used in both text-encoders
+            device: (`torch.device`):
+                torch device
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            do_classifier_free_guidance (`bool`):
+                whether to use classifier free guidance or not
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            negative_prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and
+                `text_encoder_2`. If not defined, `negative_prompt` is used in both text-encoders
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
+                If not provided, pooled text embeddings will be generated from `prompt` input argument.
+            negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt`
+                input argument.
+            lora_scale (`float`, *optional*):
+                A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+        """
+        device = device or self._execution_device
+
+        # set lora scale so that monkey patched LoRA
+        # function of text encoder can correctly access it
+        if lora_scale is not None and isinstance(self, StableDiffusionXLLoraLoaderMixin):
+            self._lora_scale = lora_scale
+
+            # dynamically adjust the LoRA scale
+            if self.text_encoder is not None:
+                if not USE_PEFT_BACKEND:
+                    adjust_lora_scale_text_encoder(self.text_encoder, lora_scale)
+                else:
+                    scale_lora_layers(self.text_encoder, lora_scale)
+
+            if self.text_encoder_2 is not None:
+                if not USE_PEFT_BACKEND:
+                    adjust_lora_scale_text_encoder(self.text_encoder_2, lora_scale)
+                else:
+                    scale_lora_layers(self.text_encoder_2, lora_scale)
+
+        prompt = [prompt] if isinstance(prompt, str) else prompt
+
+        if prompt is not None:
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        # Define tokenizers and text encoders
+        tokenizers = [self.tokenizer, self.tokenizer_2] if self.tokenizer is not None else [self.tokenizer_2]
+        text_encoders = (
+            [self.text_encoder, self.text_encoder_2] if self.text_encoder is not None else [self.text_encoder_2]
+        )
+
+        if prompt_embeds is None:
+            prompt_2 = prompt_2 or prompt
+            prompt_2 = [prompt_2] if isinstance(prompt_2, str) else prompt_2
+
+            # textual inversion: procecss multi-vector tokens if necessary
+            prompt_embeds_list = []
+            prompts = [prompt, prompt_2]
+            for prompt, tokenizer, text_encoder in zip(prompts, tokenizers, text_encoders):
+                if isinstance(self, TextualInversionLoaderMixin):
+                    prompt = self.maybe_convert_prompt(prompt, tokenizer)
+
+                text_inputs = tokenizer(
+                    prompt,
+                    padding="max_length",
+                    max_length=tokenizer.model_max_length,
+                    truncation=True,
+                    return_tensors="pt",
+                )
+
+                text_input_ids = text_inputs.input_ids
+                untruncated_ids = tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+                if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
+                    text_input_ids, untruncated_ids
+                ):
+                    removed_text = tokenizer.batch_decode(untruncated_ids[:, tokenizer.model_max_length - 1 : -1])
+                    logger.warning(
+                        "The following part of your input was truncated because CLIP can only handle sequences up to"
+                        f" {tokenizer.model_max_length} tokens: {removed_text}"
+                    )
+
+                prompt_embeds = text_encoder(text_input_ids.to(device), output_hidden_states=True)
+
+                # We are only ALWAYS interested in the pooled output of the final text encoder
+                pooled_prompt_embeds = prompt_embeds[0]
+                if clip_skip is None:
+                    prompt_embeds = prompt_embeds.hidden_states[-2]
+                else:
+                    # "2" because SDXL always indexes from the penultimate layer.
+                    prompt_embeds = prompt_embeds.hidden_states[-(clip_skip + 2)]
+
+                prompt_embeds_list.append(prompt_embeds)
+
+            prompt_embeds = torch.concat(prompt_embeds_list, dim=-1)
+
+        # get unconditional embeddings for classifier free guidance
+        zero_out_negative_prompt = negative_prompt is None and self.config.force_zeros_for_empty_prompt
+        if do_classifier_free_guidance and negative_prompt_embeds is None and zero_out_negative_prompt:
+            negative_prompt_embeds = torch.zeros_like(prompt_embeds)
+            negative_pooled_prompt_embeds = torch.zeros_like(pooled_prompt_embeds)
+        elif do_classifier_free_guidance and negative_prompt_embeds is None:
+            negative_prompt = negative_prompt or ""
+            negative_prompt_2 = negative_prompt_2 or negative_prompt
+
+            # normalize str to list
+            negative_prompt = batch_size * [negative_prompt] if isinstance(negative_prompt, str) else negative_prompt
+            negative_prompt_2 = (
+                batch_size * [negative_prompt_2] if isinstance(negative_prompt_2, str) else negative_prompt_2
+            )
+
+            uncond_tokens: List[str]
+            if prompt is not None and type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = [negative_prompt, negative_prompt_2]
+
+            negative_prompt_embeds_list = []
+            for negative_prompt, tokenizer, text_encoder in zip(uncond_tokens, tokenizers, text_encoders):
+                if isinstance(self, TextualInversionLoaderMixin):
+                    negative_prompt = self.maybe_convert_prompt(negative_prompt, tokenizer)
+
+                max_length = prompt_embeds.shape[1]
+                uncond_input = tokenizer(
+                    negative_prompt,
+                    padding="max_length",
+                    max_length=max_length,
+                    truncation=True,
+                    return_tensors="pt",
+                )
+
+                negative_prompt_embeds = text_encoder(
+                    uncond_input.input_ids.to(device),
+                    output_hidden_states=True,
+                )
+                # We are only ALWAYS interested in the pooled output of the final text encoder
+                negative_pooled_prompt_embeds = negative_prompt_embeds[0]
+                negative_prompt_embeds = negative_prompt_embeds.hidden_states[-2]
+
+                negative_prompt_embeds_list.append(negative_prompt_embeds)
+
+            negative_prompt_embeds = torch.concat(negative_prompt_embeds_list, dim=-1)
+
+        if self.text_encoder_2 is not None:
+            prompt_embeds = prompt_embeds.to(dtype=self.text_encoder_2.dtype, device=device)
+        else:
+            prompt_embeds = prompt_embeds.to(dtype=self.unet.dtype, device=device)
+
+        bs_embed, seq_len, _ = prompt_embeds.shape
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
+
+        if do_classifier_free_guidance:
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = negative_prompt_embeds.shape[1]
+
+            if self.text_encoder_2 is not None:
+                negative_prompt_embeds = negative_prompt_embeds.to(dtype=self.text_encoder_2.dtype, device=device)
+            else:
+                negative_prompt_embeds = negative_prompt_embeds.to(dtype=self.unet.dtype, device=device)
+
+            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+        pooled_prompt_embeds = pooled_prompt_embeds.repeat(1, num_images_per_prompt).view(
+            bs_embed * num_images_per_prompt, -1
+        )
+        if do_classifier_free_guidance:
+            negative_pooled_prompt_embeds = negative_pooled_prompt_embeds.repeat(1, num_images_per_prompt).view(
+                bs_embed * num_images_per_prompt, -1
+            )
+
+        if self.text_encoder is not None:
+            if isinstance(self, StableDiffusionXLLoraLoaderMixin) and USE_PEFT_BACKEND:
+                # Retrieve the original scale by scaling back the LoRA layers
+                unscale_lora_layers(self.text_encoder, lora_scale)
+
+        if self.text_encoder_2 is not None:
+            if isinstance(self, StableDiffusionXLLoraLoaderMixin) and USE_PEFT_BACKEND:
+                # Retrieve the original scale by scaling back the LoRA layers
+                unscale_lora_layers(self.text_encoder_2, lora_scale)
+
+        return prompt_embeds, negative_prompt_embeds, pooled_prompt_embeds, negative_pooled_prompt_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+
+    def check_inputs(
+        self,
+        prompt,
+        prompt_2,
+        image,
+        mask_image,
+        height,
+        width,
+        strength,
+        callback_steps,
+        output_type,
+        negative_prompt=None,
+        negative_prompt_2=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+        callback_on_step_end_tensor_inputs=None,
+        padding_mask_crop=None,
+    ):
+        if strength < 0 or strength > 1:
+            raise ValueError(f"The value of strength should in [0.0, 1.0] but is {strength}")
+
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+
+        if callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+
+        if callback_on_step_end_tensor_inputs is not None and not all(
+            k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
+        ):
+            raise ValueError(
+                f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
+            )
+
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt_2 is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt_2`: {prompt_2} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+        elif prompt_2 is not None and (not isinstance(prompt_2, str) and not isinstance(prompt_2, list)):
+            raise ValueError(f"`prompt_2` has to be of type `str` or `list` but is {type(prompt_2)}")
+
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+        elif negative_prompt_2 is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt_2`: {negative_prompt_2} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+        if padding_mask_crop is not None:
+            if not isinstance(image, PIL.Image.Image):
+                raise ValueError(
+                    f"The image should be a PIL image when inpainting mask crop, but is of type" f" {type(image)}."
+                )
+            if not isinstance(mask_image, PIL.Image.Image):
+                raise ValueError(
+                    f"The mask image should be a PIL image when inpainting mask crop, but is of type"
+                    f" {type(mask_image)}."
+                )
+            if output_type != "pil":
+                raise ValueError(f"The output type should be PIL when inpainting mask crop, but is" f" {output_type}.")
+
+    def prepare_latents(
+        self,
+        batch_size,
+        num_channels_latents,
+        height,
+        width,
+        dtype,
+        device,
+        generator,
+        latents=None,
+        image=None,
+        timestep=None,
+        is_strength_max=True,
+        add_noise=True,
+        return_noise=False,
+        return_image_latents=False,
+    ):
+        shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor)
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+
+        if (image is None or timestep is None) and not is_strength_max:
+            raise ValueError(
+                "Since strength < 1. initial latents are to be initialised as a combination of Image + Noise."
+                "However, either the image or the noise timestep has not been provided."
+            )
+
+        if image.shape[1] == 4:
+            image_latents = image.to(device=device, dtype=dtype)
+            image_latents = image_latents.repeat(batch_size // image_latents.shape[0], 1, 1, 1)
+        elif return_image_latents or (latents is None and not is_strength_max):
+            image = image.to(device=device, dtype=dtype)
+            image_latents = self._encode_vae_image(image=image, generator=generator)
+            image_latents = image_latents.repeat(batch_size // image_latents.shape[0], 1, 1, 1)
+
+        if latents is None and add_noise:
+            noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+            # if strength is 1. then initialise the latents to noise, else initial to image + noise
+            latents = noise if is_strength_max else self.scheduler.add_noise(image_latents, noise, timestep)
+            # if pure noise then scale the initial latents by the  Scheduler's init sigma
+            latents = latents * self.scheduler.init_noise_sigma if is_strength_max else latents
+        elif add_noise:
+            noise = latents.to(device)
+            latents = noise * self.scheduler.init_noise_sigma
+        else:
+            noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+            latents = image_latents.to(device)
+
+        outputs = (latents,)
+
+        if return_noise:
+            outputs += (noise,)
+
+        if return_image_latents:
+            outputs += (image_latents,)
+
+        return outputs
+
+    def _encode_vae_image(self, image: torch.Tensor, generator: torch.Generator):
+        dtype = image.dtype
+        if self.vae.config.force_upcast:
+            image = image.float()
+            self.vae.to(dtype=torch.float32)
+
+        if isinstance(generator, list):
+            image_latents = [
+                retrieve_latents(self.vae.encode(image[i : i + 1]), generator=generator[i])
+                for i in range(image.shape[0])
+            ]
+            image_latents = torch.cat(image_latents, dim=0)
+        else:
+            image_latents = retrieve_latents(self.vae.encode(image), generator=generator)
+
+        if self.vae.config.force_upcast:
+            self.vae.to(dtype)
+
+        image_latents = image_latents.to(dtype)
+        image_latents = self.vae.config.scaling_factor * image_latents
+
+        return image_latents
+
+    def prepare_mask_latents(
+        self, mask, masked_image, batch_size, height, width, dtype, device, generator, do_classifier_free_guidance
+    ):
+        # resize the mask to latents shape as we concatenate the mask to the latents
+        # we do that before converting to dtype to avoid breaking in case we're using cpu_offload
+        # and half precision
+        mask = torch.nn.functional.interpolate(
+            mask, size=(height // self.vae_scale_factor, width // self.vae_scale_factor)
+        )
+        mask = mask.to(device=device, dtype=dtype)
+
+        # duplicate mask and masked_image_latents for each generation per prompt, using mps friendly method
+        if mask.shape[0] < batch_size:
+            if not batch_size % mask.shape[0] == 0:
+                raise ValueError(
+                    "The passed mask and the required batch size don't match. Masks are supposed to be duplicated to"
+                    f" a total batch size of {batch_size}, but {mask.shape[0]} masks were passed. Make sure the number"
+                    " of masks that you pass is divisible by the total requested batch size."
+                )
+            mask = mask.repeat(batch_size // mask.shape[0], 1, 1, 1)
+
+        mask = torch.cat([mask] * 2) if do_classifier_free_guidance else mask
+        if masked_image is not None and masked_image.shape[1] == 4:
+            masked_image_latents = masked_image
+        else:
+            masked_image_latents = None
+
+        if masked_image is not None:
+            if masked_image_latents is None:
+                masked_image = masked_image.to(device=device, dtype=dtype)
+                masked_image_latents = self._encode_vae_image(masked_image, generator=generator)
+
+            if masked_image_latents.shape[0] < batch_size:
+                if not batch_size % masked_image_latents.shape[0] == 0:
+                    raise ValueError(
+                        "The passed images and the required batch size don't match. Images are supposed to be duplicated"
+                        f" to a total batch size of {batch_size}, but {masked_image_latents.shape[0]} images were passed."
+                        " Make sure the number of images that you pass is divisible by the total requested batch size."
+                    )
+                masked_image_latents = masked_image_latents.repeat(
+                    batch_size // masked_image_latents.shape[0], 1, 1, 1
+                )
+
+            masked_image_latents = (
+                torch.cat([masked_image_latents] * 2) if do_classifier_free_guidance else masked_image_latents
+            )
+
+            # aligning device to prevent device errors when concating it with the latent model input
+            masked_image_latents = masked_image_latents.to(device=device, dtype=dtype)
+
+        return mask, masked_image_latents
+
+    # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl_img2img.StableDiffusionXLImg2ImgPipeline.get_timesteps
+    def get_timesteps(self, num_inference_steps, strength, device, denoising_start=None):
+        # get the original timestep using init_timestep
+        if denoising_start is None:
+            init_timestep = min(int(num_inference_steps * strength), num_inference_steps)
+            t_start = max(num_inference_steps - init_timestep, 0)
+        else:
+            t_start = 0
+
+        timesteps = self.scheduler.timesteps[t_start * self.scheduler.order :]
+
+        # Strength is irrelevant if we directly request a timestep to start at;
+        # that is, strength is determined by the denoising_start instead.
+        if denoising_start is not None:
+            discrete_timestep_cutoff = int(
+                round(
+                    self.scheduler.config.num_train_timesteps
+                    - (denoising_start * self.scheduler.config.num_train_timesteps)
+                )
+            )
+
+            num_inference_steps = (timesteps < discrete_timestep_cutoff).sum().item()
+            if self.scheduler.order == 2 and num_inference_steps % 2 == 0:
+                # if the scheduler is a 2nd order scheduler we might have to do +1
+                # because `num_inference_steps` might be even given that every timestep
+                # (except the highest one) is duplicated. If `num_inference_steps` is even it would
+                # mean that we cut the timesteps in the middle of the denoising step
+                # (between 1st and 2nd devirative) which leads to incorrect results. By adding 1
+                # we ensure that the denoising process always ends after the 2nd derivate step of the scheduler
+                num_inference_steps = num_inference_steps + 1
+
+            # because t_n+1 >= t_n, we slice the timesteps starting from the end
+            timesteps = timesteps[-num_inference_steps:]
+            return timesteps, num_inference_steps
+
+        return timesteps, num_inference_steps - t_start
+
+    # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl_img2img.StableDiffusionXLImg2ImgPipeline._get_add_time_ids
+    def _get_add_time_ids(
+        self,
+        original_size,
+        crops_coords_top_left,
+        target_size,
+        aesthetic_score,
+        negative_aesthetic_score,
+        negative_original_size,
+        negative_crops_coords_top_left,
+        negative_target_size,
+        dtype,
+        text_encoder_projection_dim=None,
+    ):
+        if self.config.requires_aesthetics_score:
+            add_time_ids = list(original_size + crops_coords_top_left + (aesthetic_score,))
+            add_neg_time_ids = list(
+                negative_original_size + negative_crops_coords_top_left + (negative_aesthetic_score,)
+            )
+        else:
+            add_time_ids = list(original_size + crops_coords_top_left + target_size)
+            add_neg_time_ids = list(negative_original_size + crops_coords_top_left + negative_target_size)
+
+        passed_add_embed_dim = (
+            self.unet.config.addition_time_embed_dim * len(add_time_ids) + text_encoder_projection_dim
+        )
+        expected_add_embed_dim = self.unet.add_embedding.linear_1.in_features
+
+        if (
+            expected_add_embed_dim > passed_add_embed_dim
+            and (expected_add_embed_dim - passed_add_embed_dim) == self.unet.config.addition_time_embed_dim
+        ):
+            raise ValueError(
+                f"Model expects an added time embedding vector of length {expected_add_embed_dim}, but a vector of {passed_add_embed_dim} was created. Please make sure to enable `requires_aesthetics_score` with `pipe.register_to_config(requires_aesthetics_score=True)` to make sure `aesthetic_score` {aesthetic_score} and `negative_aesthetic_score` {negative_aesthetic_score} is correctly used by the model."
+            )
+        elif (
+            expected_add_embed_dim < passed_add_embed_dim
+            and (passed_add_embed_dim - expected_add_embed_dim) == self.unet.config.addition_time_embed_dim
+        ):
+            raise ValueError(
+                f"Model expects an added time embedding vector of length {expected_add_embed_dim}, but a vector of {passed_add_embed_dim} was created. Please make sure to disable `requires_aesthetics_score` with `pipe.register_to_config(requires_aesthetics_score=False)` to make sure `target_size` {target_size} is correctly used by the model."
+            )
+        elif expected_add_embed_dim != passed_add_embed_dim:
+            raise ValueError(
+                f"Model expects an added time embedding vector of length {expected_add_embed_dim}, but a vector of {passed_add_embed_dim} was created. The model has an incorrect config. Please check `unet.config.time_embedding_type` and `text_encoder_2.config.projection_dim`."
+            )
+
+        add_time_ids = torch.tensor([add_time_ids], dtype=dtype)
+        add_neg_time_ids = torch.tensor([add_neg_time_ids], dtype=dtype)
+
+        return add_time_ids, add_neg_time_ids
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_upscale.StableDiffusionUpscalePipeline.upcast_vae
+    def upcast_vae(self):
+        dtype = self.vae.dtype
+        self.vae.to(dtype=torch.float32)
+        use_torch_2_0_or_xformers = isinstance(
+            self.vae.decoder.mid_block.attentions[0].processor,
+            (
+                AttnProcessor2_0,
+                XFormersAttnProcessor,
+                LoRAXFormersAttnProcessor,
+                LoRAAttnProcessor2_0,
+            ),
+        )
+        # if xformers or torch_2_0 is used attention block does not need
+        # to be in float32 which can save lots of memory
+        if use_torch_2_0_or_xformers:
+            self.vae.post_quant_conv.to(dtype)
+            self.vae.decoder.conv_in.to(dtype)
+            self.vae.decoder.mid_block.to(dtype)
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_freeu
+    def enable_freeu(self, s1: float, s2: float, b1: float, b2: float):
+        r"""Enables the FreeU mechanism as in https://arxiv.org/abs/2309.11497.
+
+        The suffixes after the scaling factors represent the stages where they are being applied.
+
+        Please refer to the [official repository](https://github.com/ChenyangSi/FreeU) for combinations of the values
+        that are known to work well for different pipelines such as Stable Diffusion v1, v2, and Stable Diffusion XL.
+
+        Args:
+            s1 (`float`):
+                Scaling factor for stage 1 to attenuate the contributions of the skip features. This is done to
+                mitigate "oversmoothing effect" in the enhanced denoising process.
+            s2 (`float`):
+                Scaling factor for stage 2 to attenuate the contributions of the skip features. This is done to
+                mitigate "oversmoothing effect" in the enhanced denoising process.
+            b1 (`float`): Scaling factor for stage 1 to amplify the contributions of backbone features.
+            b2 (`float`): Scaling factor for stage 2 to amplify the contributions of backbone features.
+        """
+        if not hasattr(self, "unet"):
+            raise ValueError("The pipeline must have `unet` for using FreeU.")
+        self.unet.enable_freeu(s1=s1, s2=s2, b1=b1, b2=b2)
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_freeu
+    def disable_freeu(self):
+        """Disables the FreeU mechanism if enabled."""
+        self.unet.disable_freeu()
+
+    # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.fuse_qkv_projections
+    def fuse_qkv_projections(self, unet: bool = True, vae: bool = True):
+        """
+        Enables fused QKV projections. For self-attention modules, all projection matrices (i.e., query,
+        key, value) are fused. For cross-attention modules, key and value projection matrices are fused.
+
+        <Tip warning={true}>
+
+        This API is 🧪 experimental.
+
+        </Tip>
+
+        Args:
+            unet (`bool`, defaults to `True`): To apply fusion on the UNet.
+            vae (`bool`, defaults to `True`): To apply fusion on the VAE.
+        """
+        self.fusing_unet = False
+        self.fusing_vae = False
+
+        if unet:
+            self.fusing_unet = True
+            self.unet.fuse_qkv_projections()
+            self.unet.set_attn_processor(FusedAttnProcessor2_0())
+
+        if vae:
+            if not isinstance(self.vae, AutoencoderKL):
+                raise ValueError("`fuse_qkv_projections()` is only supported for the VAE of type `AutoencoderKL`.")
+
+            self.fusing_vae = True
+            self.vae.fuse_qkv_projections()
+            self.vae.set_attn_processor(FusedAttnProcessor2_0())
+
+    # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.unfuse_qkv_projections
+    def unfuse_qkv_projections(self, unet: bool = True, vae: bool = True):
+        """Disable QKV projection fusion if enabled.
+
+        <Tip warning={true}>
+
+        This API is 🧪 experimental.
+
+        </Tip>
+
+        Args:
+            unet (`bool`, defaults to `True`): To apply fusion on the UNet.
+            vae (`bool`, defaults to `True`): To apply fusion on the VAE.
+
+        """
+        if unet:
+            if not self.fusing_unet:
+                logger.warning("The UNet was not initially fused for QKV projections. Doing nothing.")
+            else:
+                self.unet.unfuse_qkv_projections()
+                self.fusing_unet = False
+
+        if vae:
+            if not self.fusing_vae:
+                logger.warning("The VAE was not initially fused for QKV projections. Doing nothing.")
+            else:
+                self.vae.unfuse_qkv_projections()
+                self.fusing_vae = False
+
+    # Copied from diffusers.pipelines.latent_consistency_models.pipeline_latent_consistency_text2img.LatentConsistencyModelPipeline.get_guidance_scale_embedding
+    def get_guidance_scale_embedding(self, w, embedding_dim=512, dtype=torch.float32):
+        """
+        See https://github.com/google-research/vdm/blob/dc27b98a554f65cdc654b800da5aa1846545d41b/model_vdm.py#L298
+
+        Args:
+            timesteps (`torch.Tensor`):
+                generate embedding vectors at these timesteps
+            embedding_dim (`int`, *optional*, defaults to 512):
+                dimension of the embeddings to generate
+            dtype:
+                data type of the generated embeddings
+
+        Returns:
+            `torch.FloatTensor`: Embedding vectors with shape `(len(timesteps), embedding_dim)`
+        """
+        assert len(w.shape) == 1
+        w = w * 1000.0
+
+        half_dim = embedding_dim // 2
+        emb = torch.log(torch.tensor(10000.0)) / (half_dim - 1)
+        emb = torch.exp(torch.arange(half_dim, dtype=dtype) * -emb)
+        emb = w.to(dtype)[:, None] * emb[None, :]
+        emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=1)
+        if embedding_dim % 2 == 1:  # zero pad
+            emb = torch.nn.functional.pad(emb, (0, 1))
+        assert emb.shape == (w.shape[0], embedding_dim)
+        return emb
+
+    @property
+    def guidance_scale(self):
+        return self._guidance_scale
+
+    @property
+    def guidance_rescale(self):
+        return self._guidance_rescale
+
+    @property
+    def clip_skip(self):
+        return self._clip_skip
+
+    # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+    # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+    # corresponds to doing no classifier free guidance.
+    @property
+    def do_classifier_free_guidance(self):
+        return self._guidance_scale > 1 and self.unet.config.time_cond_proj_dim is None
+
+    @property
+    def cross_attention_kwargs(self):
+        return self._cross_attention_kwargs
+
+    @property
+    def denoising_end(self):
+        return self._denoising_end
+
+    @property
+    def denoising_start(self):
+        return self._denoising_start
+
+    @property
+    def num_timesteps(self):
+        return self._num_timesteps
+
+    @property
+    def interrupt(self):
+        return self._interrupt
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        prompt_2: Optional[Union[str, List[str]]] = None,
+        image: PipelineImageInput = None,
+        mask_image: PipelineImageInput = None,
+        masked_image_latents: torch.FloatTensor = None,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        padding_mask_crop: Optional[int] = None,
+        strength: float = 0.9999,
+        num_inference_steps: int = 50,
+        timesteps: List[int] = None,
+        denoising_start: Optional[float] = None,
+        denoising_end: Optional[float] = None,
+        guidance_scale: float = 7.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        negative_prompt_2: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
+        ip_adapter_image: Optional[PipelineImageInput] = None,
+        output_type: Optional[str] = "pil",
+        cloth =None,
+        pose_img = None,
+        text_embeds_cloth=None,
+        return_dict: bool = True,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        guidance_rescale: float = 0.0,
+        original_size: Tuple[int, int] = None,
+        crops_coords_top_left: Tuple[int, int] = (0, 0),
+        target_size: Tuple[int, int] = None,
+        negative_original_size: Optional[Tuple[int, int]] = None,
+        negative_crops_coords_top_left: Tuple[int, int] = (0, 0),
+        negative_target_size: Optional[Tuple[int, int]] = None,
+        aesthetic_score: float = 6.0,
+        negative_aesthetic_score: float = 2.5,
+        clip_skip: Optional[int] = None,
+        pooled_prompt_embeds_c=None,
+        callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        **kwargs,
+    ):
+        r"""
+        Function invoked when calling the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
+                instead.
+            prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts to be sent to the `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is
+                used in both text-encoders
+            image (`PIL.Image.Image`):
+                `Image`, or tensor representing an image batch which will be inpainted, *i.e.* parts of the image will
+                be masked out with `mask_image` and repainted according to `prompt`.
+            mask_image (`PIL.Image.Image`):
+                `Image`, or tensor representing an image batch, to mask `image`. White pixels in the mask will be
+                repainted, while black pixels will be preserved. If `mask_image` is a PIL image, it will be converted
+                to a single channel (luminance) before use. If it's a tensor, it should contain one color channel (L)
+                instead of 3, so the expected shape would be `(B, H, W, 1)`.
+            height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
+                The height in pixels of the generated image. This is set to 1024 by default for the best results.
+                Anything below 512 pixels won't work well for
+                [stabilityai/stable-diffusion-xl-base-1.0](https://huggingface.co./stabilityai/stable-diffusion-xl-base-1.0)
+                and checkpoints that are not specifically fine-tuned on low resolutions.
+            width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
+                The width in pixels of the generated image. This is set to 1024 by default for the best results.
+                Anything below 512 pixels won't work well for
+                [stabilityai/stable-diffusion-xl-base-1.0](https://huggingface.co./stabilityai/stable-diffusion-xl-base-1.0)
+                and checkpoints that are not specifically fine-tuned on low resolutions.
+            padding_mask_crop (`int`, *optional*, defaults to `None`):
+                The size of margin in the crop to be applied to the image and masking. If `None`, no crop is applied to image and mask_image. If
+                `padding_mask_crop` is not `None`, it will first find a rectangular region with the same aspect ration of the image and
+                contains all masked area, and then expand that area based on `padding_mask_crop`. The image and mask_image will then be cropped based on
+                the expanded area before resizing to the original image size for inpainting. This is useful when the masked area is small while the image is large
+                and contain information inreleant for inpainging, such as background.
+            strength (`float`, *optional*, defaults to 0.9999):
+                Conceptually, indicates how much to transform the masked portion of the reference `image`. Must be
+                between 0 and 1. `image` will be used as a starting point, adding more noise to it the larger the
+                `strength`. The number of denoising steps depends on the amount of noise initially added. When
+                `strength` is 1, added noise will be maximum and the denoising process will run for the full number of
+                iterations specified in `num_inference_steps`. A value of 1, therefore, essentially ignores the masked
+                portion of the reference `image`. Note that in the case of `denoising_start` being declared as an
+                integer, the value of `strength` will be ignored.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            timesteps (`List[int]`, *optional*):
+                Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument
+                in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is
+                passed will be used. Must be in descending order.
+            denoising_start (`float`, *optional*):
+                When specified, indicates the fraction (between 0.0 and 1.0) of the total denoising process to be
+                bypassed before it is initiated. Consequently, the initial part of the denoising process is skipped and
+                it is assumed that the passed `image` is a partly denoised image. Note that when this is specified,
+                strength will be ignored. The `denoising_start` parameter is particularly beneficial when this pipeline
+                is integrated into a "Mixture of Denoisers" multi-pipeline setup, as detailed in [**Refining the Image
+                Output**](https://huggingface.co./docs/diffusers/api/pipelines/stable_diffusion/stable_diffusion_xl#refining-the-image-output).
+            denoising_end (`float`, *optional*):
+                When specified, determines the fraction (between 0.0 and 1.0) of the total denoising process to be
+                completed before it is intentionally prematurely terminated. As a result, the returned sample will
+                still retain a substantial amount of noise (ca. final 20% of timesteps still needed) and should be
+                denoised by a successor pipeline that has `denoising_start` set to 0.8 so that it only denoises the
+                final 20% of the scheduler. The denoising_end parameter should ideally be utilized when this pipeline
+                forms a part of a "Mixture of Denoisers" multi-pipeline setup, as elaborated in [**Refining the Image
+                Output**](https://huggingface.co./docs/diffusers/api/pipelines/stable_diffusion/stable_diffusion_xl#refining-the-image-output).
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            negative_prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and
+                `text_encoder_2`. If not defined, `negative_prompt` is used in both text-encoders
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
+                If not provided, pooled text embeddings will be generated from `prompt` input argument.
+            negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt`
+                input argument.
+            ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters.
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
+                [`schedulers.DDIMScheduler`], will be ignored for others.
+            generator (`torch.Generator`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
+                `self.processor` in
+                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            original_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
+                If `original_size` is not the same as `target_size` the image will appear to be down- or upsampled.
+                `original_size` defaults to `(height, width)` if not specified. Part of SDXL's micro-conditioning as
+                explained in section 2.2 of
+                [https://huggingface.co./papers/2307.01952](https://huggingface.co./papers/2307.01952).
+            crops_coords_top_left (`Tuple[int]`, *optional*, defaults to (0, 0)):
+                `crops_coords_top_left` can be used to generate an image that appears to be "cropped" from the position
+                `crops_coords_top_left` downwards. Favorable, well-centered images are usually achieved by setting
+                `crops_coords_top_left` to (0, 0). Part of SDXL's micro-conditioning as explained in section 2.2 of
+                [https://huggingface.co./papers/2307.01952](https://huggingface.co./papers/2307.01952).
+            target_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
+                For most cases, `target_size` should be set to the desired height and width of the generated image. If
+                not specified it will default to `(height, width)`. Part of SDXL's micro-conditioning as explained in
+                section 2.2 of [https://huggingface.co./papers/2307.01952](https://huggingface.co./papers/2307.01952).
+            negative_original_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
+                To negatively condition the generation process based on a specific image resolution. Part of SDXL's
+                micro-conditioning as explained in section 2.2 of
+                [https://huggingface.co./papers/2307.01952](https://huggingface.co./papers/2307.01952). For more
+                information, refer to this issue thread: https://github.com/huggingface/diffusers/issues/4208.
+            negative_crops_coords_top_left (`Tuple[int]`, *optional*, defaults to (0, 0)):
+                To negatively condition the generation process based on a specific crop coordinates. Part of SDXL's
+                micro-conditioning as explained in section 2.2 of
+                [https://huggingface.co./papers/2307.01952](https://huggingface.co./papers/2307.01952). For more
+                information, refer to this issue thread: https://github.com/huggingface/diffusers/issues/4208.
+            negative_target_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
+                To negatively condition the generation process based on a target image resolution. It should be as same
+                as the `target_size` for most cases. Part of SDXL's micro-conditioning as explained in section 2.2 of
+                [https://huggingface.co./papers/2307.01952](https://huggingface.co./papers/2307.01952). For more
+                information, refer to this issue thread: https://github.com/huggingface/diffusers/issues/4208.
+            aesthetic_score (`float`, *optional*, defaults to 6.0):
+                Used to simulate an aesthetic score of the generated image by influencing the positive text condition.
+                Part of SDXL's micro-conditioning as explained in section 2.2 of
+                [https://huggingface.co./papers/2307.01952](https://huggingface.co./papers/2307.01952).
+            negative_aesthetic_score (`float`, *optional*, defaults to 2.5):
+                Part of SDXL's micro-conditioning as explained in section 2.2 of
+                [https://huggingface.co./papers/2307.01952](https://huggingface.co./papers/2307.01952). Can be used to
+                simulate an aesthetic score of the generated image by influencing the negative text condition.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+            callback_on_step_end (`Callable`, *optional*):
+                A function that calls at the end of each denoising steps during the inference. The function is called
+                with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
+                callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
+                `callback_on_step_end_tensor_inputs`.
+            callback_on_step_end_tensor_inputs (`List`, *optional*):
+                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
+                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
+                `._callback_tensor_inputs` attribute of your pipeline class.
+
+        Examples:
+
+        Returns:
+            [`~pipelines.stable_diffusion.StableDiffusionXLPipelineOutput`] or `tuple`:
+            [`~pipelines.stable_diffusion.StableDiffusionXLPipelineOutput`] if `return_dict` is True, otherwise a
+            `tuple. `tuple. When returning a tuple, the first element is a list with the generated images.
+        """
+
+        callback = kwargs.pop("callback", None)
+        callback_steps = kwargs.pop("callback_steps", None)
+
+        if callback is not None:
+            deprecate(
+                "callback",
+                "1.0.0",
+                "Passing `callback` as an input argument to `__call__` is deprecated, consider use `callback_on_step_end`",
+            )
+        if callback_steps is not None:
+            deprecate(
+                "callback_steps",
+                "1.0.0",
+                "Passing `callback_steps` as an input argument to `__call__` is deprecated, consider use `callback_on_step_end`",
+            )
+
+        # 0. Default height and width to unet
+        height = height or self.unet.config.sample_size * self.vae_scale_factor
+        width = width or self.unet.config.sample_size * self.vae_scale_factor
+
+        # 1. Check inputs
+        self.check_inputs(
+            prompt,
+            prompt_2,
+            image,
+            mask_image,
+            height,
+            width,
+            strength,
+            callback_steps,
+            output_type,
+            negative_prompt,
+            negative_prompt_2,
+            prompt_embeds,
+            negative_prompt_embeds,
+            callback_on_step_end_tensor_inputs,
+            padding_mask_crop,
+        )
+
+        self._guidance_scale = guidance_scale
+        self._guidance_rescale = guidance_rescale
+        self._clip_skip = clip_skip
+        self._cross_attention_kwargs = cross_attention_kwargs
+        self._denoising_end = denoising_end
+        self._denoising_start = denoising_start
+        self._interrupt = False
+
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        device = self._execution_device
+
+        # 3. Encode input prompt
+        text_encoder_lora_scale = (
+            self.cross_attention_kwargs.get("scale", None) if self.cross_attention_kwargs is not None else None
+        )
+
+        (
+            prompt_embeds,
+            negative_prompt_embeds,
+            pooled_prompt_embeds,
+            negative_pooled_prompt_embeds,
+        ) = self.encode_prompt(
+            prompt=prompt,
+            prompt_2=prompt_2,
+            device=device,
+            num_images_per_prompt=num_images_per_prompt,
+            do_classifier_free_guidance=self.do_classifier_free_guidance,
+            negative_prompt=negative_prompt,
+            negative_prompt_2=negative_prompt_2,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            pooled_prompt_embeds=pooled_prompt_embeds,
+            negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
+            lora_scale=text_encoder_lora_scale,
+            clip_skip=self.clip_skip,
+        )
+
+        # 4. set timesteps
+        def denoising_value_valid(dnv):
+            return isinstance(self.denoising_end, float) and 0 < dnv < 1
+
+        timesteps, num_inference_steps = retrieve_timesteps(self.scheduler, num_inference_steps, device, timesteps)
+        timesteps, num_inference_steps = self.get_timesteps(
+            num_inference_steps,
+            strength,
+            device,
+            denoising_start=self.denoising_start if denoising_value_valid else None,
+        )
+        # check that number of inference steps is not < 1 - as this doesn't make sense
+        if num_inference_steps < 1:
+            raise ValueError(
+                f"After adjusting the num_inference_steps by strength parameter: {strength}, the number of pipeline"
+                f"steps is {num_inference_steps} which is < 1 and not appropriate for this pipeline."
+            )
+        # at which timestep to set the initial noise (n.b. 50% if strength is 0.5)
+        latent_timestep = timesteps[:1].repeat(batch_size * num_images_per_prompt)
+        # create a boolean to check if the strength is set to 1. if so then initialise the latents with pure noise
+        is_strength_max = strength == 1.0
+
+        # 5. Preprocess mask and image
+        if padding_mask_crop is not None:
+            crops_coords = self.mask_processor.get_crop_region(mask_image, width, height, pad=padding_mask_crop)
+            resize_mode = "fill"
+        else:
+            crops_coords = None
+            resize_mode = "default"
+
+        original_image = image
+        init_image = self.image_processor.preprocess(
+            image, height=height, width=width, crops_coords=crops_coords, resize_mode=resize_mode
+        )
+        init_image = init_image.to(dtype=torch.float32)
+
+        mask = self.mask_processor.preprocess(
+            mask_image, height=height, width=width, resize_mode=resize_mode, crops_coords=crops_coords
+        )
+        if masked_image_latents is not None:
+            masked_image = masked_image_latents
+        elif init_image.shape[1] == 4:
+            # if images are in latent space, we can't mask it
+            masked_image = None
+        else:
+            masked_image = init_image * (mask < 0.5)
+
+        # 6. Prepare latent variables
+        num_channels_latents = self.vae.config.latent_channels
+        num_channels_unet = self.unet.config.in_channels
+        return_image_latents = num_channels_unet == 4
+
+        add_noise = True if self.denoising_start is None else False
+        latents_outputs = self.prepare_latents(
+            batch_size * num_images_per_prompt,
+            num_channels_latents,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            latents,
+            image=init_image,
+            timestep=latent_timestep,
+            is_strength_max=is_strength_max,
+            add_noise=add_noise,
+            return_noise=True,
+            return_image_latents=return_image_latents,
+        )
+
+        if return_image_latents:
+            latents, noise, image_latents = latents_outputs
+        else:
+            latents, noise = latents_outputs
+
+        # 7. Prepare mask latent variables
+        mask, masked_image_latents = self.prepare_mask_latents(
+            mask,
+            masked_image,
+            batch_size * num_images_per_prompt,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            self.do_classifier_free_guidance,
+        )
+        pose_img = pose_img.to(device=device, dtype=prompt_embeds.dtype)
+
+        pose_img = self.vae.encode(pose_img).latent_dist.sample()
+        pose_img = pose_img * self.vae.config.scaling_factor
+
+        # pose_img = self._encode_vae_image(pose_img, generator=generator)
+
+        pose_img = (
+                torch.cat([pose_img] * 2) if self.do_classifier_free_guidance else pose_img
+        )
+        cloth = self._encode_vae_image(cloth, generator=generator)
+
+        # # 8. Check that sizes of mask, masked image and latents match
+        # if num_channels_unet == 9:
+        #     # default case for runwayml/stable-diffusion-inpainting
+        #     num_channels_mask = mask.shape[1]
+        #     num_channels_masked_image = masked_image_latents.shape[1]
+        #     if num_channels_latents + num_channels_mask + num_channels_masked_image != self.unet.config.in_channels:
+        #         raise ValueError(
+        #             f"Incorrect configuration settings! The config of `pipeline.unet`: {self.unet.config} expects"
+        #             f" {self.unet.config.in_channels} but received `num_channels_latents`: {num_channels_latents} +"
+        #             f" `num_channels_mask`: {num_channels_mask} + `num_channels_masked_image`: {num_channels_masked_image}"
+        #             f" = {num_channels_latents+num_channels_masked_image+num_channels_mask}. Please verify the config of"
+        #             " `pipeline.unet` or your `mask_image` or `image` input."
+        #         )
+        # elif num_channels_unet != 4:
+        #     raise ValueError(
+        #         f"The unet {self.unet.__class__} should have either 4 or 9 input channels, not {self.unet.config.in_channels}."
+        #     )
+        # 8.1 Prepare extra step kwargs.
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+
+        # 9. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        height, width = latents.shape[-2:]
+        height = height * self.vae_scale_factor
+        width = width * self.vae_scale_factor
+
+        original_size = original_size or (height, width)
+        target_size = target_size or (height, width)
+
+        # 10. Prepare added time ids & embeddings
+        if negative_original_size is None:
+            negative_original_size = original_size
+        if negative_target_size is None:
+            negative_target_size = target_size
+
+        add_text_embeds = pooled_prompt_embeds
+        if self.text_encoder_2 is None:
+            text_encoder_projection_dim = int(pooled_prompt_embeds.shape[-1])
+        else:
+            text_encoder_projection_dim = self.text_encoder_2.config.projection_dim
+
+        add_time_ids, add_neg_time_ids = self._get_add_time_ids(
+            original_size,
+            crops_coords_top_left,
+            target_size,
+            aesthetic_score,
+            negative_aesthetic_score,
+            negative_original_size,
+            negative_crops_coords_top_left,
+            negative_target_size,
+            dtype=prompt_embeds.dtype,
+            text_encoder_projection_dim=text_encoder_projection_dim,
+        )
+        add_time_ids = add_time_ids.repeat(batch_size * num_images_per_prompt, 1)
+
+        if self.do_classifier_free_guidance:
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds], dim=0)
+            add_text_embeds = torch.cat([negative_pooled_prompt_embeds, add_text_embeds], dim=0)
+            add_neg_time_ids = add_neg_time_ids.repeat(batch_size * num_images_per_prompt, 1)
+            add_time_ids = torch.cat([add_neg_time_ids, add_time_ids], dim=0)
+
+        prompt_embeds = prompt_embeds.to(device)
+        add_text_embeds = add_text_embeds.to(device)
+        add_time_ids = add_time_ids.to(device)
+
+        if ip_adapter_image is not None:
+            image_embeds = self.prepare_ip_adapter_image_embeds(
+                ip_adapter_image, device, batch_size * num_images_per_prompt
+            )
+            # print("a")
+            # print(image_embeds[0].shape)
+
+        # 11. Denoising loop
+        num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0)
+
+        if (
+            self.denoising_end is not None
+            and self.denoising_start is not None
+            and denoising_value_valid(self.denoising_end)
+            and denoising_value_valid(self.denoising_start)
+            and self.denoising_start >= self.denoising_end
+        ):
+            raise ValueError(
+                f"`denoising_start`: {self.denoising_start} cannot be larger than or equal to `denoising_end`: "
+                + f" {self.denoising_end} when using type float."
+            )
+        elif self.denoising_end is not None and denoising_value_valid(self.denoising_end):
+            discrete_timestep_cutoff = int(
+                round(
+                    self.scheduler.config.num_train_timesteps
+                    - (self.denoising_end * self.scheduler.config.num_train_timesteps)
+                )
+            )
+            num_inference_steps = len(list(filter(lambda ts: ts >= discrete_timestep_cutoff, timesteps)))
+            timesteps = timesteps[:num_inference_steps]
+
+        # 11.1 Optionally get Guidance Scale Embedding
+        timestep_cond = None
+        if self.unet.config.time_cond_proj_dim is not None:
+            guidance_scale_tensor = torch.tensor(self.guidance_scale - 1).repeat(batch_size * num_images_per_prompt)
+            timestep_cond = self.get_guidance_scale_embedding(
+                guidance_scale_tensor, embedding_dim=self.unet.config.time_cond_proj_dim
+            ).to(device=device, dtype=latents.dtype)
+
+        self._num_timesteps = len(timesteps)
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                if self.interrupt:
+                    continue
+                # expand the latents if we are doing classifier free guidance
+                latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents
+
+                # concat latents, mask, masked_image_latents in the channel dimension
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+
+                # bsz = mask.shape[0]
+                if num_channels_unet == 13:
+                    latent_model_input = torch.cat([latent_model_input, mask, masked_image_latents,pose_img], dim=1)
+
+                # if num_channels_unet == 9:
+                #     latent_model_input = torch.cat([latent_model_input, mask, masked_image_latents], dim=1)
+
+                # predict the noise residual
+                added_cond_kwargs = {"text_embeds": add_text_embeds, "time_ids": add_time_ids}
+                if ip_adapter_image is not None:
+                    added_cond_kwargs["image_embeds"] = image_embeds[0]
+                # down,reference_features = self.UNet_Encoder(cloth,t, text_embeds_cloth,added_cond_kwargs= {"text_embeds": pooled_prompt_embeds_c, "time_ids": add_time_ids},return_dict=False)
+                down,reference_features = self.unet_encoder(cloth,t, text_embeds_cloth,return_dict=False)
+                # print(type(reference_features))
+                # print(reference_features)
+                reference_features = list(reference_features)
+                # print(len(reference_features))
+                # for elem in reference_features:
+                #     print(elem.shape)
+                # exit(1)
+                if self.do_classifier_free_guidance:
+                    reference_features = [torch.cat([torch.zeros_like(d), d]) for d in reference_features]
+
+
+                noise_pred = self.unet(
+                    latent_model_input,
+                    t,
+                    encoder_hidden_states=prompt_embeds,
+                    timestep_cond=timestep_cond,
+                    cross_attention_kwargs=self.cross_attention_kwargs,
+                    added_cond_kwargs=added_cond_kwargs,
+                    return_dict=False,
+                    garment_features=reference_features,
+                )[0]
+                # noise_pred = self.unet(latent_model_input, t, 
+                #                             prompt_embeds,timestep_cond=timestep_cond,cross_attention_kwargs=self.cross_attention_kwargs,added_cond_kwargs=added_cond_kwargs,down_block_additional_attn=down ).sample
+
+
+                # perform guidance
+                if self.do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + self.guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+                if self.do_classifier_free_guidance and self.guidance_rescale > 0.0:
+                    # Based on 3.4. in https://arxiv.org/pdf/2305.08891.pdf
+                    noise_pred = rescale_noise_cfg(noise_pred, noise_pred_text, guidance_rescale=self.guidance_rescale)
+
+                # compute the previous noisy sample x_t -> x_t-1
+                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
+
+                if num_channels_unet == 4:
+                    init_latents_proper = image_latents
+                    if self.do_classifier_free_guidance:
+                        init_mask, _ = mask.chunk(2)
+                    else:
+                        init_mask = mask
+
+                    if i < len(timesteps) - 1:
+                        noise_timestep = timesteps[i + 1]
+                        init_latents_proper = self.scheduler.add_noise(
+                            init_latents_proper, noise, torch.tensor([noise_timestep])
+                        )
+
+                    latents = (1 - init_mask) * init_latents_proper + init_mask * latents
+
+                if callback_on_step_end is not None:
+                    callback_kwargs = {}
+                    for k in callback_on_step_end_tensor_inputs:
+                        callback_kwargs[k] = locals()[k]
+                    callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+
+                    latents = callback_outputs.pop("latents", latents)
+                    prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
+                    negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
+                    add_text_embeds = callback_outputs.pop("add_text_embeds", add_text_embeds)
+                    negative_pooled_prompt_embeds = callback_outputs.pop(
+                        "negative_pooled_prompt_embeds", negative_pooled_prompt_embeds
+                    )
+                    add_time_ids = callback_outputs.pop("add_time_ids", add_time_ids)
+                    add_neg_time_ids = callback_outputs.pop("add_neg_time_ids", add_neg_time_ids)
+                    mask = callback_outputs.pop("mask", mask)
+                    masked_image_latents = callback_outputs.pop("masked_image_latents", masked_image_latents)
+
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+                    if callback is not None and i % callback_steps == 0:
+                        step_idx = i // getattr(self.scheduler, "order", 1)
+                        callback(step_idx, t, latents)
+
+                if XLA_AVAILABLE:
+                    xm.mark_step()
+
+        if not output_type == "latent":
+            # make sure the VAE is in float32 mode, as it overflows in float16
+            needs_upcasting = self.vae.dtype == torch.float16 and self.vae.config.force_upcast
+
+            if needs_upcasting:
+                self.upcast_vae()
+                latents = latents.to(next(iter(self.vae.post_quant_conv.parameters())).dtype)
+
+            image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
+
+            # cast back to fp16 if needed
+            if needs_upcasting:
+                self.vae.to(dtype=torch.float16)
+        # else:
+        #     return StableDiffusionXLPipelineOutput(images=latents)
+
+
+        image = self.image_processor.postprocess(image, output_type=output_type)
+
+        if padding_mask_crop is not None:
+            image = [self.image_processor.apply_overlay(mask_image, original_image, i, crops_coords) for i in image]
+
+        # Offload all models
+        self.maybe_free_model_hooks()
+
+        # if not return_dict:
+        return (image,)
+
+        # return StableDiffusionXLPipelineOutput(images=image)
\ No newline at end of file
diff --git a/src/unet_block_hacked_garmnet.py b/src/unet_block_hacked_garmnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..a6e80c058014b83617fa3fd4e26daeb5a0a11ec3
--- /dev/null
+++ b/src/unet_block_hacked_garmnet.py
@@ -0,0 +1,3579 @@
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Any, Dict, Optional, Tuple, Union
+
+import numpy as np
+import torch
+import torch.nn.functional as F
+from torch import nn
+
+from diffusers.utils import is_torch_version, logging
+from diffusers.utils.torch_utils import apply_freeu
+from diffusers.models.activations import get_activation
+from diffusers.models.attention_processor import Attention, AttnAddedKVProcessor, AttnAddedKVProcessor2_0
+from diffusers.models.dual_transformer_2d import DualTransformer2DModel
+from diffusers.models.normalization import AdaGroupNorm
+from diffusers.models.resnet import Downsample2D, FirDownsample2D, FirUpsample2D, KDownsample2D, KUpsample2D, ResnetBlock2D, Upsample2D
+from src.transformerhacked_garmnet import Transformer2DModel
+from einops import rearrange
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+def get_down_block(
+    down_block_type: str,
+    num_layers: int,
+    in_channels: int,
+    out_channels: int,
+    temb_channels: int,
+    add_downsample: bool,
+    resnet_eps: float,
+    resnet_act_fn: str,
+    transformer_layers_per_block: int = 1,
+    num_attention_heads: Optional[int] = None,
+    resnet_groups: Optional[int] = None,
+    cross_attention_dim: Optional[int] = None,
+    downsample_padding: Optional[int] = None,
+    dual_cross_attention: bool = False,
+    use_linear_projection: bool = False,
+    only_cross_attention: bool = False,
+    upcast_attention: bool = False,
+    resnet_time_scale_shift: str = "default",
+    attention_type: str = "default",
+    resnet_skip_time_act: bool = False,
+    resnet_out_scale_factor: float = 1.0,
+    cross_attention_norm: Optional[str] = None,
+    attention_head_dim: Optional[int] = None,
+    downsample_type: Optional[str] = None,
+    dropout: float = 0.0,
+):
+    # If attn head dim is not defined, we default it to the number of heads
+    if attention_head_dim is None:
+        logger.warn(
+            f"It is recommended to provide `attention_head_dim` when calling `get_down_block`. Defaulting `attention_head_dim` to {num_attention_heads}."
+        )
+        attention_head_dim = num_attention_heads
+
+    down_block_type = down_block_type[7:] if down_block_type.startswith("UNetRes") else down_block_type
+    if down_block_type == "DownBlock2D":
+        return DownBlock2D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            temb_channels=temb_channels,
+            dropout=dropout,
+            add_downsample=add_downsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            resnet_groups=resnet_groups,
+            downsample_padding=downsample_padding,
+            resnet_time_scale_shift=resnet_time_scale_shift,
+        )
+    elif down_block_type == "ResnetDownsampleBlock2D":
+        return ResnetDownsampleBlock2D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            temb_channels=temb_channels,
+            dropout=dropout,
+            add_downsample=add_downsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            resnet_groups=resnet_groups,
+            resnet_time_scale_shift=resnet_time_scale_shift,
+            skip_time_act=resnet_skip_time_act,
+            output_scale_factor=resnet_out_scale_factor,
+        )
+    elif down_block_type == "AttnDownBlock2D":
+        if add_downsample is False:
+            downsample_type = None
+        else:
+            downsample_type = downsample_type or "conv"  # default to 'conv'
+        return AttnDownBlock2D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            temb_channels=temb_channels,
+            dropout=dropout,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            resnet_groups=resnet_groups,
+            downsample_padding=downsample_padding,
+            attention_head_dim=attention_head_dim,
+            resnet_time_scale_shift=resnet_time_scale_shift,
+            downsample_type=downsample_type,
+        )
+    elif down_block_type == "CrossAttnDownBlock2D":
+        if cross_attention_dim is None:
+            raise ValueError("cross_attention_dim must be specified for CrossAttnDownBlock2D")
+        return CrossAttnDownBlock2D(
+            num_layers=num_layers,
+            transformer_layers_per_block=transformer_layers_per_block,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            temb_channels=temb_channels,
+            dropout=dropout,
+            add_downsample=add_downsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            resnet_groups=resnet_groups,
+            downsample_padding=downsample_padding,
+            cross_attention_dim=cross_attention_dim,
+            num_attention_heads=num_attention_heads,
+            dual_cross_attention=dual_cross_attention,
+            use_linear_projection=use_linear_projection,
+            only_cross_attention=only_cross_attention,
+            upcast_attention=upcast_attention,
+            resnet_time_scale_shift=resnet_time_scale_shift,
+            attention_type=attention_type,
+        )
+    elif down_block_type == "SimpleCrossAttnDownBlock2D":
+        if cross_attention_dim is None:
+            raise ValueError("cross_attention_dim must be specified for SimpleCrossAttnDownBlock2D")
+        return SimpleCrossAttnDownBlock2D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            temb_channels=temb_channels,
+            dropout=dropout,
+            add_downsample=add_downsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            resnet_groups=resnet_groups,
+            cross_attention_dim=cross_attention_dim,
+            attention_head_dim=attention_head_dim,
+            resnet_time_scale_shift=resnet_time_scale_shift,
+            skip_time_act=resnet_skip_time_act,
+            output_scale_factor=resnet_out_scale_factor,
+            only_cross_attention=only_cross_attention,
+            cross_attention_norm=cross_attention_norm,
+        )
+    elif down_block_type == "SkipDownBlock2D":
+        return SkipDownBlock2D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            temb_channels=temb_channels,
+            dropout=dropout,
+            add_downsample=add_downsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            downsample_padding=downsample_padding,
+            resnet_time_scale_shift=resnet_time_scale_shift,
+        )
+    elif down_block_type == "AttnSkipDownBlock2D":
+        return AttnSkipDownBlock2D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            temb_channels=temb_channels,
+            dropout=dropout,
+            add_downsample=add_downsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            attention_head_dim=attention_head_dim,
+            resnet_time_scale_shift=resnet_time_scale_shift,
+        )
+    elif down_block_type == "DownEncoderBlock2D":
+        return DownEncoderBlock2D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            dropout=dropout,
+            add_downsample=add_downsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            resnet_groups=resnet_groups,
+            downsample_padding=downsample_padding,
+            resnet_time_scale_shift=resnet_time_scale_shift,
+        )
+    elif down_block_type == "AttnDownEncoderBlock2D":
+        return AttnDownEncoderBlock2D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            dropout=dropout,
+            add_downsample=add_downsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            resnet_groups=resnet_groups,
+            downsample_padding=downsample_padding,
+            attention_head_dim=attention_head_dim,
+            resnet_time_scale_shift=resnet_time_scale_shift,
+        )
+    elif down_block_type == "KDownBlock2D":
+        return KDownBlock2D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            temb_channels=temb_channels,
+            dropout=dropout,
+            add_downsample=add_downsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+        )
+    elif down_block_type == "KCrossAttnDownBlock2D":
+        return KCrossAttnDownBlock2D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            temb_channels=temb_channels,
+            dropout=dropout,
+            add_downsample=add_downsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            cross_attention_dim=cross_attention_dim,
+            attention_head_dim=attention_head_dim,
+            add_self_attention=True if not add_downsample else False,
+        )
+    raise ValueError(f"{down_block_type} does not exist.")
+
+
+def get_up_block(
+    up_block_type: str,
+    num_layers: int,
+    in_channels: int,
+    out_channels: int,
+    prev_output_channel: int,
+    temb_channels: int,
+    add_upsample: bool,
+    resnet_eps: float,
+    resnet_act_fn: str,
+    resolution_idx: Optional[int] = None,
+    transformer_layers_per_block: int = 1,
+    num_attention_heads: Optional[int] = None,
+    resnet_groups: Optional[int] = None,
+    cross_attention_dim: Optional[int] = None,
+    dual_cross_attention: bool = False,
+    use_linear_projection: bool = False,
+    only_cross_attention: bool = False,
+    upcast_attention: bool = False,
+    resnet_time_scale_shift: str = "default",
+    attention_type: str = "default",
+    resnet_skip_time_act: bool = False,
+    resnet_out_scale_factor: float = 1.0,
+    cross_attention_norm: Optional[str] = None,
+    attention_head_dim: Optional[int] = None,
+    upsample_type: Optional[str] = None,
+    dropout: float = 0.0,
+) -> nn.Module:
+    # If attn head dim is not defined, we default it to the number of heads
+    if attention_head_dim is None:
+        logger.warn(
+            f"It is recommended to provide `attention_head_dim` when calling `get_up_block`. Defaulting `attention_head_dim` to {num_attention_heads}."
+        )
+        attention_head_dim = num_attention_heads
+
+    up_block_type = up_block_type[7:] if up_block_type.startswith("UNetRes") else up_block_type
+    if up_block_type == "UpBlock2D":
+        return UpBlock2D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            prev_output_channel=prev_output_channel,
+            temb_channels=temb_channels,
+            resolution_idx=resolution_idx,
+            dropout=dropout,
+            add_upsample=add_upsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            resnet_groups=resnet_groups,
+            resnet_time_scale_shift=resnet_time_scale_shift,
+        )
+    elif up_block_type == "ResnetUpsampleBlock2D":
+        return ResnetUpsampleBlock2D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            prev_output_channel=prev_output_channel,
+            temb_channels=temb_channels,
+            resolution_idx=resolution_idx,
+            dropout=dropout,
+            add_upsample=add_upsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            resnet_groups=resnet_groups,
+            resnet_time_scale_shift=resnet_time_scale_shift,
+            skip_time_act=resnet_skip_time_act,
+            output_scale_factor=resnet_out_scale_factor,
+        )
+    elif up_block_type == "CrossAttnUpBlock2D":
+        if cross_attention_dim is None:
+            raise ValueError("cross_attention_dim must be specified for CrossAttnUpBlock2D")
+        return CrossAttnUpBlock2D(
+            num_layers=num_layers,
+            transformer_layers_per_block=transformer_layers_per_block,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            prev_output_channel=prev_output_channel,
+            temb_channels=temb_channels,
+            resolution_idx=resolution_idx,
+            dropout=dropout,
+            add_upsample=add_upsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            resnet_groups=resnet_groups,
+            cross_attention_dim=cross_attention_dim,
+            num_attention_heads=num_attention_heads,
+            dual_cross_attention=dual_cross_attention,
+            use_linear_projection=use_linear_projection,
+            only_cross_attention=only_cross_attention,
+            upcast_attention=upcast_attention,
+            resnet_time_scale_shift=resnet_time_scale_shift,
+            attention_type=attention_type,
+        )
+    elif up_block_type == "SimpleCrossAttnUpBlock2D":
+        if cross_attention_dim is None:
+            raise ValueError("cross_attention_dim must be specified for SimpleCrossAttnUpBlock2D")
+        return SimpleCrossAttnUpBlock2D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            prev_output_channel=prev_output_channel,
+            temb_channels=temb_channels,
+            resolution_idx=resolution_idx,
+            dropout=dropout,
+            add_upsample=add_upsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            resnet_groups=resnet_groups,
+            cross_attention_dim=cross_attention_dim,
+            attention_head_dim=attention_head_dim,
+            resnet_time_scale_shift=resnet_time_scale_shift,
+            skip_time_act=resnet_skip_time_act,
+            output_scale_factor=resnet_out_scale_factor,
+            only_cross_attention=only_cross_attention,
+            cross_attention_norm=cross_attention_norm,
+        )
+    elif up_block_type == "AttnUpBlock2D":
+        if add_upsample is False:
+            upsample_type = None
+        else:
+            upsample_type = upsample_type or "conv"  # default to 'conv'
+
+        return AttnUpBlock2D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            prev_output_channel=prev_output_channel,
+            temb_channels=temb_channels,
+            resolution_idx=resolution_idx,
+            dropout=dropout,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            resnet_groups=resnet_groups,
+            attention_head_dim=attention_head_dim,
+            resnet_time_scale_shift=resnet_time_scale_shift,
+            upsample_type=upsample_type,
+        )
+    elif up_block_type == "SkipUpBlock2D":
+        return SkipUpBlock2D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            prev_output_channel=prev_output_channel,
+            temb_channels=temb_channels,
+            resolution_idx=resolution_idx,
+            dropout=dropout,
+            add_upsample=add_upsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            resnet_time_scale_shift=resnet_time_scale_shift,
+        )
+    elif up_block_type == "AttnSkipUpBlock2D":
+        return AttnSkipUpBlock2D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            prev_output_channel=prev_output_channel,
+            temb_channels=temb_channels,
+            resolution_idx=resolution_idx,
+            dropout=dropout,
+            add_upsample=add_upsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            attention_head_dim=attention_head_dim,
+            resnet_time_scale_shift=resnet_time_scale_shift,
+        )
+    elif up_block_type == "UpDecoderBlock2D":
+        return UpDecoderBlock2D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            resolution_idx=resolution_idx,
+            dropout=dropout,
+            add_upsample=add_upsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            resnet_groups=resnet_groups,
+            resnet_time_scale_shift=resnet_time_scale_shift,
+            temb_channels=temb_channels,
+        )
+    elif up_block_type == "AttnUpDecoderBlock2D":
+        return AttnUpDecoderBlock2D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            resolution_idx=resolution_idx,
+            dropout=dropout,
+            add_upsample=add_upsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            resnet_groups=resnet_groups,
+            attention_head_dim=attention_head_dim,
+            resnet_time_scale_shift=resnet_time_scale_shift,
+            temb_channels=temb_channels,
+        )
+    elif up_block_type == "KUpBlock2D":
+        return KUpBlock2D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            temb_channels=temb_channels,
+            resolution_idx=resolution_idx,
+            dropout=dropout,
+            add_upsample=add_upsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+        )
+    elif up_block_type == "KCrossAttnUpBlock2D":
+        return KCrossAttnUpBlock2D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            temb_channels=temb_channels,
+            resolution_idx=resolution_idx,
+            dropout=dropout,
+            add_upsample=add_upsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            cross_attention_dim=cross_attention_dim,
+            attention_head_dim=attention_head_dim,
+        )
+
+    raise ValueError(f"{up_block_type} does not exist.")
+
+
+class AutoencoderTinyBlock(nn.Module):
+    """
+    Tiny Autoencoder block used in [`AutoencoderTiny`]. It is a mini residual module consisting of plain conv + ReLU
+    blocks.
+
+    Args:
+        in_channels (`int`): The number of input channels.
+        out_channels (`int`): The number of output channels.
+        act_fn (`str`):
+            ` The activation function to use. Supported values are `"swish"`, `"mish"`, `"gelu"`, and `"relu"`.
+
+    Returns:
+        `torch.FloatTensor`: A tensor with the same shape as the input tensor, but with the number of channels equal to
+        `out_channels`.
+    """
+
+    def __init__(self, in_channels: int, out_channels: int, act_fn: str):
+        super().__init__()
+        act_fn = get_activation(act_fn)
+        self.conv = nn.Sequential(
+            nn.Conv2d(in_channels, out_channels, kernel_size=3, padding=1),
+            act_fn,
+            nn.Conv2d(out_channels, out_channels, kernel_size=3, padding=1),
+            act_fn,
+            nn.Conv2d(out_channels, out_channels, kernel_size=3, padding=1),
+        )
+        self.skip = (
+            nn.Conv2d(in_channels, out_channels, kernel_size=1, bias=False)
+            if in_channels != out_channels
+            else nn.Identity()
+        )
+        self.fuse = nn.ReLU()
+
+    def forward(self, x: torch.FloatTensor) -> torch.FloatTensor:
+        return self.fuse(self.conv(x) + self.skip(x))
+
+
+class UNetMidBlock2D(nn.Module):
+    """
+    A 2D UNet mid-block [`UNetMidBlock2D`] with multiple residual blocks and optional attention blocks.
+
+    Args:
+        in_channels (`int`): The number of input channels.
+        temb_channels (`int`): The number of temporal embedding channels.
+        dropout (`float`, *optional*, defaults to 0.0): The dropout rate.
+        num_layers (`int`, *optional*, defaults to 1): The number of residual blocks.
+        resnet_eps (`float`, *optional*, 1e-6 ): The epsilon value for the resnet blocks.
+        resnet_time_scale_shift (`str`, *optional*, defaults to `default`):
+            The type of normalization to apply to the time embeddings. This can help to improve the performance of the
+            model on tasks with long-range temporal dependencies.
+        resnet_act_fn (`str`, *optional*, defaults to `swish`): The activation function for the resnet blocks.
+        resnet_groups (`int`, *optional*, defaults to 32):
+            The number of groups to use in the group normalization layers of the resnet blocks.
+        attn_groups (`Optional[int]`, *optional*, defaults to None): The number of groups for the attention blocks.
+        resnet_pre_norm (`bool`, *optional*, defaults to `True`):
+            Whether to use pre-normalization for the resnet blocks.
+        add_attention (`bool`, *optional*, defaults to `True`): Whether to add attention blocks.
+        attention_head_dim (`int`, *optional*, defaults to 1):
+            Dimension of a single attention head. The number of attention heads is determined based on this value and
+            the number of input channels.
+        output_scale_factor (`float`, *optional*, defaults to 1.0): The output scale factor.
+
+    Returns:
+        `torch.FloatTensor`: The output of the last residual block, which is a tensor of shape `(batch_size,
+        in_channels, height, width)`.
+
+    """
+
+    def __init__(
+        self,
+        in_channels: int,
+        temb_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",  # default, spatial
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        attn_groups: Optional[int] = None,
+        resnet_pre_norm: bool = True,
+        add_attention: bool = True,
+        attention_head_dim: int = 1,
+        output_scale_factor: float = 1.0,
+    ):
+        super().__init__()
+        resnet_groups = resnet_groups if resnet_groups is not None else min(in_channels // 4, 32)
+        self.add_attention = add_attention
+
+        if attn_groups is None:
+            attn_groups = resnet_groups if resnet_time_scale_shift == "default" else None
+
+        # there is always at least one resnet
+        resnets = [
+            ResnetBlock2D(
+                in_channels=in_channels,
+                out_channels=in_channels,
+                temb_channels=temb_channels,
+                eps=resnet_eps,
+                groups=resnet_groups,
+                dropout=dropout,
+                time_embedding_norm=resnet_time_scale_shift,
+                non_linearity=resnet_act_fn,
+                output_scale_factor=output_scale_factor,
+                pre_norm=resnet_pre_norm,
+            )
+        ]
+        attentions = []
+
+        if attention_head_dim is None:
+            logger.warn(
+                f"It is not recommend to pass `attention_head_dim=None`. Defaulting `attention_head_dim` to `in_channels`: {in_channels}."
+            )
+            attention_head_dim = in_channels
+
+        for _ in range(num_layers):
+            if self.add_attention:
+                attentions.append(
+                    Attention(
+                        in_channels,
+                        heads=in_channels // attention_head_dim,
+                        dim_head=attention_head_dim,
+                        rescale_output_factor=output_scale_factor,
+                        eps=resnet_eps,
+                        norm_num_groups=attn_groups,
+                        spatial_norm_dim=temb_channels if resnet_time_scale_shift == "spatial" else None,
+                        residual_connection=True,
+                        bias=True,
+                        upcast_softmax=True,
+                        _from_deprecated_attn_block=True,
+                    )
+                )
+            else:
+                attentions.append(None)
+
+            resnets.append(
+                ResnetBlock2D(
+                    in_channels=in_channels,
+                    out_channels=in_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                )
+            )
+
+        self.attentions = nn.ModuleList(attentions)
+        self.resnets = nn.ModuleList(resnets)
+
+    def forward(self, hidden_states: torch.FloatTensor, temb: Optional[torch.FloatTensor] = None) -> torch.FloatTensor:
+        hidden_states = self.resnets[0](hidden_states, temb)
+        for attn, resnet in zip(self.attentions, self.resnets[1:]):
+            if attn is not None:
+                hidden_states = attn(hidden_states, temb=temb)
+            hidden_states = resnet(hidden_states, temb)
+
+        return hidden_states
+
+
+class UNetMidBlock2DCrossAttn(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        temb_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        transformer_layers_per_block: Union[int, Tuple[int]] = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        num_attention_heads: int = 1,
+        output_scale_factor: float = 1.0,
+        cross_attention_dim: int = 1280,
+        dual_cross_attention: bool = False,
+        use_linear_projection: bool = False,
+        upcast_attention: bool = False,
+        attention_type: str = "default",
+    ):
+        super().__init__()
+
+        self.has_cross_attention = True
+        self.num_attention_heads = num_attention_heads
+        resnet_groups = resnet_groups if resnet_groups is not None else min(in_channels // 4, 32)
+
+        # support for variable transformer layers per block
+        if isinstance(transformer_layers_per_block, int):
+            transformer_layers_per_block = [transformer_layers_per_block] * num_layers
+
+        # there is always at least one resnet
+        resnets = [
+            ResnetBlock2D(
+                in_channels=in_channels,
+                out_channels=in_channels,
+                temb_channels=temb_channels,
+                eps=resnet_eps,
+                groups=resnet_groups,
+                dropout=dropout,
+                time_embedding_norm=resnet_time_scale_shift,
+                non_linearity=resnet_act_fn,
+                output_scale_factor=output_scale_factor,
+                pre_norm=resnet_pre_norm,
+            )
+        ]
+        attentions = []
+
+        for i in range(num_layers):
+            if not dual_cross_attention:
+                attentions.append(
+                    Transformer2DModel(
+                        num_attention_heads,
+                        in_channels // num_attention_heads,
+                        in_channels=in_channels,
+                        num_layers=transformer_layers_per_block[i],
+                        cross_attention_dim=cross_attention_dim,
+                        norm_num_groups=resnet_groups,
+                        use_linear_projection=use_linear_projection,
+                        upcast_attention=upcast_attention,
+                        attention_type=attention_type,
+                    )
+                )
+            else:
+                attentions.append(
+                    DualTransformer2DModel(
+                        num_attention_heads,
+                        in_channels // num_attention_heads,
+                        in_channels=in_channels,
+                        num_layers=1,
+                        cross_attention_dim=cross_attention_dim,
+                        norm_num_groups=resnet_groups,
+                    )
+                )
+            resnets.append(
+                ResnetBlock2D(
+                    in_channels=in_channels,
+                    out_channels=in_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                )
+            )
+
+        self.attentions = nn.ModuleList(attentions)
+        self.resnets = nn.ModuleList(resnets)
+
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        hidden_states: torch.FloatTensor,
+        temb: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+    ) -> torch.FloatTensor:
+        lora_scale = cross_attention_kwargs.get("scale", 1.0) if cross_attention_kwargs is not None else 1.0
+        hidden_states = self.resnets[0](hidden_states, temb, scale=lora_scale)
+        garment_features = []
+        for attn, resnet in zip(self.attentions, self.resnets[1:]):
+            if self.training and self.gradient_checkpointing:
+
+                def create_custom_forward(module, return_dict=None):
+                    def custom_forward(*inputs):
+                        if return_dict is not None:
+                            return module(*inputs, return_dict=return_dict)
+                        else:
+                            return module(*inputs)
+
+                    return custom_forward
+
+                ckpt_kwargs: Dict[str, Any] = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
+                # hidden_states = attn(
+                hidden_states,out_garment_feat = attn(
+                    hidden_states,
+                    encoder_hidden_states=encoder_hidden_states,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    attention_mask=attention_mask,
+                    encoder_attention_mask=encoder_attention_mask,
+                    return_dict=False,
+                )
+                hidden_states=hidden_states[0]
+                hidden_states = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(resnet),
+                    hidden_states,
+                    temb,
+                    **ckpt_kwargs,
+                )
+            else:
+                # hidden_states= attn(
+                hidden_states,out_garment_feat = attn(
+                    hidden_states,
+                    encoder_hidden_states=encoder_hidden_states,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    attention_mask=attention_mask,
+                    encoder_attention_mask=encoder_attention_mask,
+                    return_dict=False,
+                )
+                hidden_states=hidden_states[0]
+                hidden_states = resnet(hidden_states, temb, scale=lora_scale)
+            garment_features += out_garment_feat
+        return hidden_states,garment_features
+        # return hidden_states 
+
+
+class UNetMidBlock2DSimpleCrossAttn(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        temb_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        attention_head_dim: int = 1,
+        output_scale_factor: float = 1.0,
+        cross_attention_dim: int = 1280,
+        skip_time_act: bool = False,
+        only_cross_attention: bool = False,
+        cross_attention_norm: Optional[str] = None,
+    ):
+        super().__init__()
+
+        self.has_cross_attention = True
+
+        self.attention_head_dim = attention_head_dim
+        resnet_groups = resnet_groups if resnet_groups is not None else min(in_channels // 4, 32)
+
+        self.num_heads = in_channels // self.attention_head_dim
+
+        # there is always at least one resnet
+        resnets = [
+            ResnetBlock2D(
+                in_channels=in_channels,
+                out_channels=in_channels,
+                temb_channels=temb_channels,
+                eps=resnet_eps,
+                groups=resnet_groups,
+                dropout=dropout,
+                time_embedding_norm=resnet_time_scale_shift,
+                non_linearity=resnet_act_fn,
+                output_scale_factor=output_scale_factor,
+                pre_norm=resnet_pre_norm,
+                skip_time_act=skip_time_act,
+            )
+        ]
+        attentions = []
+
+        for _ in range(num_layers):
+            processor = (
+                AttnAddedKVProcessor2_0() if hasattr(F, "scaled_dot_product_attention") else AttnAddedKVProcessor()
+            )
+
+            attentions.append(
+                Attention(
+                    query_dim=in_channels,
+                    cross_attention_dim=in_channels,
+                    heads=self.num_heads,
+                    dim_head=self.attention_head_dim,
+                    added_kv_proj_dim=cross_attention_dim,
+                    norm_num_groups=resnet_groups,
+                    bias=True,
+                    upcast_softmax=True,
+                    only_cross_attention=only_cross_attention,
+                    cross_attention_norm=cross_attention_norm,
+                    processor=processor,
+                )
+            )
+            resnets.append(
+                ResnetBlock2D(
+                    in_channels=in_channels,
+                    out_channels=in_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                    skip_time_act=skip_time_act,
+                )
+            )
+
+        self.attentions = nn.ModuleList(attentions)
+        self.resnets = nn.ModuleList(resnets)
+
+    def forward(
+        self,
+        hidden_states: torch.FloatTensor,
+        temb: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+    ) -> torch.FloatTensor:
+        cross_attention_kwargs = cross_attention_kwargs if cross_attention_kwargs is not None else {}
+        lora_scale = cross_attention_kwargs.get("scale", 1.0)
+
+        if attention_mask is None:
+            # if encoder_hidden_states is defined: we are doing cross-attn, so we should use cross-attn mask.
+            mask = None if encoder_hidden_states is None else encoder_attention_mask
+        else:
+            # when attention_mask is defined: we don't even check for encoder_attention_mask.
+            # this is to maintain compatibility with UnCLIP, which uses 'attention_mask' param for cross-attn masks.
+            # TODO: UnCLIP should express cross-attn mask via encoder_attention_mask param instead of via attention_mask.
+            #       then we can simplify this whole if/else block to:
+            #         mask = attention_mask if encoder_hidden_states is None else encoder_attention_mask
+            mask = attention_mask
+
+        hidden_states = self.resnets[0](hidden_states, temb, scale=lora_scale)
+        for attn, resnet in zip(self.attentions, self.resnets[1:]):
+            # attn
+            hidden_states = attn(
+                hidden_states,
+                encoder_hidden_states=encoder_hidden_states,
+                attention_mask=mask,
+                **cross_attention_kwargs,
+            )
+
+            # resnet
+            hidden_states = resnet(hidden_states, temb, scale=lora_scale)
+
+        return hidden_states
+
+
+class AttnDownBlock2D(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        temb_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        attention_head_dim: int = 1,
+        output_scale_factor: float = 1.0,
+        downsample_padding: int = 1,
+        downsample_type: str = "conv",
+    ):
+        super().__init__()
+        resnets = []
+        attentions = []
+        self.downsample_type = downsample_type
+
+        if attention_head_dim is None:
+            logger.warn(
+                f"It is not recommend to pass `attention_head_dim=None`. Defaulting `attention_head_dim` to `in_channels`: {out_channels}."
+            )
+            attention_head_dim = out_channels
+
+        for i in range(num_layers):
+            in_channels = in_channels if i == 0 else out_channels
+            resnets.append(
+                ResnetBlock2D(
+                    in_channels=in_channels,
+                    out_channels=out_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                )
+            )
+            attentions.append(
+                Attention(
+                    out_channels,
+                    heads=out_channels // attention_head_dim,
+                    dim_head=attention_head_dim,
+                    rescale_output_factor=output_scale_factor,
+                    eps=resnet_eps,
+                    norm_num_groups=resnet_groups,
+                    residual_connection=True,
+                    bias=True,
+                    upcast_softmax=True,
+                    _from_deprecated_attn_block=True,
+                )
+            )
+
+        self.attentions = nn.ModuleList(attentions)
+        self.resnets = nn.ModuleList(resnets)
+
+        if downsample_type == "conv":
+            self.downsamplers = nn.ModuleList(
+                [
+                    Downsample2D(
+                        out_channels, use_conv=True, out_channels=out_channels, padding=downsample_padding, name="op"
+                    )
+                ]
+            )
+        elif downsample_type == "resnet":
+            self.downsamplers = nn.ModuleList(
+                [
+                    ResnetBlock2D(
+                        in_channels=out_channels,
+                        out_channels=out_channels,
+                        temb_channels=temb_channels,
+                        eps=resnet_eps,
+                        groups=resnet_groups,
+                        dropout=dropout,
+                        time_embedding_norm=resnet_time_scale_shift,
+                        non_linearity=resnet_act_fn,
+                        output_scale_factor=output_scale_factor,
+                        pre_norm=resnet_pre_norm,
+                        down=True,
+                    )
+                ]
+            )
+        else:
+            self.downsamplers = None
+
+    def forward(
+        self,
+        hidden_states: torch.FloatTensor,
+        temb: Optional[torch.FloatTensor] = None,
+        upsample_size: Optional[int] = None,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+    ) -> Tuple[torch.FloatTensor, Tuple[torch.FloatTensor, ...]]:
+        cross_attention_kwargs = cross_attention_kwargs if cross_attention_kwargs is not None else {}
+
+        lora_scale = cross_attention_kwargs.get("scale", 1.0)
+
+        output_states = ()
+
+        for resnet, attn in zip(self.resnets, self.attentions):
+            cross_attention_kwargs.update({"scale": lora_scale})
+            hidden_states = resnet(hidden_states, temb, scale=lora_scale)
+            hidden_states = attn(hidden_states, **cross_attention_kwargs)
+            output_states = output_states + (hidden_states,)
+
+        if self.downsamplers is not None:
+            for downsampler in self.downsamplers:
+                if self.downsample_type == "resnet":
+                    hidden_states = downsampler(hidden_states, temb=temb, scale=lora_scale)
+                else:
+                    hidden_states = downsampler(hidden_states, scale=lora_scale)
+
+            output_states += (hidden_states,)
+
+        return hidden_states, output_states
+
+
+class CrossAttnDownBlock2D(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        temb_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        transformer_layers_per_block: Union[int, Tuple[int]] = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        num_attention_heads: int = 1,
+        cross_attention_dim: int = 1280,
+        output_scale_factor: float = 1.0,
+        downsample_padding: int = 1,
+        add_downsample: bool = True,
+        dual_cross_attention: bool = False,
+        use_linear_projection: bool = False,
+        only_cross_attention: bool = False,
+        upcast_attention: bool = False,
+        attention_type: str = "default",
+    ):
+        super().__init__()
+        resnets = []
+        attentions = []
+
+        self.has_cross_attention = True
+        self.num_attention_heads = num_attention_heads
+        if isinstance(transformer_layers_per_block, int):
+            transformer_layers_per_block = [transformer_layers_per_block] * num_layers
+
+        for i in range(num_layers):
+            in_channels = in_channels if i == 0 else out_channels
+            resnets.append(
+                ResnetBlock2D(
+                    in_channels=in_channels,
+                    out_channels=out_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                )
+            )
+            if not dual_cross_attention:
+                attentions.append(
+                    Transformer2DModel(
+                        num_attention_heads,
+                        out_channels // num_attention_heads,
+                        in_channels=out_channels,
+                        num_layers=transformer_layers_per_block[i],
+                        cross_attention_dim=cross_attention_dim,
+                        norm_num_groups=resnet_groups,
+                        use_linear_projection=use_linear_projection,
+                        only_cross_attention=only_cross_attention,
+                        upcast_attention=upcast_attention,
+                        attention_type=attention_type,
+                    )
+                )
+            else:
+                attentions.append(
+                    DualTransformer2DModel(
+                        num_attention_heads,
+                        out_channels // num_attention_heads,
+                        in_channels=out_channels,
+                        num_layers=1,
+                        cross_attention_dim=cross_attention_dim,
+                        norm_num_groups=resnet_groups,
+                    )
+                )
+        self.attentions = nn.ModuleList(attentions)
+        self.resnets = nn.ModuleList(resnets)
+
+        if add_downsample:
+            self.downsamplers = nn.ModuleList(
+                [
+                    Downsample2D(
+                        out_channels, use_conv=True, out_channels=out_channels, padding=downsample_padding, name="op"
+                    )
+                ]
+            )
+        else:
+            self.downsamplers = None
+
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        hidden_states: torch.FloatTensor,
+        temb: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        additional_residuals: Optional[torch.FloatTensor] = None,
+    ) -> Tuple[torch.FloatTensor, Tuple[torch.FloatTensor, ...]]:
+        output_states = ()
+
+        lora_scale = cross_attention_kwargs.get("scale", 1.0) if cross_attention_kwargs is not None else 1.0
+
+        blocks = list(zip(self.resnets, self.attentions))
+        garment_features = []
+        for i, (resnet, attn) in enumerate(blocks):
+            if self.training and self.gradient_checkpointing:
+
+                def create_custom_forward(module, return_dict=None):
+                    def custom_forward(*inputs):
+                        if return_dict is not None:
+                            return module(*inputs, return_dict=return_dict)
+                        else:
+                            return module(*inputs)
+
+                    return custom_forward
+
+                ckpt_kwargs: Dict[str, Any] = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
+                hidden_states = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(resnet),
+                    hidden_states,
+                    temb,
+                    **ckpt_kwargs,
+                )
+                hidden_states,out_garment_feat = attn(
+                    hidden_states,
+                    encoder_hidden_states=encoder_hidden_states,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    attention_mask=attention_mask,
+                    encoder_attention_mask=encoder_attention_mask,
+                    return_dict=False,
+                )
+                hidden_states=hidden_states[0]
+            else:
+                hidden_states = resnet(hidden_states, temb, scale=lora_scale)
+                hidden_states,out_garment_feat = attn(
+                    hidden_states,
+                    encoder_hidden_states=encoder_hidden_states,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    attention_mask=attention_mask,
+                    encoder_attention_mask=encoder_attention_mask,
+                    return_dict=False,
+                )
+                hidden_states=hidden_states[0]
+            garment_features += out_garment_feat
+            # apply additional residuals to the output of the last pair of resnet and attention blocks
+            if i == len(blocks) - 1 and additional_residuals is not None:
+                hidden_states = hidden_states + additional_residuals
+
+            output_states = output_states + (hidden_states,)
+
+        if self.downsamplers is not None:
+            for downsampler in self.downsamplers:
+                hidden_states = downsampler(hidden_states, scale=lora_scale)
+
+            output_states = output_states + (hidden_states,)
+
+        return hidden_states, output_states,garment_features
+
+
+class DownBlock2D(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        temb_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        output_scale_factor: float = 1.0,
+        add_downsample: bool = True,
+        downsample_padding: int = 1,
+    ):
+        super().__init__()
+        resnets = []
+
+        for i in range(num_layers):
+            in_channels = in_channels if i == 0 else out_channels
+            resnets.append(
+                ResnetBlock2D(
+                    in_channels=in_channels,
+                    out_channels=out_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                )
+            )
+
+        self.resnets = nn.ModuleList(resnets)
+
+        if add_downsample:
+            self.downsamplers = nn.ModuleList(
+                [
+                    Downsample2D(
+                        out_channels, use_conv=True, out_channels=out_channels, padding=downsample_padding, name="op"
+                    )
+                ]
+            )
+        else:
+            self.downsamplers = None
+
+        self.gradient_checkpointing = False
+
+    def forward(
+        self, hidden_states: torch.FloatTensor, temb: Optional[torch.FloatTensor] = None, scale: float = 1.0
+    ) -> Tuple[torch.FloatTensor, Tuple[torch.FloatTensor, ...]]:
+        output_states = ()
+
+        for resnet in self.resnets:
+            if self.training and self.gradient_checkpointing:
+
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        return module(*inputs)
+
+                    return custom_forward
+
+                if is_torch_version(">=", "1.11.0"):
+                    hidden_states = torch.utils.checkpoint.checkpoint(
+                        create_custom_forward(resnet), hidden_states, temb, use_reentrant=False
+                    )
+                else:
+                    hidden_states = torch.utils.checkpoint.checkpoint(
+                        create_custom_forward(resnet), hidden_states, temb
+                    )
+            else:
+                hidden_states = resnet(hidden_states, temb, scale=scale)
+
+            output_states = output_states + (hidden_states,)
+
+        if self.downsamplers is not None:
+            for downsampler in self.downsamplers:
+                hidden_states = downsampler(hidden_states, scale=scale)
+
+            output_states = output_states + (hidden_states,)
+
+        return hidden_states, output_states
+
+
+class DownEncoderBlock2D(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        output_scale_factor: float = 1.0,
+        add_downsample: bool = True,
+        downsample_padding: int = 1,
+    ):
+        super().__init__()
+        resnets = []
+
+        for i in range(num_layers):
+            in_channels = in_channels if i == 0 else out_channels
+            resnets.append(
+                ResnetBlock2D(
+                    in_channels=in_channels,
+                    out_channels=out_channels,
+                    temb_channels=None,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                )
+            )
+
+        self.resnets = nn.ModuleList(resnets)
+
+        if add_downsample:
+            self.downsamplers = nn.ModuleList(
+                [
+                    Downsample2D(
+                        out_channels, use_conv=True, out_channels=out_channels, padding=downsample_padding, name="op"
+                    )
+                ]
+            )
+        else:
+            self.downsamplers = None
+
+    def forward(self, hidden_states: torch.FloatTensor, scale: float = 1.0) -> torch.FloatTensor:
+        for resnet in self.resnets:
+            hidden_states = resnet(hidden_states, temb=None, scale=scale)
+
+        if self.downsamplers is not None:
+            for downsampler in self.downsamplers:
+                hidden_states = downsampler(hidden_states, scale)
+
+        return hidden_states
+
+
+class AttnDownEncoderBlock2D(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        attention_head_dim: int = 1,
+        output_scale_factor: float = 1.0,
+        add_downsample: bool = True,
+        downsample_padding: int = 1,
+    ):
+        super().__init__()
+        resnets = []
+        attentions = []
+
+        if attention_head_dim is None:
+            logger.warn(
+                f"It is not recommend to pass `attention_head_dim=None`. Defaulting `attention_head_dim` to `in_channels`: {out_channels}."
+            )
+            attention_head_dim = out_channels
+
+        for i in range(num_layers):
+            in_channels = in_channels if i == 0 else out_channels
+            resnets.append(
+                ResnetBlock2D(
+                    in_channels=in_channels,
+                    out_channels=out_channels,
+                    temb_channels=None,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                )
+            )
+            attentions.append(
+                Attention(
+                    out_channels,
+                    heads=out_channels // attention_head_dim,
+                    dim_head=attention_head_dim,
+                    rescale_output_factor=output_scale_factor,
+                    eps=resnet_eps,
+                    norm_num_groups=resnet_groups,
+                    residual_connection=True,
+                    bias=True,
+                    upcast_softmax=True,
+                    _from_deprecated_attn_block=True,
+                )
+            )
+
+        self.attentions = nn.ModuleList(attentions)
+        self.resnets = nn.ModuleList(resnets)
+
+        if add_downsample:
+            self.downsamplers = nn.ModuleList(
+                [
+                    Downsample2D(
+                        out_channels, use_conv=True, out_channels=out_channels, padding=downsample_padding, name="op"
+                    )
+                ]
+            )
+        else:
+            self.downsamplers = None
+
+    def forward(self, hidden_states: torch.FloatTensor, scale: float = 1.0) -> torch.FloatTensor:
+        for resnet, attn in zip(self.resnets, self.attentions):
+            hidden_states = resnet(hidden_states, temb=None, scale=scale)
+            cross_attention_kwargs = {"scale": scale}
+            hidden_states = attn(hidden_states, **cross_attention_kwargs)
+
+        if self.downsamplers is not None:
+            for downsampler in self.downsamplers:
+                hidden_states = downsampler(hidden_states, scale)
+
+        return hidden_states
+
+
+class AttnSkipDownBlock2D(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        temb_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_pre_norm: bool = True,
+        attention_head_dim: int = 1,
+        output_scale_factor: float = np.sqrt(2.0),
+        add_downsample: bool = True,
+    ):
+        super().__init__()
+        self.attentions = nn.ModuleList([])
+        self.resnets = nn.ModuleList([])
+
+        if attention_head_dim is None:
+            logger.warn(
+                f"It is not recommend to pass `attention_head_dim=None`. Defaulting `attention_head_dim` to `in_channels`: {out_channels}."
+            )
+            attention_head_dim = out_channels
+
+        for i in range(num_layers):
+            in_channels = in_channels if i == 0 else out_channels
+            self.resnets.append(
+                ResnetBlock2D(
+                    in_channels=in_channels,
+                    out_channels=out_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=min(in_channels // 4, 32),
+                    groups_out=min(out_channels // 4, 32),
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                )
+            )
+            self.attentions.append(
+                Attention(
+                    out_channels,
+                    heads=out_channels // attention_head_dim,
+                    dim_head=attention_head_dim,
+                    rescale_output_factor=output_scale_factor,
+                    eps=resnet_eps,
+                    norm_num_groups=32,
+                    residual_connection=True,
+                    bias=True,
+                    upcast_softmax=True,
+                    _from_deprecated_attn_block=True,
+                )
+            )
+
+        if add_downsample:
+            self.resnet_down = ResnetBlock2D(
+                in_channels=out_channels,
+                out_channels=out_channels,
+                temb_channels=temb_channels,
+                eps=resnet_eps,
+                groups=min(out_channels // 4, 32),
+                dropout=dropout,
+                time_embedding_norm=resnet_time_scale_shift,
+                non_linearity=resnet_act_fn,
+                output_scale_factor=output_scale_factor,
+                pre_norm=resnet_pre_norm,
+                use_in_shortcut=True,
+                down=True,
+                kernel="fir",
+            )
+            self.downsamplers = nn.ModuleList([FirDownsample2D(out_channels, out_channels=out_channels)])
+            self.skip_conv = nn.Conv2d(3, out_channels, kernel_size=(1, 1), stride=(1, 1))
+        else:
+            self.resnet_down = None
+            self.downsamplers = None
+            self.skip_conv = None
+
+    def forward(
+        self,
+        hidden_states: torch.FloatTensor,
+        temb: Optional[torch.FloatTensor] = None,
+        skip_sample: Optional[torch.FloatTensor] = None,
+        scale: float = 1.0,
+    ) -> Tuple[torch.FloatTensor, Tuple[torch.FloatTensor, ...], torch.FloatTensor]:
+        output_states = ()
+
+        for resnet, attn in zip(self.resnets, self.attentions):
+            hidden_states = resnet(hidden_states, temb, scale=scale)
+            cross_attention_kwargs = {"scale": scale}
+            hidden_states = attn(hidden_states, **cross_attention_kwargs)
+            output_states += (hidden_states,)
+
+        if self.downsamplers is not None:
+            hidden_states = self.resnet_down(hidden_states, temb, scale=scale)
+            for downsampler in self.downsamplers:
+                skip_sample = downsampler(skip_sample)
+
+            hidden_states = self.skip_conv(skip_sample) + hidden_states
+
+            output_states += (hidden_states,)
+
+        return hidden_states, output_states, skip_sample
+
+
+class SkipDownBlock2D(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        temb_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_pre_norm: bool = True,
+        output_scale_factor: float = np.sqrt(2.0),
+        add_downsample: bool = True,
+        downsample_padding: int = 1,
+    ):
+        super().__init__()
+        self.resnets = nn.ModuleList([])
+
+        for i in range(num_layers):
+            in_channels = in_channels if i == 0 else out_channels
+            self.resnets.append(
+                ResnetBlock2D(
+                    in_channels=in_channels,
+                    out_channels=out_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=min(in_channels // 4, 32),
+                    groups_out=min(out_channels // 4, 32),
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                )
+            )
+
+        if add_downsample:
+            self.resnet_down = ResnetBlock2D(
+                in_channels=out_channels,
+                out_channels=out_channels,
+                temb_channels=temb_channels,
+                eps=resnet_eps,
+                groups=min(out_channels // 4, 32),
+                dropout=dropout,
+                time_embedding_norm=resnet_time_scale_shift,
+                non_linearity=resnet_act_fn,
+                output_scale_factor=output_scale_factor,
+                pre_norm=resnet_pre_norm,
+                use_in_shortcut=True,
+                down=True,
+                kernel="fir",
+            )
+            self.downsamplers = nn.ModuleList([FirDownsample2D(out_channels, out_channels=out_channels)])
+            self.skip_conv = nn.Conv2d(3, out_channels, kernel_size=(1, 1), stride=(1, 1))
+        else:
+            self.resnet_down = None
+            self.downsamplers = None
+            self.skip_conv = None
+
+    def forward(
+        self,
+        hidden_states: torch.FloatTensor,
+        temb: Optional[torch.FloatTensor] = None,
+        skip_sample: Optional[torch.FloatTensor] = None,
+        scale: float = 1.0,
+    ) -> Tuple[torch.FloatTensor, Tuple[torch.FloatTensor, ...], torch.FloatTensor]:
+        output_states = ()
+
+        for resnet in self.resnets:
+            hidden_states = resnet(hidden_states, temb, scale)
+            output_states += (hidden_states,)
+
+        if self.downsamplers is not None:
+            hidden_states = self.resnet_down(hidden_states, temb, scale)
+            for downsampler in self.downsamplers:
+                skip_sample = downsampler(skip_sample)
+
+            hidden_states = self.skip_conv(skip_sample) + hidden_states
+
+            output_states += (hidden_states,)
+
+        return hidden_states, output_states, skip_sample
+
+
+class ResnetDownsampleBlock2D(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        temb_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        output_scale_factor: float = 1.0,
+        add_downsample: bool = True,
+        skip_time_act: bool = False,
+    ):
+        super().__init__()
+        resnets = []
+
+        for i in range(num_layers):
+            in_channels = in_channels if i == 0 else out_channels
+            resnets.append(
+                ResnetBlock2D(
+                    in_channels=in_channels,
+                    out_channels=out_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                    skip_time_act=skip_time_act,
+                )
+            )
+
+        self.resnets = nn.ModuleList(resnets)
+
+        if add_downsample:
+            self.downsamplers = nn.ModuleList(
+                [
+                    ResnetBlock2D(
+                        in_channels=out_channels,
+                        out_channels=out_channels,
+                        temb_channels=temb_channels,
+                        eps=resnet_eps,
+                        groups=resnet_groups,
+                        dropout=dropout,
+                        time_embedding_norm=resnet_time_scale_shift,
+                        non_linearity=resnet_act_fn,
+                        output_scale_factor=output_scale_factor,
+                        pre_norm=resnet_pre_norm,
+                        skip_time_act=skip_time_act,
+                        down=True,
+                    )
+                ]
+            )
+        else:
+            self.downsamplers = None
+
+        self.gradient_checkpointing = False
+
+    def forward(
+        self, hidden_states: torch.FloatTensor, temb: Optional[torch.FloatTensor] = None, scale: float = 1.0
+    ) -> Tuple[torch.FloatTensor, Tuple[torch.FloatTensor, ...]]:
+        output_states = ()
+
+        for resnet in self.resnets:
+            if self.training and self.gradient_checkpointing:
+
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        return module(*inputs)
+
+                    return custom_forward
+
+                if is_torch_version(">=", "1.11.0"):
+                    hidden_states = torch.utils.checkpoint.checkpoint(
+                        create_custom_forward(resnet), hidden_states, temb, use_reentrant=False
+                    )
+                else:
+                    hidden_states = torch.utils.checkpoint.checkpoint(
+                        create_custom_forward(resnet), hidden_states, temb
+                    )
+            else:
+                hidden_states = resnet(hidden_states, temb, scale)
+
+            output_states = output_states + (hidden_states,)
+
+        if self.downsamplers is not None:
+            for downsampler in self.downsamplers:
+                hidden_states = downsampler(hidden_states, temb, scale)
+
+            output_states = output_states + (hidden_states,)
+
+        return hidden_states, output_states
+
+
+class SimpleCrossAttnDownBlock2D(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        temb_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        attention_head_dim: int = 1,
+        cross_attention_dim: int = 1280,
+        output_scale_factor: float = 1.0,
+        add_downsample: bool = True,
+        skip_time_act: bool = False,
+        only_cross_attention: bool = False,
+        cross_attention_norm: Optional[str] = None,
+    ):
+        super().__init__()
+
+        self.has_cross_attention = True
+
+        resnets = []
+        attentions = []
+
+        self.attention_head_dim = attention_head_dim
+        self.num_heads = out_channels // self.attention_head_dim
+
+        for i in range(num_layers):
+            in_channels = in_channels if i == 0 else out_channels
+            resnets.append(
+                ResnetBlock2D(
+                    in_channels=in_channels,
+                    out_channels=out_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                    skip_time_act=skip_time_act,
+                )
+            )
+
+            processor = (
+                AttnAddedKVProcessor2_0() if hasattr(F, "scaled_dot_product_attention") else AttnAddedKVProcessor()
+            )
+
+            attentions.append(
+                Attention(
+                    query_dim=out_channels,
+                    cross_attention_dim=out_channels,
+                    heads=self.num_heads,
+                    dim_head=attention_head_dim,
+                    added_kv_proj_dim=cross_attention_dim,
+                    norm_num_groups=resnet_groups,
+                    bias=True,
+                    upcast_softmax=True,
+                    only_cross_attention=only_cross_attention,
+                    cross_attention_norm=cross_attention_norm,
+                    processor=processor,
+                )
+            )
+        self.attentions = nn.ModuleList(attentions)
+        self.resnets = nn.ModuleList(resnets)
+
+        if add_downsample:
+            self.downsamplers = nn.ModuleList(
+                [
+                    ResnetBlock2D(
+                        in_channels=out_channels,
+                        out_channels=out_channels,
+                        temb_channels=temb_channels,
+                        eps=resnet_eps,
+                        groups=resnet_groups,
+                        dropout=dropout,
+                        time_embedding_norm=resnet_time_scale_shift,
+                        non_linearity=resnet_act_fn,
+                        output_scale_factor=output_scale_factor,
+                        pre_norm=resnet_pre_norm,
+                        skip_time_act=skip_time_act,
+                        down=True,
+                    )
+                ]
+            )
+        else:
+            self.downsamplers = None
+
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        hidden_states: torch.FloatTensor,
+        temb: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+    ) -> Tuple[torch.FloatTensor, Tuple[torch.FloatTensor, ...]]:
+        output_states = ()
+        cross_attention_kwargs = cross_attention_kwargs if cross_attention_kwargs is not None else {}
+
+        lora_scale = cross_attention_kwargs.get("scale", 1.0)
+
+        if attention_mask is None:
+            # if encoder_hidden_states is defined: we are doing cross-attn, so we should use cross-attn mask.
+            mask = None if encoder_hidden_states is None else encoder_attention_mask
+        else:
+            # when attention_mask is defined: we don't even check for encoder_attention_mask.
+            # this is to maintain compatibility with UnCLIP, which uses 'attention_mask' param for cross-attn masks.
+            # TODO: UnCLIP should express cross-attn mask via encoder_attention_mask param instead of via attention_mask.
+            #       then we can simplify this whole if/else block to:
+            #         mask = attention_mask if encoder_hidden_states is None else encoder_attention_mask
+            mask = attention_mask
+
+        for resnet, attn in zip(self.resnets, self.attentions):
+            if self.training and self.gradient_checkpointing:
+
+                def create_custom_forward(module, return_dict=None):
+                    def custom_forward(*inputs):
+                        if return_dict is not None:
+                            return module(*inputs, return_dict=return_dict)
+                        else:
+                            return module(*inputs)
+
+                    return custom_forward
+
+                hidden_states = torch.utils.checkpoint.checkpoint(create_custom_forward(resnet), hidden_states, temb)
+                hidden_states = attn(
+                    hidden_states,
+                    encoder_hidden_states=encoder_hidden_states,
+                    attention_mask=mask,
+                    **cross_attention_kwargs,
+                )
+            else:
+                hidden_states = resnet(hidden_states, temb, scale=lora_scale)
+
+                hidden_states = attn(
+                    hidden_states,
+                    encoder_hidden_states=encoder_hidden_states,
+                    attention_mask=mask,
+                    **cross_attention_kwargs,
+                )
+
+            output_states = output_states + (hidden_states,)
+
+        if self.downsamplers is not None:
+            for downsampler in self.downsamplers:
+                hidden_states = downsampler(hidden_states, temb, scale=lora_scale)
+
+            output_states = output_states + (hidden_states,)
+
+        return hidden_states, output_states
+
+
+class KDownBlock2D(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        temb_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 4,
+        resnet_eps: float = 1e-5,
+        resnet_act_fn: str = "gelu",
+        resnet_group_size: int = 32,
+        add_downsample: bool = False,
+    ):
+        super().__init__()
+        resnets = []
+
+        for i in range(num_layers):
+            in_channels = in_channels if i == 0 else out_channels
+            groups = in_channels // resnet_group_size
+            groups_out = out_channels // resnet_group_size
+
+            resnets.append(
+                ResnetBlock2D(
+                    in_channels=in_channels,
+                    out_channels=out_channels,
+                    dropout=dropout,
+                    temb_channels=temb_channels,
+                    groups=groups,
+                    groups_out=groups_out,
+                    eps=resnet_eps,
+                    non_linearity=resnet_act_fn,
+                    time_embedding_norm="ada_group",
+                    conv_shortcut_bias=False,
+                )
+            )
+
+        self.resnets = nn.ModuleList(resnets)
+
+        if add_downsample:
+            # YiYi's comments- might be able to use FirDownsample2D, look into details later
+            self.downsamplers = nn.ModuleList([KDownsample2D()])
+        else:
+            self.downsamplers = None
+
+        self.gradient_checkpointing = False
+
+    def forward(
+        self, hidden_states: torch.FloatTensor, temb: Optional[torch.FloatTensor] = None, scale: float = 1.0
+    ) -> Tuple[torch.FloatTensor, Tuple[torch.FloatTensor, ...]]:
+        output_states = ()
+
+        for resnet in self.resnets:
+            if self.training and self.gradient_checkpointing:
+
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        return module(*inputs)
+
+                    return custom_forward
+
+                if is_torch_version(">=", "1.11.0"):
+                    hidden_states = torch.utils.checkpoint.checkpoint(
+                        create_custom_forward(resnet), hidden_states, temb, use_reentrant=False
+                    )
+                else:
+                    hidden_states = torch.utils.checkpoint.checkpoint(
+                        create_custom_forward(resnet), hidden_states, temb
+                    )
+            else:
+                hidden_states = resnet(hidden_states, temb, scale)
+
+            output_states += (hidden_states,)
+
+        if self.downsamplers is not None:
+            for downsampler in self.downsamplers:
+                hidden_states = downsampler(hidden_states)
+
+        return hidden_states, output_states
+
+
+class KCrossAttnDownBlock2D(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        temb_channels: int,
+        cross_attention_dim: int,
+        dropout: float = 0.0,
+        num_layers: int = 4,
+        resnet_group_size: int = 32,
+        add_downsample: bool = True,
+        attention_head_dim: int = 64,
+        add_self_attention: bool = False,
+        resnet_eps: float = 1e-5,
+        resnet_act_fn: str = "gelu",
+    ):
+        super().__init__()
+        resnets = []
+        attentions = []
+
+        self.has_cross_attention = True
+
+        for i in range(num_layers):
+            in_channels = in_channels if i == 0 else out_channels
+            groups = in_channels // resnet_group_size
+            groups_out = out_channels // resnet_group_size
+
+            resnets.append(
+                ResnetBlock2D(
+                    in_channels=in_channels,
+                    out_channels=out_channels,
+                    dropout=dropout,
+                    temb_channels=temb_channels,
+                    groups=groups,
+                    groups_out=groups_out,
+                    eps=resnet_eps,
+                    non_linearity=resnet_act_fn,
+                    time_embedding_norm="ada_group",
+                    conv_shortcut_bias=False,
+                )
+            )
+            attentions.append(
+                KAttentionBlock(
+                    out_channels,
+                    out_channels // attention_head_dim,
+                    attention_head_dim,
+                    cross_attention_dim=cross_attention_dim,
+                    temb_channels=temb_channels,
+                    attention_bias=True,
+                    add_self_attention=add_self_attention,
+                    cross_attention_norm="layer_norm",
+                    group_size=resnet_group_size,
+                )
+            )
+
+        self.resnets = nn.ModuleList(resnets)
+        self.attentions = nn.ModuleList(attentions)
+
+        if add_downsample:
+            self.downsamplers = nn.ModuleList([KDownsample2D()])
+        else:
+            self.downsamplers = None
+
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        hidden_states: torch.FloatTensor,
+        temb: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+    ) -> Tuple[torch.FloatTensor, Tuple[torch.FloatTensor, ...]]:
+        output_states = ()
+        lora_scale = cross_attention_kwargs.get("scale", 1.0) if cross_attention_kwargs is not None else 1.0
+
+        for resnet, attn in zip(self.resnets, self.attentions):
+            if self.training and self.gradient_checkpointing:
+
+                def create_custom_forward(module, return_dict=None):
+                    def custom_forward(*inputs):
+                        if return_dict is not None:
+                            return module(*inputs, return_dict=return_dict)
+                        else:
+                            return module(*inputs)
+
+                    return custom_forward
+
+                ckpt_kwargs: Dict[str, Any] = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
+                hidden_states = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(resnet),
+                    hidden_states,
+                    temb,
+                    **ckpt_kwargs,
+                )
+                hidden_states = attn(
+                    hidden_states,
+                    encoder_hidden_states=encoder_hidden_states,
+                    emb=temb,
+                    attention_mask=attention_mask,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    encoder_attention_mask=encoder_attention_mask,
+                )
+            else:
+                hidden_states = resnet(hidden_states, temb, scale=lora_scale)
+                hidden_states = attn(
+                    hidden_states,
+                    encoder_hidden_states=encoder_hidden_states,
+                    emb=temb,
+                    attention_mask=attention_mask,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    encoder_attention_mask=encoder_attention_mask,
+                )
+
+            if self.downsamplers is None:
+                output_states += (None,)
+            else:
+                output_states += (hidden_states,)
+
+        if self.downsamplers is not None:
+            for downsampler in self.downsamplers:
+                hidden_states = downsampler(hidden_states)
+
+        return hidden_states, output_states
+
+
+class AttnUpBlock2D(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        prev_output_channel: int,
+        out_channels: int,
+        temb_channels: int,
+        resolution_idx: int = None,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        attention_head_dim: int = 1,
+        output_scale_factor: float = 1.0,
+        upsample_type: str = "conv",
+    ):
+        super().__init__()
+        resnets = []
+        attentions = []
+
+        self.upsample_type = upsample_type
+
+        if attention_head_dim is None:
+            logger.warn(
+                f"It is not recommend to pass `attention_head_dim=None`. Defaulting `attention_head_dim` to `in_channels`: {out_channels}."
+            )
+            attention_head_dim = out_channels
+
+        for i in range(num_layers):
+            res_skip_channels = in_channels if (i == num_layers - 1) else out_channels
+            resnet_in_channels = prev_output_channel if i == 0 else out_channels
+
+            resnets.append(
+                ResnetBlock2D(
+                    in_channels=resnet_in_channels + res_skip_channels,
+                    out_channels=out_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                )
+            )
+            attentions.append(
+                Attention(
+                    out_channels,
+                    heads=out_channels // attention_head_dim,
+                    dim_head=attention_head_dim,
+                    rescale_output_factor=output_scale_factor,
+                    eps=resnet_eps,
+                    norm_num_groups=resnet_groups,
+                    residual_connection=True,
+                    bias=True,
+                    upcast_softmax=True,
+                    _from_deprecated_attn_block=True,
+                )
+            )
+
+        self.attentions = nn.ModuleList(attentions)
+        self.resnets = nn.ModuleList(resnets)
+
+        if upsample_type == "conv":
+            self.upsamplers = nn.ModuleList([Upsample2D(out_channels, use_conv=True, out_channels=out_channels)])
+        elif upsample_type == "resnet":
+            self.upsamplers = nn.ModuleList(
+                [
+                    ResnetBlock2D(
+                        in_channels=out_channels,
+                        out_channels=out_channels,
+                        temb_channels=temb_channels,
+                        eps=resnet_eps,
+                        groups=resnet_groups,
+                        dropout=dropout,
+                        time_embedding_norm=resnet_time_scale_shift,
+                        non_linearity=resnet_act_fn,
+                        output_scale_factor=output_scale_factor,
+                        pre_norm=resnet_pre_norm,
+                        up=True,
+                    )
+                ]
+            )
+        else:
+            self.upsamplers = None
+
+        self.resolution_idx = resolution_idx
+
+    def forward(
+        self,
+        hidden_states: torch.FloatTensor,
+        res_hidden_states_tuple: Tuple[torch.FloatTensor, ...],
+        temb: Optional[torch.FloatTensor] = None,
+        upsample_size: Optional[int] = None,
+        scale: float = 1.0,
+    ) -> torch.FloatTensor:
+        for resnet, attn in zip(self.resnets, self.attentions):
+            # pop res hidden states
+            res_hidden_states = res_hidden_states_tuple[-1]
+            res_hidden_states_tuple = res_hidden_states_tuple[:-1]
+            hidden_states = torch.cat([hidden_states, res_hidden_states], dim=1)
+
+            hidden_states = resnet(hidden_states, temb, scale=scale)
+            cross_attention_kwargs = {"scale": scale}
+            hidden_states = attn(hidden_states, **cross_attention_kwargs)
+
+        if self.upsamplers is not None:
+            for upsampler in self.upsamplers:
+                if self.upsample_type == "resnet":
+                    hidden_states = upsampler(hidden_states, temb=temb, scale=scale)
+                else:
+                    hidden_states = upsampler(hidden_states, scale=scale)
+
+        return hidden_states
+
+
+class CrossAttnUpBlock2D(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        prev_output_channel: int,
+        temb_channels: int,
+        resolution_idx: Optional[int] = None,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        transformer_layers_per_block: Union[int, Tuple[int]] = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        num_attention_heads: int = 1,
+        cross_attention_dim: int = 1280,
+        output_scale_factor: float = 1.0,
+        add_upsample: bool = True,
+        dual_cross_attention: bool = False,
+        use_linear_projection: bool = False,
+        only_cross_attention: bool = False,
+        upcast_attention: bool = False,
+        attention_type: str = "default",
+    ):
+        super().__init__()
+        resnets = []
+        attentions = []
+
+        self.has_cross_attention = True
+        self.num_attention_heads = num_attention_heads
+
+        if isinstance(transformer_layers_per_block, int):
+            transformer_layers_per_block = [transformer_layers_per_block] * num_layers
+
+        for i in range(num_layers):
+            res_skip_channels = in_channels if (i == num_layers - 1) else out_channels
+            resnet_in_channels = prev_output_channel if i == 0 else out_channels
+
+            resnets.append(
+                ResnetBlock2D(
+                    in_channels=resnet_in_channels + res_skip_channels,
+                    out_channels=out_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                )
+            )
+            if not dual_cross_attention:
+                attentions.append(
+                    Transformer2DModel(
+                        num_attention_heads,
+                        out_channels // num_attention_heads,
+                        in_channels=out_channels,
+                        num_layers=transformer_layers_per_block[i],
+                        cross_attention_dim=cross_attention_dim,
+                        norm_num_groups=resnet_groups,
+                        use_linear_projection=use_linear_projection,
+                        only_cross_attention=only_cross_attention,
+                        upcast_attention=upcast_attention,
+                        attention_type=attention_type,
+                    )
+                )
+            else:
+                attentions.append(
+                    DualTransformer2DModel(
+                        num_attention_heads,
+                        out_channels // num_attention_heads,
+                        in_channels=out_channels,
+                        num_layers=1,
+                        cross_attention_dim=cross_attention_dim,
+                        norm_num_groups=resnet_groups,
+                    )
+                )
+        self.attentions = nn.ModuleList(attentions)
+        self.resnets = nn.ModuleList(resnets)
+
+        if add_upsample:
+            self.upsamplers = nn.ModuleList([Upsample2D(out_channels, use_conv=True, out_channels=out_channels)])
+        else:
+            self.upsamplers = None
+
+        self.gradient_checkpointing = False
+        self.resolution_idx = resolution_idx
+
+    def forward(
+        self,
+        hidden_states: torch.FloatTensor,
+        res_hidden_states_tuple: Tuple[torch.FloatTensor, ...],
+        temb: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        upsample_size: Optional[int] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+    ) -> torch.FloatTensor:
+        lora_scale = cross_attention_kwargs.get("scale", 1.0) if cross_attention_kwargs is not None else 1.0
+        is_freeu_enabled = (
+            getattr(self, "s1", None)
+            and getattr(self, "s2", None)
+            and getattr(self, "b1", None)
+            and getattr(self, "b2", None)
+        )
+        garment_features = []
+        for resnet, attn in zip(self.resnets, self.attentions):
+            # pop res hidden states
+            # print("h.shape")
+            # print(h.shape)
+            # print("hidden_states.shape)
+            # print(hidden_states.shape)
+            # print("attn_block")
+            # print(attn)
+
+            res_hidden_states = res_hidden_states_tuple[-1]
+            res_hidden_states_tuple = res_hidden_states_tuple[:-1]
+
+            # FreeU: Only operate on the first two stages
+            if is_freeu_enabled:
+                hidden_states, res_hidden_states = apply_freeu(
+                    self.resolution_idx,
+                    hidden_states,
+                    res_hidden_states,
+                    s1=self.s1,
+                    s2=self.s2,
+                    b1=self.b1,
+                    b2=self.b2,
+                )
+
+            hidden_states = torch.cat([hidden_states, res_hidden_states], dim=1)
+            # print(hidden_states.shape)
+            # print(encoder_hidden_states.shape)
+            if self.training and self.gradient_checkpointing:
+
+                def create_custom_forward(module, return_dict=None):
+                    def custom_forward(*inputs):
+                        if return_dict is not None:
+                            return module(*inputs, return_dict=return_dict)
+                        else:
+                            return module(*inputs)
+
+                    return custom_forward
+
+                ckpt_kwargs: Dict[str, Any] = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
+                hidden_states = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(resnet),
+                    hidden_states,
+                    temb,
+                    **ckpt_kwargs,
+                )
+                hidden_states,out_garment_feat = attn(
+                    hidden_states,
+                    encoder_hidden_states=encoder_hidden_states,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    attention_mask=attention_mask,
+                    encoder_attention_mask=encoder_attention_mask,
+                    return_dict=False,
+                )
+                hidden_states=hidden_states[0]
+            else:
+                hidden_states = resnet(hidden_states, temb, scale=lora_scale)
+                hidden_states,out_garment_feat = attn(
+                    hidden_states,
+                    encoder_hidden_states=encoder_hidden_states,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    attention_mask=attention_mask,
+                    encoder_attention_mask=encoder_attention_mask,
+                    return_dict=False,
+                )
+                hidden_states=hidden_states[0]
+            garment_features += out_garment_feat
+        if self.upsamplers is not None:
+            for upsampler in self.upsamplers:
+                hidden_states = upsampler(hidden_states, upsample_size, scale=lora_scale)
+
+        return hidden_states,garment_features
+
+
+class UpBlock2D(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        prev_output_channel: int,
+        out_channels: int,
+        temb_channels: int,
+        resolution_idx: Optional[int] = None,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        output_scale_factor: float = 1.0,
+        add_upsample: bool = True,
+    ):
+        super().__init__()
+        resnets = []
+
+        for i in range(num_layers):
+            res_skip_channels = in_channels if (i == num_layers - 1) else out_channels
+            resnet_in_channels = prev_output_channel if i == 0 else out_channels
+
+            resnets.append(
+                ResnetBlock2D(
+                    in_channels=resnet_in_channels + res_skip_channels,
+                    out_channels=out_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                )
+            )
+
+        self.resnets = nn.ModuleList(resnets)
+
+        if add_upsample:
+            self.upsamplers = nn.ModuleList([Upsample2D(out_channels, use_conv=True, out_channels=out_channels)])
+        else:
+            self.upsamplers = None
+
+        self.gradient_checkpointing = False
+        self.resolution_idx = resolution_idx
+
+    def forward(
+        self,
+        hidden_states: torch.FloatTensor,
+        res_hidden_states_tuple: Tuple[torch.FloatTensor, ...],
+        temb: Optional[torch.FloatTensor] = None,
+        upsample_size: Optional[int] = None,
+        scale: float = 1.0,
+    ) -> torch.FloatTensor:
+        is_freeu_enabled = (
+            getattr(self, "s1", None)
+            and getattr(self, "s2", None)
+            and getattr(self, "b1", None)
+            and getattr(self, "b2", None)
+        )
+
+        for resnet in self.resnets:
+            # pop res hidden states
+            res_hidden_states = res_hidden_states_tuple[-1]
+            res_hidden_states_tuple = res_hidden_states_tuple[:-1]
+
+            # FreeU: Only operate on the first two stages
+            if is_freeu_enabled:
+                hidden_states, res_hidden_states = apply_freeu(
+                    self.resolution_idx,
+                    hidden_states,
+                    res_hidden_states,
+                    s1=self.s1,
+                    s2=self.s2,
+                    b1=self.b1,
+                    b2=self.b2,
+                )
+
+            hidden_states = torch.cat([hidden_states, res_hidden_states], dim=1)
+
+            if self.training and self.gradient_checkpointing:
+
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        return module(*inputs)
+
+                    return custom_forward
+
+                if is_torch_version(">=", "1.11.0"):
+                    hidden_states = torch.utils.checkpoint.checkpoint(
+                        create_custom_forward(resnet), hidden_states, temb, use_reentrant=False
+                    )
+                else:
+                    hidden_states = torch.utils.checkpoint.checkpoint(
+                        create_custom_forward(resnet), hidden_states, temb
+                    )
+            else:
+                hidden_states = resnet(hidden_states, temb, scale=scale)
+
+        if self.upsamplers is not None:
+            for upsampler in self.upsamplers:
+                hidden_states = upsampler(hidden_states, upsample_size, scale=scale)
+
+        return hidden_states
+    # def forward(
+    #     self,
+    #     hidden_states: torch.FloatTensor,
+    #     res_hidden_states_tuple: Tuple[torch.FloatTensor, ...],
+    #     temb: Optional[torch.FloatTensor] = None,
+    #     upsample_size: Optional[int] = None,
+    #     scale: float = 1.0,
+    #     zero_block=None,
+    #     hint=None,
+    # ) -> torch.FloatTensor:
+    #     is_freeu_enabled = (
+    #         getattr(self, "s1", None)
+    #         and getattr(self, "s2", None)
+    #         and getattr(self, "b1", None)
+    #         and getattr(self, "b2", None)
+    #     )
+
+    #     # print(len(self.resnets))
+    #     # print(len(zero_block))
+    #     # print(len(hint))
+    #     # for resnet in self.resnets:
+    #     for resnet, zero,h in zip(self.resnets,zero_block,hint):
+
+    #         # pop res hidden states
+    #         res_hidden_states = res_hidden_states_tuple[-1]
+    #         res_hidden_states_tuple = res_hidden_states_tuple[:-1]
+
+    #         res_hidden_states = res_hidden_states + zero(h)
+    #         # FreeU: Only operate on the first two stages
+    #         if is_freeu_enabled:
+    #             hidden_states, res_hidden_states = apply_freeu(
+    #                 self.resolution_idx,
+    #                 hidden_states,
+    #                 res_hidden_states,
+    #                 s1=self.s1,
+    #                 s2=self.s2,
+    #                 b1=self.b1,
+    #                 b2=self.b2,
+    #             )
+
+    #         # print(hidden_states.shape)
+    #         # # print(h.shape)
+    #         # print(res_hidden_states.shape)
+    #         hidden_states = torch.cat([hidden_states, res_hidden_states], dim=1)
+    #         # print(hidden_states.shape)
+
+    #         if self.training and self.gradient_checkpointing:
+
+    #             def create_custom_forward(module):
+    #                 def custom_forward(*inputs):
+    #                     return module(*inputs)
+
+    #                 return custom_forward
+
+    #             if is_torch_version(">=", "1.11.0"):
+    #                 hidden_states = torch.utils.checkpoint.checkpoint(
+    #                     create_custom_forward(resnet), hidden_states, temb, use_reentrant=False
+    #                 )
+    #             else:
+    #                 hidden_states = torch.utils.checkpoint.checkpoint(
+    #                     create_custom_forward(resnet), hidden_states, temb
+    #                 )
+    #         else:
+    #             hidden_states = resnet(hidden_states, temb, scale=scale)
+
+    #     if self.upsamplers is not None:
+    #         for upsampler in self.upsamplers:
+    #             hidden_states = upsampler(hidden_states, upsample_size, scale=scale)
+
+    #     return hidden_states
+
+
+class UpDecoderBlock2D(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        resolution_idx: Optional[int] = None,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",  # default, spatial
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        output_scale_factor: float = 1.0,
+        add_upsample: bool = True,
+        temb_channels: Optional[int] = None,
+    ):
+        super().__init__()
+        resnets = []
+
+        for i in range(num_layers):
+            input_channels = in_channels if i == 0 else out_channels
+
+            resnets.append(
+                ResnetBlock2D(
+                    in_channels=input_channels,
+                    out_channels=out_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                )
+            )
+
+        self.resnets = nn.ModuleList(resnets)
+
+        if add_upsample:
+            self.upsamplers = nn.ModuleList([Upsample2D(out_channels, use_conv=True, out_channels=out_channels)])
+        else:
+            self.upsamplers = None
+
+        self.resolution_idx = resolution_idx
+
+    def forward(
+        self, hidden_states: torch.FloatTensor, temb: Optional[torch.FloatTensor] = None, scale: float = 1.0
+    ) -> torch.FloatTensor:
+        for resnet in self.resnets:
+            hidden_states = resnet(hidden_states, temb=temb, scale=scale)
+
+        if self.upsamplers is not None:
+            for upsampler in self.upsamplers:
+                hidden_states = upsampler(hidden_states)
+
+        return hidden_states
+
+
+class AttnUpDecoderBlock2D(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        resolution_idx: Optional[int] = None,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        attention_head_dim: int = 1,
+        output_scale_factor: float = 1.0,
+        add_upsample: bool = True,
+        temb_channels: Optional[int] = None,
+    ):
+        super().__init__()
+        resnets = []
+        attentions = []
+
+        if attention_head_dim is None:
+            logger.warn(
+                f"It is not recommend to pass `attention_head_dim=None`. Defaulting `attention_head_dim` to `out_channels`: {out_channels}."
+            )
+            attention_head_dim = out_channels
+
+        for i in range(num_layers):
+            input_channels = in_channels if i == 0 else out_channels
+
+            resnets.append(
+                ResnetBlock2D(
+                    in_channels=input_channels,
+                    out_channels=out_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                )
+            )
+            attentions.append(
+                Attention(
+                    out_channels,
+                    heads=out_channels // attention_head_dim,
+                    dim_head=attention_head_dim,
+                    rescale_output_factor=output_scale_factor,
+                    eps=resnet_eps,
+                    norm_num_groups=resnet_groups if resnet_time_scale_shift != "spatial" else None,
+                    spatial_norm_dim=temb_channels if resnet_time_scale_shift == "spatial" else None,
+                    residual_connection=True,
+                    bias=True,
+                    upcast_softmax=True,
+                    _from_deprecated_attn_block=True,
+                )
+            )
+
+        self.attentions = nn.ModuleList(attentions)
+        self.resnets = nn.ModuleList(resnets)
+
+        if add_upsample:
+            self.upsamplers = nn.ModuleList([Upsample2D(out_channels, use_conv=True, out_channels=out_channels)])
+        else:
+            self.upsamplers = None
+
+        self.resolution_idx = resolution_idx
+
+    def forward(
+        self, hidden_states: torch.FloatTensor, temb: Optional[torch.FloatTensor] = None, scale: float = 1.0
+    ) -> torch.FloatTensor:
+        for resnet, attn in zip(self.resnets, self.attentions):
+            hidden_states = resnet(hidden_states, temb=temb, scale=scale)
+            cross_attention_kwargs = {"scale": scale}
+            hidden_states = attn(hidden_states, temb=temb, **cross_attention_kwargs)
+
+        if self.upsamplers is not None:
+            for upsampler in self.upsamplers:
+                hidden_states = upsampler(hidden_states, scale=scale)
+
+        return hidden_states
+
+
+class AttnSkipUpBlock2D(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        prev_output_channel: int,
+        out_channels: int,
+        temb_channels: int,
+        resolution_idx: Optional[int] = None,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_pre_norm: bool = True,
+        attention_head_dim: int = 1,
+        output_scale_factor: float = np.sqrt(2.0),
+        add_upsample: bool = True,
+    ):
+        super().__init__()
+        self.attentions = nn.ModuleList([])
+        self.resnets = nn.ModuleList([])
+
+        for i in range(num_layers):
+            res_skip_channels = in_channels if (i == num_layers - 1) else out_channels
+            resnet_in_channels = prev_output_channel if i == 0 else out_channels
+
+            self.resnets.append(
+                ResnetBlock2D(
+                    in_channels=resnet_in_channels + res_skip_channels,
+                    out_channels=out_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=min(resnet_in_channels + res_skip_channels // 4, 32),
+                    groups_out=min(out_channels // 4, 32),
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                )
+            )
+
+        if attention_head_dim is None:
+            logger.warn(
+                f"It is not recommend to pass `attention_head_dim=None`. Defaulting `attention_head_dim` to `out_channels`: {out_channels}."
+            )
+            attention_head_dim = out_channels
+
+        self.attentions.append(
+            Attention(
+                out_channels,
+                heads=out_channels // attention_head_dim,
+                dim_head=attention_head_dim,
+                rescale_output_factor=output_scale_factor,
+                eps=resnet_eps,
+                norm_num_groups=32,
+                residual_connection=True,
+                bias=True,
+                upcast_softmax=True,
+                _from_deprecated_attn_block=True,
+            )
+        )
+
+        self.upsampler = FirUpsample2D(in_channels, out_channels=out_channels)
+        if add_upsample:
+            self.resnet_up = ResnetBlock2D(
+                in_channels=out_channels,
+                out_channels=out_channels,
+                temb_channels=temb_channels,
+                eps=resnet_eps,
+                groups=min(out_channels // 4, 32),
+                groups_out=min(out_channels // 4, 32),
+                dropout=dropout,
+                time_embedding_norm=resnet_time_scale_shift,
+                non_linearity=resnet_act_fn,
+                output_scale_factor=output_scale_factor,
+                pre_norm=resnet_pre_norm,
+                use_in_shortcut=True,
+                up=True,
+                kernel="fir",
+            )
+            self.skip_conv = nn.Conv2d(out_channels, 3, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+            self.skip_norm = torch.nn.GroupNorm(
+                num_groups=min(out_channels // 4, 32), num_channels=out_channels, eps=resnet_eps, affine=True
+            )
+            self.act = nn.SiLU()
+        else:
+            self.resnet_up = None
+            self.skip_conv = None
+            self.skip_norm = None
+            self.act = None
+
+        self.resolution_idx = resolution_idx
+
+    def forward(
+        self,
+        hidden_states: torch.FloatTensor,
+        res_hidden_states_tuple: Tuple[torch.FloatTensor, ...],
+        temb: Optional[torch.FloatTensor] = None,
+        skip_sample=None,
+        scale: float = 1.0,
+    ) -> Tuple[torch.FloatTensor, torch.FloatTensor]:
+        for resnet in self.resnets:
+            # pop res hidden states
+            res_hidden_states = res_hidden_states_tuple[-1]
+            res_hidden_states_tuple = res_hidden_states_tuple[:-1]
+            hidden_states = torch.cat([hidden_states, res_hidden_states], dim=1)
+
+            hidden_states = resnet(hidden_states, temb, scale=scale)
+
+        cross_attention_kwargs = {"scale": scale}
+        hidden_states = self.attentions[0](hidden_states, **cross_attention_kwargs)
+
+        if skip_sample is not None:
+            skip_sample = self.upsampler(skip_sample)
+        else:
+            skip_sample = 0
+
+        if self.resnet_up is not None:
+            skip_sample_states = self.skip_norm(hidden_states)
+            skip_sample_states = self.act(skip_sample_states)
+            skip_sample_states = self.skip_conv(skip_sample_states)
+
+            skip_sample = skip_sample + skip_sample_states
+
+            hidden_states = self.resnet_up(hidden_states, temb, scale=scale)
+
+        return hidden_states, skip_sample
+
+
+class SkipUpBlock2D(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        prev_output_channel: int,
+        out_channels: int,
+        temb_channels: int,
+        resolution_idx: Optional[int] = None,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_pre_norm: bool = True,
+        output_scale_factor: float = np.sqrt(2.0),
+        add_upsample: bool = True,
+        upsample_padding: int = 1,
+    ):
+        super().__init__()
+        self.resnets = nn.ModuleList([])
+
+        for i in range(num_layers):
+            res_skip_channels = in_channels if (i == num_layers - 1) else out_channels
+            resnet_in_channels = prev_output_channel if i == 0 else out_channels
+
+            self.resnets.append(
+                ResnetBlock2D(
+                    in_channels=resnet_in_channels + res_skip_channels,
+                    out_channels=out_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=min((resnet_in_channels + res_skip_channels) // 4, 32),
+                    groups_out=min(out_channels // 4, 32),
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                )
+            )
+
+        self.upsampler = FirUpsample2D(in_channels, out_channels=out_channels)
+        if add_upsample:
+            self.resnet_up = ResnetBlock2D(
+                in_channels=out_channels,
+                out_channels=out_channels,
+                temb_channels=temb_channels,
+                eps=resnet_eps,
+                groups=min(out_channels // 4, 32),
+                groups_out=min(out_channels // 4, 32),
+                dropout=dropout,
+                time_embedding_norm=resnet_time_scale_shift,
+                non_linearity=resnet_act_fn,
+                output_scale_factor=output_scale_factor,
+                pre_norm=resnet_pre_norm,
+                use_in_shortcut=True,
+                up=True,
+                kernel="fir",
+            )
+            self.skip_conv = nn.Conv2d(out_channels, 3, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+            self.skip_norm = torch.nn.GroupNorm(
+                num_groups=min(out_channels // 4, 32), num_channels=out_channels, eps=resnet_eps, affine=True
+            )
+            self.act = nn.SiLU()
+        else:
+            self.resnet_up = None
+            self.skip_conv = None
+            self.skip_norm = None
+            self.act = None
+
+        self.resolution_idx = resolution_idx
+
+    def forward(
+        self,
+        hidden_states: torch.FloatTensor,
+        res_hidden_states_tuple: Tuple[torch.FloatTensor, ...],
+        temb: Optional[torch.FloatTensor] = None,
+        skip_sample=None,
+        scale: float = 1.0,
+    ) -> Tuple[torch.FloatTensor, torch.FloatTensor]:
+        for resnet in self.resnets:
+            # pop res hidden states
+            res_hidden_states = res_hidden_states_tuple[-1]
+            res_hidden_states_tuple = res_hidden_states_tuple[:-1]
+            hidden_states = torch.cat([hidden_states, res_hidden_states], dim=1)
+
+            hidden_states = resnet(hidden_states, temb, scale=scale)
+
+        if skip_sample is not None:
+            skip_sample = self.upsampler(skip_sample)
+        else:
+            skip_sample = 0
+
+        if self.resnet_up is not None:
+            skip_sample_states = self.skip_norm(hidden_states)
+            skip_sample_states = self.act(skip_sample_states)
+            skip_sample_states = self.skip_conv(skip_sample_states)
+
+            skip_sample = skip_sample + skip_sample_states
+
+            hidden_states = self.resnet_up(hidden_states, temb, scale=scale)
+
+        return hidden_states, skip_sample
+
+
+class ResnetUpsampleBlock2D(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        prev_output_channel: int,
+        out_channels: int,
+        temb_channels: int,
+        resolution_idx: Optional[int] = None,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        output_scale_factor: float = 1.0,
+        add_upsample: bool = True,
+        skip_time_act: bool = False,
+    ):
+        super().__init__()
+        resnets = []
+
+        for i in range(num_layers):
+            res_skip_channels = in_channels if (i == num_layers - 1) else out_channels
+            resnet_in_channels = prev_output_channel if i == 0 else out_channels
+
+            resnets.append(
+                ResnetBlock2D(
+                    in_channels=resnet_in_channels + res_skip_channels,
+                    out_channels=out_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                    skip_time_act=skip_time_act,
+                )
+            )
+
+        self.resnets = nn.ModuleList(resnets)
+
+        if add_upsample:
+            self.upsamplers = nn.ModuleList(
+                [
+                    ResnetBlock2D(
+                        in_channels=out_channels,
+                        out_channels=out_channels,
+                        temb_channels=temb_channels,
+                        eps=resnet_eps,
+                        groups=resnet_groups,
+                        dropout=dropout,
+                        time_embedding_norm=resnet_time_scale_shift,
+                        non_linearity=resnet_act_fn,
+                        output_scale_factor=output_scale_factor,
+                        pre_norm=resnet_pre_norm,
+                        skip_time_act=skip_time_act,
+                        up=True,
+                    )
+                ]
+            )
+        else:
+            self.upsamplers = None
+
+        self.gradient_checkpointing = False
+        self.resolution_idx = resolution_idx
+
+    def forward(
+        self,
+        hidden_states: torch.FloatTensor,
+        res_hidden_states_tuple: Tuple[torch.FloatTensor, ...],
+        temb: Optional[torch.FloatTensor] = None,
+        upsample_size: Optional[int] = None,
+        scale: float = 1.0,
+    ) -> torch.FloatTensor:
+        for resnet in self.resnets:
+            # pop res hidden states
+            res_hidden_states = res_hidden_states_tuple[-1]
+            res_hidden_states_tuple = res_hidden_states_tuple[:-1]
+            hidden_states = torch.cat([hidden_states, res_hidden_states], dim=1)
+
+            if self.training and self.gradient_checkpointing:
+
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        return module(*inputs)
+
+                    return custom_forward
+
+                if is_torch_version(">=", "1.11.0"):
+                    hidden_states = torch.utils.checkpoint.checkpoint(
+                        create_custom_forward(resnet), hidden_states, temb, use_reentrant=False
+                    )
+                else:
+                    hidden_states = torch.utils.checkpoint.checkpoint(
+                        create_custom_forward(resnet), hidden_states, temb
+                    )
+            else:
+                hidden_states = resnet(hidden_states, temb, scale=scale)
+
+        if self.upsamplers is not None:
+            for upsampler in self.upsamplers:
+                hidden_states = upsampler(hidden_states, temb, scale=scale)
+
+        return hidden_states
+
+
+class SimpleCrossAttnUpBlock2D(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        prev_output_channel: int,
+        temb_channels: int,
+        resolution_idx: Optional[int] = None,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        attention_head_dim: int = 1,
+        cross_attention_dim: int = 1280,
+        output_scale_factor: float = 1.0,
+        add_upsample: bool = True,
+        skip_time_act: bool = False,
+        only_cross_attention: bool = False,
+        cross_attention_norm: Optional[str] = None,
+    ):
+        super().__init__()
+        resnets = []
+        attentions = []
+
+        self.has_cross_attention = True
+        self.attention_head_dim = attention_head_dim
+
+        self.num_heads = out_channels // self.attention_head_dim
+
+        for i in range(num_layers):
+            res_skip_channels = in_channels if (i == num_layers - 1) else out_channels
+            resnet_in_channels = prev_output_channel if i == 0 else out_channels
+
+            resnets.append(
+                ResnetBlock2D(
+                    in_channels=resnet_in_channels + res_skip_channels,
+                    out_channels=out_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                    skip_time_act=skip_time_act,
+                )
+            )
+
+            processor = (
+                AttnAddedKVProcessor2_0() if hasattr(F, "scaled_dot_product_attention") else AttnAddedKVProcessor()
+            )
+
+            attentions.append(
+                Attention(
+                    query_dim=out_channels,
+                    cross_attention_dim=out_channels,
+                    heads=self.num_heads,
+                    dim_head=self.attention_head_dim,
+                    added_kv_proj_dim=cross_attention_dim,
+                    norm_num_groups=resnet_groups,
+                    bias=True,
+                    upcast_softmax=True,
+                    only_cross_attention=only_cross_attention,
+                    cross_attention_norm=cross_attention_norm,
+                    processor=processor,
+                )
+            )
+        self.attentions = nn.ModuleList(attentions)
+        self.resnets = nn.ModuleList(resnets)
+
+        if add_upsample:
+            self.upsamplers = nn.ModuleList(
+                [
+                    ResnetBlock2D(
+                        in_channels=out_channels,
+                        out_channels=out_channels,
+                        temb_channels=temb_channels,
+                        eps=resnet_eps,
+                        groups=resnet_groups,
+                        dropout=dropout,
+                        time_embedding_norm=resnet_time_scale_shift,
+                        non_linearity=resnet_act_fn,
+                        output_scale_factor=output_scale_factor,
+                        pre_norm=resnet_pre_norm,
+                        skip_time_act=skip_time_act,
+                        up=True,
+                    )
+                ]
+            )
+        else:
+            self.upsamplers = None
+
+        self.gradient_checkpointing = False
+        self.resolution_idx = resolution_idx
+
+    def forward(
+        self,
+        hidden_states: torch.FloatTensor,
+        res_hidden_states_tuple: Tuple[torch.FloatTensor, ...],
+        temb: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        upsample_size: Optional[int] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+    ) -> torch.FloatTensor:
+        cross_attention_kwargs = cross_attention_kwargs if cross_attention_kwargs is not None else {}
+
+        lora_scale = cross_attention_kwargs.get("scale", 1.0)
+        if attention_mask is None:
+            # if encoder_hidden_states is defined: we are doing cross-attn, so we should use cross-attn mask.
+            mask = None if encoder_hidden_states is None else encoder_attention_mask
+        else:
+            # when attention_mask is defined: we don't even check for encoder_attention_mask.
+            # this is to maintain compatibility with UnCLIP, which uses 'attention_mask' param for cross-attn masks.
+            # TODO: UnCLIP should express cross-attn mask via encoder_attention_mask param instead of via attention_mask.
+            #       then we can simplify this whole if/else block to:
+            #         mask = attention_mask if encoder_hidden_states is None else encoder_attention_mask
+            mask = attention_mask
+
+        for resnet, attn in zip(self.resnets, self.attentions):
+            # resnet
+            # pop res hidden states
+            res_hidden_states = res_hidden_states_tuple[-1]
+            res_hidden_states_tuple = res_hidden_states_tuple[:-1]
+            hidden_states = torch.cat([hidden_states, res_hidden_states], dim=1)
+
+            if self.training and self.gradient_checkpointing:
+
+                def create_custom_forward(module, return_dict=None):
+                    def custom_forward(*inputs):
+                        if return_dict is not None:
+                            return module(*inputs, return_dict=return_dict)
+                        else:
+                            return module(*inputs)
+
+                    return custom_forward
+
+                hidden_states = torch.utils.checkpoint.checkpoint(create_custom_forward(resnet), hidden_states, temb)
+                hidden_states = attn(
+                    hidden_states,
+                    encoder_hidden_states=encoder_hidden_states,
+                    attention_mask=mask,
+                    **cross_attention_kwargs,
+                )
+            else:
+                hidden_states = resnet(hidden_states, temb, scale=lora_scale)
+
+                hidden_states = attn(
+                    hidden_states,
+                    encoder_hidden_states=encoder_hidden_states,
+                    attention_mask=mask,
+                    **cross_attention_kwargs,
+                )
+
+        if self.upsamplers is not None:
+            for upsampler in self.upsamplers:
+                hidden_states = upsampler(hidden_states, temb, scale=lora_scale)
+
+        return hidden_states
+
+
+class KUpBlock2D(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        temb_channels: int,
+        resolution_idx: int,
+        dropout: float = 0.0,
+        num_layers: int = 5,
+        resnet_eps: float = 1e-5,
+        resnet_act_fn: str = "gelu",
+        resnet_group_size: Optional[int] = 32,
+        add_upsample: bool = True,
+    ):
+        super().__init__()
+        resnets = []
+        k_in_channels = 2 * out_channels
+        k_out_channels = in_channels
+        num_layers = num_layers - 1
+
+        for i in range(num_layers):
+            in_channels = k_in_channels if i == 0 else out_channels
+            groups = in_channels // resnet_group_size
+            groups_out = out_channels // resnet_group_size
+
+            resnets.append(
+                ResnetBlock2D(
+                    in_channels=in_channels,
+                    out_channels=k_out_channels if (i == num_layers - 1) else out_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=groups,
+                    groups_out=groups_out,
+                    dropout=dropout,
+                    non_linearity=resnet_act_fn,
+                    time_embedding_norm="ada_group",
+                    conv_shortcut_bias=False,
+                )
+            )
+
+        self.resnets = nn.ModuleList(resnets)
+
+        if add_upsample:
+            self.upsamplers = nn.ModuleList([KUpsample2D()])
+        else:
+            self.upsamplers = None
+
+        self.gradient_checkpointing = False
+        self.resolution_idx = resolution_idx
+
+    def forward(
+        self,
+        hidden_states: torch.FloatTensor,
+        res_hidden_states_tuple: Tuple[torch.FloatTensor, ...],
+        temb: Optional[torch.FloatTensor] = None,
+        upsample_size: Optional[int] = None,
+        scale: float = 1.0,
+    ) -> torch.FloatTensor:
+        res_hidden_states_tuple = res_hidden_states_tuple[-1]
+        if res_hidden_states_tuple is not None:
+            hidden_states = torch.cat([hidden_states, res_hidden_states_tuple], dim=1)
+
+        for resnet in self.resnets:
+            if self.training and self.gradient_checkpointing:
+
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        return module(*inputs)
+
+                    return custom_forward
+
+                if is_torch_version(">=", "1.11.0"):
+                    hidden_states = torch.utils.checkpoint.checkpoint(
+                        create_custom_forward(resnet), hidden_states, temb, use_reentrant=False
+                    )
+                else:
+                    hidden_states = torch.utils.checkpoint.checkpoint(
+                        create_custom_forward(resnet), hidden_states, temb
+                    )
+            else:
+                hidden_states = resnet(hidden_states, temb, scale=scale)
+
+        if self.upsamplers is not None:
+            for upsampler in self.upsamplers:
+                hidden_states = upsampler(hidden_states)
+
+        return hidden_states
+
+
+class KCrossAttnUpBlock2D(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        temb_channels: int,
+        resolution_idx: int,
+        dropout: float = 0.0,
+        num_layers: int = 4,
+        resnet_eps: float = 1e-5,
+        resnet_act_fn: str = "gelu",
+        resnet_group_size: int = 32,
+        attention_head_dim: int = 1,  # attention dim_head
+        cross_attention_dim: int = 768,
+        add_upsample: bool = True,
+        upcast_attention: bool = False,
+    ):
+        super().__init__()
+        resnets = []
+        attentions = []
+
+        is_first_block = in_channels == out_channels == temb_channels
+        is_middle_block = in_channels != out_channels
+        add_self_attention = True if is_first_block else False
+
+        self.has_cross_attention = True
+        self.attention_head_dim = attention_head_dim
+
+        # in_channels, and out_channels for the block (k-unet)
+        k_in_channels = out_channels if is_first_block else 2 * out_channels
+        k_out_channels = in_channels
+
+        num_layers = num_layers - 1
+
+        for i in range(num_layers):
+            in_channels = k_in_channels if i == 0 else out_channels
+            groups = in_channels // resnet_group_size
+            groups_out = out_channels // resnet_group_size
+
+            if is_middle_block and (i == num_layers - 1):
+                conv_2d_out_channels = k_out_channels
+            else:
+                conv_2d_out_channels = None
+
+            resnets.append(
+                ResnetBlock2D(
+                    in_channels=in_channels,
+                    out_channels=out_channels,
+                    conv_2d_out_channels=conv_2d_out_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=groups,
+                    groups_out=groups_out,
+                    dropout=dropout,
+                    non_linearity=resnet_act_fn,
+                    time_embedding_norm="ada_group",
+                    conv_shortcut_bias=False,
+                )
+            )
+            attentions.append(
+                KAttentionBlock(
+                    k_out_channels if (i == num_layers - 1) else out_channels,
+                    k_out_channels // attention_head_dim
+                    if (i == num_layers - 1)
+                    else out_channels // attention_head_dim,
+                    attention_head_dim,
+                    cross_attention_dim=cross_attention_dim,
+                    temb_channels=temb_channels,
+                    attention_bias=True,
+                    add_self_attention=add_self_attention,
+                    cross_attention_norm="layer_norm",
+                    upcast_attention=upcast_attention,
+                )
+            )
+
+        self.resnets = nn.ModuleList(resnets)
+        self.attentions = nn.ModuleList(attentions)
+
+        if add_upsample:
+            self.upsamplers = nn.ModuleList([KUpsample2D()])
+        else:
+            self.upsamplers = None
+
+        self.gradient_checkpointing = False
+        self.resolution_idx = resolution_idx
+
+    def forward(
+        self,
+        hidden_states: torch.FloatTensor,
+        res_hidden_states_tuple: Tuple[torch.FloatTensor, ...],
+        temb: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        upsample_size: Optional[int] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+    ) -> torch.FloatTensor:
+        res_hidden_states_tuple = res_hidden_states_tuple[-1]
+        if res_hidden_states_tuple is not None:
+            hidden_states = torch.cat([hidden_states, res_hidden_states_tuple], dim=1)
+
+        lora_scale = cross_attention_kwargs.get("scale", 1.0) if cross_attention_kwargs is not None else 1.0
+        for resnet, attn in zip(self.resnets, self.attentions):
+            if self.training and self.gradient_checkpointing:
+
+                def create_custom_forward(module, return_dict=None):
+                    def custom_forward(*inputs):
+                        if return_dict is not None:
+                            return module(*inputs, return_dict=return_dict)
+                        else:
+                            return module(*inputs)
+
+                    return custom_forward
+
+                ckpt_kwargs: Dict[str, Any] = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
+                hidden_states = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(resnet),
+                    hidden_states,
+                    temb,
+                    **ckpt_kwargs,
+                )
+                hidden_states = attn(
+                    hidden_states,
+                    encoder_hidden_states=encoder_hidden_states,
+                    emb=temb,
+                    attention_mask=attention_mask,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    encoder_attention_mask=encoder_attention_mask,
+                )
+            else:
+                hidden_states = resnet(hidden_states, temb, scale=lora_scale)
+                hidden_states = attn(
+                    hidden_states,
+                    encoder_hidden_states=encoder_hidden_states,
+                    emb=temb,
+                    attention_mask=attention_mask,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    encoder_attention_mask=encoder_attention_mask,
+                )
+
+        if self.upsamplers is not None:
+            for upsampler in self.upsamplers:
+                hidden_states = upsampler(hidden_states)
+
+        return hidden_states
+
+
+# can potentially later be renamed to `No-feed-forward` attention
+class KAttentionBlock(nn.Module):
+    r"""
+    A basic Transformer block.
+
+    Parameters:
+        dim (`int`): The number of channels in the input and output.
+        num_attention_heads (`int`): The number of heads to use for multi-head attention.
+        attention_head_dim (`int`): The number of channels in each head.
+        dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use.
+        cross_attention_dim (`int`, *optional*): The size of the encoder_hidden_states vector for cross attention.
+        attention_bias (`bool`, *optional*, defaults to `False`):
+            Configure if the attention layers should contain a bias parameter.
+        upcast_attention (`bool`, *optional*, defaults to `False`):
+            Set to `True` to upcast the attention computation to `float32`.
+        temb_channels (`int`, *optional*, defaults to 768):
+            The number of channels in the token embedding.
+        add_self_attention (`bool`, *optional*, defaults to `False`):
+            Set to `True` to add self-attention to the block.
+        cross_attention_norm (`str`, *optional*, defaults to `None`):
+            The type of normalization to use for the cross attention. Can be `None`, `layer_norm`, or `group_norm`.
+        group_size (`int`, *optional*, defaults to 32):
+            The number of groups to separate the channels into for group normalization.
+    """
+
+    def __init__(
+        self,
+        dim: int,
+        num_attention_heads: int,
+        attention_head_dim: int,
+        dropout: float = 0.0,
+        cross_attention_dim: Optional[int] = None,
+        attention_bias: bool = False,
+        upcast_attention: bool = False,
+        temb_channels: int = 768,  # for ada_group_norm
+        add_self_attention: bool = False,
+        cross_attention_norm: Optional[str] = None,
+        group_size: int = 32,
+    ):
+        super().__init__()
+        self.add_self_attention = add_self_attention
+
+        # 1. Self-Attn
+        if add_self_attention:
+            self.norm1 = AdaGroupNorm(temb_channels, dim, max(1, dim // group_size))
+            self.attn1 = Attention(
+                query_dim=dim,
+                heads=num_attention_heads,
+                dim_head=attention_head_dim,
+                dropout=dropout,
+                bias=attention_bias,
+                cross_attention_dim=None,
+                cross_attention_norm=None,
+            )
+
+        # 2. Cross-Attn
+        self.norm2 = AdaGroupNorm(temb_channels, dim, max(1, dim // group_size))
+        self.attn2 = Attention(
+            query_dim=dim,
+            cross_attention_dim=cross_attention_dim,
+            heads=num_attention_heads,
+            dim_head=attention_head_dim,
+            dropout=dropout,
+            bias=attention_bias,
+            upcast_attention=upcast_attention,
+            cross_attention_norm=cross_attention_norm,
+        )
+
+    def _to_3d(self, hidden_states: torch.FloatTensor, height: int, weight: int) -> torch.FloatTensor:
+        return hidden_states.permute(0, 2, 3, 1).reshape(hidden_states.shape[0], height * weight, -1)
+
+    def _to_4d(self, hidden_states: torch.FloatTensor, height: int, weight: int) -> torch.FloatTensor:
+        return hidden_states.permute(0, 2, 1).reshape(hidden_states.shape[0], -1, height, weight)
+
+    def forward(
+        self,
+        hidden_states: torch.FloatTensor,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        # TODO: mark emb as non-optional (self.norm2 requires it).
+        #       requires assessing impact of change to positional param interface.
+        emb: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+    ) -> torch.FloatTensor:
+        cross_attention_kwargs = cross_attention_kwargs if cross_attention_kwargs is not None else {}
+
+        # 1. Self-Attention
+        if self.add_self_attention:
+            norm_hidden_states = self.norm1(hidden_states, emb)
+
+            height, weight = norm_hidden_states.shape[2:]
+            norm_hidden_states = self._to_3d(norm_hidden_states, height, weight)
+
+            attn_output = self.attn1(
+                norm_hidden_states,
+                encoder_hidden_states=None,
+                attention_mask=attention_mask,
+                **cross_attention_kwargs,
+            )
+            attn_output = self._to_4d(attn_output, height, weight)
+
+            hidden_states = attn_output + hidden_states
+
+        # 2. Cross-Attention/None
+        norm_hidden_states = self.norm2(hidden_states, emb)
+
+        height, weight = norm_hidden_states.shape[2:]
+        norm_hidden_states = self._to_3d(norm_hidden_states, height, weight)
+        attn_output = self.attn2(
+            norm_hidden_states,
+            encoder_hidden_states=encoder_hidden_states,
+            attention_mask=attention_mask if encoder_hidden_states is None else encoder_attention_mask,
+            **cross_attention_kwargs,
+        )
+        attn_output = self._to_4d(attn_output, height, weight)
+
+        hidden_states = attn_output + hidden_states
+
+        return hidden_states
diff --git a/src/unet_block_hacked_tryon.py b/src/unet_block_hacked_tryon.py
new file mode 100644
index 0000000000000000000000000000000000000000..d46728a7501764dbb710ba04c43e63e8b5b84146
--- /dev/null
+++ b/src/unet_block_hacked_tryon.py
@@ -0,0 +1,3522 @@
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Any, Dict, Optional, Tuple, Union
+
+import numpy as np
+import torch
+import torch.nn.functional as F
+from torch import nn
+
+from diffusers.utils import is_torch_version, logging
+from diffusers.utils.torch_utils import apply_freeu
+from diffusers.models.activations import get_activation
+from diffusers.models.attention_processor import Attention, AttnAddedKVProcessor, AttnAddedKVProcessor2_0
+from diffusers.models.dual_transformer_2d import DualTransformer2DModel
+from diffusers.models.normalization import AdaGroupNorm
+from diffusers.models.resnet import Downsample2D, FirDownsample2D, FirUpsample2D, KDownsample2D, KUpsample2D, ResnetBlock2D, Upsample2D
+from src.transformerhacked_tryon import Transformer2DModel
+from einops import rearrange
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+def get_down_block(
+    down_block_type: str,
+    num_layers: int,
+    in_channels: int,
+    out_channels: int,
+    temb_channels: int,
+    add_downsample: bool,
+    resnet_eps: float,
+    resnet_act_fn: str,
+    transformer_layers_per_block: int = 1,
+    num_attention_heads: Optional[int] = None,
+    resnet_groups: Optional[int] = None,
+    cross_attention_dim: Optional[int] = None,
+    downsample_padding: Optional[int] = None,
+    dual_cross_attention: bool = False,
+    use_linear_projection: bool = False,
+    only_cross_attention: bool = False,
+    upcast_attention: bool = False,
+    resnet_time_scale_shift: str = "default",
+    attention_type: str = "default",
+    resnet_skip_time_act: bool = False,
+    resnet_out_scale_factor: float = 1.0,
+    cross_attention_norm: Optional[str] = None,
+    attention_head_dim: Optional[int] = None,
+    downsample_type: Optional[str] = None,
+    dropout: float = 0.0,
+):
+    # If attn head dim is not defined, we default it to the number of heads
+    if attention_head_dim is None:
+        logger.warn(
+            f"It is recommended to provide `attention_head_dim` when calling `get_down_block`. Defaulting `attention_head_dim` to {num_attention_heads}."
+        )
+        attention_head_dim = num_attention_heads
+
+    down_block_type = down_block_type[7:] if down_block_type.startswith("UNetRes") else down_block_type
+    if down_block_type == "DownBlock2D":
+        return DownBlock2D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            temb_channels=temb_channels,
+            dropout=dropout,
+            add_downsample=add_downsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            resnet_groups=resnet_groups,
+            downsample_padding=downsample_padding,
+            resnet_time_scale_shift=resnet_time_scale_shift,
+        )
+    elif down_block_type == "ResnetDownsampleBlock2D":
+        return ResnetDownsampleBlock2D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            temb_channels=temb_channels,
+            dropout=dropout,
+            add_downsample=add_downsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            resnet_groups=resnet_groups,
+            resnet_time_scale_shift=resnet_time_scale_shift,
+            skip_time_act=resnet_skip_time_act,
+            output_scale_factor=resnet_out_scale_factor,
+        )
+    elif down_block_type == "AttnDownBlock2D":
+        if add_downsample is False:
+            downsample_type = None
+        else:
+            downsample_type = downsample_type or "conv"  # default to 'conv'
+        return AttnDownBlock2D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            temb_channels=temb_channels,
+            dropout=dropout,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            resnet_groups=resnet_groups,
+            downsample_padding=downsample_padding,
+            attention_head_dim=attention_head_dim,
+            resnet_time_scale_shift=resnet_time_scale_shift,
+            downsample_type=downsample_type,
+        )
+    elif down_block_type == "CrossAttnDownBlock2D":
+        if cross_attention_dim is None:
+            raise ValueError("cross_attention_dim must be specified for CrossAttnDownBlock2D")
+        return CrossAttnDownBlock2D(
+            num_layers=num_layers,
+            transformer_layers_per_block=transformer_layers_per_block,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            temb_channels=temb_channels,
+            dropout=dropout,
+            add_downsample=add_downsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            resnet_groups=resnet_groups,
+            downsample_padding=downsample_padding,
+            cross_attention_dim=cross_attention_dim,
+            num_attention_heads=num_attention_heads,
+            dual_cross_attention=dual_cross_attention,
+            use_linear_projection=use_linear_projection,
+            only_cross_attention=only_cross_attention,
+            upcast_attention=upcast_attention,
+            resnet_time_scale_shift=resnet_time_scale_shift,
+            attention_type=attention_type,
+        )
+    elif down_block_type == "SimpleCrossAttnDownBlock2D":
+        if cross_attention_dim is None:
+            raise ValueError("cross_attention_dim must be specified for SimpleCrossAttnDownBlock2D")
+        return SimpleCrossAttnDownBlock2D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            temb_channels=temb_channels,
+            dropout=dropout,
+            add_downsample=add_downsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            resnet_groups=resnet_groups,
+            cross_attention_dim=cross_attention_dim,
+            attention_head_dim=attention_head_dim,
+            resnet_time_scale_shift=resnet_time_scale_shift,
+            skip_time_act=resnet_skip_time_act,
+            output_scale_factor=resnet_out_scale_factor,
+            only_cross_attention=only_cross_attention,
+            cross_attention_norm=cross_attention_norm,
+        )
+    elif down_block_type == "SkipDownBlock2D":
+        return SkipDownBlock2D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            temb_channels=temb_channels,
+            dropout=dropout,
+            add_downsample=add_downsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            downsample_padding=downsample_padding,
+            resnet_time_scale_shift=resnet_time_scale_shift,
+        )
+    elif down_block_type == "AttnSkipDownBlock2D":
+        return AttnSkipDownBlock2D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            temb_channels=temb_channels,
+            dropout=dropout,
+            add_downsample=add_downsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            attention_head_dim=attention_head_dim,
+            resnet_time_scale_shift=resnet_time_scale_shift,
+        )
+    elif down_block_type == "DownEncoderBlock2D":
+        return DownEncoderBlock2D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            dropout=dropout,
+            add_downsample=add_downsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            resnet_groups=resnet_groups,
+            downsample_padding=downsample_padding,
+            resnet_time_scale_shift=resnet_time_scale_shift,
+        )
+    elif down_block_type == "AttnDownEncoderBlock2D":
+        return AttnDownEncoderBlock2D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            dropout=dropout,
+            add_downsample=add_downsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            resnet_groups=resnet_groups,
+            downsample_padding=downsample_padding,
+            attention_head_dim=attention_head_dim,
+            resnet_time_scale_shift=resnet_time_scale_shift,
+        )
+    elif down_block_type == "KDownBlock2D":
+        return KDownBlock2D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            temb_channels=temb_channels,
+            dropout=dropout,
+            add_downsample=add_downsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+        )
+    elif down_block_type == "KCrossAttnDownBlock2D":
+        return KCrossAttnDownBlock2D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            temb_channels=temb_channels,
+            dropout=dropout,
+            add_downsample=add_downsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            cross_attention_dim=cross_attention_dim,
+            attention_head_dim=attention_head_dim,
+            add_self_attention=True if not add_downsample else False,
+        )
+    raise ValueError(f"{down_block_type} does not exist.")
+
+
+def get_up_block(
+    up_block_type: str,
+    num_layers: int,
+    in_channels: int,
+    out_channels: int,
+    prev_output_channel: int,
+    temb_channels: int,
+    add_upsample: bool,
+    resnet_eps: float,
+    resnet_act_fn: str,
+    resolution_idx: Optional[int] = None,
+    transformer_layers_per_block: int = 1,
+    num_attention_heads: Optional[int] = None,
+    resnet_groups: Optional[int] = None,
+    cross_attention_dim: Optional[int] = None,
+    dual_cross_attention: bool = False,
+    use_linear_projection: bool = False,
+    only_cross_attention: bool = False,
+    upcast_attention: bool = False,
+    resnet_time_scale_shift: str = "default",
+    attention_type: str = "default",
+    resnet_skip_time_act: bool = False,
+    resnet_out_scale_factor: float = 1.0,
+    cross_attention_norm: Optional[str] = None,
+    attention_head_dim: Optional[int] = None,
+    upsample_type: Optional[str] = None,
+    dropout: float = 0.0,
+) -> nn.Module:
+    # If attn head dim is not defined, we default it to the number of heads
+    if attention_head_dim is None:
+        logger.warn(
+            f"It is recommended to provide `attention_head_dim` when calling `get_up_block`. Defaulting `attention_head_dim` to {num_attention_heads}."
+        )
+        attention_head_dim = num_attention_heads
+
+    up_block_type = up_block_type[7:] if up_block_type.startswith("UNetRes") else up_block_type
+    if up_block_type == "UpBlock2D":
+        return UpBlock2D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            prev_output_channel=prev_output_channel,
+            temb_channels=temb_channels,
+            resolution_idx=resolution_idx,
+            dropout=dropout,
+            add_upsample=add_upsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            resnet_groups=resnet_groups,
+            resnet_time_scale_shift=resnet_time_scale_shift,
+        )
+    elif up_block_type == "ResnetUpsampleBlock2D":
+        return ResnetUpsampleBlock2D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            prev_output_channel=prev_output_channel,
+            temb_channels=temb_channels,
+            resolution_idx=resolution_idx,
+            dropout=dropout,
+            add_upsample=add_upsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            resnet_groups=resnet_groups,
+            resnet_time_scale_shift=resnet_time_scale_shift,
+            skip_time_act=resnet_skip_time_act,
+            output_scale_factor=resnet_out_scale_factor,
+        )
+    elif up_block_type == "CrossAttnUpBlock2D":
+        if cross_attention_dim is None:
+            raise ValueError("cross_attention_dim must be specified for CrossAttnUpBlock2D")
+        return CrossAttnUpBlock2D(
+            num_layers=num_layers,
+            transformer_layers_per_block=transformer_layers_per_block,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            prev_output_channel=prev_output_channel,
+            temb_channels=temb_channels,
+            resolution_idx=resolution_idx,
+            dropout=dropout,
+            add_upsample=add_upsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            resnet_groups=resnet_groups,
+            cross_attention_dim=cross_attention_dim,
+            num_attention_heads=num_attention_heads,
+            dual_cross_attention=dual_cross_attention,
+            use_linear_projection=use_linear_projection,
+            only_cross_attention=only_cross_attention,
+            upcast_attention=upcast_attention,
+            resnet_time_scale_shift=resnet_time_scale_shift,
+            attention_type=attention_type,
+        )
+    elif up_block_type == "SimpleCrossAttnUpBlock2D":
+        if cross_attention_dim is None:
+            raise ValueError("cross_attention_dim must be specified for SimpleCrossAttnUpBlock2D")
+        return SimpleCrossAttnUpBlock2D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            prev_output_channel=prev_output_channel,
+            temb_channels=temb_channels,
+            resolution_idx=resolution_idx,
+            dropout=dropout,
+            add_upsample=add_upsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            resnet_groups=resnet_groups,
+            cross_attention_dim=cross_attention_dim,
+            attention_head_dim=attention_head_dim,
+            resnet_time_scale_shift=resnet_time_scale_shift,
+            skip_time_act=resnet_skip_time_act,
+            output_scale_factor=resnet_out_scale_factor,
+            only_cross_attention=only_cross_attention,
+            cross_attention_norm=cross_attention_norm,
+        )
+    elif up_block_type == "AttnUpBlock2D":
+        if add_upsample is False:
+            upsample_type = None
+        else:
+            upsample_type = upsample_type or "conv"  # default to 'conv'
+
+        return AttnUpBlock2D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            prev_output_channel=prev_output_channel,
+            temb_channels=temb_channels,
+            resolution_idx=resolution_idx,
+            dropout=dropout,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            resnet_groups=resnet_groups,
+            attention_head_dim=attention_head_dim,
+            resnet_time_scale_shift=resnet_time_scale_shift,
+            upsample_type=upsample_type,
+        )
+    elif up_block_type == "SkipUpBlock2D":
+        return SkipUpBlock2D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            prev_output_channel=prev_output_channel,
+            temb_channels=temb_channels,
+            resolution_idx=resolution_idx,
+            dropout=dropout,
+            add_upsample=add_upsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            resnet_time_scale_shift=resnet_time_scale_shift,
+        )
+    elif up_block_type == "AttnSkipUpBlock2D":
+        return AttnSkipUpBlock2D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            prev_output_channel=prev_output_channel,
+            temb_channels=temb_channels,
+            resolution_idx=resolution_idx,
+            dropout=dropout,
+            add_upsample=add_upsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            attention_head_dim=attention_head_dim,
+            resnet_time_scale_shift=resnet_time_scale_shift,
+        )
+    elif up_block_type == "UpDecoderBlock2D":
+        return UpDecoderBlock2D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            resolution_idx=resolution_idx,
+            dropout=dropout,
+            add_upsample=add_upsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            resnet_groups=resnet_groups,
+            resnet_time_scale_shift=resnet_time_scale_shift,
+            temb_channels=temb_channels,
+        )
+    elif up_block_type == "AttnUpDecoderBlock2D":
+        return AttnUpDecoderBlock2D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            resolution_idx=resolution_idx,
+            dropout=dropout,
+            add_upsample=add_upsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            resnet_groups=resnet_groups,
+            attention_head_dim=attention_head_dim,
+            resnet_time_scale_shift=resnet_time_scale_shift,
+            temb_channels=temb_channels,
+        )
+    elif up_block_type == "KUpBlock2D":
+        return KUpBlock2D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            temb_channels=temb_channels,
+            resolution_idx=resolution_idx,
+            dropout=dropout,
+            add_upsample=add_upsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+        )
+    elif up_block_type == "KCrossAttnUpBlock2D":
+        return KCrossAttnUpBlock2D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            temb_channels=temb_channels,
+            resolution_idx=resolution_idx,
+            dropout=dropout,
+            add_upsample=add_upsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            cross_attention_dim=cross_attention_dim,
+            attention_head_dim=attention_head_dim,
+        )
+
+    raise ValueError(f"{up_block_type} does not exist.")
+
+
+class AutoencoderTinyBlock(nn.Module):
+    """
+    Tiny Autoencoder block used in [`AutoencoderTiny`]. It is a mini residual module consisting of plain conv + ReLU
+    blocks.
+
+    Args:
+        in_channels (`int`): The number of input channels.
+        out_channels (`int`): The number of output channels.
+        act_fn (`str`):
+            ` The activation function to use. Supported values are `"swish"`, `"mish"`, `"gelu"`, and `"relu"`.
+
+    Returns:
+        `torch.FloatTensor`: A tensor with the same shape as the input tensor, but with the number of channels equal to
+        `out_channels`.
+    """
+
+    def __init__(self, in_channels: int, out_channels: int, act_fn: str):
+        super().__init__()
+        act_fn = get_activation(act_fn)
+        self.conv = nn.Sequential(
+            nn.Conv2d(in_channels, out_channels, kernel_size=3, padding=1),
+            act_fn,
+            nn.Conv2d(out_channels, out_channels, kernel_size=3, padding=1),
+            act_fn,
+            nn.Conv2d(out_channels, out_channels, kernel_size=3, padding=1),
+        )
+        self.skip = (
+            nn.Conv2d(in_channels, out_channels, kernel_size=1, bias=False)
+            if in_channels != out_channels
+            else nn.Identity()
+        )
+        self.fuse = nn.ReLU()
+
+    def forward(self, x: torch.FloatTensor) -> torch.FloatTensor:
+        return self.fuse(self.conv(x) + self.skip(x))
+
+
+class UNetMidBlock2D(nn.Module):
+    """
+    A 2D UNet mid-block [`UNetMidBlock2D`] with multiple residual blocks and optional attention blocks.
+
+    Args:
+        in_channels (`int`): The number of input channels.
+        temb_channels (`int`): The number of temporal embedding channels.
+        dropout (`float`, *optional*, defaults to 0.0): The dropout rate.
+        num_layers (`int`, *optional*, defaults to 1): The number of residual blocks.
+        resnet_eps (`float`, *optional*, 1e-6 ): The epsilon value for the resnet blocks.
+        resnet_time_scale_shift (`str`, *optional*, defaults to `default`):
+            The type of normalization to apply to the time embeddings. This can help to improve the performance of the
+            model on tasks with long-range temporal dependencies.
+        resnet_act_fn (`str`, *optional*, defaults to `swish`): The activation function for the resnet blocks.
+        resnet_groups (`int`, *optional*, defaults to 32):
+            The number of groups to use in the group normalization layers of the resnet blocks.
+        attn_groups (`Optional[int]`, *optional*, defaults to None): The number of groups for the attention blocks.
+        resnet_pre_norm (`bool`, *optional*, defaults to `True`):
+            Whether to use pre-normalization for the resnet blocks.
+        add_attention (`bool`, *optional*, defaults to `True`): Whether to add attention blocks.
+        attention_head_dim (`int`, *optional*, defaults to 1):
+            Dimension of a single attention head. The number of attention heads is determined based on this value and
+            the number of input channels.
+        output_scale_factor (`float`, *optional*, defaults to 1.0): The output scale factor.
+
+    Returns:
+        `torch.FloatTensor`: The output of the last residual block, which is a tensor of shape `(batch_size,
+        in_channels, height, width)`.
+
+    """
+
+    def __init__(
+        self,
+        in_channels: int,
+        temb_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",  # default, spatial
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        attn_groups: Optional[int] = None,
+        resnet_pre_norm: bool = True,
+        add_attention: bool = True,
+        attention_head_dim: int = 1,
+        output_scale_factor: float = 1.0,
+    ):
+        super().__init__()
+        resnet_groups = resnet_groups if resnet_groups is not None else min(in_channels // 4, 32)
+        self.add_attention = add_attention
+
+        if attn_groups is None:
+            attn_groups = resnet_groups if resnet_time_scale_shift == "default" else None
+
+        # there is always at least one resnet
+        resnets = [
+            ResnetBlock2D(
+                in_channels=in_channels,
+                out_channels=in_channels,
+                temb_channels=temb_channels,
+                eps=resnet_eps,
+                groups=resnet_groups,
+                dropout=dropout,
+                time_embedding_norm=resnet_time_scale_shift,
+                non_linearity=resnet_act_fn,
+                output_scale_factor=output_scale_factor,
+                pre_norm=resnet_pre_norm,
+            )
+        ]
+        attentions = []
+
+        if attention_head_dim is None:
+            logger.warn(
+                f"It is not recommend to pass `attention_head_dim=None`. Defaulting `attention_head_dim` to `in_channels`: {in_channels}."
+            )
+            attention_head_dim = in_channels
+
+        for _ in range(num_layers):
+            if self.add_attention:
+                attentions.append(
+                    Attention(
+                        in_channels,
+                        heads=in_channels // attention_head_dim,
+                        dim_head=attention_head_dim,
+                        rescale_output_factor=output_scale_factor,
+                        eps=resnet_eps,
+                        norm_num_groups=attn_groups,
+                        spatial_norm_dim=temb_channels if resnet_time_scale_shift == "spatial" else None,
+                        residual_connection=True,
+                        bias=True,
+                        upcast_softmax=True,
+                        _from_deprecated_attn_block=True,
+                    )
+                )
+            else:
+                attentions.append(None)
+
+            resnets.append(
+                ResnetBlock2D(
+                    in_channels=in_channels,
+                    out_channels=in_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                )
+            )
+
+        self.attentions = nn.ModuleList(attentions)
+        self.resnets = nn.ModuleList(resnets)
+
+    def forward(self, hidden_states: torch.FloatTensor, temb: Optional[torch.FloatTensor] = None) -> torch.FloatTensor:
+        hidden_states = self.resnets[0](hidden_states, temb)
+        for attn, resnet in zip(self.attentions, self.resnets[1:]):
+            if attn is not None:
+                hidden_states = attn(hidden_states, temb=temb)
+            hidden_states = resnet(hidden_states, temb)
+
+        return hidden_states
+
+
+class UNetMidBlock2DCrossAttn(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        temb_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        transformer_layers_per_block: Union[int, Tuple[int]] = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        num_attention_heads: int = 1,
+        output_scale_factor: float = 1.0,
+        cross_attention_dim: int = 1280,
+        dual_cross_attention: bool = False,
+        use_linear_projection: bool = False,
+        upcast_attention: bool = False,
+        attention_type: str = "default",
+    ):
+        super().__init__()
+
+        self.has_cross_attention = True
+        self.num_attention_heads = num_attention_heads
+        resnet_groups = resnet_groups if resnet_groups is not None else min(in_channels // 4, 32)
+
+        # support for variable transformer layers per block
+        if isinstance(transformer_layers_per_block, int):
+            transformer_layers_per_block = [transformer_layers_per_block] * num_layers
+
+        # there is always at least one resnet
+        resnets = [
+            ResnetBlock2D(
+                in_channels=in_channels,
+                out_channels=in_channels,
+                temb_channels=temb_channels,
+                eps=resnet_eps,
+                groups=resnet_groups,
+                dropout=dropout,
+                time_embedding_norm=resnet_time_scale_shift,
+                non_linearity=resnet_act_fn,
+                output_scale_factor=output_scale_factor,
+                pre_norm=resnet_pre_norm,
+            )
+        ]
+        attentions = []
+
+        for i in range(num_layers):
+            if not dual_cross_attention:
+                attentions.append(
+                    Transformer2DModel(
+                        num_attention_heads,
+                        in_channels // num_attention_heads,
+                        in_channels=in_channels,
+                        num_layers=transformer_layers_per_block[i],
+                        cross_attention_dim=cross_attention_dim,
+                        norm_num_groups=resnet_groups,
+                        use_linear_projection=use_linear_projection,
+                        upcast_attention=upcast_attention,
+                        attention_type=attention_type,
+                    )
+                )
+            else:
+                attentions.append(
+                    DualTransformer2DModel(
+                        num_attention_heads,
+                        in_channels // num_attention_heads,
+                        in_channels=in_channels,
+                        num_layers=1,
+                        cross_attention_dim=cross_attention_dim,
+                        norm_num_groups=resnet_groups,
+                    )
+                )
+            resnets.append(
+                ResnetBlock2D(
+                    in_channels=in_channels,
+                    out_channels=in_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                )
+            )
+
+        self.attentions = nn.ModuleList(attentions)
+        self.resnets = nn.ModuleList(resnets)
+
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        hidden_states: torch.FloatTensor,
+        temb: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        garment_features=None,
+        curr_garment_feat_idx=0,
+    ) -> torch.FloatTensor:
+        lora_scale = cross_attention_kwargs.get("scale", 1.0) if cross_attention_kwargs is not None else 1.0
+        hidden_states = self.resnets[0](hidden_states, temb, scale=lora_scale)
+        for attn, resnet in zip(self.attentions, self.resnets[1:]):
+            if self.training and self.gradient_checkpointing:
+
+                def create_custom_forward(module, return_dict=None):
+                    def custom_forward(*inputs):
+                        if return_dict is not None:
+                            return module(*inputs, return_dict=return_dict)
+                        else:
+                            return module(*inputs)
+
+                    return custom_forward
+
+                ckpt_kwargs: Dict[str, Any] = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
+                hidden_states,curr_garment_feat_idx = attn(
+                    hidden_states,
+                    encoder_hidden_states=encoder_hidden_states,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    attention_mask=attention_mask,
+                    encoder_attention_mask=encoder_attention_mask,
+                    return_dict=False,
+                    garment_features=garment_features,
+                    curr_garment_feat_idx=curr_garment_feat_idx,
+                )
+                hidden_states=hidden_states[0]
+                hidden_states = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(resnet),
+                    hidden_states,
+                    temb,
+                    **ckpt_kwargs,
+                )
+            else:
+                hidden_states,curr_garment_feat_idx = attn(
+                    hidden_states,
+                    encoder_hidden_states=encoder_hidden_states,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    attention_mask=attention_mask,
+                    encoder_attention_mask=encoder_attention_mask,
+                    return_dict=False,
+                    garment_features=garment_features,
+                    curr_garment_feat_idx=curr_garment_feat_idx,
+                )
+                hidden_states=hidden_states[0]
+                hidden_states = resnet(hidden_states, temb, scale=lora_scale)
+
+        return hidden_states,curr_garment_feat_idx
+
+
+class UNetMidBlock2DSimpleCrossAttn(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        temb_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        attention_head_dim: int = 1,
+        output_scale_factor: float = 1.0,
+        cross_attention_dim: int = 1280,
+        skip_time_act: bool = False,
+        only_cross_attention: bool = False,
+        cross_attention_norm: Optional[str] = None,
+    ):
+        super().__init__()
+
+        self.has_cross_attention = True
+
+        self.attention_head_dim = attention_head_dim
+        resnet_groups = resnet_groups if resnet_groups is not None else min(in_channels // 4, 32)
+
+        self.num_heads = in_channels // self.attention_head_dim
+
+        # there is always at least one resnet
+        resnets = [
+            ResnetBlock2D(
+                in_channels=in_channels,
+                out_channels=in_channels,
+                temb_channels=temb_channels,
+                eps=resnet_eps,
+                groups=resnet_groups,
+                dropout=dropout,
+                time_embedding_norm=resnet_time_scale_shift,
+                non_linearity=resnet_act_fn,
+                output_scale_factor=output_scale_factor,
+                pre_norm=resnet_pre_norm,
+                skip_time_act=skip_time_act,
+            )
+        ]
+        attentions = []
+
+        for _ in range(num_layers):
+            processor = (
+                AttnAddedKVProcessor2_0() if hasattr(F, "scaled_dot_product_attention") else AttnAddedKVProcessor()
+            )
+
+            attentions.append(
+                Attention(
+                    query_dim=in_channels,
+                    cross_attention_dim=in_channels,
+                    heads=self.num_heads,
+                    dim_head=self.attention_head_dim,
+                    added_kv_proj_dim=cross_attention_dim,
+                    norm_num_groups=resnet_groups,
+                    bias=True,
+                    upcast_softmax=True,
+                    only_cross_attention=only_cross_attention,
+                    cross_attention_norm=cross_attention_norm,
+                    processor=processor,
+                )
+            )
+            resnets.append(
+                ResnetBlock2D(
+                    in_channels=in_channels,
+                    out_channels=in_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                    skip_time_act=skip_time_act,
+                )
+            )
+
+        self.attentions = nn.ModuleList(attentions)
+        self.resnets = nn.ModuleList(resnets)
+
+    def forward(
+        self,
+        hidden_states: torch.FloatTensor,
+        temb: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+    ) -> torch.FloatTensor:
+        cross_attention_kwargs = cross_attention_kwargs if cross_attention_kwargs is not None else {}
+        lora_scale = cross_attention_kwargs.get("scale", 1.0)
+
+        if attention_mask is None:
+            # if encoder_hidden_states is defined: we are doing cross-attn, so we should use cross-attn mask.
+            mask = None if encoder_hidden_states is None else encoder_attention_mask
+        else:
+            # when attention_mask is defined: we don't even check for encoder_attention_mask.
+            # this is to maintain compatibility with UnCLIP, which uses 'attention_mask' param for cross-attn masks.
+            # TODO: UnCLIP should express cross-attn mask via encoder_attention_mask param instead of via attention_mask.
+            #       then we can simplify this whole if/else block to:
+            #         mask = attention_mask if encoder_hidden_states is None else encoder_attention_mask
+            mask = attention_mask
+
+        hidden_states = self.resnets[0](hidden_states, temb, scale=lora_scale)
+        for attn, resnet in zip(self.attentions, self.resnets[1:]):
+            # attn
+            hidden_states = attn(
+                hidden_states,
+                encoder_hidden_states=encoder_hidden_states,
+                attention_mask=mask,
+                **cross_attention_kwargs,
+            )
+
+            # resnet
+            hidden_states = resnet(hidden_states, temb, scale=lora_scale)
+
+        return hidden_states
+
+
+class AttnDownBlock2D(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        temb_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        attention_head_dim: int = 1,
+        output_scale_factor: float = 1.0,
+        downsample_padding: int = 1,
+        downsample_type: str = "conv",
+    ):
+        super().__init__()
+        resnets = []
+        attentions = []
+        self.downsample_type = downsample_type
+
+        if attention_head_dim is None:
+            logger.warn(
+                f"It is not recommend to pass `attention_head_dim=None`. Defaulting `attention_head_dim` to `in_channels`: {out_channels}."
+            )
+            attention_head_dim = out_channels
+
+        for i in range(num_layers):
+            in_channels = in_channels if i == 0 else out_channels
+            resnets.append(
+                ResnetBlock2D(
+                    in_channels=in_channels,
+                    out_channels=out_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                )
+            )
+            attentions.append(
+                Attention(
+                    out_channels,
+                    heads=out_channels // attention_head_dim,
+                    dim_head=attention_head_dim,
+                    rescale_output_factor=output_scale_factor,
+                    eps=resnet_eps,
+                    norm_num_groups=resnet_groups,
+                    residual_connection=True,
+                    bias=True,
+                    upcast_softmax=True,
+                    _from_deprecated_attn_block=True,
+                )
+            )
+
+        self.attentions = nn.ModuleList(attentions)
+        self.resnets = nn.ModuleList(resnets)
+
+        if downsample_type == "conv":
+            self.downsamplers = nn.ModuleList(
+                [
+                    Downsample2D(
+                        out_channels, use_conv=True, out_channels=out_channels, padding=downsample_padding, name="op"
+                    )
+                ]
+            )
+        elif downsample_type == "resnet":
+            self.downsamplers = nn.ModuleList(
+                [
+                    ResnetBlock2D(
+                        in_channels=out_channels,
+                        out_channels=out_channels,
+                        temb_channels=temb_channels,
+                        eps=resnet_eps,
+                        groups=resnet_groups,
+                        dropout=dropout,
+                        time_embedding_norm=resnet_time_scale_shift,
+                        non_linearity=resnet_act_fn,
+                        output_scale_factor=output_scale_factor,
+                        pre_norm=resnet_pre_norm,
+                        down=True,
+                    )
+                ]
+            )
+        else:
+            self.downsamplers = None
+
+    def forward(
+        self,
+        hidden_states: torch.FloatTensor,
+        temb: Optional[torch.FloatTensor] = None,
+        upsample_size: Optional[int] = None,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+    ) -> Tuple[torch.FloatTensor, Tuple[torch.FloatTensor, ...]]:
+        cross_attention_kwargs = cross_attention_kwargs if cross_attention_kwargs is not None else {}
+
+        lora_scale = cross_attention_kwargs.get("scale", 1.0)
+
+        output_states = ()
+
+        for resnet, attn in zip(self.resnets, self.attentions):
+            cross_attention_kwargs.update({"scale": lora_scale})
+            hidden_states = resnet(hidden_states, temb, scale=lora_scale)
+            hidden_states = attn(hidden_states, **cross_attention_kwargs)
+            output_states = output_states + (hidden_states,)
+
+        if self.downsamplers is not None:
+            for downsampler in self.downsamplers:
+                if self.downsample_type == "resnet":
+                    hidden_states = downsampler(hidden_states, temb=temb, scale=lora_scale)
+                else:
+                    hidden_states = downsampler(hidden_states, scale=lora_scale)
+
+            output_states += (hidden_states,)
+
+        return hidden_states, output_states
+
+
+class CrossAttnDownBlock2D(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        temb_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        transformer_layers_per_block: Union[int, Tuple[int]] = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        num_attention_heads: int = 1,
+        cross_attention_dim: int = 1280,
+        output_scale_factor: float = 1.0,
+        downsample_padding: int = 1,
+        add_downsample: bool = True,
+        dual_cross_attention: bool = False,
+        use_linear_projection: bool = False,
+        only_cross_attention: bool = False,
+        upcast_attention: bool = False,
+        attention_type: str = "default",
+    ):
+        super().__init__()
+        resnets = []
+        attentions = []
+
+        self.has_cross_attention = True
+        self.num_attention_heads = num_attention_heads
+        if isinstance(transformer_layers_per_block, int):
+            transformer_layers_per_block = [transformer_layers_per_block] * num_layers
+
+        for i in range(num_layers):
+            in_channels = in_channels if i == 0 else out_channels
+            resnets.append(
+                ResnetBlock2D(
+                    in_channels=in_channels,
+                    out_channels=out_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                )
+            )
+            if not dual_cross_attention:
+                attentions.append(
+                    Transformer2DModel(
+                        num_attention_heads,
+                        out_channels // num_attention_heads,
+                        in_channels=out_channels,
+                        num_layers=transformer_layers_per_block[i],
+                        cross_attention_dim=cross_attention_dim,
+                        norm_num_groups=resnet_groups,
+                        use_linear_projection=use_linear_projection,
+                        only_cross_attention=only_cross_attention,
+                        upcast_attention=upcast_attention,
+                        attention_type=attention_type,
+                    )
+                )
+            else:
+                attentions.append(
+                    DualTransformer2DModel(
+                        num_attention_heads,
+                        out_channels // num_attention_heads,
+                        in_channels=out_channels,
+                        num_layers=1,
+                        cross_attention_dim=cross_attention_dim,
+                        norm_num_groups=resnet_groups,
+                    )
+                )
+        self.attentions = nn.ModuleList(attentions)
+        self.resnets = nn.ModuleList(resnets)
+
+        if add_downsample:
+            self.downsamplers = nn.ModuleList(
+                [
+                    Downsample2D(
+                        out_channels, use_conv=True, out_channels=out_channels, padding=downsample_padding, name="op"
+                    )
+                ]
+            )
+        else:
+            self.downsamplers = None
+
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        hidden_states: torch.FloatTensor,
+        temb: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        additional_residuals: Optional[torch.FloatTensor] = None,
+        garment_features=None,
+        curr_garment_feat_idx=0,
+    ) -> Tuple[torch.FloatTensor, Tuple[torch.FloatTensor, ...]]:
+        output_states = ()
+
+        lora_scale = cross_attention_kwargs.get("scale", 1.0) if cross_attention_kwargs is not None else 1.0
+
+        blocks = list(zip(self.resnets, self.attentions))
+        # print("len(self.resnets)")
+        # print(len(self.resnets))
+        # print("len(self.attentions)")
+        # print(len(self.attentions))
+        for i, (resnet, attn) in enumerate(blocks):
+            if self.training and self.gradient_checkpointing:
+
+                def create_custom_forward(module, return_dict=None):
+                    def custom_forward(*inputs):
+                        if return_dict is not None:
+                            return module(*inputs, return_dict=return_dict)
+                        else:
+                            return module(*inputs)
+
+                    return custom_forward
+
+                ckpt_kwargs: Dict[str, Any] = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
+                hidden_states = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(resnet),
+                    hidden_states,
+                    temb,
+                    **ckpt_kwargs,
+                )
+                hidden_states,curr_garment_feat_idx = attn(
+                    hidden_states,
+                    encoder_hidden_states=encoder_hidden_states,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    attention_mask=attention_mask,
+                    encoder_attention_mask=encoder_attention_mask,
+                    return_dict=False,
+                    garment_features=garment_features,
+                    curr_garment_feat_idx=curr_garment_feat_idx,
+                )
+                hidden_states=hidden_states[0]
+            else:
+                hidden_states = resnet(hidden_states, temb, scale=lora_scale)
+                hidden_states,curr_garment_feat_idx = attn(
+                    hidden_states,
+                    encoder_hidden_states=encoder_hidden_states,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    attention_mask=attention_mask,
+                    encoder_attention_mask=encoder_attention_mask,
+                    return_dict=False,
+                    garment_features=garment_features,
+                    curr_garment_feat_idx=curr_garment_feat_idx,
+                )
+                hidden_states=hidden_states[0]
+
+
+            # apply additional residuals to the output of the last pair of resnet and attention blocks
+            if i == len(blocks) - 1 and additional_residuals is not None:
+                hidden_states = hidden_states + additional_residuals
+
+            output_states = output_states + (hidden_states,)
+
+        if self.downsamplers is not None:
+            for downsampler in self.downsamplers:
+                hidden_states = downsampler(hidden_states, scale=lora_scale)
+
+            output_states = output_states + (hidden_states,)
+
+        return hidden_states, output_states,curr_garment_feat_idx
+
+
+class DownBlock2D(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        temb_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        output_scale_factor: float = 1.0,
+        add_downsample: bool = True,
+        downsample_padding: int = 1,
+    ):
+        super().__init__()
+        resnets = []
+
+        for i in range(num_layers):
+            in_channels = in_channels if i == 0 else out_channels
+            resnets.append(
+                ResnetBlock2D(
+                    in_channels=in_channels,
+                    out_channels=out_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                )
+            )
+
+        self.resnets = nn.ModuleList(resnets)
+
+        if add_downsample:
+            self.downsamplers = nn.ModuleList(
+                [
+                    Downsample2D(
+                        out_channels, use_conv=True, out_channels=out_channels, padding=downsample_padding, name="op"
+                    )
+                ]
+            )
+        else:
+            self.downsamplers = None
+
+        self.gradient_checkpointing = False
+
+    def forward(
+        self, hidden_states: torch.FloatTensor, temb: Optional[torch.FloatTensor] = None, scale: float = 1.0
+    ) -> Tuple[torch.FloatTensor, Tuple[torch.FloatTensor, ...]]:
+        output_states = ()
+
+        for resnet in self.resnets:
+            if self.training and self.gradient_checkpointing:
+
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        return module(*inputs)
+
+                    return custom_forward
+
+                if is_torch_version(">=", "1.11.0"):
+                    hidden_states = torch.utils.checkpoint.checkpoint(
+                        create_custom_forward(resnet), hidden_states, temb, use_reentrant=False
+                    )
+                else:
+                    hidden_states = torch.utils.checkpoint.checkpoint(
+                        create_custom_forward(resnet), hidden_states, temb
+                    )
+            else:
+                hidden_states = resnet(hidden_states, temb, scale=scale)
+
+            output_states = output_states + (hidden_states,)
+
+        if self.downsamplers is not None:
+            for downsampler in self.downsamplers:
+                hidden_states = downsampler(hidden_states, scale=scale)
+
+            output_states = output_states + (hidden_states,)
+
+        return hidden_states, output_states
+
+
+class DownEncoderBlock2D(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        output_scale_factor: float = 1.0,
+        add_downsample: bool = True,
+        downsample_padding: int = 1,
+    ):
+        super().__init__()
+        resnets = []
+
+        for i in range(num_layers):
+            in_channels = in_channels if i == 0 else out_channels
+            resnets.append(
+                ResnetBlock2D(
+                    in_channels=in_channels,
+                    out_channels=out_channels,
+                    temb_channels=None,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                )
+            )
+
+        self.resnets = nn.ModuleList(resnets)
+
+        if add_downsample:
+            self.downsamplers = nn.ModuleList(
+                [
+                    Downsample2D(
+                        out_channels, use_conv=True, out_channels=out_channels, padding=downsample_padding, name="op"
+                    )
+                ]
+            )
+        else:
+            self.downsamplers = None
+
+    def forward(self, hidden_states: torch.FloatTensor, scale: float = 1.0) -> torch.FloatTensor:
+        for resnet in self.resnets:
+            hidden_states = resnet(hidden_states, temb=None, scale=scale)
+
+        if self.downsamplers is not None:
+            for downsampler in self.downsamplers:
+                hidden_states = downsampler(hidden_states, scale)
+
+        return hidden_states
+
+
+class AttnDownEncoderBlock2D(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        attention_head_dim: int = 1,
+        output_scale_factor: float = 1.0,
+        add_downsample: bool = True,
+        downsample_padding: int = 1,
+    ):
+        super().__init__()
+        resnets = []
+        attentions = []
+
+        if attention_head_dim is None:
+            logger.warn(
+                f"It is not recommend to pass `attention_head_dim=None`. Defaulting `attention_head_dim` to `in_channels`: {out_channels}."
+            )
+            attention_head_dim = out_channels
+
+        for i in range(num_layers):
+            in_channels = in_channels if i == 0 else out_channels
+            resnets.append(
+                ResnetBlock2D(
+                    in_channels=in_channels,
+                    out_channels=out_channels,
+                    temb_channels=None,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                )
+            )
+            attentions.append(
+                Attention(
+                    out_channels,
+                    heads=out_channels // attention_head_dim,
+                    dim_head=attention_head_dim,
+                    rescale_output_factor=output_scale_factor,
+                    eps=resnet_eps,
+                    norm_num_groups=resnet_groups,
+                    residual_connection=True,
+                    bias=True,
+                    upcast_softmax=True,
+                    _from_deprecated_attn_block=True,
+                )
+            )
+
+        self.attentions = nn.ModuleList(attentions)
+        self.resnets = nn.ModuleList(resnets)
+
+        if add_downsample:
+            self.downsamplers = nn.ModuleList(
+                [
+                    Downsample2D(
+                        out_channels, use_conv=True, out_channels=out_channels, padding=downsample_padding, name="op"
+                    )
+                ]
+            )
+        else:
+            self.downsamplers = None
+
+    def forward(self, hidden_states: torch.FloatTensor, scale: float = 1.0) -> torch.FloatTensor:
+        for resnet, attn in zip(self.resnets, self.attentions):
+            hidden_states = resnet(hidden_states, temb=None, scale=scale)
+            cross_attention_kwargs = {"scale": scale}
+            hidden_states = attn(hidden_states, **cross_attention_kwargs)
+
+        if self.downsamplers is not None:
+            for downsampler in self.downsamplers:
+                hidden_states = downsampler(hidden_states, scale)
+
+        return hidden_states
+
+
+class AttnSkipDownBlock2D(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        temb_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_pre_norm: bool = True,
+        attention_head_dim: int = 1,
+        output_scale_factor: float = np.sqrt(2.0),
+        add_downsample: bool = True,
+    ):
+        super().__init__()
+        self.attentions = nn.ModuleList([])
+        self.resnets = nn.ModuleList([])
+
+        if attention_head_dim is None:
+            logger.warn(
+                f"It is not recommend to pass `attention_head_dim=None`. Defaulting `attention_head_dim` to `in_channels`: {out_channels}."
+            )
+            attention_head_dim = out_channels
+
+        for i in range(num_layers):
+            in_channels = in_channels if i == 0 else out_channels
+            self.resnets.append(
+                ResnetBlock2D(
+                    in_channels=in_channels,
+                    out_channels=out_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=min(in_channels // 4, 32),
+                    groups_out=min(out_channels // 4, 32),
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                )
+            )
+            self.attentions.append(
+                Attention(
+                    out_channels,
+                    heads=out_channels // attention_head_dim,
+                    dim_head=attention_head_dim,
+                    rescale_output_factor=output_scale_factor,
+                    eps=resnet_eps,
+                    norm_num_groups=32,
+                    residual_connection=True,
+                    bias=True,
+                    upcast_softmax=True,
+                    _from_deprecated_attn_block=True,
+                )
+            )
+
+        if add_downsample:
+            self.resnet_down = ResnetBlock2D(
+                in_channels=out_channels,
+                out_channels=out_channels,
+                temb_channels=temb_channels,
+                eps=resnet_eps,
+                groups=min(out_channels // 4, 32),
+                dropout=dropout,
+                time_embedding_norm=resnet_time_scale_shift,
+                non_linearity=resnet_act_fn,
+                output_scale_factor=output_scale_factor,
+                pre_norm=resnet_pre_norm,
+                use_in_shortcut=True,
+                down=True,
+                kernel="fir",
+            )
+            self.downsamplers = nn.ModuleList([FirDownsample2D(out_channels, out_channels=out_channels)])
+            self.skip_conv = nn.Conv2d(3, out_channels, kernel_size=(1, 1), stride=(1, 1))
+        else:
+            self.resnet_down = None
+            self.downsamplers = None
+            self.skip_conv = None
+
+    def forward(
+        self,
+        hidden_states: torch.FloatTensor,
+        temb: Optional[torch.FloatTensor] = None,
+        skip_sample: Optional[torch.FloatTensor] = None,
+        scale: float = 1.0,
+    ) -> Tuple[torch.FloatTensor, Tuple[torch.FloatTensor, ...], torch.FloatTensor]:
+        output_states = ()
+
+        for resnet, attn in zip(self.resnets, self.attentions):
+            hidden_states = resnet(hidden_states, temb, scale=scale)
+            cross_attention_kwargs = {"scale": scale}
+            hidden_states = attn(hidden_states, **cross_attention_kwargs)
+            output_states += (hidden_states,)
+
+        if self.downsamplers is not None:
+            hidden_states = self.resnet_down(hidden_states, temb, scale=scale)
+            for downsampler in self.downsamplers:
+                skip_sample = downsampler(skip_sample)
+
+            hidden_states = self.skip_conv(skip_sample) + hidden_states
+
+            output_states += (hidden_states,)
+
+        return hidden_states, output_states, skip_sample
+
+
+class SkipDownBlock2D(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        temb_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_pre_norm: bool = True,
+        output_scale_factor: float = np.sqrt(2.0),
+        add_downsample: bool = True,
+        downsample_padding: int = 1,
+    ):
+        super().__init__()
+        self.resnets = nn.ModuleList([])
+
+        for i in range(num_layers):
+            in_channels = in_channels if i == 0 else out_channels
+            self.resnets.append(
+                ResnetBlock2D(
+                    in_channels=in_channels,
+                    out_channels=out_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=min(in_channels // 4, 32),
+                    groups_out=min(out_channels // 4, 32),
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                )
+            )
+
+        if add_downsample:
+            self.resnet_down = ResnetBlock2D(
+                in_channels=out_channels,
+                out_channels=out_channels,
+                temb_channels=temb_channels,
+                eps=resnet_eps,
+                groups=min(out_channels // 4, 32),
+                dropout=dropout,
+                time_embedding_norm=resnet_time_scale_shift,
+                non_linearity=resnet_act_fn,
+                output_scale_factor=output_scale_factor,
+                pre_norm=resnet_pre_norm,
+                use_in_shortcut=True,
+                down=True,
+                kernel="fir",
+            )
+            self.downsamplers = nn.ModuleList([FirDownsample2D(out_channels, out_channels=out_channels)])
+            self.skip_conv = nn.Conv2d(3, out_channels, kernel_size=(1, 1), stride=(1, 1))
+        else:
+            self.resnet_down = None
+            self.downsamplers = None
+            self.skip_conv = None
+
+    def forward(
+        self,
+        hidden_states: torch.FloatTensor,
+        temb: Optional[torch.FloatTensor] = None,
+        skip_sample: Optional[torch.FloatTensor] = None,
+        scale: float = 1.0,
+    ) -> Tuple[torch.FloatTensor, Tuple[torch.FloatTensor, ...], torch.FloatTensor]:
+        output_states = ()
+
+        for resnet in self.resnets:
+            hidden_states = resnet(hidden_states, temb, scale)
+            output_states += (hidden_states,)
+
+        if self.downsamplers is not None:
+            hidden_states = self.resnet_down(hidden_states, temb, scale)
+            for downsampler in self.downsamplers:
+                skip_sample = downsampler(skip_sample)
+
+            hidden_states = self.skip_conv(skip_sample) + hidden_states
+
+            output_states += (hidden_states,)
+
+        return hidden_states, output_states, skip_sample
+
+
+class ResnetDownsampleBlock2D(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        temb_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        output_scale_factor: float = 1.0,
+        add_downsample: bool = True,
+        skip_time_act: bool = False,
+    ):
+        super().__init__()
+        resnets = []
+
+        for i in range(num_layers):
+            in_channels = in_channels if i == 0 else out_channels
+            resnets.append(
+                ResnetBlock2D(
+                    in_channels=in_channels,
+                    out_channels=out_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                    skip_time_act=skip_time_act,
+                )
+            )
+
+        self.resnets = nn.ModuleList(resnets)
+
+        if add_downsample:
+            self.downsamplers = nn.ModuleList(
+                [
+                    ResnetBlock2D(
+                        in_channels=out_channels,
+                        out_channels=out_channels,
+                        temb_channels=temb_channels,
+                        eps=resnet_eps,
+                        groups=resnet_groups,
+                        dropout=dropout,
+                        time_embedding_norm=resnet_time_scale_shift,
+                        non_linearity=resnet_act_fn,
+                        output_scale_factor=output_scale_factor,
+                        pre_norm=resnet_pre_norm,
+                        skip_time_act=skip_time_act,
+                        down=True,
+                    )
+                ]
+            )
+        else:
+            self.downsamplers = None
+
+        self.gradient_checkpointing = False
+
+    def forward(
+        self, hidden_states: torch.FloatTensor, temb: Optional[torch.FloatTensor] = None, scale: float = 1.0
+    ) -> Tuple[torch.FloatTensor, Tuple[torch.FloatTensor, ...]]:
+        output_states = ()
+
+        for resnet in self.resnets:
+            if self.training and self.gradient_checkpointing:
+
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        return module(*inputs)
+
+                    return custom_forward
+
+                if is_torch_version(">=", "1.11.0"):
+                    hidden_states = torch.utils.checkpoint.checkpoint(
+                        create_custom_forward(resnet), hidden_states, temb, use_reentrant=False
+                    )
+                else:
+                    hidden_states = torch.utils.checkpoint.checkpoint(
+                        create_custom_forward(resnet), hidden_states, temb
+                    )
+            else:
+                hidden_states = resnet(hidden_states, temb, scale)
+
+            output_states = output_states + (hidden_states,)
+
+        if self.downsamplers is not None:
+            for downsampler in self.downsamplers:
+                hidden_states = downsampler(hidden_states, temb, scale)
+
+            output_states = output_states + (hidden_states,)
+
+        return hidden_states, output_states
+
+
+class SimpleCrossAttnDownBlock2D(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        temb_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        attention_head_dim: int = 1,
+        cross_attention_dim: int = 1280,
+        output_scale_factor: float = 1.0,
+        add_downsample: bool = True,
+        skip_time_act: bool = False,
+        only_cross_attention: bool = False,
+        cross_attention_norm: Optional[str] = None,
+    ):
+        super().__init__()
+
+        self.has_cross_attention = True
+
+        resnets = []
+        attentions = []
+
+        self.attention_head_dim = attention_head_dim
+        self.num_heads = out_channels // self.attention_head_dim
+
+        for i in range(num_layers):
+            in_channels = in_channels if i == 0 else out_channels
+            resnets.append(
+                ResnetBlock2D(
+                    in_channels=in_channels,
+                    out_channels=out_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                    skip_time_act=skip_time_act,
+                )
+            )
+
+            processor = (
+                AttnAddedKVProcessor2_0() if hasattr(F, "scaled_dot_product_attention") else AttnAddedKVProcessor()
+            )
+
+            attentions.append(
+                Attention(
+                    query_dim=out_channels,
+                    cross_attention_dim=out_channels,
+                    heads=self.num_heads,
+                    dim_head=attention_head_dim,
+                    added_kv_proj_dim=cross_attention_dim,
+                    norm_num_groups=resnet_groups,
+                    bias=True,
+                    upcast_softmax=True,
+                    only_cross_attention=only_cross_attention,
+                    cross_attention_norm=cross_attention_norm,
+                    processor=processor,
+                )
+            )
+        self.attentions = nn.ModuleList(attentions)
+        self.resnets = nn.ModuleList(resnets)
+
+        if add_downsample:
+            self.downsamplers = nn.ModuleList(
+                [
+                    ResnetBlock2D(
+                        in_channels=out_channels,
+                        out_channels=out_channels,
+                        temb_channels=temb_channels,
+                        eps=resnet_eps,
+                        groups=resnet_groups,
+                        dropout=dropout,
+                        time_embedding_norm=resnet_time_scale_shift,
+                        non_linearity=resnet_act_fn,
+                        output_scale_factor=output_scale_factor,
+                        pre_norm=resnet_pre_norm,
+                        skip_time_act=skip_time_act,
+                        down=True,
+                    )
+                ]
+            )
+        else:
+            self.downsamplers = None
+
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        hidden_states: torch.FloatTensor,
+        temb: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+    ) -> Tuple[torch.FloatTensor, Tuple[torch.FloatTensor, ...]]:
+        output_states = ()
+        cross_attention_kwargs = cross_attention_kwargs if cross_attention_kwargs is not None else {}
+
+        lora_scale = cross_attention_kwargs.get("scale", 1.0)
+
+        if attention_mask is None:
+            # if encoder_hidden_states is defined: we are doing cross-attn, so we should use cross-attn mask.
+            mask = None if encoder_hidden_states is None else encoder_attention_mask
+        else:
+            # when attention_mask is defined: we don't even check for encoder_attention_mask.
+            # this is to maintain compatibility with UnCLIP, which uses 'attention_mask' param for cross-attn masks.
+            # TODO: UnCLIP should express cross-attn mask via encoder_attention_mask param instead of via attention_mask.
+            #       then we can simplify this whole if/else block to:
+            #         mask = attention_mask if encoder_hidden_states is None else encoder_attention_mask
+            mask = attention_mask
+
+        for resnet, attn in zip(self.resnets, self.attentions):
+            if self.training and self.gradient_checkpointing:
+
+                def create_custom_forward(module, return_dict=None):
+                    def custom_forward(*inputs):
+                        if return_dict is not None:
+                            return module(*inputs, return_dict=return_dict)
+                        else:
+                            return module(*inputs)
+
+                    return custom_forward
+
+                hidden_states = torch.utils.checkpoint.checkpoint(create_custom_forward(resnet), hidden_states, temb)
+                hidden_states = attn(
+                    hidden_states,
+                    encoder_hidden_states=encoder_hidden_states,
+                    attention_mask=mask,
+                    **cross_attention_kwargs,
+                )
+            else:
+                hidden_states = resnet(hidden_states, temb, scale=lora_scale)
+
+                hidden_states = attn(
+                    hidden_states,
+                    encoder_hidden_states=encoder_hidden_states,
+                    attention_mask=mask,
+                    **cross_attention_kwargs,
+                )
+
+            output_states = output_states + (hidden_states,)
+
+        if self.downsamplers is not None:
+            for downsampler in self.downsamplers:
+                hidden_states = downsampler(hidden_states, temb, scale=lora_scale)
+
+            output_states = output_states + (hidden_states,)
+
+        return hidden_states, output_states
+
+
+class KDownBlock2D(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        temb_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 4,
+        resnet_eps: float = 1e-5,
+        resnet_act_fn: str = "gelu",
+        resnet_group_size: int = 32,
+        add_downsample: bool = False,
+    ):
+        super().__init__()
+        resnets = []
+
+        for i in range(num_layers):
+            in_channels = in_channels if i == 0 else out_channels
+            groups = in_channels // resnet_group_size
+            groups_out = out_channels // resnet_group_size
+
+            resnets.append(
+                ResnetBlock2D(
+                    in_channels=in_channels,
+                    out_channels=out_channels,
+                    dropout=dropout,
+                    temb_channels=temb_channels,
+                    groups=groups,
+                    groups_out=groups_out,
+                    eps=resnet_eps,
+                    non_linearity=resnet_act_fn,
+                    time_embedding_norm="ada_group",
+                    conv_shortcut_bias=False,
+                )
+            )
+
+        self.resnets = nn.ModuleList(resnets)
+
+        if add_downsample:
+            # YiYi's comments- might be able to use FirDownsample2D, look into details later
+            self.downsamplers = nn.ModuleList([KDownsample2D()])
+        else:
+            self.downsamplers = None
+
+        self.gradient_checkpointing = False
+
+    def forward(
+        self, hidden_states: torch.FloatTensor, temb: Optional[torch.FloatTensor] = None, scale: float = 1.0
+    ) -> Tuple[torch.FloatTensor, Tuple[torch.FloatTensor, ...]]:
+        output_states = ()
+
+        for resnet in self.resnets:
+            if self.training and self.gradient_checkpointing:
+
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        return module(*inputs)
+
+                    return custom_forward
+
+                if is_torch_version(">=", "1.11.0"):
+                    hidden_states = torch.utils.checkpoint.checkpoint(
+                        create_custom_forward(resnet), hidden_states, temb, use_reentrant=False
+                    )
+                else:
+                    hidden_states = torch.utils.checkpoint.checkpoint(
+                        create_custom_forward(resnet), hidden_states, temb
+                    )
+            else:
+                hidden_states = resnet(hidden_states, temb, scale)
+
+            output_states += (hidden_states,)
+
+        if self.downsamplers is not None:
+            for downsampler in self.downsamplers:
+                hidden_states = downsampler(hidden_states)
+
+        return hidden_states, output_states
+
+
+class KCrossAttnDownBlock2D(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        temb_channels: int,
+        cross_attention_dim: int,
+        dropout: float = 0.0,
+        num_layers: int = 4,
+        resnet_group_size: int = 32,
+        add_downsample: bool = True,
+        attention_head_dim: int = 64,
+        add_self_attention: bool = False,
+        resnet_eps: float = 1e-5,
+        resnet_act_fn: str = "gelu",
+    ):
+        super().__init__()
+        resnets = []
+        attentions = []
+
+        self.has_cross_attention = True
+
+        for i in range(num_layers):
+            in_channels = in_channels if i == 0 else out_channels
+            groups = in_channels // resnet_group_size
+            groups_out = out_channels // resnet_group_size
+
+            resnets.append(
+                ResnetBlock2D(
+                    in_channels=in_channels,
+                    out_channels=out_channels,
+                    dropout=dropout,
+                    temb_channels=temb_channels,
+                    groups=groups,
+                    groups_out=groups_out,
+                    eps=resnet_eps,
+                    non_linearity=resnet_act_fn,
+                    time_embedding_norm="ada_group",
+                    conv_shortcut_bias=False,
+                )
+            )
+            attentions.append(
+                KAttentionBlock(
+                    out_channels,
+                    out_channels // attention_head_dim,
+                    attention_head_dim,
+                    cross_attention_dim=cross_attention_dim,
+                    temb_channels=temb_channels,
+                    attention_bias=True,
+                    add_self_attention=add_self_attention,
+                    cross_attention_norm="layer_norm",
+                    group_size=resnet_group_size,
+                )
+            )
+
+        self.resnets = nn.ModuleList(resnets)
+        self.attentions = nn.ModuleList(attentions)
+
+        if add_downsample:
+            self.downsamplers = nn.ModuleList([KDownsample2D()])
+        else:
+            self.downsamplers = None
+
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        hidden_states: torch.FloatTensor,
+        temb: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+    ) -> Tuple[torch.FloatTensor, Tuple[torch.FloatTensor, ...]]:
+        output_states = ()
+        lora_scale = cross_attention_kwargs.get("scale", 1.0) if cross_attention_kwargs is not None else 1.0
+
+        for resnet, attn in zip(self.resnets, self.attentions):
+            if self.training and self.gradient_checkpointing:
+
+                def create_custom_forward(module, return_dict=None):
+                    def custom_forward(*inputs):
+                        if return_dict is not None:
+                            return module(*inputs, return_dict=return_dict)
+                        else:
+                            return module(*inputs)
+
+                    return custom_forward
+
+                ckpt_kwargs: Dict[str, Any] = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
+                hidden_states = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(resnet),
+                    hidden_states,
+                    temb,
+                    **ckpt_kwargs,
+                )
+                hidden_states = attn(
+                    hidden_states,
+                    encoder_hidden_states=encoder_hidden_states,
+                    emb=temb,
+                    attention_mask=attention_mask,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    encoder_attention_mask=encoder_attention_mask,
+                )
+            else:
+                hidden_states = resnet(hidden_states, temb, scale=lora_scale)
+                hidden_states = attn(
+                    hidden_states,
+                    encoder_hidden_states=encoder_hidden_states,
+                    emb=temb,
+                    attention_mask=attention_mask,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    encoder_attention_mask=encoder_attention_mask,
+                )
+
+            if self.downsamplers is None:
+                output_states += (None,)
+            else:
+                output_states += (hidden_states,)
+
+        if self.downsamplers is not None:
+            for downsampler in self.downsamplers:
+                hidden_states = downsampler(hidden_states)
+
+        return hidden_states, output_states
+
+
+class AttnUpBlock2D(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        prev_output_channel: int,
+        out_channels: int,
+        temb_channels: int,
+        resolution_idx: int = None,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        attention_head_dim: int = 1,
+        output_scale_factor: float = 1.0,
+        upsample_type: str = "conv",
+    ):
+        super().__init__()
+        resnets = []
+        attentions = []
+
+        self.upsample_type = upsample_type
+
+        if attention_head_dim is None:
+            logger.warn(
+                f"It is not recommend to pass `attention_head_dim=None`. Defaulting `attention_head_dim` to `in_channels`: {out_channels}."
+            )
+            attention_head_dim = out_channels
+
+        for i in range(num_layers):
+            res_skip_channels = in_channels if (i == num_layers - 1) else out_channels
+            resnet_in_channels = prev_output_channel if i == 0 else out_channels
+
+            resnets.append(
+                ResnetBlock2D(
+                    in_channels=resnet_in_channels + res_skip_channels,
+                    out_channels=out_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                )
+            )
+            attentions.append(
+                Attention(
+                    out_channels,
+                    heads=out_channels // attention_head_dim,
+                    dim_head=attention_head_dim,
+                    rescale_output_factor=output_scale_factor,
+                    eps=resnet_eps,
+                    norm_num_groups=resnet_groups,
+                    residual_connection=True,
+                    bias=True,
+                    upcast_softmax=True,
+                    _from_deprecated_attn_block=True,
+                )
+            )
+
+        self.attentions = nn.ModuleList(attentions)
+        self.resnets = nn.ModuleList(resnets)
+
+        if upsample_type == "conv":
+            self.upsamplers = nn.ModuleList([Upsample2D(out_channels, use_conv=True, out_channels=out_channels)])
+        elif upsample_type == "resnet":
+            self.upsamplers = nn.ModuleList(
+                [
+                    ResnetBlock2D(
+                        in_channels=out_channels,
+                        out_channels=out_channels,
+                        temb_channels=temb_channels,
+                        eps=resnet_eps,
+                        groups=resnet_groups,
+                        dropout=dropout,
+                        time_embedding_norm=resnet_time_scale_shift,
+                        non_linearity=resnet_act_fn,
+                        output_scale_factor=output_scale_factor,
+                        pre_norm=resnet_pre_norm,
+                        up=True,
+                    )
+                ]
+            )
+        else:
+            self.upsamplers = None
+
+        self.resolution_idx = resolution_idx
+
+    def forward(
+        self,
+        hidden_states: torch.FloatTensor,
+        res_hidden_states_tuple: Tuple[torch.FloatTensor, ...],
+        temb: Optional[torch.FloatTensor] = None,
+        upsample_size: Optional[int] = None,
+        scale: float = 1.0,
+    ) -> torch.FloatTensor:
+        for resnet, attn in zip(self.resnets, self.attentions):
+            # pop res hidden states
+            res_hidden_states = res_hidden_states_tuple[-1]
+            res_hidden_states_tuple = res_hidden_states_tuple[:-1]
+            hidden_states = torch.cat([hidden_states, res_hidden_states], dim=1)
+
+            hidden_states = resnet(hidden_states, temb, scale=scale)
+            cross_attention_kwargs = {"scale": scale}
+            hidden_states = attn(hidden_states, **cross_attention_kwargs)
+
+        if self.upsamplers is not None:
+            for upsampler in self.upsamplers:
+                if self.upsample_type == "resnet":
+                    hidden_states = upsampler(hidden_states, temb=temb, scale=scale)
+                else:
+                    hidden_states = upsampler(hidden_states, scale=scale)
+
+        return hidden_states
+
+
+class CrossAttnUpBlock2D(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        prev_output_channel: int,
+        temb_channels: int,
+        resolution_idx: Optional[int] = None,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        transformer_layers_per_block: Union[int, Tuple[int]] = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        num_attention_heads: int = 1,
+        cross_attention_dim: int = 1280,
+        output_scale_factor: float = 1.0,
+        add_upsample: bool = True,
+        dual_cross_attention: bool = False,
+        use_linear_projection: bool = False,
+        only_cross_attention: bool = False,
+        upcast_attention: bool = False,
+        attention_type: str = "default",
+    ):
+        super().__init__()
+        resnets = []
+        attentions = []
+
+        self.has_cross_attention = True
+        self.num_attention_heads = num_attention_heads
+
+        if isinstance(transformer_layers_per_block, int):
+            transformer_layers_per_block = [transformer_layers_per_block] * num_layers
+
+        for i in range(num_layers):
+            res_skip_channels = in_channels if (i == num_layers - 1) else out_channels
+            resnet_in_channels = prev_output_channel if i == 0 else out_channels
+
+            resnets.append(
+                ResnetBlock2D(
+                    in_channels=resnet_in_channels + res_skip_channels,
+                    out_channels=out_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                )
+            )
+            if not dual_cross_attention:
+                attentions.append(
+                    Transformer2DModel(
+                        num_attention_heads,
+                        out_channels // num_attention_heads,
+                        in_channels=out_channels,
+                        num_layers=transformer_layers_per_block[i],
+                        cross_attention_dim=cross_attention_dim,
+                        norm_num_groups=resnet_groups,
+                        use_linear_projection=use_linear_projection,
+                        only_cross_attention=only_cross_attention,
+                        upcast_attention=upcast_attention,
+                        attention_type=attention_type,
+                    )
+                )
+            else:
+                attentions.append(
+                    DualTransformer2DModel(
+                        num_attention_heads,
+                        out_channels // num_attention_heads,
+                        in_channels=out_channels,
+                        num_layers=1,
+                        cross_attention_dim=cross_attention_dim,
+                        norm_num_groups=resnet_groups,
+                    )
+                )
+        self.attentions = nn.ModuleList(attentions)
+        self.resnets = nn.ModuleList(resnets)
+
+        if add_upsample:
+            self.upsamplers = nn.ModuleList([Upsample2D(out_channels, use_conv=True, out_channels=out_channels)])
+        else:
+            self.upsamplers = None
+
+        self.gradient_checkpointing = False
+        self.resolution_idx = resolution_idx
+
+    def forward(
+        self,
+        hidden_states: torch.FloatTensor,
+        res_hidden_states_tuple: Tuple[torch.FloatTensor, ...],
+        temb: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        upsample_size: Optional[int] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        garment_features=None,
+        curr_garment_feat_idx=0,
+    ) -> torch.FloatTensor:
+        lora_scale = cross_attention_kwargs.get("scale", 1.0) if cross_attention_kwargs is not None else 1.0
+        is_freeu_enabled = (
+            getattr(self, "s1", None)
+            and getattr(self, "s2", None)
+            and getattr(self, "b1", None)
+            and getattr(self, "b2", None)
+        )
+
+        for resnet, attn in zip(self.resnets, self.attentions):
+
+            res_hidden_states = res_hidden_states_tuple[-1]
+            res_hidden_states_tuple = res_hidden_states_tuple[:-1]
+
+            # FreeU: Only operate on the first two stages
+            if is_freeu_enabled:
+                hidden_states, res_hidden_states = apply_freeu(
+                    self.resolution_idx,
+                    hidden_states,
+                    res_hidden_states,
+                    s1=self.s1,
+                    s2=self.s2,
+                    b1=self.b1,
+                    b2=self.b2,
+                )
+
+            hidden_states = torch.cat([hidden_states, res_hidden_states], dim=1)
+            # print(hidden_states.shape)
+            # print(encoder_hidden_states.shape)
+            if self.training and self.gradient_checkpointing:
+
+                def create_custom_forward(module, return_dict=None):
+                    def custom_forward(*inputs):
+                        if return_dict is not None:
+                            return module(*inputs, return_dict=return_dict)
+                        else:
+                            return module(*inputs)
+
+                    return custom_forward
+
+                ckpt_kwargs: Dict[str, Any] = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
+                hidden_states = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(resnet),
+                    hidden_states,
+                    temb,
+                    **ckpt_kwargs,
+                )
+                hidden_states,curr_garment_feat_idx = attn(
+                    hidden_states,
+                    encoder_hidden_states=encoder_hidden_states,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    attention_mask=attention_mask,
+                    encoder_attention_mask=encoder_attention_mask,
+                    return_dict=False,
+                    garment_features=garment_features,
+                    curr_garment_feat_idx=curr_garment_feat_idx,
+                )
+                hidden_states=hidden_states[0]
+            else:
+                hidden_states = resnet(hidden_states, temb, scale=lora_scale)
+                hidden_states,curr_garment_feat_idx = attn(
+                    hidden_states,
+                    encoder_hidden_states=encoder_hidden_states,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    attention_mask=attention_mask,
+                    encoder_attention_mask=encoder_attention_mask,
+                    return_dict=False,
+                    garment_features=garment_features,
+                    curr_garment_feat_idx=curr_garment_feat_idx,
+                )
+                hidden_states=hidden_states[0]
+        if self.upsamplers is not None:
+            for upsampler in self.upsamplers:
+                hidden_states = upsampler(hidden_states, upsample_size, scale=lora_scale)
+
+
+
+        return hidden_states,curr_garment_feat_idx
+
+
+class UpBlock2D(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        prev_output_channel: int,
+        out_channels: int,
+        temb_channels: int,
+        resolution_idx: Optional[int] = None,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        output_scale_factor: float = 1.0,
+        add_upsample: bool = True,
+    ):
+        super().__init__()
+        resnets = []
+
+        for i in range(num_layers):
+            res_skip_channels = in_channels if (i == num_layers - 1) else out_channels
+            resnet_in_channels = prev_output_channel if i == 0 else out_channels
+
+            resnets.append(
+                ResnetBlock2D(
+                    in_channels=resnet_in_channels + res_skip_channels,
+                    out_channels=out_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                )
+            )
+
+        self.resnets = nn.ModuleList(resnets)
+
+        if add_upsample:
+            self.upsamplers = nn.ModuleList([Upsample2D(out_channels, use_conv=True, out_channels=out_channels)])
+        else:
+            self.upsamplers = None
+
+        self.gradient_checkpointing = False
+        self.resolution_idx = resolution_idx
+
+    def forward(
+        self,
+        hidden_states: torch.FloatTensor,
+        res_hidden_states_tuple: Tuple[torch.FloatTensor, ...],
+        temb: Optional[torch.FloatTensor] = None,
+        upsample_size: Optional[int] = None,
+        scale: float = 1.0,
+    ) -> torch.FloatTensor:
+        is_freeu_enabled = (
+            getattr(self, "s1", None)
+            and getattr(self, "s2", None)
+            and getattr(self, "b1", None)
+            and getattr(self, "b2", None)
+        )
+
+        for resnet in self.resnets:
+            # pop res hidden states
+            res_hidden_states = res_hidden_states_tuple[-1]
+            res_hidden_states_tuple = res_hidden_states_tuple[:-1]
+
+            # FreeU: Only operate on the first two stages
+            if is_freeu_enabled:
+                hidden_states, res_hidden_states = apply_freeu(
+                    self.resolution_idx,
+                    hidden_states,
+                    res_hidden_states,
+                    s1=self.s1,
+                    s2=self.s2,
+                    b1=self.b1,
+                    b2=self.b2,
+                )
+
+            hidden_states = torch.cat([hidden_states, res_hidden_states], dim=1)
+
+            if self.training and self.gradient_checkpointing:
+
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        return module(*inputs)
+
+                    return custom_forward
+
+                if is_torch_version(">=", "1.11.0"):
+                    hidden_states = torch.utils.checkpoint.checkpoint(
+                        create_custom_forward(resnet), hidden_states, temb, use_reentrant=False
+                    )
+                else:
+                    hidden_states = torch.utils.checkpoint.checkpoint(
+                        create_custom_forward(resnet), hidden_states, temb
+                    )
+            else:
+                hidden_states = resnet(hidden_states, temb, scale=scale)
+
+        if self.upsamplers is not None:
+            for upsampler in self.upsamplers:
+                hidden_states = upsampler(hidden_states, upsample_size, scale=scale)
+
+        return hidden_states
+
+
+
+class UpDecoderBlock2D(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        resolution_idx: Optional[int] = None,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",  # default, spatial
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        output_scale_factor: float = 1.0,
+        add_upsample: bool = True,
+        temb_channels: Optional[int] = None,
+    ):
+        super().__init__()
+        resnets = []
+
+        for i in range(num_layers):
+            input_channels = in_channels if i == 0 else out_channels
+
+            resnets.append(
+                ResnetBlock2D(
+                    in_channels=input_channels,
+                    out_channels=out_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                )
+            )
+
+        self.resnets = nn.ModuleList(resnets)
+
+        if add_upsample:
+            self.upsamplers = nn.ModuleList([Upsample2D(out_channels, use_conv=True, out_channels=out_channels)])
+        else:
+            self.upsamplers = None
+
+        self.resolution_idx = resolution_idx
+
+    def forward(
+        self, hidden_states: torch.FloatTensor, temb: Optional[torch.FloatTensor] = None, scale: float = 1.0
+    ) -> torch.FloatTensor:
+        for resnet in self.resnets:
+            hidden_states = resnet(hidden_states, temb=temb, scale=scale)
+
+        if self.upsamplers is not None:
+            for upsampler in self.upsamplers:
+                hidden_states = upsampler(hidden_states)
+
+        return hidden_states
+
+
+class AttnUpDecoderBlock2D(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        resolution_idx: Optional[int] = None,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        attention_head_dim: int = 1,
+        output_scale_factor: float = 1.0,
+        add_upsample: bool = True,
+        temb_channels: Optional[int] = None,
+    ):
+        super().__init__()
+        resnets = []
+        attentions = []
+
+        if attention_head_dim is None:
+            logger.warn(
+                f"It is not recommend to pass `attention_head_dim=None`. Defaulting `attention_head_dim` to `out_channels`: {out_channels}."
+            )
+            attention_head_dim = out_channels
+
+        for i in range(num_layers):
+            input_channels = in_channels if i == 0 else out_channels
+
+            resnets.append(
+                ResnetBlock2D(
+                    in_channels=input_channels,
+                    out_channels=out_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                )
+            )
+            attentions.append(
+                Attention(
+                    out_channels,
+                    heads=out_channels // attention_head_dim,
+                    dim_head=attention_head_dim,
+                    rescale_output_factor=output_scale_factor,
+                    eps=resnet_eps,
+                    norm_num_groups=resnet_groups if resnet_time_scale_shift != "spatial" else None,
+                    spatial_norm_dim=temb_channels if resnet_time_scale_shift == "spatial" else None,
+                    residual_connection=True,
+                    bias=True,
+                    upcast_softmax=True,
+                    _from_deprecated_attn_block=True,
+                )
+            )
+
+        self.attentions = nn.ModuleList(attentions)
+        self.resnets = nn.ModuleList(resnets)
+
+        if add_upsample:
+            self.upsamplers = nn.ModuleList([Upsample2D(out_channels, use_conv=True, out_channels=out_channels)])
+        else:
+            self.upsamplers = None
+
+        self.resolution_idx = resolution_idx
+
+    def forward(
+        self, hidden_states: torch.FloatTensor, temb: Optional[torch.FloatTensor] = None, scale: float = 1.0
+    ) -> torch.FloatTensor:
+        for resnet, attn in zip(self.resnets, self.attentions):
+            hidden_states = resnet(hidden_states, temb=temb, scale=scale)
+            cross_attention_kwargs = {"scale": scale}
+            hidden_states = attn(hidden_states, temb=temb, **cross_attention_kwargs)
+
+        if self.upsamplers is not None:
+            for upsampler in self.upsamplers:
+                hidden_states = upsampler(hidden_states, scale=scale)
+
+        return hidden_states
+
+
+class AttnSkipUpBlock2D(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        prev_output_channel: int,
+        out_channels: int,
+        temb_channels: int,
+        resolution_idx: Optional[int] = None,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_pre_norm: bool = True,
+        attention_head_dim: int = 1,
+        output_scale_factor: float = np.sqrt(2.0),
+        add_upsample: bool = True,
+    ):
+        super().__init__()
+        self.attentions = nn.ModuleList([])
+        self.resnets = nn.ModuleList([])
+
+        for i in range(num_layers):
+            res_skip_channels = in_channels if (i == num_layers - 1) else out_channels
+            resnet_in_channels = prev_output_channel if i == 0 else out_channels
+
+            self.resnets.append(
+                ResnetBlock2D(
+                    in_channels=resnet_in_channels + res_skip_channels,
+                    out_channels=out_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=min(resnet_in_channels + res_skip_channels // 4, 32),
+                    groups_out=min(out_channels // 4, 32),
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                )
+            )
+
+        if attention_head_dim is None:
+            logger.warn(
+                f"It is not recommend to pass `attention_head_dim=None`. Defaulting `attention_head_dim` to `out_channels`: {out_channels}."
+            )
+            attention_head_dim = out_channels
+
+        self.attentions.append(
+            Attention(
+                out_channels,
+                heads=out_channels // attention_head_dim,
+                dim_head=attention_head_dim,
+                rescale_output_factor=output_scale_factor,
+                eps=resnet_eps,
+                norm_num_groups=32,
+                residual_connection=True,
+                bias=True,
+                upcast_softmax=True,
+                _from_deprecated_attn_block=True,
+            )
+        )
+
+        self.upsampler = FirUpsample2D(in_channels, out_channels=out_channels)
+        if add_upsample:
+            self.resnet_up = ResnetBlock2D(
+                in_channels=out_channels,
+                out_channels=out_channels,
+                temb_channels=temb_channels,
+                eps=resnet_eps,
+                groups=min(out_channels // 4, 32),
+                groups_out=min(out_channels // 4, 32),
+                dropout=dropout,
+                time_embedding_norm=resnet_time_scale_shift,
+                non_linearity=resnet_act_fn,
+                output_scale_factor=output_scale_factor,
+                pre_norm=resnet_pre_norm,
+                use_in_shortcut=True,
+                up=True,
+                kernel="fir",
+            )
+            self.skip_conv = nn.Conv2d(out_channels, 3, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+            self.skip_norm = torch.nn.GroupNorm(
+                num_groups=min(out_channels // 4, 32), num_channels=out_channels, eps=resnet_eps, affine=True
+            )
+            self.act = nn.SiLU()
+        else:
+            self.resnet_up = None
+            self.skip_conv = None
+            self.skip_norm = None
+            self.act = None
+
+        self.resolution_idx = resolution_idx
+
+    def forward(
+        self,
+        hidden_states: torch.FloatTensor,
+        res_hidden_states_tuple: Tuple[torch.FloatTensor, ...],
+        temb: Optional[torch.FloatTensor] = None,
+        skip_sample=None,
+        scale: float = 1.0,
+    ) -> Tuple[torch.FloatTensor, torch.FloatTensor]:
+        for resnet in self.resnets:
+            # pop res hidden states
+            res_hidden_states = res_hidden_states_tuple[-1]
+            res_hidden_states_tuple = res_hidden_states_tuple[:-1]
+            hidden_states = torch.cat([hidden_states, res_hidden_states], dim=1)
+
+            hidden_states = resnet(hidden_states, temb, scale=scale)
+
+        cross_attention_kwargs = {"scale": scale}
+        hidden_states = self.attentions[0](hidden_states, **cross_attention_kwargs)
+
+        if skip_sample is not None:
+            skip_sample = self.upsampler(skip_sample)
+        else:
+            skip_sample = 0
+
+        if self.resnet_up is not None:
+            skip_sample_states = self.skip_norm(hidden_states)
+            skip_sample_states = self.act(skip_sample_states)
+            skip_sample_states = self.skip_conv(skip_sample_states)
+
+            skip_sample = skip_sample + skip_sample_states
+
+            hidden_states = self.resnet_up(hidden_states, temb, scale=scale)
+
+        return hidden_states, skip_sample
+
+
+class SkipUpBlock2D(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        prev_output_channel: int,
+        out_channels: int,
+        temb_channels: int,
+        resolution_idx: Optional[int] = None,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_pre_norm: bool = True,
+        output_scale_factor: float = np.sqrt(2.0),
+        add_upsample: bool = True,
+        upsample_padding: int = 1,
+    ):
+        super().__init__()
+        self.resnets = nn.ModuleList([])
+
+        for i in range(num_layers):
+            res_skip_channels = in_channels if (i == num_layers - 1) else out_channels
+            resnet_in_channels = prev_output_channel if i == 0 else out_channels
+
+            self.resnets.append(
+                ResnetBlock2D(
+                    in_channels=resnet_in_channels + res_skip_channels,
+                    out_channels=out_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=min((resnet_in_channels + res_skip_channels) // 4, 32),
+                    groups_out=min(out_channels // 4, 32),
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                )
+            )
+
+        self.upsampler = FirUpsample2D(in_channels, out_channels=out_channels)
+        if add_upsample:
+            self.resnet_up = ResnetBlock2D(
+                in_channels=out_channels,
+                out_channels=out_channels,
+                temb_channels=temb_channels,
+                eps=resnet_eps,
+                groups=min(out_channels // 4, 32),
+                groups_out=min(out_channels // 4, 32),
+                dropout=dropout,
+                time_embedding_norm=resnet_time_scale_shift,
+                non_linearity=resnet_act_fn,
+                output_scale_factor=output_scale_factor,
+                pre_norm=resnet_pre_norm,
+                use_in_shortcut=True,
+                up=True,
+                kernel="fir",
+            )
+            self.skip_conv = nn.Conv2d(out_channels, 3, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+            self.skip_norm = torch.nn.GroupNorm(
+                num_groups=min(out_channels // 4, 32), num_channels=out_channels, eps=resnet_eps, affine=True
+            )
+            self.act = nn.SiLU()
+        else:
+            self.resnet_up = None
+            self.skip_conv = None
+            self.skip_norm = None
+            self.act = None
+
+        self.resolution_idx = resolution_idx
+
+    def forward(
+        self,
+        hidden_states: torch.FloatTensor,
+        res_hidden_states_tuple: Tuple[torch.FloatTensor, ...],
+        temb: Optional[torch.FloatTensor] = None,
+        skip_sample=None,
+        scale: float = 1.0,
+    ) -> Tuple[torch.FloatTensor, torch.FloatTensor]:
+        for resnet in self.resnets:
+            # pop res hidden states
+            res_hidden_states = res_hidden_states_tuple[-1]
+            res_hidden_states_tuple = res_hidden_states_tuple[:-1]
+            hidden_states = torch.cat([hidden_states, res_hidden_states], dim=1)
+
+            hidden_states = resnet(hidden_states, temb, scale=scale)
+
+        if skip_sample is not None:
+            skip_sample = self.upsampler(skip_sample)
+        else:
+            skip_sample = 0
+
+        if self.resnet_up is not None:
+            skip_sample_states = self.skip_norm(hidden_states)
+            skip_sample_states = self.act(skip_sample_states)
+            skip_sample_states = self.skip_conv(skip_sample_states)
+
+            skip_sample = skip_sample + skip_sample_states
+
+            hidden_states = self.resnet_up(hidden_states, temb, scale=scale)
+
+        return hidden_states, skip_sample
+
+
+class ResnetUpsampleBlock2D(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        prev_output_channel: int,
+        out_channels: int,
+        temb_channels: int,
+        resolution_idx: Optional[int] = None,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        output_scale_factor: float = 1.0,
+        add_upsample: bool = True,
+        skip_time_act: bool = False,
+    ):
+        super().__init__()
+        resnets = []
+
+        for i in range(num_layers):
+            res_skip_channels = in_channels if (i == num_layers - 1) else out_channels
+            resnet_in_channels = prev_output_channel if i == 0 else out_channels
+
+            resnets.append(
+                ResnetBlock2D(
+                    in_channels=resnet_in_channels + res_skip_channels,
+                    out_channels=out_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                    skip_time_act=skip_time_act,
+                )
+            )
+
+        self.resnets = nn.ModuleList(resnets)
+
+        if add_upsample:
+            self.upsamplers = nn.ModuleList(
+                [
+                    ResnetBlock2D(
+                        in_channels=out_channels,
+                        out_channels=out_channels,
+                        temb_channels=temb_channels,
+                        eps=resnet_eps,
+                        groups=resnet_groups,
+                        dropout=dropout,
+                        time_embedding_norm=resnet_time_scale_shift,
+                        non_linearity=resnet_act_fn,
+                        output_scale_factor=output_scale_factor,
+                        pre_norm=resnet_pre_norm,
+                        skip_time_act=skip_time_act,
+                        up=True,
+                    )
+                ]
+            )
+        else:
+            self.upsamplers = None
+
+        self.gradient_checkpointing = False
+        self.resolution_idx = resolution_idx
+
+    def forward(
+        self,
+        hidden_states: torch.FloatTensor,
+        res_hidden_states_tuple: Tuple[torch.FloatTensor, ...],
+        temb: Optional[torch.FloatTensor] = None,
+        upsample_size: Optional[int] = None,
+        scale: float = 1.0,
+    ) -> torch.FloatTensor:
+        for resnet in self.resnets:
+            # pop res hidden states
+            res_hidden_states = res_hidden_states_tuple[-1]
+            res_hidden_states_tuple = res_hidden_states_tuple[:-1]
+            hidden_states = torch.cat([hidden_states, res_hidden_states], dim=1)
+
+            if self.training and self.gradient_checkpointing:
+
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        return module(*inputs)
+
+                    return custom_forward
+
+                if is_torch_version(">=", "1.11.0"):
+                    hidden_states = torch.utils.checkpoint.checkpoint(
+                        create_custom_forward(resnet), hidden_states, temb, use_reentrant=False
+                    )
+                else:
+                    hidden_states = torch.utils.checkpoint.checkpoint(
+                        create_custom_forward(resnet), hidden_states, temb
+                    )
+            else:
+                hidden_states = resnet(hidden_states, temb, scale=scale)
+
+        if self.upsamplers is not None:
+            for upsampler in self.upsamplers:
+                hidden_states = upsampler(hidden_states, temb, scale=scale)
+
+        return hidden_states
+
+
+class SimpleCrossAttnUpBlock2D(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        prev_output_channel: int,
+        temb_channels: int,
+        resolution_idx: Optional[int] = None,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        attention_head_dim: int = 1,
+        cross_attention_dim: int = 1280,
+        output_scale_factor: float = 1.0,
+        add_upsample: bool = True,
+        skip_time_act: bool = False,
+        only_cross_attention: bool = False,
+        cross_attention_norm: Optional[str] = None,
+    ):
+        super().__init__()
+        resnets = []
+        attentions = []
+
+        self.has_cross_attention = True
+        self.attention_head_dim = attention_head_dim
+
+        self.num_heads = out_channels // self.attention_head_dim
+
+        for i in range(num_layers):
+            res_skip_channels = in_channels if (i == num_layers - 1) else out_channels
+            resnet_in_channels = prev_output_channel if i == 0 else out_channels
+
+            resnets.append(
+                ResnetBlock2D(
+                    in_channels=resnet_in_channels + res_skip_channels,
+                    out_channels=out_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                    skip_time_act=skip_time_act,
+                )
+            )
+
+            processor = (
+                AttnAddedKVProcessor2_0() if hasattr(F, "scaled_dot_product_attention") else AttnAddedKVProcessor()
+            )
+
+            attentions.append(
+                Attention(
+                    query_dim=out_channels,
+                    cross_attention_dim=out_channels,
+                    heads=self.num_heads,
+                    dim_head=self.attention_head_dim,
+                    added_kv_proj_dim=cross_attention_dim,
+                    norm_num_groups=resnet_groups,
+                    bias=True,
+                    upcast_softmax=True,
+                    only_cross_attention=only_cross_attention,
+                    cross_attention_norm=cross_attention_norm,
+                    processor=processor,
+                )
+            )
+        self.attentions = nn.ModuleList(attentions)
+        self.resnets = nn.ModuleList(resnets)
+
+        if add_upsample:
+            self.upsamplers = nn.ModuleList(
+                [
+                    ResnetBlock2D(
+                        in_channels=out_channels,
+                        out_channels=out_channels,
+                        temb_channels=temb_channels,
+                        eps=resnet_eps,
+                        groups=resnet_groups,
+                        dropout=dropout,
+                        time_embedding_norm=resnet_time_scale_shift,
+                        non_linearity=resnet_act_fn,
+                        output_scale_factor=output_scale_factor,
+                        pre_norm=resnet_pre_norm,
+                        skip_time_act=skip_time_act,
+                        up=True,
+                    )
+                ]
+            )
+        else:
+            self.upsamplers = None
+
+        self.gradient_checkpointing = False
+        self.resolution_idx = resolution_idx
+
+    def forward(
+        self,
+        hidden_states: torch.FloatTensor,
+        res_hidden_states_tuple: Tuple[torch.FloatTensor, ...],
+        temb: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        upsample_size: Optional[int] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+    ) -> torch.FloatTensor:
+        cross_attention_kwargs = cross_attention_kwargs if cross_attention_kwargs is not None else {}
+
+        lora_scale = cross_attention_kwargs.get("scale", 1.0)
+        if attention_mask is None:
+            # if encoder_hidden_states is defined: we are doing cross-attn, so we should use cross-attn mask.
+            mask = None if encoder_hidden_states is None else encoder_attention_mask
+        else:
+            # when attention_mask is defined: we don't even check for encoder_attention_mask.
+            # this is to maintain compatibility with UnCLIP, which uses 'attention_mask' param for cross-attn masks.
+            # TODO: UnCLIP should express cross-attn mask via encoder_attention_mask param instead of via attention_mask.
+            #       then we can simplify this whole if/else block to:
+            #         mask = attention_mask if encoder_hidden_states is None else encoder_attention_mask
+            mask = attention_mask
+
+        for resnet, attn in zip(self.resnets, self.attentions):
+            # resnet
+            # pop res hidden states
+            res_hidden_states = res_hidden_states_tuple[-1]
+            res_hidden_states_tuple = res_hidden_states_tuple[:-1]
+            hidden_states = torch.cat([hidden_states, res_hidden_states], dim=1)
+
+            if self.training and self.gradient_checkpointing:
+
+                def create_custom_forward(module, return_dict=None):
+                    def custom_forward(*inputs):
+                        if return_dict is not None:
+                            return module(*inputs, return_dict=return_dict)
+                        else:
+                            return module(*inputs)
+
+                    return custom_forward
+
+                hidden_states = torch.utils.checkpoint.checkpoint(create_custom_forward(resnet), hidden_states, temb)
+                hidden_states = attn(
+                    hidden_states,
+                    encoder_hidden_states=encoder_hidden_states,
+                    attention_mask=mask,
+                    **cross_attention_kwargs,
+                )
+            else:
+                hidden_states = resnet(hidden_states, temb, scale=lora_scale)
+
+                hidden_states = attn(
+                    hidden_states,
+                    encoder_hidden_states=encoder_hidden_states,
+                    attention_mask=mask,
+                    **cross_attention_kwargs,
+                )
+
+        if self.upsamplers is not None:
+            for upsampler in self.upsamplers:
+                hidden_states = upsampler(hidden_states, temb, scale=lora_scale)
+
+        return hidden_states
+
+
+class KUpBlock2D(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        temb_channels: int,
+        resolution_idx: int,
+        dropout: float = 0.0,
+        num_layers: int = 5,
+        resnet_eps: float = 1e-5,
+        resnet_act_fn: str = "gelu",
+        resnet_group_size: Optional[int] = 32,
+        add_upsample: bool = True,
+    ):
+        super().__init__()
+        resnets = []
+        k_in_channels = 2 * out_channels
+        k_out_channels = in_channels
+        num_layers = num_layers - 1
+
+        for i in range(num_layers):
+            in_channels = k_in_channels if i == 0 else out_channels
+            groups = in_channels // resnet_group_size
+            groups_out = out_channels // resnet_group_size
+
+            resnets.append(
+                ResnetBlock2D(
+                    in_channels=in_channels,
+                    out_channels=k_out_channels if (i == num_layers - 1) else out_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=groups,
+                    groups_out=groups_out,
+                    dropout=dropout,
+                    non_linearity=resnet_act_fn,
+                    time_embedding_norm="ada_group",
+                    conv_shortcut_bias=False,
+                )
+            )
+
+        self.resnets = nn.ModuleList(resnets)
+
+        if add_upsample:
+            self.upsamplers = nn.ModuleList([KUpsample2D()])
+        else:
+            self.upsamplers = None
+
+        self.gradient_checkpointing = False
+        self.resolution_idx = resolution_idx
+
+    def forward(
+        self,
+        hidden_states: torch.FloatTensor,
+        res_hidden_states_tuple: Tuple[torch.FloatTensor, ...],
+        temb: Optional[torch.FloatTensor] = None,
+        upsample_size: Optional[int] = None,
+        scale: float = 1.0,
+    ) -> torch.FloatTensor:
+        res_hidden_states_tuple = res_hidden_states_tuple[-1]
+        if res_hidden_states_tuple is not None:
+            hidden_states = torch.cat([hidden_states, res_hidden_states_tuple], dim=1)
+
+        for resnet in self.resnets:
+            if self.training and self.gradient_checkpointing:
+
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        return module(*inputs)
+
+                    return custom_forward
+
+                if is_torch_version(">=", "1.11.0"):
+                    hidden_states = torch.utils.checkpoint.checkpoint(
+                        create_custom_forward(resnet), hidden_states, temb, use_reentrant=False
+                    )
+                else:
+                    hidden_states = torch.utils.checkpoint.checkpoint(
+                        create_custom_forward(resnet), hidden_states, temb
+                    )
+            else:
+                hidden_states = resnet(hidden_states, temb, scale=scale)
+
+        if self.upsamplers is not None:
+            for upsampler in self.upsamplers:
+                hidden_states = upsampler(hidden_states)
+
+        return hidden_states
+
+
+class KCrossAttnUpBlock2D(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        temb_channels: int,
+        resolution_idx: int,
+        dropout: float = 0.0,
+        num_layers: int = 4,
+        resnet_eps: float = 1e-5,
+        resnet_act_fn: str = "gelu",
+        resnet_group_size: int = 32,
+        attention_head_dim: int = 1,  # attention dim_head
+        cross_attention_dim: int = 768,
+        add_upsample: bool = True,
+        upcast_attention: bool = False,
+    ):
+        super().__init__()
+        resnets = []
+        attentions = []
+
+        is_first_block = in_channels == out_channels == temb_channels
+        is_middle_block = in_channels != out_channels
+        add_self_attention = True if is_first_block else False
+
+        self.has_cross_attention = True
+        self.attention_head_dim = attention_head_dim
+
+        # in_channels, and out_channels for the block (k-unet)
+        k_in_channels = out_channels if is_first_block else 2 * out_channels
+        k_out_channels = in_channels
+
+        num_layers = num_layers - 1
+
+        for i in range(num_layers):
+            in_channels = k_in_channels if i == 0 else out_channels
+            groups = in_channels // resnet_group_size
+            groups_out = out_channels // resnet_group_size
+
+            if is_middle_block and (i == num_layers - 1):
+                conv_2d_out_channels = k_out_channels
+            else:
+                conv_2d_out_channels = None
+
+            resnets.append(
+                ResnetBlock2D(
+                    in_channels=in_channels,
+                    out_channels=out_channels,
+                    conv_2d_out_channels=conv_2d_out_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=groups,
+                    groups_out=groups_out,
+                    dropout=dropout,
+                    non_linearity=resnet_act_fn,
+                    time_embedding_norm="ada_group",
+                    conv_shortcut_bias=False,
+                )
+            )
+            attentions.append(
+                KAttentionBlock(
+                    k_out_channels if (i == num_layers - 1) else out_channels,
+                    k_out_channels // attention_head_dim
+                    if (i == num_layers - 1)
+                    else out_channels // attention_head_dim,
+                    attention_head_dim,
+                    cross_attention_dim=cross_attention_dim,
+                    temb_channels=temb_channels,
+                    attention_bias=True,
+                    add_self_attention=add_self_attention,
+                    cross_attention_norm="layer_norm",
+                    upcast_attention=upcast_attention,
+                )
+            )
+
+        self.resnets = nn.ModuleList(resnets)
+        self.attentions = nn.ModuleList(attentions)
+
+        if add_upsample:
+            self.upsamplers = nn.ModuleList([KUpsample2D()])
+        else:
+            self.upsamplers = None
+
+        self.gradient_checkpointing = False
+        self.resolution_idx = resolution_idx
+
+    def forward(
+        self,
+        hidden_states: torch.FloatTensor,
+        res_hidden_states_tuple: Tuple[torch.FloatTensor, ...],
+        temb: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        upsample_size: Optional[int] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+    ) -> torch.FloatTensor:
+        res_hidden_states_tuple = res_hidden_states_tuple[-1]
+        if res_hidden_states_tuple is not None:
+            hidden_states = torch.cat([hidden_states, res_hidden_states_tuple], dim=1)
+
+        lora_scale = cross_attention_kwargs.get("scale", 1.0) if cross_attention_kwargs is not None else 1.0
+        for resnet, attn in zip(self.resnets, self.attentions):
+            if self.training and self.gradient_checkpointing:
+
+                def create_custom_forward(module, return_dict=None):
+                    def custom_forward(*inputs):
+                        if return_dict is not None:
+                            return module(*inputs, return_dict=return_dict)
+                        else:
+                            return module(*inputs)
+
+                    return custom_forward
+
+                ckpt_kwargs: Dict[str, Any] = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
+                hidden_states = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(resnet),
+                    hidden_states,
+                    temb,
+                    **ckpt_kwargs,
+                )
+                hidden_states = attn(
+                    hidden_states,
+                    encoder_hidden_states=encoder_hidden_states,
+                    emb=temb,
+                    attention_mask=attention_mask,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    encoder_attention_mask=encoder_attention_mask,
+                )
+            else:
+                hidden_states = resnet(hidden_states, temb, scale=lora_scale)
+                hidden_states = attn(
+                    hidden_states,
+                    encoder_hidden_states=encoder_hidden_states,
+                    emb=temb,
+                    attention_mask=attention_mask,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    encoder_attention_mask=encoder_attention_mask,
+                )
+
+        if self.upsamplers is not None:
+            for upsampler in self.upsamplers:
+                hidden_states = upsampler(hidden_states)
+
+        return hidden_states
+
+
+# can potentially later be renamed to `No-feed-forward` attention
+class KAttentionBlock(nn.Module):
+    r"""
+    A basic Transformer block.
+
+    Parameters:
+        dim (`int`): The number of channels in the input and output.
+        num_attention_heads (`int`): The number of heads to use for multi-head attention.
+        attention_head_dim (`int`): The number of channels in each head.
+        dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use.
+        cross_attention_dim (`int`, *optional*): The size of the encoder_hidden_states vector for cross attention.
+        attention_bias (`bool`, *optional*, defaults to `False`):
+            Configure if the attention layers should contain a bias parameter.
+        upcast_attention (`bool`, *optional*, defaults to `False`):
+            Set to `True` to upcast the attention computation to `float32`.
+        temb_channels (`int`, *optional*, defaults to 768):
+            The number of channels in the token embedding.
+        add_self_attention (`bool`, *optional*, defaults to `False`):
+            Set to `True` to add self-attention to the block.
+        cross_attention_norm (`str`, *optional*, defaults to `None`):
+            The type of normalization to use for the cross attention. Can be `None`, `layer_norm`, or `group_norm`.
+        group_size (`int`, *optional*, defaults to 32):
+            The number of groups to separate the channels into for group normalization.
+    """
+
+    def __init__(
+        self,
+        dim: int,
+        num_attention_heads: int,
+        attention_head_dim: int,
+        dropout: float = 0.0,
+        cross_attention_dim: Optional[int] = None,
+        attention_bias: bool = False,
+        upcast_attention: bool = False,
+        temb_channels: int = 768,  # for ada_group_norm
+        add_self_attention: bool = False,
+        cross_attention_norm: Optional[str] = None,
+        group_size: int = 32,
+    ):
+        super().__init__()
+        self.add_self_attention = add_self_attention
+
+        # 1. Self-Attn
+        if add_self_attention:
+            self.norm1 = AdaGroupNorm(temb_channels, dim, max(1, dim // group_size))
+            self.attn1 = Attention(
+                query_dim=dim,
+                heads=num_attention_heads,
+                dim_head=attention_head_dim,
+                dropout=dropout,
+                bias=attention_bias,
+                cross_attention_dim=None,
+                cross_attention_norm=None,
+            )
+
+        # 2. Cross-Attn
+        self.norm2 = AdaGroupNorm(temb_channels, dim, max(1, dim // group_size))
+        self.attn2 = Attention(
+            query_dim=dim,
+            cross_attention_dim=cross_attention_dim,
+            heads=num_attention_heads,
+            dim_head=attention_head_dim,
+            dropout=dropout,
+            bias=attention_bias,
+            upcast_attention=upcast_attention,
+            cross_attention_norm=cross_attention_norm,
+        )
+
+    def _to_3d(self, hidden_states: torch.FloatTensor, height: int, weight: int) -> torch.FloatTensor:
+        return hidden_states.permute(0, 2, 3, 1).reshape(hidden_states.shape[0], height * weight, -1)
+
+    def _to_4d(self, hidden_states: torch.FloatTensor, height: int, weight: int) -> torch.FloatTensor:
+        return hidden_states.permute(0, 2, 1).reshape(hidden_states.shape[0], -1, height, weight)
+
+    def forward(
+        self,
+        hidden_states: torch.FloatTensor,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        # TODO: mark emb as non-optional (self.norm2 requires it).
+        #       requires assessing impact of change to positional param interface.
+        emb: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+    ) -> torch.FloatTensor:
+        cross_attention_kwargs = cross_attention_kwargs if cross_attention_kwargs is not None else {}
+
+        # 1. Self-Attention
+        if self.add_self_attention:
+            norm_hidden_states = self.norm1(hidden_states, emb)
+
+            height, weight = norm_hidden_states.shape[2:]
+            norm_hidden_states = self._to_3d(norm_hidden_states, height, weight)
+
+            attn_output = self.attn1(
+                norm_hidden_states,
+                encoder_hidden_states=None,
+                attention_mask=attention_mask,
+                **cross_attention_kwargs,
+            )
+            attn_output = self._to_4d(attn_output, height, weight)
+
+            hidden_states = attn_output + hidden_states
+
+        # 2. Cross-Attention/None
+        norm_hidden_states = self.norm2(hidden_states, emb)
+
+        height, weight = norm_hidden_states.shape[2:]
+        norm_hidden_states = self._to_3d(norm_hidden_states, height, weight)
+        attn_output = self.attn2(
+            norm_hidden_states,
+            encoder_hidden_states=encoder_hidden_states,
+            attention_mask=attention_mask if encoder_hidden_states is None else encoder_attention_mask,
+            **cross_attention_kwargs,
+        )
+        attn_output = self._to_4d(attn_output, height, weight)
+
+        hidden_states = attn_output + hidden_states
+
+        return hidden_states
diff --git a/src/unet_hacked_garmnet.py b/src/unet_hacked_garmnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..927f49cc5f0cc924c3dcdfbae35df97f62479279
--- /dev/null
+++ b/src/unet_hacked_garmnet.py
@@ -0,0 +1,1284 @@
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from dataclasses import dataclass
+from typing import Any, Dict, List, Optional, Tuple, Union
+
+import torch
+import torch.nn as nn
+import torch.utils.checkpoint
+
+from diffusers.configuration_utils import ConfigMixin, register_to_config
+from diffusers.loaders import UNet2DConditionLoadersMixin
+from diffusers.utils import USE_PEFT_BACKEND, BaseOutput, deprecate, logging, scale_lora_layers, unscale_lora_layers
+from diffusers.models.activations import get_activation
+from diffusers.models.attention_processor import (
+    ADDED_KV_ATTENTION_PROCESSORS,
+    CROSS_ATTENTION_PROCESSORS,
+    Attention,
+    AttentionProcessor,
+    AttnAddedKVProcessor,
+    AttnProcessor,
+)
+from einops import rearrange
+
+from diffusers.models.embeddings import (
+    GaussianFourierProjection,
+    ImageHintTimeEmbedding,
+    ImageProjection,
+    ImageTimeEmbedding,
+    PositionNet,
+    TextImageProjection,
+    TextImageTimeEmbedding,
+    TextTimeEmbedding,
+    TimestepEmbedding,
+    Timesteps,
+)
+from diffusers.models.modeling_utils import ModelMixin
+from src.unet_block_hacked_garmnet import (
+    UNetMidBlock2D,
+    UNetMidBlock2DCrossAttn,
+    UNetMidBlock2DSimpleCrossAttn,
+    get_down_block,
+    get_up_block,
+)
+from diffusers.models.resnet import Downsample2D, FirDownsample2D, FirUpsample2D, KDownsample2D, KUpsample2D, ResnetBlock2D, Upsample2D
+from diffusers.models.transformer_2d import Transformer2DModel
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+def zero_module(module):
+    for p in module.parameters():
+        nn.init.zeros_(p)
+    return module
+
+@dataclass
+class UNet2DConditionOutput(BaseOutput):
+    """
+    The output of [`UNet2DConditionModel`].
+
+    Args:
+        sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+            The hidden states output conditioned on `encoder_hidden_states` input. Output of last layer of model.
+    """
+
+    sample: torch.FloatTensor = None
+
+
+class UNet2DConditionModel(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin):
+    r"""
+    A conditional 2D UNet model that takes a noisy sample, conditional state, and a timestep and returns a sample
+    shaped output.
+
+    This model inherits from [`ModelMixin`]. Check the superclass documentation for it's generic methods implemented
+    for all models (such as downloading or saving).
+
+    Parameters:
+        sample_size (`int` or `Tuple[int, int]`, *optional*, defaults to `None`):
+            Height and width of input/output sample.
+        in_channels (`int`, *optional*, defaults to 4): Number of channels in the input sample.
+        out_channels (`int`, *optional*, defaults to 4): Number of channels in the output.
+        center_input_sample (`bool`, *optional*, defaults to `False`): Whether to center the input sample.
+        flip_sin_to_cos (`bool`, *optional*, defaults to `False`):
+            Whether to flip the sin to cos in the time embedding.
+        freq_shift (`int`, *optional*, defaults to 0): The frequency shift to apply to the time embedding.
+        down_block_types (`Tuple[str]`, *optional*, defaults to `("CrossAttnDownBlock2D", "CrossAttnDownBlock2D", "CrossAttnDownBlock2D", "DownBlock2D")`):
+            The tuple of downsample blocks to use.
+        mid_block_type (`str`, *optional*, defaults to `"UNetMidBlock2DCrossAttn"`):
+            Block type for middle of UNet, it can be one of `UNetMidBlock2DCrossAttn`, `UNetMidBlock2D`, or
+            `UNetMidBlock2DSimpleCrossAttn`. If `None`, the mid block layer is skipped.
+        up_block_types (`Tuple[str]`, *optional*, defaults to `("UpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D")`):
+            The tuple of upsample blocks to use.
+        only_cross_attention(`bool` or `Tuple[bool]`, *optional*, default to `False`):
+            Whether to include self-attention in the basic transformer blocks, see
+            [`~models.attention.BasicTransformerBlock`].
+        block_out_channels (`Tuple[int]`, *optional*, defaults to `(320, 640, 1280, 1280)`):
+            The tuple of output channels for each block.
+        layers_per_block (`int`, *optional*, defaults to 2): The number of layers per block.
+        downsample_padding (`int`, *optional*, defaults to 1): The padding to use for the downsampling convolution.
+        mid_block_scale_factor (`float`, *optional*, defaults to 1.0): The scale factor to use for the mid block.
+        dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use.
+        act_fn (`str`, *optional*, defaults to `"silu"`): The activation function to use.
+        norm_num_groups (`int`, *optional*, defaults to 32): The number of groups to use for the normalization.
+            If `None`, normalization and activation layers is skipped in post-processing.
+        norm_eps (`float`, *optional*, defaults to 1e-5): The epsilon to use for the normalization.
+        cross_attention_dim (`int` or `Tuple[int]`, *optional*, defaults to 1280):
+            The dimension of the cross attention features.
+        transformer_layers_per_block (`int`, `Tuple[int]`, or `Tuple[Tuple]` , *optional*, defaults to 1):
+            The number of transformer blocks of type [`~models.attention.BasicTransformerBlock`]. Only relevant for
+            [`~models.unet_2d_blocks.CrossAttnDownBlock2D`], [`~models.unet_2d_blocks.CrossAttnUpBlock2D`],
+            [`~models.unet_2d_blocks.UNetMidBlock2DCrossAttn`].
+       reverse_transformer_layers_per_block : (`Tuple[Tuple]`, *optional*, defaults to None):
+            The number of transformer blocks of type [`~models.attention.BasicTransformerBlock`], in the upsampling
+            blocks of the U-Net. Only relevant if `transformer_layers_per_block` is of type `Tuple[Tuple]` and for
+            [`~models.unet_2d_blocks.CrossAttnDownBlock2D`], [`~models.unet_2d_blocks.CrossAttnUpBlock2D`],
+            [`~models.unet_2d_blocks.UNetMidBlock2DCrossAttn`].
+        encoder_hid_dim (`int`, *optional*, defaults to None):
+            If `encoder_hid_dim_type` is defined, `encoder_hidden_states` will be projected from `encoder_hid_dim`
+            dimension to `cross_attention_dim`.
+        encoder_hid_dim_type (`str`, *optional*, defaults to `None`):
+            If given, the `encoder_hidden_states` and potentially other embeddings are down-projected to text
+            embeddings of dimension `cross_attention` according to `encoder_hid_dim_type`.
+        attention_head_dim (`int`, *optional*, defaults to 8): The dimension of the attention heads.
+        num_attention_heads (`int`, *optional*):
+            The number of attention heads. If not defined, defaults to `attention_head_dim`
+        resnet_time_scale_shift (`str`, *optional*, defaults to `"default"`): Time scale shift config
+            for ResNet blocks (see [`~models.resnet.ResnetBlock2D`]). Choose from `default` or `scale_shift`.
+        class_embed_type (`str`, *optional*, defaults to `None`):
+            The type of class embedding to use which is ultimately summed with the time embeddings. Choose from `None`,
+            `"timestep"`, `"identity"`, `"projection"`, or `"simple_projection"`.
+        addition_embed_type (`str`, *optional*, defaults to `None`):
+            Configures an optional embedding which will be summed with the time embeddings. Choose from `None` or
+            "text". "text" will use the `TextTimeEmbedding` layer.
+        addition_time_embed_dim: (`int`, *optional*, defaults to `None`):
+            Dimension for the timestep embeddings.
+        num_class_embeds (`int`, *optional*, defaults to `None`):
+            Input dimension of the learnable embedding matrix to be projected to `time_embed_dim`, when performing
+            class conditioning with `class_embed_type` equal to `None`.
+        time_embedding_type (`str`, *optional*, defaults to `positional`):
+            The type of position embedding to use for timesteps. Choose from `positional` or `fourier`.
+        time_embedding_dim (`int`, *optional*, defaults to `None`):
+            An optional override for the dimension of the projected time embedding.
+        time_embedding_act_fn (`str`, *optional*, defaults to `None`):
+            Optional activation function to use only once on the time embeddings before they are passed to the rest of
+            the UNet. Choose from `silu`, `mish`, `gelu`, and `swish`.
+        timestep_post_act (`str`, *optional*, defaults to `None`):
+            The second activation function to use in timestep embedding. Choose from `silu`, `mish` and `gelu`.
+        time_cond_proj_dim (`int`, *optional*, defaults to `None`):
+            The dimension of `cond_proj` layer in the timestep embedding.
+        conv_in_kernel (`int`, *optional*, default to `3`): The kernel size of `conv_in` layer. conv_out_kernel (`int`,
+        *optional*, default to `3`): The kernel size of `conv_out` layer. projection_class_embeddings_input_dim (`int`,
+        *optional*): The dimension of the `class_labels` input when
+            `class_embed_type="projection"`. Required when `class_embed_type="projection"`.
+        class_embeddings_concat (`bool`, *optional*, defaults to `False`): Whether to concatenate the time
+            embeddings with the class embeddings.
+        mid_block_only_cross_attention (`bool`, *optional*, defaults to `None`):
+            Whether to use cross attention with the mid block when using the `UNetMidBlock2DSimpleCrossAttn`. If
+            `only_cross_attention` is given as a single boolean and `mid_block_only_cross_attention` is `None`, the
+            `only_cross_attention` value is used as the value for `mid_block_only_cross_attention`. Default to `False`
+            otherwise.
+    """
+
+    _supports_gradient_checkpointing = True
+
+    @register_to_config
+    def __init__(
+        self,
+        sample_size: Optional[int] = None,
+        in_channels: int = 4,
+        out_channels: int = 4,
+        center_input_sample: bool = False,
+        flip_sin_to_cos: bool = True,
+        freq_shift: int = 0,
+        down_block_types: Tuple[str] = (
+            "CrossAttnDownBlock2D",
+            "CrossAttnDownBlock2D",
+            "CrossAttnDownBlock2D",
+            "DownBlock2D",
+        ),
+        mid_block_type: Optional[str] = "UNetMidBlock2DCrossAttn",
+        up_block_types: Tuple[str] = ("UpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D"),
+        only_cross_attention: Union[bool, Tuple[bool]] = False,
+        block_out_channels: Tuple[int] = (320, 640, 1280, 1280),
+        layers_per_block: Union[int, Tuple[int]] = 2,
+        downsample_padding: int = 1,
+        mid_block_scale_factor: float = 1,
+        dropout: float = 0.0,
+        act_fn: str = "silu",
+        norm_num_groups: Optional[int] = 32,
+        norm_eps: float = 1e-5,
+        cross_attention_dim: Union[int, Tuple[int]] = 1280,
+        transformer_layers_per_block: Union[int, Tuple[int], Tuple[Tuple]] = 1,
+        reverse_transformer_layers_per_block: Optional[Tuple[Tuple[int]]] = None,
+        encoder_hid_dim: Optional[int] = None,
+        encoder_hid_dim_type: Optional[str] = None,
+        attention_head_dim: Union[int, Tuple[int]] = 8,
+        num_attention_heads: Optional[Union[int, Tuple[int]]] = None,
+        dual_cross_attention: bool = False,
+        use_linear_projection: bool = False,
+        class_embed_type: Optional[str] = None,
+        addition_embed_type: Optional[str] = None,
+        addition_time_embed_dim: Optional[int] = None,
+        num_class_embeds: Optional[int] = None,
+        upcast_attention: bool = False,
+        resnet_time_scale_shift: str = "default",
+        resnet_skip_time_act: bool = False,
+        resnet_out_scale_factor: int = 1.0,
+        time_embedding_type: str = "positional",
+        time_embedding_dim: Optional[int] = None,
+        time_embedding_act_fn: Optional[str] = None,
+        timestep_post_act: Optional[str] = None,
+        time_cond_proj_dim: Optional[int] = None,
+        conv_in_kernel: int = 3,
+        conv_out_kernel: int = 3,
+        projection_class_embeddings_input_dim: Optional[int] = None,
+        attention_type: str = "default",
+        class_embeddings_concat: bool = False,
+        mid_block_only_cross_attention: Optional[bool] = None,
+        cross_attention_norm: Optional[str] = None,
+        addition_embed_type_num_heads=64,
+    ):
+        super().__init__()
+
+        self.sample_size = sample_size
+
+        if num_attention_heads is not None:
+            raise ValueError(
+                "At the moment it is not possible to define the number of attention heads via `num_attention_heads` because of a naming issue as described in https://github.com/huggingface/diffusers/issues/2011#issuecomment-1547958131. Passing `num_attention_heads` will only be supported in diffusers v0.19."
+            )
+
+        # If `num_attention_heads` is not defined (which is the case for most models)
+        # it will default to `attention_head_dim`. This looks weird upon first reading it and it is.
+        # The reason for this behavior is to correct for incorrectly named variables that were introduced
+        # when this library was created. The incorrect naming was only discovered much later in https://github.com/huggingface/diffusers/issues/2011#issuecomment-1547958131
+        # Changing `attention_head_dim` to `num_attention_heads` for 40,000+ configurations is too backwards breaking
+        # which is why we correct for the naming here.
+        num_attention_heads = num_attention_heads or attention_head_dim
+
+        # Check inputs
+        if len(down_block_types) != len(up_block_types):
+            raise ValueError(
+                f"Must provide the same number of `down_block_types` as `up_block_types`. `down_block_types`: {down_block_types}. `up_block_types`: {up_block_types}."
+            )
+
+        if len(block_out_channels) != len(down_block_types):
+            raise ValueError(
+                f"Must provide the same number of `block_out_channels` as `down_block_types`. `block_out_channels`: {block_out_channels}. `down_block_types`: {down_block_types}."
+            )
+
+        if not isinstance(only_cross_attention, bool) and len(only_cross_attention) != len(down_block_types):
+            raise ValueError(
+                f"Must provide the same number of `only_cross_attention` as `down_block_types`. `only_cross_attention`: {only_cross_attention}. `down_block_types`: {down_block_types}."
+            )
+
+        if not isinstance(num_attention_heads, int) and len(num_attention_heads) != len(down_block_types):
+            raise ValueError(
+                f"Must provide the same number of `num_attention_heads` as `down_block_types`. `num_attention_heads`: {num_attention_heads}. `down_block_types`: {down_block_types}."
+            )
+
+        if not isinstance(attention_head_dim, int) and len(attention_head_dim) != len(down_block_types):
+            raise ValueError(
+                f"Must provide the same number of `attention_head_dim` as `down_block_types`. `attention_head_dim`: {attention_head_dim}. `down_block_types`: {down_block_types}."
+            )
+
+        if isinstance(cross_attention_dim, list) and len(cross_attention_dim) != len(down_block_types):
+            raise ValueError(
+                f"Must provide the same number of `cross_attention_dim` as `down_block_types`. `cross_attention_dim`: {cross_attention_dim}. `down_block_types`: {down_block_types}."
+            )
+
+        if not isinstance(layers_per_block, int) and len(layers_per_block) != len(down_block_types):
+            raise ValueError(
+                f"Must provide the same number of `layers_per_block` as `down_block_types`. `layers_per_block`: {layers_per_block}. `down_block_types`: {down_block_types}."
+            )
+        if isinstance(transformer_layers_per_block, list) and reverse_transformer_layers_per_block is None:
+            for layer_number_per_block in transformer_layers_per_block:
+                if isinstance(layer_number_per_block, list):
+                    raise ValueError("Must provide 'reverse_transformer_layers_per_block` if using asymmetrical UNet.")
+
+        # input
+        conv_in_padding = (conv_in_kernel - 1) // 2
+        self.conv_in = nn.Conv2d(
+            in_channels, block_out_channels[0], kernel_size=conv_in_kernel, padding=conv_in_padding
+        )
+
+        # time
+        if time_embedding_type == "fourier":
+            time_embed_dim = time_embedding_dim or block_out_channels[0] * 2
+            if time_embed_dim % 2 != 0:
+                raise ValueError(f"`time_embed_dim` should be divisible by 2, but is {time_embed_dim}.")
+            self.time_proj = GaussianFourierProjection(
+                time_embed_dim // 2, set_W_to_weight=False, log=False, flip_sin_to_cos=flip_sin_to_cos
+            )
+            timestep_input_dim = time_embed_dim
+        elif time_embedding_type == "positional":
+            time_embed_dim = time_embedding_dim or block_out_channels[0] * 4
+
+            self.time_proj = Timesteps(block_out_channels[0], flip_sin_to_cos, freq_shift)
+            timestep_input_dim = block_out_channels[0]
+        else:
+            raise ValueError(
+                f"{time_embedding_type} does not exist. Please make sure to use one of `fourier` or `positional`."
+            )
+
+        self.time_embedding = TimestepEmbedding(
+            timestep_input_dim,
+            time_embed_dim,
+            act_fn=act_fn,
+            post_act_fn=timestep_post_act,
+            cond_proj_dim=time_cond_proj_dim,
+        )
+
+        if encoder_hid_dim_type is None and encoder_hid_dim is not None:
+            encoder_hid_dim_type = "text_proj"
+            self.register_to_config(encoder_hid_dim_type=encoder_hid_dim_type)
+            logger.info("encoder_hid_dim_type defaults to 'text_proj' as `encoder_hid_dim` is defined.")
+
+        if encoder_hid_dim is None and encoder_hid_dim_type is not None:
+            raise ValueError(
+                f"`encoder_hid_dim` has to be defined when `encoder_hid_dim_type` is set to {encoder_hid_dim_type}."
+            )
+
+        if encoder_hid_dim_type == "text_proj":
+            self.encoder_hid_proj = nn.Linear(encoder_hid_dim, cross_attention_dim)
+        elif encoder_hid_dim_type == "text_image_proj":
+            # image_embed_dim DOESN'T have to be `cross_attention_dim`. To not clutter the __init__ too much
+            # they are set to `cross_attention_dim` here as this is exactly the required dimension for the currently only use
+            # case when `addition_embed_type == "text_image_proj"` (Kadinsky 2.1)`
+            self.encoder_hid_proj = TextImageProjection(
+                text_embed_dim=encoder_hid_dim,
+                image_embed_dim=cross_attention_dim,
+                cross_attention_dim=cross_attention_dim,
+            )
+        elif encoder_hid_dim_type == "image_proj":
+            # Kandinsky 2.2
+            self.encoder_hid_proj = ImageProjection(
+                image_embed_dim=encoder_hid_dim,
+                cross_attention_dim=cross_attention_dim,
+            )
+        elif encoder_hid_dim_type is not None:
+            raise ValueError(
+                f"encoder_hid_dim_type: {encoder_hid_dim_type} must be None, 'text_proj' or 'text_image_proj'."
+            )
+        else:
+            self.encoder_hid_proj = None
+
+        # class embedding
+        if class_embed_type is None and num_class_embeds is not None:
+            self.class_embedding = nn.Embedding(num_class_embeds, time_embed_dim)
+        elif class_embed_type == "timestep":
+            self.class_embedding = TimestepEmbedding(timestep_input_dim, time_embed_dim, act_fn=act_fn)
+        elif class_embed_type == "identity":
+            self.class_embedding = nn.Identity(time_embed_dim, time_embed_dim)
+        elif class_embed_type == "projection":
+            if projection_class_embeddings_input_dim is None:
+                raise ValueError(
+                    "`class_embed_type`: 'projection' requires `projection_class_embeddings_input_dim` be set"
+                )
+            # The projection `class_embed_type` is the same as the timestep `class_embed_type` except
+            # 1. the `class_labels` inputs are not first converted to sinusoidal embeddings
+            # 2. it projects from an arbitrary input dimension.
+            #
+            # Note that `TimestepEmbedding` is quite general, being mainly linear layers and activations.
+            # When used for embedding actual timesteps, the timesteps are first converted to sinusoidal embeddings.
+            # As a result, `TimestepEmbedding` can be passed arbitrary vectors.
+            self.class_embedding = TimestepEmbedding(projection_class_embeddings_input_dim, time_embed_dim)
+        elif class_embed_type == "simple_projection":
+            if projection_class_embeddings_input_dim is None:
+                raise ValueError(
+                    "`class_embed_type`: 'simple_projection' requires `projection_class_embeddings_input_dim` be set"
+                )
+            self.class_embedding = nn.Linear(projection_class_embeddings_input_dim, time_embed_dim)
+        else:
+            self.class_embedding = None
+
+        if addition_embed_type == "text":
+            if encoder_hid_dim is not None:
+                text_time_embedding_from_dim = encoder_hid_dim
+            else:
+                text_time_embedding_from_dim = cross_attention_dim
+
+            self.add_embedding = TextTimeEmbedding(
+                text_time_embedding_from_dim, time_embed_dim, num_heads=addition_embed_type_num_heads
+            )
+        elif addition_embed_type == "text_image":
+            # text_embed_dim and image_embed_dim DON'T have to be `cross_attention_dim`. To not clutter the __init__ too much
+            # they are set to `cross_attention_dim` here as this is exactly the required dimension for the currently only use
+            # case when `addition_embed_type == "text_image"` (Kadinsky 2.1)`
+            self.add_embedding = TextImageTimeEmbedding(
+                text_embed_dim=cross_attention_dim, image_embed_dim=cross_attention_dim, time_embed_dim=time_embed_dim
+            )
+        elif addition_embed_type == "text_time":
+            self.add_time_proj = Timesteps(addition_time_embed_dim, flip_sin_to_cos, freq_shift)
+            self.add_embedding = TimestepEmbedding(projection_class_embeddings_input_dim, time_embed_dim)
+        elif addition_embed_type == "image":
+            # Kandinsky 2.2
+            self.add_embedding = ImageTimeEmbedding(image_embed_dim=encoder_hid_dim, time_embed_dim=time_embed_dim)
+        elif addition_embed_type == "image_hint":
+            # Kandinsky 2.2 ControlNet
+            self.add_embedding = ImageHintTimeEmbedding(image_embed_dim=encoder_hid_dim, time_embed_dim=time_embed_dim)
+        elif addition_embed_type is not None:
+            raise ValueError(f"addition_embed_type: {addition_embed_type} must be None, 'text' or 'text_image'.")
+
+        if time_embedding_act_fn is None:
+            self.time_embed_act = None
+        else:
+            self.time_embed_act = get_activation(time_embedding_act_fn)
+
+        self.down_blocks = nn.ModuleList([])
+        self.up_blocks = nn.ModuleList([])
+
+        if isinstance(only_cross_attention, bool):
+            if mid_block_only_cross_attention is None:
+                mid_block_only_cross_attention = only_cross_attention
+
+            only_cross_attention = [only_cross_attention] * len(down_block_types)
+
+        if mid_block_only_cross_attention is None:
+            mid_block_only_cross_attention = False
+
+        if isinstance(num_attention_heads, int):
+            num_attention_heads = (num_attention_heads,) * len(down_block_types)
+
+        if isinstance(attention_head_dim, int):
+            attention_head_dim = (attention_head_dim,) * len(down_block_types)
+
+        if isinstance(cross_attention_dim, int):
+            cross_attention_dim = (cross_attention_dim,) * len(down_block_types)
+
+        if isinstance(layers_per_block, int):
+            layers_per_block = [layers_per_block] * len(down_block_types)
+
+        if isinstance(transformer_layers_per_block, int):
+            transformer_layers_per_block = [transformer_layers_per_block] * len(down_block_types)
+        if class_embeddings_concat:
+            # The time embeddings are concatenated with the class embeddings. The dimension of the
+            # time embeddings passed to the down, middle, and up blocks is twice the dimension of the
+            # regular time embeddings
+            blocks_time_embed_dim = time_embed_dim * 2
+        else:
+            blocks_time_embed_dim = time_embed_dim
+
+        # down
+        output_channel = block_out_channels[0]
+        for i, down_block_type in enumerate(down_block_types):
+            input_channel = output_channel
+            output_channel = block_out_channels[i]
+            is_final_block = i == len(block_out_channels) - 1
+
+            down_block = get_down_block(
+                down_block_type,
+                num_layers=layers_per_block[i],
+                transformer_layers_per_block=transformer_layers_per_block[i],
+                in_channels=input_channel,
+                out_channels=output_channel,
+                temb_channels=blocks_time_embed_dim,
+                add_downsample=not is_final_block,
+                resnet_eps=norm_eps,
+                resnet_act_fn=act_fn,
+                resnet_groups=norm_num_groups,
+                cross_attention_dim=cross_attention_dim[i],
+                num_attention_heads=num_attention_heads[i],
+                downsample_padding=downsample_padding,
+                dual_cross_attention=dual_cross_attention,
+                use_linear_projection=use_linear_projection,
+                only_cross_attention=only_cross_attention[i],
+                upcast_attention=upcast_attention,
+                resnet_time_scale_shift=resnet_time_scale_shift,
+                attention_type=attention_type,
+                resnet_skip_time_act=resnet_skip_time_act,
+                resnet_out_scale_factor=resnet_out_scale_factor,
+                cross_attention_norm=cross_attention_norm,
+                attention_head_dim=attention_head_dim[i] if attention_head_dim[i] is not None else output_channel,
+                dropout=dropout,
+            )
+            self.down_blocks.append(down_block)
+
+        # mid
+        if mid_block_type == "UNetMidBlock2DCrossAttn":
+            self.mid_block = UNetMidBlock2DCrossAttn(
+                transformer_layers_per_block=transformer_layers_per_block[-1],
+                in_channels=block_out_channels[-1],
+                temb_channels=blocks_time_embed_dim,
+                dropout=dropout,
+                resnet_eps=norm_eps,
+                resnet_act_fn=act_fn,
+                output_scale_factor=mid_block_scale_factor,
+                resnet_time_scale_shift=resnet_time_scale_shift,
+                cross_attention_dim=cross_attention_dim[-1],
+                num_attention_heads=num_attention_heads[-1],
+                resnet_groups=norm_num_groups,
+                dual_cross_attention=dual_cross_attention,
+                use_linear_projection=use_linear_projection,
+                upcast_attention=upcast_attention,
+                attention_type=attention_type,
+            )
+        elif mid_block_type == "UNetMidBlock2DSimpleCrossAttn":
+            self.mid_block = UNetMidBlock2DSimpleCrossAttn(
+                in_channels=block_out_channels[-1],
+                temb_channels=blocks_time_embed_dim,
+                dropout=dropout,
+                resnet_eps=norm_eps,
+                resnet_act_fn=act_fn,
+                output_scale_factor=mid_block_scale_factor,
+                cross_attention_dim=cross_attention_dim[-1],
+                attention_head_dim=attention_head_dim[-1],
+                resnet_groups=norm_num_groups,
+                resnet_time_scale_shift=resnet_time_scale_shift,
+                skip_time_act=resnet_skip_time_act,
+                only_cross_attention=mid_block_only_cross_attention,
+                cross_attention_norm=cross_attention_norm,
+            )
+        elif mid_block_type == "UNetMidBlock2D":
+            self.mid_block = UNetMidBlock2D(
+                in_channels=block_out_channels[-1],
+                temb_channels=blocks_time_embed_dim,
+                dropout=dropout,
+                num_layers=0,
+                resnet_eps=norm_eps,
+                resnet_act_fn=act_fn,
+                output_scale_factor=mid_block_scale_factor,
+                resnet_groups=norm_num_groups,
+                resnet_time_scale_shift=resnet_time_scale_shift,
+                add_attention=False,
+            )
+        elif mid_block_type is None:
+            self.mid_block = None
+        else:
+            raise ValueError(f"unknown mid_block_type : {mid_block_type}")
+
+        # count how many layers upsample the images
+        self.num_upsamplers = 0
+
+        # up
+        reversed_block_out_channels = list(reversed(block_out_channels))
+        reversed_num_attention_heads = list(reversed(num_attention_heads))
+        reversed_layers_per_block = list(reversed(layers_per_block))
+        reversed_cross_attention_dim = list(reversed(cross_attention_dim))
+        reversed_transformer_layers_per_block = (
+            list(reversed(transformer_layers_per_block))
+            if reverse_transformer_layers_per_block is None
+            else reverse_transformer_layers_per_block
+        )
+        only_cross_attention = list(reversed(only_cross_attention))
+
+        output_channel = reversed_block_out_channels[0]
+        for i, up_block_type in enumerate(up_block_types):
+            is_final_block = i == len(block_out_channels) - 1
+
+            prev_output_channel = output_channel
+            output_channel = reversed_block_out_channels[i]
+            input_channel = reversed_block_out_channels[min(i + 1, len(block_out_channels) - 1)]
+
+            # add upsample block for all BUT final layer
+            if not is_final_block:
+                add_upsample = True
+                self.num_upsamplers += 1
+            else:
+                add_upsample = False
+            up_block = get_up_block(
+                up_block_type,
+                num_layers=reversed_layers_per_block[i] + 1,
+                transformer_layers_per_block=reversed_transformer_layers_per_block[i],
+                in_channels=input_channel,
+                out_channels=output_channel,
+                prev_output_channel=prev_output_channel,
+                temb_channels=blocks_time_embed_dim,
+                add_upsample=add_upsample,
+                resnet_eps=norm_eps,
+                resnet_act_fn=act_fn,
+                resolution_idx=i,
+                resnet_groups=norm_num_groups,
+                cross_attention_dim=reversed_cross_attention_dim[i],
+                num_attention_heads=reversed_num_attention_heads[i],
+                dual_cross_attention=dual_cross_attention,
+                use_linear_projection=use_linear_projection,
+                only_cross_attention=only_cross_attention[i],
+                upcast_attention=upcast_attention,
+                resnet_time_scale_shift=resnet_time_scale_shift,
+                attention_type=attention_type,
+                resnet_skip_time_act=resnet_skip_time_act,
+                resnet_out_scale_factor=resnet_out_scale_factor,
+                cross_attention_norm=cross_attention_norm,
+                attention_head_dim=attention_head_dim[i] if attention_head_dim[i] is not None else output_channel,
+                dropout=dropout,
+            )
+
+            self.up_blocks.append(up_block)
+            prev_output_channel = output_channel
+
+
+
+
+        # encode_output_chs = [
+        #     # 320,
+        #     # 320,
+        #     # 320,
+        #     1280, 
+        #     1280, 
+        #     1280, 
+        #     1280,
+        #     640,
+        #     640
+        # ]
+
+        # encode_output_chs2 = [
+        #     # 320,
+        #     # 320,
+        #     # 320,
+        #     1280, 
+        #     1280,
+        #     640, 
+        #     640, 
+        #     640,
+        #     320
+        # ]
+
+        # encode_num_head_chs3 = [
+        #     # 5,
+        #     # 5,
+        #     # 10,
+        #     20,
+        #     20, 
+        #     20,
+        #     10,
+        #     10, 
+        #     10 
+        # ]
+
+
+        # encode_num_layers_chs4 = [
+        #     # 1,
+        #     # 1,
+        #     # 2,
+        #     10,
+        #     10, 
+        #     10,
+        #     2,
+        #     2, 
+        #     2 
+        # ]
+
+
+        # self.warp_blks = nn.ModuleList([])
+        # self.warp_zeros = nn.ModuleList([])
+
+        # for in_ch, cont_ch,num_head,num_layers in zip(encode_output_chs, encode_output_chs2,encode_num_head_chs3,encode_num_layers_chs4):
+        #     # dim_head = in_ch // self.num_heads
+        #     # dim_head = dim_head // dim_head_denorm
+
+        #     self.warp_blks.append(Transformer2DModel(
+        #     num_attention_heads=num_head,
+        #     attention_head_dim=64,
+        #     in_channels=in_ch,
+        #     num_layers = num_layers,
+        #     cross_attention_dim = cont_ch,
+        #     ))
+            
+        #     self.warp_zeros.append(zero_module(nn.Conv2d(in_ch, in_ch, 1, padding=0)))
+
+
+
+        # out
+        if norm_num_groups is not None:
+            self.conv_norm_out = nn.GroupNorm(
+                num_channels=block_out_channels[0], num_groups=norm_num_groups, eps=norm_eps
+            )
+
+            self.conv_act = get_activation(act_fn)
+
+        else:
+            self.conv_norm_out = None
+            self.conv_act = None
+
+        conv_out_padding = (conv_out_kernel - 1) // 2
+        self.conv_out = nn.Conv2d(
+            block_out_channels[0], out_channels, kernel_size=conv_out_kernel, padding=conv_out_padding
+        )
+
+        if attention_type in ["gated", "gated-text-image"]:
+            positive_len = 768
+            if isinstance(cross_attention_dim, int):
+                positive_len = cross_attention_dim
+            elif isinstance(cross_attention_dim, tuple) or isinstance(cross_attention_dim, list):
+                positive_len = cross_attention_dim[0]
+
+            feature_type = "text-only" if attention_type == "gated" else "text-image"
+            self.position_net = PositionNet(
+                positive_len=positive_len, out_dim=cross_attention_dim, feature_type=feature_type
+            )
+
+
+
+
+    @property
+    def attn_processors(self) -> Dict[str, AttentionProcessor]:
+        r"""
+        Returns:
+            `dict` of attention processors: A dictionary containing all attention processors used in the model with
+            indexed by its weight name.
+        """
+        # set recursively
+        processors = {}
+
+        def fn_recursive_add_processors(name: str, module: torch.nn.Module, processors: Dict[str, AttentionProcessor]):
+            if hasattr(module, "get_processor"):
+                processors[f"{name}.processor"] = module.get_processor(return_deprecated_lora=True)
+
+            for sub_name, child in module.named_children():
+                fn_recursive_add_processors(f"{name}.{sub_name}", child, processors)
+
+            return processors
+
+        for name, module in self.named_children():
+            fn_recursive_add_processors(name, module, processors)
+
+        return processors
+
+    def set_attn_processor(
+        self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]], _remove_lora=False
+    ):
+        r"""
+        Sets the attention processor to use to compute attention.
+
+        Parameters:
+            processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`):
+                The instantiated processor class or a dictionary of processor classes that will be set as the processor
+                for **all** `Attention` layers.
+
+                If `processor` is a dict, the key needs to define the path to the corresponding cross attention
+                processor. This is strongly recommended when setting trainable attention processors.
+
+        """
+        count = len(self.attn_processors.keys())
+
+        if isinstance(processor, dict) and len(processor) != count:
+            raise ValueError(
+                f"A dict of processors was passed, but the number of processors {len(processor)} does not match the"
+                f" number of attention layers: {count}. Please make sure to pass {count} processor classes."
+            )
+
+        def fn_recursive_attn_processor(name: str, module: torch.nn.Module, processor):
+            if hasattr(module, "set_processor"):
+                if not isinstance(processor, dict):
+                    module.set_processor(processor, _remove_lora=_remove_lora)
+                else:
+                    module.set_processor(processor.pop(f"{name}.processor"), _remove_lora=_remove_lora)
+
+            for sub_name, child in module.named_children():
+                fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor)
+
+        for name, module in self.named_children():
+            fn_recursive_attn_processor(name, module, processor)
+
+    def set_default_attn_processor(self):
+        """
+        Disables custom attention processors and sets the default attention implementation.
+        """
+        if all(proc.__class__ in ADDED_KV_ATTENTION_PROCESSORS for proc in self.attn_processors.values()):
+            processor = AttnAddedKVProcessor()
+        elif all(proc.__class__ in CROSS_ATTENTION_PROCESSORS for proc in self.attn_processors.values()):
+            processor = AttnProcessor()
+        else:
+            raise ValueError(
+                f"Cannot call `set_default_attn_processor` when attention processors are of type {next(iter(self.attn_processors.values()))}"
+            )
+
+        self.set_attn_processor(processor, _remove_lora=True)
+
+    def set_attention_slice(self, slice_size):
+        r"""
+        Enable sliced attention computation.
+
+        When this option is enabled, the attention module splits the input tensor in slices to compute attention in
+        several steps. This is useful for saving some memory in exchange for a small decrease in speed.
+
+        Args:
+            slice_size (`str` or `int` or `list(int)`, *optional*, defaults to `"auto"`):
+                When `"auto"`, input to the attention heads is halved, so attention is computed in two steps. If
+                `"max"`, maximum amount of memory is saved by running only one slice at a time. If a number is
+                provided, uses as many slices as `attention_head_dim // slice_size`. In this case, `attention_head_dim`
+                must be a multiple of `slice_size`.
+        """
+        sliceable_head_dims = []
+
+        def fn_recursive_retrieve_sliceable_dims(module: torch.nn.Module):
+            if hasattr(module, "set_attention_slice"):
+                sliceable_head_dims.append(module.sliceable_head_dim)
+
+            for child in module.children():
+                fn_recursive_retrieve_sliceable_dims(child)
+
+        # retrieve number of attention layers
+        for module in self.children():
+            fn_recursive_retrieve_sliceable_dims(module)
+
+        num_sliceable_layers = len(sliceable_head_dims)
+
+        if slice_size == "auto":
+            # half the attention head size is usually a good trade-off between
+            # speed and memory
+            slice_size = [dim // 2 for dim in sliceable_head_dims]
+        elif slice_size == "max":
+            # make smallest slice possible
+            slice_size = num_sliceable_layers * [1]
+
+        slice_size = num_sliceable_layers * [slice_size] if not isinstance(slice_size, list) else slice_size
+
+        if len(slice_size) != len(sliceable_head_dims):
+            raise ValueError(
+                f"You have provided {len(slice_size)}, but {self.config} has {len(sliceable_head_dims)} different"
+                f" attention layers. Make sure to match `len(slice_size)` to be {len(sliceable_head_dims)}."
+            )
+
+        for i in range(len(slice_size)):
+            size = slice_size[i]
+            dim = sliceable_head_dims[i]
+            if size is not None and size > dim:
+                raise ValueError(f"size {size} has to be smaller or equal to {dim}.")
+
+        # Recursively walk through all the children.
+        # Any children which exposes the set_attention_slice method
+        # gets the message
+        def fn_recursive_set_attention_slice(module: torch.nn.Module, slice_size: List[int]):
+            if hasattr(module, "set_attention_slice"):
+                module.set_attention_slice(slice_size.pop())
+
+            for child in module.children():
+                fn_recursive_set_attention_slice(child, slice_size)
+
+        reversed_slice_size = list(reversed(slice_size))
+        for module in self.children():
+            fn_recursive_set_attention_slice(module, reversed_slice_size)
+
+    def _set_gradient_checkpointing(self, module, value=False):
+        if hasattr(module, "gradient_checkpointing"):
+            module.gradient_checkpointing = value
+
+    def enable_freeu(self, s1, s2, b1, b2):
+        r"""Enables the FreeU mechanism from https://arxiv.org/abs/2309.11497.
+
+        The suffixes after the scaling factors represent the stage blocks where they are being applied.
+
+        Please refer to the [official repository](https://github.com/ChenyangSi/FreeU) for combinations of values that
+        are known to work well for different pipelines such as Stable Diffusion v1, v2, and Stable Diffusion XL.
+
+        Args:
+            s1 (`float`):
+                Scaling factor for stage 1 to attenuate the contributions of the skip features. This is done to
+                mitigate the "oversmoothing effect" in the enhanced denoising process.
+            s2 (`float`):
+                Scaling factor for stage 2 to attenuate the contributions of the skip features. This is done to
+                mitigate the "oversmoothing effect" in the enhanced denoising process.
+            b1 (`float`): Scaling factor for stage 1 to amplify the contributions of backbone features.
+            b2 (`float`): Scaling factor for stage 2 to amplify the contributions of backbone features.
+        """
+        for i, upsample_block in enumerate(self.up_blocks):
+            setattr(upsample_block, "s1", s1)
+            setattr(upsample_block, "s2", s2)
+            setattr(upsample_block, "b1", b1)
+            setattr(upsample_block, "b2", b2)
+
+    def disable_freeu(self):
+        """Disables the FreeU mechanism."""
+        freeu_keys = {"s1", "s2", "b1", "b2"}
+        for i, upsample_block in enumerate(self.up_blocks):
+            for k in freeu_keys:
+                if hasattr(upsample_block, k) or getattr(upsample_block, k, None) is not None:
+                    setattr(upsample_block, k, None)
+
+    def fuse_qkv_projections(self):
+        """
+        Enables fused QKV projections. For self-attention modules, all projection matrices (i.e., query,
+        key, value) are fused. For cross-attention modules, key and value projection matrices are fused.
+
+        <Tip warning={true}>
+
+        This API is 🧪 experimental.
+
+        </Tip>
+        """
+        self.original_attn_processors = None
+
+        for _, attn_processor in self.attn_processors.items():
+            if "Added" in str(attn_processor.__class__.__name__):
+                raise ValueError("`fuse_qkv_projections()` is not supported for models having added KV projections.")
+
+        self.original_attn_processors = self.attn_processors
+
+        for module in self.modules():
+            if isinstance(module, Attention):
+                module.fuse_projections(fuse=True)
+
+    def unfuse_qkv_projections(self):
+        """Disables the fused QKV projection if enabled.
+
+        <Tip warning={true}>
+
+        This API is 🧪 experimental.
+
+        </Tip>
+
+        """
+        if self.original_attn_processors is not None:
+            self.set_attn_processor(self.original_attn_processors)
+
+    def forward(
+        self,
+        sample: torch.FloatTensor,
+        timestep: Union[torch.Tensor, float, int],
+        encoder_hidden_states: torch.Tensor,
+        class_labels: Optional[torch.Tensor] = None,
+        timestep_cond: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        added_cond_kwargs: Optional[Dict[str, torch.Tensor]] = None,
+        down_block_additional_residuals: Optional[Tuple[torch.Tensor]] = None,
+        mid_block_additional_residual: Optional[torch.Tensor] = None,
+        down_intrablock_additional_residuals: Optional[Tuple[torch.Tensor]] = None,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
+        return_dict: bool = True,
+    ) -> Union[UNet2DConditionOutput, Tuple]:
+        r"""
+        The [`UNet2DConditionModel`] forward method.
+
+        Args:
+            sample (`torch.FloatTensor`):
+                The noisy input tensor with the following shape `(batch, channel, height, width)`.
+            timestep (`torch.FloatTensor` or `float` or `int`): The number of timesteps to denoise an input.
+            encoder_hidden_states (`torch.FloatTensor`):
+                The encoder hidden states with shape `(batch, sequence_length, feature_dim)`.
+            class_labels (`torch.Tensor`, *optional*, defaults to `None`):
+                Optional class labels for conditioning. Their embeddings will be summed with the timestep embeddings.
+            timestep_cond: (`torch.Tensor`, *optional*, defaults to `None`):
+                Conditional embeddings for timestep. If provided, the embeddings will be summed with the samples passed
+                through the `self.time_embedding` layer to obtain the timestep embeddings.
+            attention_mask (`torch.Tensor`, *optional*, defaults to `None`):
+                An attention mask of shape `(batch, key_tokens)` is applied to `encoder_hidden_states`. If `1` the mask
+                is kept, otherwise if `0` it is discarded. Mask will be converted into a bias, which adds large
+                negative values to the attention scores corresponding to "discard" tokens.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
+                `self.processor` in
+                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            added_cond_kwargs: (`dict`, *optional*):
+                A kwargs dictionary containing additional embeddings that if specified are added to the embeddings that
+                are passed along to the UNet blocks.
+            down_block_additional_residuals: (`tuple` of `torch.Tensor`, *optional*):
+                A tuple of tensors that if specified are added to the residuals of down unet blocks.
+            mid_block_additional_residual: (`torch.Tensor`, *optional*):
+                A tensor that if specified is added to the residual of the middle unet block.
+            encoder_attention_mask (`torch.Tensor`):
+                A cross-attention mask of shape `(batch, sequence_length)` is applied to `encoder_hidden_states`. If
+                `True` the mask is kept, otherwise if `False` it is discarded. Mask will be converted into a bias,
+                which adds large negative values to the attention scores corresponding to "discard" tokens.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~models.unet_2d_condition.UNet2DConditionOutput`] instead of a plain
+                tuple.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the [`AttnProcessor`].
+            added_cond_kwargs: (`dict`, *optional*):
+                A kwargs dictionary containin additional embeddings that if specified are added to the embeddings that
+                are passed along to the UNet blocks.
+            down_block_additional_residuals (`tuple` of `torch.Tensor`, *optional*):
+                additional residuals to be added to UNet long skip connections from down blocks to up blocks for
+                example from ControlNet side model(s)
+            mid_block_additional_residual (`torch.Tensor`, *optional*):
+                additional residual to be added to UNet mid block output, for example from ControlNet side model
+            down_intrablock_additional_residuals (`tuple` of `torch.Tensor`, *optional*):
+                additional residuals to be added within UNet down blocks, for example from T2I-Adapter side model(s)
+
+        Returns:
+            [`~models.unet_2d_condition.UNet2DConditionOutput`] or `tuple`:
+                If `return_dict` is True, an [`~models.unet_2d_condition.UNet2DConditionOutput`] is returned, otherwise
+                a `tuple` is returned where the first element is the sample tensor.
+        """
+        # By default samples have to be AT least a multiple of the overall upsampling factor.
+        # The overall upsampling factor is equal to 2 ** (# num of upsampling layers).
+        # However, the upsampling interpolation output size can be forced to fit any upsampling size
+        # on the fly if necessary.
+        default_overall_up_factor = 2**self.num_upsamplers
+
+        # upsample size should be forwarded when sample is not a multiple of `default_overall_up_factor`
+        forward_upsample_size = False
+        upsample_size = None
+
+        for dim in sample.shape[-2:]:
+            if dim % default_overall_up_factor != 0:
+                # Forward upsample size to force interpolation output size.
+                forward_upsample_size = True
+                break
+
+        # ensure attention_mask is a bias, and give it a singleton query_tokens dimension
+        # expects mask of shape:
+        #   [batch, key_tokens]
+        # adds singleton query_tokens dimension:
+        #   [batch,                    1, key_tokens]
+        # this helps to broadcast it as a bias over attention scores, which will be in one of the following shapes:
+        #   [batch,  heads, query_tokens, key_tokens] (e.g. torch sdp attn)
+        #   [batch * heads, query_tokens, key_tokens] (e.g. xformers or classic attn)
+        if attention_mask is not None:
+            # assume that mask is expressed as:
+            #   (1 = keep,      0 = discard)
+            # convert mask into a bias that can be added to attention scores:
+            #       (keep = +0,     discard = -10000.0)
+            attention_mask = (1 - attention_mask.to(sample.dtype)) * -10000.0
+            attention_mask = attention_mask.unsqueeze(1)
+
+        # convert encoder_attention_mask to a bias the same way we do for attention_mask
+        if encoder_attention_mask is not None:
+            encoder_attention_mask = (1 - encoder_attention_mask.to(sample.dtype)) * -10000.0
+            encoder_attention_mask = encoder_attention_mask.unsqueeze(1)
+
+        # 0. center input if necessary
+        if self.config.center_input_sample:
+            sample = 2 * sample - 1.0
+
+        # 1. time
+        timesteps = timestep
+        if not torch.is_tensor(timesteps):
+            # TODO: this requires sync between CPU and GPU. So try to pass timesteps as tensors if you can
+            # This would be a good case for the `match` statement (Python 3.10+)
+            is_mps = sample.device.type == "mps"
+            if isinstance(timestep, float):
+                dtype = torch.float32 if is_mps else torch.float64
+            else:
+                dtype = torch.int32 if is_mps else torch.int64
+            timesteps = torch.tensor([timesteps], dtype=dtype, device=sample.device)
+        elif len(timesteps.shape) == 0:
+            timesteps = timesteps[None].to(sample.device)
+
+        # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
+        timesteps = timesteps.expand(sample.shape[0])
+
+        t_emb = self.time_proj(timesteps)
+
+        # `Timesteps` does not contain any weights and will always return f32 tensors
+        # but time_embedding might actually be running in fp16. so we need to cast here.
+        # there might be better ways to encapsulate this.
+        t_emb = t_emb.to(dtype=sample.dtype)
+
+        emb = self.time_embedding(t_emb, timestep_cond)
+        aug_emb = None
+
+        if self.class_embedding is not None:
+            if class_labels is None:
+                raise ValueError("class_labels should be provided when num_class_embeds > 0")
+
+            if self.config.class_embed_type == "timestep":
+                class_labels = self.time_proj(class_labels)
+
+                # `Timesteps` does not contain any weights and will always return f32 tensors
+                # there might be better ways to encapsulate this.
+                class_labels = class_labels.to(dtype=sample.dtype)
+
+            class_emb = self.class_embedding(class_labels).to(dtype=sample.dtype)
+
+            if self.config.class_embeddings_concat:
+                emb = torch.cat([emb, class_emb], dim=-1)
+            else:
+                emb = emb + class_emb
+
+        if self.config.addition_embed_type == "text":
+            aug_emb = self.add_embedding(encoder_hidden_states)
+        elif self.config.addition_embed_type == "text_image":
+            # Kandinsky 2.1 - style
+            if "image_embeds" not in added_cond_kwargs:
+                raise ValueError(
+                    f"{self.__class__} has the config param `addition_embed_type` set to 'text_image' which requires the keyword argument `image_embeds` to be passed in `added_cond_kwargs`"
+                )
+
+            image_embs = added_cond_kwargs.get("image_embeds")
+            text_embs = added_cond_kwargs.get("text_embeds", encoder_hidden_states)
+            aug_emb = self.add_embedding(text_embs, image_embs)
+        elif self.config.addition_embed_type == "text_time":
+            # SDXL - style
+            if "text_embeds" not in added_cond_kwargs:
+                raise ValueError(
+                    f"{self.__class__} has the config param `addition_embed_type` set to 'text_time' which requires the keyword argument `text_embeds` to be passed in `added_cond_kwargs`"
+                )
+            text_embeds = added_cond_kwargs.get("text_embeds")
+            if "time_ids" not in added_cond_kwargs:
+                raise ValueError(
+                    f"{self.__class__} has the config param `addition_embed_type` set to 'text_time' which requires the keyword argument `time_ids` to be passed in `added_cond_kwargs`"
+                )
+            time_ids = added_cond_kwargs.get("time_ids")
+            time_embeds = self.add_time_proj(time_ids.flatten())
+            time_embeds = time_embeds.reshape((text_embeds.shape[0], -1))
+            add_embeds = torch.concat([text_embeds, time_embeds], dim=-1)
+            add_embeds = add_embeds.to(emb.dtype)
+            aug_emb = self.add_embedding(add_embeds)
+        elif self.config.addition_embed_type == "image":
+            # Kandinsky 2.2 - style
+            if "image_embeds" not in added_cond_kwargs:
+                raise ValueError(
+                    f"{self.__class__} has the config param `addition_embed_type` set to 'image' which requires the keyword argument `image_embeds` to be passed in `added_cond_kwargs`"
+                )
+            image_embs = added_cond_kwargs.get("image_embeds")
+            aug_emb = self.add_embedding(image_embs)
+        elif self.config.addition_embed_type == "image_hint":
+            # Kandinsky 2.2 - style
+            if "image_embeds" not in added_cond_kwargs or "hint" not in added_cond_kwargs:
+                raise ValueError(
+                    f"{self.__class__} has the config param `addition_embed_type` set to 'image_hint' which requires the keyword arguments `image_embeds` and `hint` to be passed in `added_cond_kwargs`"
+                )
+            image_embs = added_cond_kwargs.get("image_embeds")
+            hint = added_cond_kwargs.get("hint")
+            aug_emb, hint = self.add_embedding(image_embs, hint)
+            sample = torch.cat([sample, hint], dim=1)
+
+        emb = emb + aug_emb if aug_emb is not None else emb
+
+        if self.time_embed_act is not None:
+            emb = self.time_embed_act(emb)
+
+        if self.encoder_hid_proj is not None and self.config.encoder_hid_dim_type == "text_proj":
+            encoder_hidden_states = self.encoder_hid_proj(encoder_hidden_states)
+        elif self.encoder_hid_proj is not None and self.config.encoder_hid_dim_type == "text_image_proj":
+            # Kadinsky 2.1 - style
+            if "image_embeds" not in added_cond_kwargs:
+                raise ValueError(
+                    f"{self.__class__} has the config param `encoder_hid_dim_type` set to 'text_image_proj' which requires the keyword argument `image_embeds` to be passed in  `added_conditions`"
+                )
+
+            image_embeds = added_cond_kwargs.get("image_embeds")
+            encoder_hidden_states = self.encoder_hid_proj(encoder_hidden_states, image_embeds)
+        elif self.encoder_hid_proj is not None and self.config.encoder_hid_dim_type == "image_proj":
+            # Kandinsky 2.2 - style
+            if "image_embeds" not in added_cond_kwargs:
+                raise ValueError(
+                    f"{self.__class__} has the config param `encoder_hid_dim_type` set to 'image_proj' which requires the keyword argument `image_embeds` to be passed in  `added_conditions`"
+                )
+            image_embeds = added_cond_kwargs.get("image_embeds")
+            encoder_hidden_states = self.encoder_hid_proj(image_embeds)
+        elif self.encoder_hid_proj is not None and self.config.encoder_hid_dim_type == "ip_image_proj":
+            if "image_embeds" not in added_cond_kwargs:
+                raise ValueError(
+                    f"{self.__class__} has the config param `encoder_hid_dim_type` set to 'ip_image_proj' which requires the keyword argument `image_embeds` to be passed in  `added_conditions`"
+                )
+            image_embeds = added_cond_kwargs.get("image_embeds")
+            image_embeds = self.encoder_hid_proj(image_embeds).to(encoder_hidden_states.dtype)
+            encoder_hidden_states = torch.cat([encoder_hidden_states, image_embeds], dim=1)
+
+        # 2. pre-process
+        sample = self.conv_in(sample)
+        garment_features=[]
+
+        # 2.5 GLIGEN position net
+        if cross_attention_kwargs is not None and cross_attention_kwargs.get("gligen", None) is not None:
+            cross_attention_kwargs = cross_attention_kwargs.copy()
+            gligen_args = cross_attention_kwargs.pop("gligen")
+            cross_attention_kwargs["gligen"] = {"objs": self.position_net(**gligen_args)}
+
+
+        # 3. down
+        lora_scale = cross_attention_kwargs.get("scale", 1.0) if cross_attention_kwargs is not None else 1.0
+        if USE_PEFT_BACKEND:
+            # weight the lora layers by setting `lora_scale` for each PEFT layer
+            scale_lora_layers(self, lora_scale)
+
+        is_controlnet = mid_block_additional_residual is not None and down_block_additional_residuals is not None
+        # using new arg down_intrablock_additional_residuals for T2I-Adapters, to distinguish from controlnets
+        is_adapter = down_intrablock_additional_residuals is not None
+        # maintain backward compatibility for legacy usage, where
+        #       T2I-Adapter and ControlNet both use down_block_additional_residuals arg
+        #       but can only use one or the other
+        if not is_adapter and mid_block_additional_residual is None and down_block_additional_residuals is not None:
+            deprecate(
+                "T2I should not use down_block_additional_residuals",
+                "1.3.0",
+                "Passing intrablock residual connections with `down_block_additional_residuals` is deprecated \
+                       and will be removed in diffusers 1.3.0.  `down_block_additional_residuals` should only be used \
+                       for ControlNet. Please make sure use `down_intrablock_additional_residuals` instead. ",
+                standard_warn=False,
+            )
+            down_intrablock_additional_residuals = down_block_additional_residuals
+            is_adapter = True
+
+        down_block_res_samples = (sample,)
+        for downsample_block in self.down_blocks:
+            if hasattr(downsample_block, "has_cross_attention") and downsample_block.has_cross_attention:
+                # For t2i-adapter CrossAttnDownBlock2D
+                additional_residuals = {}
+                if is_adapter and len(down_intrablock_additional_residuals) > 0:
+                    additional_residuals["additional_residuals"] = down_intrablock_additional_residuals.pop(0)
+
+                sample, res_samples,out_garment_feat = downsample_block(
+                    hidden_states=sample,
+                    temb=emb,
+                    encoder_hidden_states=encoder_hidden_states,
+                    attention_mask=attention_mask,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    encoder_attention_mask=encoder_attention_mask,
+                    **additional_residuals,
+                )
+                garment_features += out_garment_feat
+            else:
+                sample, res_samples = downsample_block(hidden_states=sample, temb=emb, scale=lora_scale)
+                if is_adapter and len(down_intrablock_additional_residuals) > 0:
+                    sample += down_intrablock_additional_residuals.pop(0)
+
+            down_block_res_samples += res_samples
+        
+
+        if is_controlnet:
+            new_down_block_res_samples = ()
+
+            for down_block_res_sample, down_block_additional_residual in zip(
+                down_block_res_samples, down_block_additional_residuals
+            ):
+                down_block_res_sample = down_block_res_sample + down_block_additional_residual
+                new_down_block_res_samples = new_down_block_res_samples + (down_block_res_sample,)
+
+            down_block_res_samples = new_down_block_res_samples
+
+        # 4. mid
+        if self.mid_block is not None:
+            if hasattr(self.mid_block, "has_cross_attention") and self.mid_block.has_cross_attention:
+                sample,out_garment_feat = self.mid_block(
+                    sample,
+                    emb,
+                    encoder_hidden_states=encoder_hidden_states,
+                    attention_mask=attention_mask,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    encoder_attention_mask=encoder_attention_mask,
+                )
+                garment_features += out_garment_feat
+
+            else:
+                sample = self.mid_block(sample, emb)
+
+            # To support T2I-Adapter-XL
+            if (
+                is_adapter
+                and len(down_intrablock_additional_residuals) > 0
+                and sample.shape == down_intrablock_additional_residuals[0].shape
+            ):
+                sample += down_intrablock_additional_residuals.pop(0)
+
+        if is_controlnet:
+            sample = sample + mid_block_additional_residual
+
+
+
+        # 5. up
+        for i, upsample_block in enumerate(self.up_blocks):
+            is_final_block = i == len(self.up_blocks) - 1
+
+            res_samples = down_block_res_samples[-len(upsample_block.resnets) :]
+            down_block_res_samples = down_block_res_samples[: -len(upsample_block.resnets)]
+
+            # if we have not reached the final block and need to forward the
+            # upsample size, we do it here
+            if not is_final_block and forward_upsample_size:
+                upsample_size = down_block_res_samples[-1].shape[2:]
+
+            if hasattr(upsample_block, "has_cross_attention") and upsample_block.has_cross_attention:
+                sample,out_garment_feat = upsample_block(
+                    hidden_states=sample,
+                    temb=emb,
+                    res_hidden_states_tuple=res_samples,
+                    encoder_hidden_states=encoder_hidden_states,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    upsample_size=upsample_size,
+                    attention_mask=attention_mask,
+                    encoder_attention_mask=encoder_attention_mask,
+                )
+                garment_features += out_garment_feat
+
+
+        if not return_dict:
+            return (sample,),garment_features
+
+        return UNet2DConditionOutput(sample=sample),garment_features
diff --git a/src/unet_hacked_tryon.py b/src/unet_hacked_tryon.py
new file mode 100644
index 0000000000000000000000000000000000000000..596dd1dcc06066ab1bd36ab04560eddbf76b28d9
--- /dev/null
+++ b/src/unet_hacked_tryon.py
@@ -0,0 +1,1395 @@
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from dataclasses import dataclass
+from typing import Any, Dict, List, Optional, Tuple, Union
+
+import torch
+import torch.nn as nn
+import torch.utils.checkpoint
+
+from diffusers.configuration_utils import ConfigMixin, register_to_config
+from diffusers.loaders import UNet2DConditionLoadersMixin
+from diffusers.utils import USE_PEFT_BACKEND, BaseOutput, deprecate, logging, scale_lora_layers, unscale_lora_layers
+from diffusers.models.activations import get_activation
+from diffusers.models.attention_processor import (
+    ADDED_KV_ATTENTION_PROCESSORS,
+    CROSS_ATTENTION_PROCESSORS,
+    Attention,
+    AttentionProcessor,
+    AttnAddedKVProcessor,
+    AttnProcessor,
+)
+from einops import rearrange
+
+from diffusers.models.embeddings import (
+    GaussianFourierProjection,
+    ImageHintTimeEmbedding,
+    ImageProjection,
+    ImageTimeEmbedding,
+    PositionNet,
+    TextImageProjection,
+    TextImageTimeEmbedding,
+    TextTimeEmbedding,
+    TimestepEmbedding,
+    Timesteps,
+)
+
+
+from diffusers.models.modeling_utils import ModelMixin
+from src.unet_block_hacked_tryon import (
+    UNetMidBlock2D,
+    UNetMidBlock2DCrossAttn,
+    UNetMidBlock2DSimpleCrossAttn,
+    get_down_block,
+    get_up_block,
+)
+from diffusers.models.resnet import Downsample2D, FirDownsample2D, FirUpsample2D, KDownsample2D, KUpsample2D, ResnetBlock2D, Upsample2D
+from diffusers.models.transformer_2d import Transformer2DModel
+import math
+
+from ip_adapter.ip_adapter import Resampler
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+# def FeedForward(dim, mult=4):
+#     inner_dim = int(dim * mult)
+#     return nn.Sequential(
+#         nn.LayerNorm(dim),
+#         nn.Linear(dim, inner_dim, bias=False),
+#         nn.GELU(),
+#         nn.Linear(inner_dim, dim, bias=False),
+#     )
+
+
+
+# def reshape_tensor(x, heads):
+#     bs, length, width = x.shape
+#     # (bs, length, width) --> (bs, length, n_heads, dim_per_head)
+#     x = x.view(bs, length, heads, -1)
+#     # (bs, length, n_heads, dim_per_head) --> (bs, n_heads, length, dim_per_head)
+#     x = x.transpose(1, 2)
+#     # (bs, n_heads, length, dim_per_head) --> (bs*n_heads, length, dim_per_head)
+#     x = x.reshape(bs, heads, length, -1)
+#     return x
+
+
+# class PerceiverAttention(nn.Module):
+#     def __init__(self, *, dim, dim_head=64, heads=8):
+#         super().__init__()
+#         self.scale = dim_head**-0.5
+#         self.dim_head = dim_head
+#         self.heads = heads
+#         inner_dim = dim_head * heads
+
+#         self.norm1 = nn.LayerNorm(dim)
+#         self.norm2 = nn.LayerNorm(dim)
+
+#         self.to_q = nn.Linear(dim, inner_dim, bias=False)
+#         self.to_kv = nn.Linear(dim, inner_dim * 2, bias=False)
+#         self.to_out = nn.Linear(inner_dim, dim, bias=False)
+
+#     def forward(self, x, latents):
+#         """
+#         Args:
+#             x (torch.Tensor): image features
+#                 shape (b, n1, D)
+#             latent (torch.Tensor): latent features
+#                 shape (b, n2, D)
+#         """
+#         x = self.norm1(x)
+#         latents = self.norm2(latents)
+
+#         b, l, _ = latents.shape
+
+#         q = self.to_q(latents)
+#         kv_input = torch.cat((x, latents), dim=-2)
+#         k, v = self.to_kv(kv_input).chunk(2, dim=-1)
+
+#         q = reshape_tensor(q, self.heads)
+#         k = reshape_tensor(k, self.heads)
+#         v = reshape_tensor(v, self.heads)
+
+#         # attention
+#         scale = 1 / math.sqrt(math.sqrt(self.dim_head))
+#         weight = (q * scale) @ (k * scale).transpose(-2, -1)  # More stable with f16 than dividing afterwards
+#         weight = torch.softmax(weight.float(), dim=-1).type(weight.dtype)
+#         out = weight @ v
+
+#         out = out.permute(0, 2, 1, 3).reshape(b, l, -1)
+
+#         return self.to_out(out)
+
+
+# class Resampler(nn.Module):
+#     def __init__(
+#         self,
+#         dim=1024,
+#         depth=8,
+#         dim_head=64,
+#         heads=16,
+#         num_queries=8,
+#         embedding_dim=768,
+#         output_dim=1024,
+#         ff_mult=4,
+#         max_seq_len: int = 257,  # CLIP tokens + CLS token
+#         apply_pos_emb: bool = False,
+#         num_latents_mean_pooled: int = 0,  # number of latents derived from mean pooled representation of the sequence
+#     ):
+#         super().__init__()
+
+#         self.latents = nn.Parameter(torch.randn(1, num_queries, dim) / dim**0.5)
+
+#         self.proj_in = nn.Linear(embedding_dim, dim)
+
+#         self.proj_out = nn.Linear(dim, output_dim)
+#         self.norm_out = nn.LayerNorm(output_dim)
+
+#         self.layers = nn.ModuleList([])
+#         for _ in range(depth):
+#             self.layers.append(
+#                 nn.ModuleList(
+#                     [
+#                         PerceiverAttention(dim=dim, dim_head=dim_head, heads=heads),
+#                         FeedForward(dim=dim, mult=ff_mult),
+#                     ]
+#                 )
+#             )
+
+#     def forward(self, x):
+
+#         latents = self.latents.repeat(x.size(0), 1, 1)
+
+#         x = self.proj_in(x)
+
+
+#         for attn, ff in self.layers:
+#             latents = attn(x, latents) + latents
+#             latents = ff(latents) + latents
+
+#         latents = self.proj_out(latents)
+#         return self.norm_out(latents)
+
+
+def zero_module(module):
+    for p in module.parameters():
+        nn.init.zeros_(p)
+    return module
+
+@dataclass
+class UNet2DConditionOutput(BaseOutput):
+    """
+    The output of [`UNet2DConditionModel`].
+
+    Args:
+        sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+            The hidden states output conditioned on `encoder_hidden_states` input. Output of last layer of model.
+    """
+
+    sample: torch.FloatTensor = None
+
+
+class UNet2DConditionModel(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin):
+    r"""
+    A conditional 2D UNet model that takes a noisy sample, conditional state, and a timestep and returns a sample
+    shaped output.
+
+    This model inherits from [`ModelMixin`]. Check the superclass documentation for it's generic methods implemented
+    for all models (such as downloading or saving).
+
+    Parameters:
+        sample_size (`int` or `Tuple[int, int]`, *optional*, defaults to `None`):
+            Height and width of input/output sample.
+        in_channels (`int`, *optional*, defaults to 4): Number of channels in the input sample.
+        out_channels (`int`, *optional*, defaults to 4): Number of channels in the output.
+        center_input_sample (`bool`, *optional*, defaults to `False`): Whether to center the input sample.
+        flip_sin_to_cos (`bool`, *optional*, defaults to `False`):
+            Whether to flip the sin to cos in the time embedding.
+        freq_shift (`int`, *optional*, defaults to 0): The frequency shift to apply to the time embedding.
+        down_block_types (`Tuple[str]`, *optional*, defaults to `("CrossAttnDownBlock2D", "CrossAttnDownBlock2D", "CrossAttnDownBlock2D", "DownBlock2D")`):
+            The tuple of downsample blocks to use.
+        mid_block_type (`str`, *optional*, defaults to `"UNetMidBlock2DCrossAttn"`):
+            Block type for middle of UNet, it can be one of `UNetMidBlock2DCrossAttn`, `UNetMidBlock2D`, or
+            `UNetMidBlock2DSimpleCrossAttn`. If `None`, the mid block layer is skipped.
+        up_block_types (`Tuple[str]`, *optional*, defaults to `("UpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D")`):
+            The tuple of upsample blocks to use.
+        only_cross_attention(`bool` or `Tuple[bool]`, *optional*, default to `False`):
+            Whether to include self-attention in the basic transformer blocks, see
+            [`~models.attention.BasicTransformerBlock`].
+        block_out_channels (`Tuple[int]`, *optional*, defaults to `(320, 640, 1280, 1280)`):
+            The tuple of output channels for each block.
+        layers_per_block (`int`, *optional*, defaults to 2): The number of layers per block.
+        downsample_padding (`int`, *optional*, defaults to 1): The padding to use for the downsampling convolution.
+        mid_block_scale_factor (`float`, *optional*, defaults to 1.0): The scale factor to use for the mid block.
+        dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use.
+        act_fn (`str`, *optional*, defaults to `"silu"`): The activation function to use.
+        norm_num_groups (`int`, *optional*, defaults to 32): The number of groups to use for the normalization.
+            If `None`, normalization and activation layers is skipped in post-processing.
+        norm_eps (`float`, *optional*, defaults to 1e-5): The epsilon to use for the normalization.
+        cross_attention_dim (`int` or `Tuple[int]`, *optional*, defaults to 1280):
+            The dimension of the cross attention features.
+        transformer_layers_per_block (`int`, `Tuple[int]`, or `Tuple[Tuple]` , *optional*, defaults to 1):
+            The number of transformer blocks of type [`~models.attention.BasicTransformerBlock`]. Only relevant for
+            [`~models.unet_2d_blocks.CrossAttnDownBlock2D`], [`~models.unet_2d_blocks.CrossAttnUpBlock2D`],
+            [`~models.unet_2d_blocks.UNetMidBlock2DCrossAttn`].
+       reverse_transformer_layers_per_block : (`Tuple[Tuple]`, *optional*, defaults to None):
+            The number of transformer blocks of type [`~models.attention.BasicTransformerBlock`], in the upsampling
+            blocks of the U-Net. Only relevant if `transformer_layers_per_block` is of type `Tuple[Tuple]` and for
+            [`~models.unet_2d_blocks.CrossAttnDownBlock2D`], [`~models.unet_2d_blocks.CrossAttnUpBlock2D`],
+            [`~models.unet_2d_blocks.UNetMidBlock2DCrossAttn`].
+        encoder_hid_dim (`int`, *optional*, defaults to None):
+            If `encoder_hid_dim_type` is defined, `encoder_hidden_states` will be projected from `encoder_hid_dim`
+            dimension to `cross_attention_dim`.
+        encoder_hid_dim_type (`str`, *optional*, defaults to `None`):
+            If given, the `encoder_hidden_states` and potentially other embeddings are down-projected to text
+            embeddings of dimension `cross_attention` according to `encoder_hid_dim_type`.
+        attention_head_dim (`int`, *optional*, defaults to 8): The dimension of the attention heads.
+        num_attention_heads (`int`, *optional*):
+            The number of attention heads. If not defined, defaults to `attention_head_dim`
+        resnet_time_scale_shift (`str`, *optional*, defaults to `"default"`): Time scale shift config
+            for ResNet blocks (see [`~models.resnet.ResnetBlock2D`]). Choose from `default` or `scale_shift`.
+        class_embed_type (`str`, *optional*, defaults to `None`):
+            The type of class embedding to use which is ultimately summed with the time embeddings. Choose from `None`,
+            `"timestep"`, `"identity"`, `"projection"`, or `"simple_projection"`.
+        addition_embed_type (`str`, *optional*, defaults to `None`):
+            Configures an optional embedding which will be summed with the time embeddings. Choose from `None` or
+            "text". "text" will use the `TextTimeEmbedding` layer.
+        addition_time_embed_dim: (`int`, *optional*, defaults to `None`):
+            Dimension for the timestep embeddings.
+        num_class_embeds (`int`, *optional*, defaults to `None`):
+            Input dimension of the learnable embedding matrix to be projected to `time_embed_dim`, when performing
+            class conditioning with `class_embed_type` equal to `None`.
+        time_embedding_type (`str`, *optional*, defaults to `positional`):
+            The type of position embedding to use for timesteps. Choose from `positional` or `fourier`.
+        time_embedding_dim (`int`, *optional*, defaults to `None`):
+            An optional override for the dimension of the projected time embedding.
+        time_embedding_act_fn (`str`, *optional*, defaults to `None`):
+            Optional activation function to use only once on the time embeddings before they are passed to the rest of
+            the UNet. Choose from `silu`, `mish`, `gelu`, and `swish`.
+        timestep_post_act (`str`, *optional*, defaults to `None`):
+            The second activation function to use in timestep embedding. Choose from `silu`, `mish` and `gelu`.
+        time_cond_proj_dim (`int`, *optional*, defaults to `None`):
+            The dimension of `cond_proj` layer in the timestep embedding.
+        conv_in_kernel (`int`, *optional*, default to `3`): The kernel size of `conv_in` layer. conv_out_kernel (`int`,
+        *optional*, default to `3`): The kernel size of `conv_out` layer. projection_class_embeddings_input_dim (`int`,
+        *optional*): The dimension of the `class_labels` input when
+            `class_embed_type="projection"`. Required when `class_embed_type="projection"`.
+        class_embeddings_concat (`bool`, *optional*, defaults to `False`): Whether to concatenate the time
+            embeddings with the class embeddings.
+        mid_block_only_cross_attention (`bool`, *optional*, defaults to `None`):
+            Whether to use cross attention with the mid block when using the `UNetMidBlock2DSimpleCrossAttn`. If
+            `only_cross_attention` is given as a single boolean and `mid_block_only_cross_attention` is `None`, the
+            `only_cross_attention` value is used as the value for `mid_block_only_cross_attention`. Default to `False`
+            otherwise.
+    """
+
+    _supports_gradient_checkpointing = True
+
+    @register_to_config
+    def __init__(
+        self,
+        sample_size: Optional[int] = None,
+        in_channels: int = 4,
+        out_channels: int = 4,
+        center_input_sample: bool = False,
+        flip_sin_to_cos: bool = True,
+        freq_shift: int = 0,
+        down_block_types: Tuple[str] = (
+            "CrossAttnDownBlock2D",
+            "CrossAttnDownBlock2D",
+            "CrossAttnDownBlock2D",
+            "DownBlock2D",
+        ),
+        mid_block_type: Optional[str] = "UNetMidBlock2DCrossAttn",
+        up_block_types: Tuple[str] = ("UpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D"),
+        only_cross_attention: Union[bool, Tuple[bool]] = False,
+        block_out_channels: Tuple[int] = (320, 640, 1280, 1280),
+        layers_per_block: Union[int, Tuple[int]] = 2,
+        downsample_padding: int = 1,
+        mid_block_scale_factor: float = 1,
+        dropout: float = 0.0,
+        act_fn: str = "silu",
+        norm_num_groups: Optional[int] = 32,
+        norm_eps: float = 1e-5,
+        cross_attention_dim: Union[int, Tuple[int]] = 1280,
+        transformer_layers_per_block: Union[int, Tuple[int], Tuple[Tuple]] = 1,
+        reverse_transformer_layers_per_block: Optional[Tuple[Tuple[int]]] = None,
+        encoder_hid_dim: Optional[int] = None,
+        encoder_hid_dim_type: Optional[str] = None,
+        attention_head_dim: Union[int, Tuple[int]] = 8,
+        num_attention_heads: Optional[Union[int, Tuple[int]]] = None,
+        dual_cross_attention: bool = False,
+        use_linear_projection: bool = False,
+        class_embed_type: Optional[str] = None,
+        addition_embed_type: Optional[str] = None,
+        addition_time_embed_dim: Optional[int] = None,
+        num_class_embeds: Optional[int] = None,
+        upcast_attention: bool = False,
+        resnet_time_scale_shift: str = "default",
+        resnet_skip_time_act: bool = False,
+        resnet_out_scale_factor: int = 1.0,
+        time_embedding_type: str = "positional",
+        time_embedding_dim: Optional[int] = None,
+        time_embedding_act_fn: Optional[str] = None,
+        timestep_post_act: Optional[str] = None,
+        time_cond_proj_dim: Optional[int] = None,
+        conv_in_kernel: int = 3,
+        conv_out_kernel: int = 3,
+        projection_class_embeddings_input_dim: Optional[int] = None,
+        attention_type: str = "default",
+        class_embeddings_concat: bool = False,
+        mid_block_only_cross_attention: Optional[bool] = None,
+        cross_attention_norm: Optional[str] = None,
+        addition_embed_type_num_heads=64,
+    ):
+        super().__init__()
+
+        self.sample_size = sample_size
+
+        if num_attention_heads is not None:
+            raise ValueError(
+                "At the moment it is not possible to define the number of attention heads via `num_attention_heads` because of a naming issue as described in https://github.com/huggingface/diffusers/issues/2011#issuecomment-1547958131. Passing `num_attention_heads` will only be supported in diffusers v0.19."
+            )
+
+        # If `num_attention_heads` is not defined (which is the case for most models)
+        # it will default to `attention_head_dim`. This looks weird upon first reading it and it is.
+        # The reason for this behavior is to correct for incorrectly named variables that were introduced
+        # when this library was created. The incorrect naming was only discovered much later in https://github.com/huggingface/diffusers/issues/2011#issuecomment-1547958131
+        # Changing `attention_head_dim` to `num_attention_heads` for 40,000+ configurations is too backwards breaking
+        # which is why we correct for the naming here.
+        num_attention_heads = num_attention_heads or attention_head_dim
+
+        # Check inputs
+        if len(down_block_types) != len(up_block_types):
+            raise ValueError(
+                f"Must provide the same number of `down_block_types` as `up_block_types`. `down_block_types`: {down_block_types}. `up_block_types`: {up_block_types}."
+            )
+
+        if len(block_out_channels) != len(down_block_types):
+            raise ValueError(
+                f"Must provide the same number of `block_out_channels` as `down_block_types`. `block_out_channels`: {block_out_channels}. `down_block_types`: {down_block_types}."
+            )
+
+        if not isinstance(only_cross_attention, bool) and len(only_cross_attention) != len(down_block_types):
+            raise ValueError(
+                f"Must provide the same number of `only_cross_attention` as `down_block_types`. `only_cross_attention`: {only_cross_attention}. `down_block_types`: {down_block_types}."
+            )
+
+        if not isinstance(num_attention_heads, int) and len(num_attention_heads) != len(down_block_types):
+            raise ValueError(
+                f"Must provide the same number of `num_attention_heads` as `down_block_types`. `num_attention_heads`: {num_attention_heads}. `down_block_types`: {down_block_types}."
+            )
+
+        if not isinstance(attention_head_dim, int) and len(attention_head_dim) != len(down_block_types):
+            raise ValueError(
+                f"Must provide the same number of `attention_head_dim` as `down_block_types`. `attention_head_dim`: {attention_head_dim}. `down_block_types`: {down_block_types}."
+            )
+
+        if isinstance(cross_attention_dim, list) and len(cross_attention_dim) != len(down_block_types):
+            raise ValueError(
+                f"Must provide the same number of `cross_attention_dim` as `down_block_types`. `cross_attention_dim`: {cross_attention_dim}. `down_block_types`: {down_block_types}."
+            )
+
+        if not isinstance(layers_per_block, int) and len(layers_per_block) != len(down_block_types):
+            raise ValueError(
+                f"Must provide the same number of `layers_per_block` as `down_block_types`. `layers_per_block`: {layers_per_block}. `down_block_types`: {down_block_types}."
+            )
+        if isinstance(transformer_layers_per_block, list) and reverse_transformer_layers_per_block is None:
+            for layer_number_per_block in transformer_layers_per_block:
+                if isinstance(layer_number_per_block, list):
+                    raise ValueError("Must provide 'reverse_transformer_layers_per_block` if using asymmetrical UNet.")
+
+        # input
+        conv_in_padding = (conv_in_kernel - 1) // 2
+        self.conv_in = nn.Conv2d(
+            in_channels, block_out_channels[0], kernel_size=conv_in_kernel, padding=conv_in_padding
+        )
+
+        # time
+        if time_embedding_type == "fourier":
+            time_embed_dim = time_embedding_dim or block_out_channels[0] * 2
+            if time_embed_dim % 2 != 0:
+                raise ValueError(f"`time_embed_dim` should be divisible by 2, but is {time_embed_dim}.")
+            self.time_proj = GaussianFourierProjection(
+                time_embed_dim // 2, set_W_to_weight=False, log=False, flip_sin_to_cos=flip_sin_to_cos
+            )
+            timestep_input_dim = time_embed_dim
+        elif time_embedding_type == "positional":
+            time_embed_dim = time_embedding_dim or block_out_channels[0] * 4
+
+            self.time_proj = Timesteps(block_out_channels[0], flip_sin_to_cos, freq_shift)
+            timestep_input_dim = block_out_channels[0]
+        else:
+            raise ValueError(
+                f"{time_embedding_type} does not exist. Please make sure to use one of `fourier` or `positional`."
+            )
+
+        self.time_embedding = TimestepEmbedding(
+            timestep_input_dim,
+            time_embed_dim,
+            act_fn=act_fn,
+            post_act_fn=timestep_post_act,
+            cond_proj_dim=time_cond_proj_dim,
+        )
+
+        if encoder_hid_dim_type is None and encoder_hid_dim is not None:
+            encoder_hid_dim_type = "text_proj"
+            self.register_to_config(encoder_hid_dim_type=encoder_hid_dim_type)
+            logger.info("encoder_hid_dim_type defaults to 'text_proj' as `encoder_hid_dim` is defined.")
+
+        if encoder_hid_dim is None and encoder_hid_dim_type is not None:
+            raise ValueError(
+                f"`encoder_hid_dim` has to be defined when `encoder_hid_dim_type` is set to {encoder_hid_dim_type}."
+            )
+
+        if encoder_hid_dim_type == "text_proj":
+            self.encoder_hid_proj = nn.Linear(encoder_hid_dim, cross_attention_dim)
+        elif encoder_hid_dim_type == "text_image_proj":
+            # image_embed_dim DOESN'T have to be `cross_attention_dim`. To not clutter the __init__ too much
+            # they are set to `cross_attention_dim` here as this is exactly the required dimension for the currently only use
+            # case when `addition_embed_type == "text_image_proj"` (Kadinsky 2.1)`
+            self.encoder_hid_proj = TextImageProjection(
+                text_embed_dim=encoder_hid_dim,
+                image_embed_dim=cross_attention_dim,
+                cross_attention_dim=cross_attention_dim,
+            )
+        elif encoder_hid_dim_type == "image_proj":
+            # Kandinsky 2.2
+            self.encoder_hid_proj = ImageProjection(
+                image_embed_dim=encoder_hid_dim,
+                cross_attention_dim=cross_attention_dim,
+            )
+        elif encoder_hid_dim_type == "ip_image_proj":
+            # Kandinsky 2.2
+            self.encoder_hid_proj = Resampler(
+                dim=1280,
+                depth=4,
+                dim_head=64,
+                heads=20,
+                num_queries=16,
+                embedding_dim=encoder_hid_dim,
+                output_dim=self.config.cross_attention_dim,
+                ff_mult=4,
+            )
+                                    
+            
+        elif encoder_hid_dim_type is not None:
+            raise ValueError(
+                f"encoder_hid_dim_type: {encoder_hid_dim_type} must be None, 'text_proj' or 'text_image_proj'."
+            )
+        else:
+            self.encoder_hid_proj = None
+
+        # class embedding
+        if class_embed_type is None and num_class_embeds is not None:
+            self.class_embedding = nn.Embedding(num_class_embeds, time_embed_dim)
+        elif class_embed_type == "timestep":
+            self.class_embedding = TimestepEmbedding(timestep_input_dim, time_embed_dim, act_fn=act_fn)
+        elif class_embed_type == "identity":
+            self.class_embedding = nn.Identity(time_embed_dim, time_embed_dim)
+        elif class_embed_type == "projection":
+            if projection_class_embeddings_input_dim is None:
+                raise ValueError(
+                    "`class_embed_type`: 'projection' requires `projection_class_embeddings_input_dim` be set"
+                )
+            # The projection `class_embed_type` is the same as the timestep `class_embed_type` except
+            # 1. the `class_labels` inputs are not first converted to sinusoidal embeddings
+            # 2. it projects from an arbitrary input dimension.
+            #
+            # Note that `TimestepEmbedding` is quite general, being mainly linear layers and activations.
+            # When used for embedding actual timesteps, the timesteps are first converted to sinusoidal embeddings.
+            # As a result, `TimestepEmbedding` can be passed arbitrary vectors.
+            self.class_embedding = TimestepEmbedding(projection_class_embeddings_input_dim, time_embed_dim)
+        elif class_embed_type == "simple_projection":
+            if projection_class_embeddings_input_dim is None:
+                raise ValueError(
+                    "`class_embed_type`: 'simple_projection' requires `projection_class_embeddings_input_dim` be set"
+                )
+            self.class_embedding = nn.Linear(projection_class_embeddings_input_dim, time_embed_dim)
+        else:
+            self.class_embedding = None
+
+        if addition_embed_type == "text":
+            if encoder_hid_dim is not None:
+                text_time_embedding_from_dim = encoder_hid_dim
+            else:
+                text_time_embedding_from_dim = cross_attention_dim
+
+            self.add_embedding = TextTimeEmbedding(
+                text_time_embedding_from_dim, time_embed_dim, num_heads=addition_embed_type_num_heads
+            )
+        elif addition_embed_type == "text_image":
+            # text_embed_dim and image_embed_dim DON'T have to be `cross_attention_dim`. To not clutter the __init__ too much
+            # they are set to `cross_attention_dim` here as this is exactly the required dimension for the currently only use
+            # case when `addition_embed_type == "text_image"` (Kadinsky 2.1)`
+            self.add_embedding = TextImageTimeEmbedding(
+                text_embed_dim=cross_attention_dim, image_embed_dim=cross_attention_dim, time_embed_dim=time_embed_dim
+            )
+        elif addition_embed_type == "text_time":
+            self.add_time_proj = Timesteps(addition_time_embed_dim, flip_sin_to_cos, freq_shift)
+            self.add_embedding = TimestepEmbedding(projection_class_embeddings_input_dim, time_embed_dim)
+        elif addition_embed_type == "image":
+            # Kandinsky 2.2
+            self.add_embedding = ImageTimeEmbedding(image_embed_dim=encoder_hid_dim, time_embed_dim=time_embed_dim)
+        elif addition_embed_type == "image_hint":
+            # Kandinsky 2.2 ControlNet
+            self.add_embedding = ImageHintTimeEmbedding(image_embed_dim=encoder_hid_dim, time_embed_dim=time_embed_dim)
+        elif addition_embed_type is not None:
+            raise ValueError(f"addition_embed_type: {addition_embed_type} must be None, 'text' or 'text_image'.")
+
+        if time_embedding_act_fn is None:
+            self.time_embed_act = None
+        else:
+            self.time_embed_act = get_activation(time_embedding_act_fn)
+
+        self.down_blocks = nn.ModuleList([])
+        self.up_blocks = nn.ModuleList([])
+
+        if isinstance(only_cross_attention, bool):
+            if mid_block_only_cross_attention is None:
+                mid_block_only_cross_attention = only_cross_attention
+
+            only_cross_attention = [only_cross_attention] * len(down_block_types)
+
+        if mid_block_only_cross_attention is None:
+            mid_block_only_cross_attention = False
+
+        if isinstance(num_attention_heads, int):
+            num_attention_heads = (num_attention_heads,) * len(down_block_types)
+
+        if isinstance(attention_head_dim, int):
+            attention_head_dim = (attention_head_dim,) * len(down_block_types)
+
+        if isinstance(cross_attention_dim, int):
+            cross_attention_dim = (cross_attention_dim,) * len(down_block_types)
+
+        if isinstance(layers_per_block, int):
+            layers_per_block = [layers_per_block] * len(down_block_types)
+
+        if isinstance(transformer_layers_per_block, int):
+            transformer_layers_per_block = [transformer_layers_per_block] * len(down_block_types)
+        if class_embeddings_concat:
+            # The time embeddings are concatenated with the class embeddings. The dimension of the
+            # time embeddings passed to the down, middle, and up blocks is twice the dimension of the
+            # regular time embeddings
+            blocks_time_embed_dim = time_embed_dim * 2
+        else:
+            blocks_time_embed_dim = time_embed_dim
+
+        # down
+        output_channel = block_out_channels[0]
+        for i, down_block_type in enumerate(down_block_types):
+            input_channel = output_channel
+            output_channel = block_out_channels[i]
+            is_final_block = i == len(block_out_channels) - 1
+
+            down_block = get_down_block(
+                down_block_type,
+                num_layers=layers_per_block[i],
+                transformer_layers_per_block=transformer_layers_per_block[i],
+                in_channels=input_channel,
+                out_channels=output_channel,
+                temb_channels=blocks_time_embed_dim,
+                add_downsample=not is_final_block,
+                resnet_eps=norm_eps,
+                resnet_act_fn=act_fn,
+                resnet_groups=norm_num_groups,
+                cross_attention_dim=cross_attention_dim[i],
+                num_attention_heads=num_attention_heads[i],
+                downsample_padding=downsample_padding,
+                dual_cross_attention=dual_cross_attention,
+                use_linear_projection=use_linear_projection,
+                only_cross_attention=only_cross_attention[i],
+                upcast_attention=upcast_attention,
+                resnet_time_scale_shift=resnet_time_scale_shift,
+                attention_type=attention_type,
+                resnet_skip_time_act=resnet_skip_time_act,
+                resnet_out_scale_factor=resnet_out_scale_factor,
+                cross_attention_norm=cross_attention_norm,
+                attention_head_dim=attention_head_dim[i] if attention_head_dim[i] is not None else output_channel,
+                dropout=dropout,
+            )
+            self.down_blocks.append(down_block)
+
+        # mid
+        if mid_block_type == "UNetMidBlock2DCrossAttn":
+            self.mid_block = UNetMidBlock2DCrossAttn(
+                transformer_layers_per_block=transformer_layers_per_block[-1],
+                in_channels=block_out_channels[-1],
+                temb_channels=blocks_time_embed_dim,
+                dropout=dropout,
+                resnet_eps=norm_eps,
+                resnet_act_fn=act_fn,
+                output_scale_factor=mid_block_scale_factor,
+                resnet_time_scale_shift=resnet_time_scale_shift,
+                cross_attention_dim=cross_attention_dim[-1],
+                num_attention_heads=num_attention_heads[-1],
+                resnet_groups=norm_num_groups,
+                dual_cross_attention=dual_cross_attention,
+                use_linear_projection=use_linear_projection,
+                upcast_attention=upcast_attention,
+                attention_type=attention_type,
+            )
+        elif mid_block_type == "UNetMidBlock2DSimpleCrossAttn":
+            self.mid_block = UNetMidBlock2DSimpleCrossAttn(
+                in_channels=block_out_channels[-1],
+                temb_channels=blocks_time_embed_dim,
+                dropout=dropout,
+                resnet_eps=norm_eps,
+                resnet_act_fn=act_fn,
+                output_scale_factor=mid_block_scale_factor,
+                cross_attention_dim=cross_attention_dim[-1],
+                attention_head_dim=attention_head_dim[-1],
+                resnet_groups=norm_num_groups,
+                resnet_time_scale_shift=resnet_time_scale_shift,
+                skip_time_act=resnet_skip_time_act,
+                only_cross_attention=mid_block_only_cross_attention,
+                cross_attention_norm=cross_attention_norm,
+            )
+        elif mid_block_type == "UNetMidBlock2D":
+            self.mid_block = UNetMidBlock2D(
+                in_channels=block_out_channels[-1],
+                temb_channels=blocks_time_embed_dim,
+                dropout=dropout,
+                num_layers=0,
+                resnet_eps=norm_eps,
+                resnet_act_fn=act_fn,
+                output_scale_factor=mid_block_scale_factor,
+                resnet_groups=norm_num_groups,
+                resnet_time_scale_shift=resnet_time_scale_shift,
+                add_attention=False,
+            )
+        elif mid_block_type is None:
+            self.mid_block = None
+        else:
+            raise ValueError(f"unknown mid_block_type : {mid_block_type}")
+
+        # count how many layers upsample the images
+        self.num_upsamplers = 0
+
+        # up
+        reversed_block_out_channels = list(reversed(block_out_channels))
+        reversed_num_attention_heads = list(reversed(num_attention_heads))
+        reversed_layers_per_block = list(reversed(layers_per_block))
+        reversed_cross_attention_dim = list(reversed(cross_attention_dim))
+        reversed_transformer_layers_per_block = (
+            list(reversed(transformer_layers_per_block))
+            if reverse_transformer_layers_per_block is None
+            else reverse_transformer_layers_per_block
+        )
+        only_cross_attention = list(reversed(only_cross_attention))
+
+        output_channel = reversed_block_out_channels[0]
+        for i, up_block_type in enumerate(up_block_types):
+            is_final_block = i == len(block_out_channels) - 1
+
+            prev_output_channel = output_channel
+            output_channel = reversed_block_out_channels[i]
+            input_channel = reversed_block_out_channels[min(i + 1, len(block_out_channels) - 1)]
+
+            # add upsample block for all BUT final layer
+            if not is_final_block:
+                add_upsample = True
+                self.num_upsamplers += 1
+            else:
+                add_upsample = False
+            up_block = get_up_block(
+                up_block_type,
+                num_layers=reversed_layers_per_block[i] + 1,
+                transformer_layers_per_block=reversed_transformer_layers_per_block[i],
+                in_channels=input_channel,
+                out_channels=output_channel,
+                prev_output_channel=prev_output_channel,
+                temb_channels=blocks_time_embed_dim,
+                add_upsample=add_upsample,
+                resnet_eps=norm_eps,
+                resnet_act_fn=act_fn,
+                resolution_idx=i,
+                resnet_groups=norm_num_groups,
+                cross_attention_dim=reversed_cross_attention_dim[i],
+                num_attention_heads=reversed_num_attention_heads[i],
+                dual_cross_attention=dual_cross_attention,
+                use_linear_projection=use_linear_projection,
+                only_cross_attention=only_cross_attention[i],
+                upcast_attention=upcast_attention,
+                resnet_time_scale_shift=resnet_time_scale_shift,
+                attention_type=attention_type,
+                resnet_skip_time_act=resnet_skip_time_act,
+                resnet_out_scale_factor=resnet_out_scale_factor,
+                cross_attention_norm=cross_attention_norm,
+                attention_head_dim=attention_head_dim[i] if attention_head_dim[i] is not None else output_channel,
+                dropout=dropout,
+            )
+
+            self.up_blocks.append(up_block)
+            prev_output_channel = output_channel
+
+
+
+
+        # out
+        if norm_num_groups is not None:
+            self.conv_norm_out = nn.GroupNorm(
+                num_channels=block_out_channels[0], num_groups=norm_num_groups, eps=norm_eps
+            )
+
+            self.conv_act = get_activation(act_fn)
+
+        else:
+            self.conv_norm_out = None
+            self.conv_act = None
+
+        conv_out_padding = (conv_out_kernel - 1) // 2
+        self.conv_out = nn.Conv2d(
+            block_out_channels[0], out_channels, kernel_size=conv_out_kernel, padding=conv_out_padding
+        )
+
+        if attention_type in ["gated", "gated-text-image"]:
+            positive_len = 768
+            if isinstance(cross_attention_dim, int):
+                positive_len = cross_attention_dim
+            elif isinstance(cross_attention_dim, tuple) or isinstance(cross_attention_dim, list):
+                positive_len = cross_attention_dim[0]
+
+            feature_type = "text-only" if attention_type == "gated" else "text-image"
+            self.position_net = PositionNet(
+                positive_len=positive_len, out_dim=cross_attention_dim, feature_type=feature_type
+            )
+
+
+
+        from ip_adapter.attention_processor import IPAttnProcessor2_0 as IPAttnProcessor, AttnProcessor2_0 as AttnProcessor
+
+        attn_procs = {}
+        for name in self.attn_processors.keys():
+            cross_attention_dim = None if name.endswith("attn1.processor") else self.config.cross_attention_dim
+            if name.startswith("mid_block"):
+                hidden_size = self.config.block_out_channels[-1]
+            elif name.startswith("up_blocks"):
+                block_id = int(name[len("up_blocks.")])
+                hidden_size = list(reversed(self.config.block_out_channels))[block_id]
+            elif name.startswith("down_blocks"):
+                block_id = int(name[len("down_blocks.")])
+                hidden_size = self.config.block_out_channels[block_id]
+            if cross_attention_dim is None:
+                attn_procs[name] = AttnProcessor()
+            else:
+                layer_name = name.split(".processor")[0]
+                attn_procs[name] = IPAttnProcessor(hidden_size=hidden_size, cross_attention_dim=cross_attention_dim, num_tokens=16)
+        self.set_attn_processor(attn_procs)
+
+
+    @property
+    def attn_processors(self) -> Dict[str, AttentionProcessor]:
+        r"""
+        Returns:
+            `dict` of attention processors: A dictionary containing all attention processors used in the model with
+            indexed by its weight name.
+        """
+        # set recursively
+        processors = {}
+
+        def fn_recursive_add_processors(name: str, module: torch.nn.Module, processors: Dict[str, AttentionProcessor]):
+            if hasattr(module, "get_processor"):
+                processors[f"{name}.processor"] = module.get_processor(return_deprecated_lora=True)
+
+            for sub_name, child in module.named_children():
+                fn_recursive_add_processors(f"{name}.{sub_name}", child, processors)
+
+            return processors
+
+        for name, module in self.named_children():
+            fn_recursive_add_processors(name, module, processors)
+
+        return processors
+
+    def set_attn_processor(
+        self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]], _remove_lora=False
+    ):
+        r"""
+        Sets the attention processor to use to compute attention.
+
+        Parameters:
+            processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`):
+                The instantiated processor class or a dictionary of processor classes that will be set as the processor
+                for **all** `Attention` layers.
+
+                If `processor` is a dict, the key needs to define the path to the corresponding cross attention
+                processor. This is strongly recommended when setting trainable attention processors.
+
+        """
+        count = len(self.attn_processors.keys())
+
+        if isinstance(processor, dict) and len(processor) != count:
+            raise ValueError(
+                f"A dict of processors was passed, but the number of processors {len(processor)} does not match the"
+                f" number of attention layers: {count}. Please make sure to pass {count} processor classes."
+            )
+
+        def fn_recursive_attn_processor(name: str, module: torch.nn.Module, processor):
+            if hasattr(module, "set_processor"):
+                if not isinstance(processor, dict):
+                    module.set_processor(processor, _remove_lora=_remove_lora)
+                else:
+                    module.set_processor(processor.pop(f"{name}.processor"), _remove_lora=_remove_lora)
+
+            for sub_name, child in module.named_children():
+                fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor)
+
+        for name, module in self.named_children():
+            fn_recursive_attn_processor(name, module, processor)
+
+    def set_default_attn_processor(self):
+        """
+        Disables custom attention processors and sets the default attention implementation.
+        """
+        if all(proc.__class__ in ADDED_KV_ATTENTION_PROCESSORS for proc in self.attn_processors.values()):
+            processor = AttnAddedKVProcessor()
+        elif all(proc.__class__ in CROSS_ATTENTION_PROCESSORS for proc in self.attn_processors.values()):
+            processor = AttnProcessor()
+        else:
+            raise ValueError(
+                f"Cannot call `set_default_attn_processor` when attention processors are of type {next(iter(self.attn_processors.values()))}"
+            )
+
+        self.set_attn_processor(processor, _remove_lora=True)
+
+    def set_attention_slice(self, slice_size):
+        r"""
+        Enable sliced attention computation.
+
+        When this option is enabled, the attention module splits the input tensor in slices to compute attention in
+        several steps. This is useful for saving some memory in exchange for a small decrease in speed.
+
+        Args:
+            slice_size (`str` or `int` or `list(int)`, *optional*, defaults to `"auto"`):
+                When `"auto"`, input to the attention heads is halved, so attention is computed in two steps. If
+                `"max"`, maximum amount of memory is saved by running only one slice at a time. If a number is
+                provided, uses as many slices as `attention_head_dim // slice_size`. In this case, `attention_head_dim`
+                must be a multiple of `slice_size`.
+        """
+        sliceable_head_dims = []
+
+        def fn_recursive_retrieve_sliceable_dims(module: torch.nn.Module):
+            if hasattr(module, "set_attention_slice"):
+                sliceable_head_dims.append(module.sliceable_head_dim)
+
+            for child in module.children():
+                fn_recursive_retrieve_sliceable_dims(child)
+
+        # retrieve number of attention layers
+        for module in self.children():
+            fn_recursive_retrieve_sliceable_dims(module)
+
+        num_sliceable_layers = len(sliceable_head_dims)
+
+        if slice_size == "auto":
+            # half the attention head size is usually a good trade-off between
+            # speed and memory
+            slice_size = [dim // 2 for dim in sliceable_head_dims]
+        elif slice_size == "max":
+            # make smallest slice possible
+            slice_size = num_sliceable_layers * [1]
+
+        slice_size = num_sliceable_layers * [slice_size] if not isinstance(slice_size, list) else slice_size
+
+        if len(slice_size) != len(sliceable_head_dims):
+            raise ValueError(
+                f"You have provided {len(slice_size)}, but {self.config} has {len(sliceable_head_dims)} different"
+                f" attention layers. Make sure to match `len(slice_size)` to be {len(sliceable_head_dims)}."
+            )
+
+        for i in range(len(slice_size)):
+            size = slice_size[i]
+            dim = sliceable_head_dims[i]
+            if size is not None and size > dim:
+                raise ValueError(f"size {size} has to be smaller or equal to {dim}.")
+
+        # Recursively walk through all the children.
+        # Any children which exposes the set_attention_slice method
+        # gets the message
+        def fn_recursive_set_attention_slice(module: torch.nn.Module, slice_size: List[int]):
+            if hasattr(module, "set_attention_slice"):
+                module.set_attention_slice(slice_size.pop())
+
+            for child in module.children():
+                fn_recursive_set_attention_slice(child, slice_size)
+
+        reversed_slice_size = list(reversed(slice_size))
+        for module in self.children():
+            fn_recursive_set_attention_slice(module, reversed_slice_size)
+
+    def _set_gradient_checkpointing(self, module, value=False):
+        if hasattr(module, "gradient_checkpointing"):
+            module.gradient_checkpointing = value
+
+    def enable_freeu(self, s1, s2, b1, b2):
+        r"""Enables the FreeU mechanism from https://arxiv.org/abs/2309.11497.
+
+        The suffixes after the scaling factors represent the stage blocks where they are being applied.
+
+        Please refer to the [official repository](https://github.com/ChenyangSi/FreeU) for combinations of values that
+        are known to work well for different pipelines such as Stable Diffusion v1, v2, and Stable Diffusion XL.
+
+        Args:
+            s1 (`float`):
+                Scaling factor for stage 1 to attenuate the contributions of the skip features. This is done to
+                mitigate the "oversmoothing effect" in the enhanced denoising process.
+            s2 (`float`):
+                Scaling factor for stage 2 to attenuate the contributions of the skip features. This is done to
+                mitigate the "oversmoothing effect" in the enhanced denoising process.
+            b1 (`float`): Scaling factor for stage 1 to amplify the contributions of backbone features.
+            b2 (`float`): Scaling factor for stage 2 to amplify the contributions of backbone features.
+        """
+        for i, upsample_block in enumerate(self.up_blocks):
+            setattr(upsample_block, "s1", s1)
+            setattr(upsample_block, "s2", s2)
+            setattr(upsample_block, "b1", b1)
+            setattr(upsample_block, "b2", b2)
+
+    def disable_freeu(self):
+        """Disables the FreeU mechanism."""
+        freeu_keys = {"s1", "s2", "b1", "b2"}
+        for i, upsample_block in enumerate(self.up_blocks):
+            for k in freeu_keys:
+                if hasattr(upsample_block, k) or getattr(upsample_block, k, None) is not None:
+                    setattr(upsample_block, k, None)
+
+    def fuse_qkv_projections(self):
+        """
+        Enables fused QKV projections. For self-attention modules, all projection matrices (i.e., query,
+        key, value) are fused. For cross-attention modules, key and value projection matrices are fused.
+
+        <Tip warning={true}>
+
+        This API is 🧪 experimental.
+
+        </Tip>
+        """
+        self.original_attn_processors = None
+
+        for _, attn_processor in self.attn_processors.items():
+            if "Added" in str(attn_processor.__class__.__name__):
+                raise ValueError("`fuse_qkv_projections()` is not supported for models having added KV projections.")
+
+        self.original_attn_processors = self.attn_processors
+
+        for module in self.modules():
+            if isinstance(module, Attention):
+                module.fuse_projections(fuse=True)
+
+    def unfuse_qkv_projections(self):
+        """Disables the fused QKV projection if enabled.
+
+        <Tip warning={true}>
+
+        This API is 🧪 experimental.
+
+        </Tip>
+
+        """
+        if self.original_attn_processors is not None:
+            self.set_attn_processor(self.original_attn_processors)
+
+    def forward(
+        self,
+        sample: torch.FloatTensor,
+        timestep: Union[torch.Tensor, float, int],
+        encoder_hidden_states: torch.Tensor,
+        class_labels: Optional[torch.Tensor] = None,
+        timestep_cond: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        added_cond_kwargs: Optional[Dict[str, torch.Tensor]] = None,
+        down_block_additional_residuals: Optional[Tuple[torch.Tensor]] = None,
+        mid_block_additional_residual: Optional[torch.Tensor] = None,
+        down_intrablock_additional_residuals: Optional[Tuple[torch.Tensor]] = None,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
+        return_dict: bool = True,
+        garment_features: Optional[Tuple[torch.Tensor]] = None,
+    ) -> Union[UNet2DConditionOutput, Tuple]:
+        r"""
+        The [`UNet2DConditionModel`] forward method.
+
+        Args:
+            sample (`torch.FloatTensor`):
+                The noisy input tensor with the following shape `(batch, channel, height, width)`.
+            timestep (`torch.FloatTensor` or `float` or `int`): The number of timesteps to denoise an input.
+            encoder_hidden_states (`torch.FloatTensor`):
+                The encoder hidden states with shape `(batch, sequence_length, feature_dim)`.
+            class_labels (`torch.Tensor`, *optional*, defaults to `None`):
+                Optional class labels for conditioning. Their embeddings will be summed with the timestep embeddings.
+            timestep_cond: (`torch.Tensor`, *optional*, defaults to `None`):
+                Conditional embeddings for timestep. If provided, the embeddings will be summed with the samples passed
+                through the `self.time_embedding` layer to obtain the timestep embeddings.
+            attention_mask (`torch.Tensor`, *optional*, defaults to `None`):
+                An attention mask of shape `(batch, key_tokens)` is applied to `encoder_hidden_states`. If `1` the mask
+                is kept, otherwise if `0` it is discarded. Mask will be converted into a bias, which adds large
+                negative values to the attention scores corresponding to "discard" tokens.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
+                `self.processor` in
+                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            added_cond_kwargs: (`dict`, *optional*):
+                A kwargs dictionary containing additional embeddings that if specified are added to the embeddings that
+                are passed along to the UNet blocks.
+            down_block_additional_residuals: (`tuple` of `torch.Tensor`, *optional*):
+                A tuple of tensors that if specified are added to the residuals of down unet blocks.
+            mid_block_additional_residual: (`torch.Tensor`, *optional*):
+                A tensor that if specified is added to the residual of the middle unet block.
+            encoder_attention_mask (`torch.Tensor`):
+                A cross-attention mask of shape `(batch, sequence_length)` is applied to `encoder_hidden_states`. If
+                `True` the mask is kept, otherwise if `False` it is discarded. Mask will be converted into a bias,
+                which adds large negative values to the attention scores corresponding to "discard" tokens.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~models.unet_2d_condition.UNet2DConditionOutput`] instead of a plain
+                tuple.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the [`AttnProcessor`].
+            added_cond_kwargs: (`dict`, *optional*):
+                A kwargs dictionary containin additional embeddings that if specified are added to the embeddings that
+                are passed along to the UNet blocks.
+            down_block_additional_residuals (`tuple` of `torch.Tensor`, *optional*):
+                additional residuals to be added to UNet long skip connections from down blocks to up blocks for
+                example from ControlNet side model(s)
+            mid_block_additional_residual (`torch.Tensor`, *optional*):
+                additional residual to be added to UNet mid block output, for example from ControlNet side model
+            down_intrablock_additional_residuals (`tuple` of `torch.Tensor`, *optional*):
+                additional residuals to be added within UNet down blocks, for example from T2I-Adapter side model(s)
+
+        Returns:
+            [`~models.unet_2d_condition.UNet2DConditionOutput`] or `tuple`:
+                If `return_dict` is True, an [`~models.unet_2d_condition.UNet2DConditionOutput`] is returned, otherwise
+                a `tuple` is returned where the first element is the sample tensor.
+        """
+        # By default samples have to be AT least a multiple of the overall upsampling factor.
+        # The overall upsampling factor is equal to 2 ** (# num of upsampling layers).
+        # However, the upsampling interpolation output size can be forced to fit any upsampling size
+        # on the fly if necessary.
+        default_overall_up_factor = 2**self.num_upsamplers
+
+        # upsample size should be forwarded when sample is not a multiple of `default_overall_up_factor`
+        forward_upsample_size = False
+        upsample_size = None
+
+        for dim in sample.shape[-2:]:
+            if dim % default_overall_up_factor != 0:
+                # Forward upsample size to force interpolation output size.
+                forward_upsample_size = True
+                break
+        # ensure attention_mask is a bias, and give it a singleton query_tokens dimension
+        # expects mask of shape:
+        #   [batch, key_tokens]
+        # adds singleton query_tokens dimension:
+        #   [batch,                    1, key_tokens]
+        # this helps to broadcast it as a bias over attention scores, which will be in one of the following shapes:
+        #   [batch,  heads, query_tokens, key_tokens] (e.g. torch sdp attn)
+        #   [batch * heads, query_tokens, key_tokens] (e.g. xformers or classic attn)
+        if attention_mask is not None:
+            # assume that mask is expressed as:
+            #   (1 = keep,      0 = discard)
+            # convert mask into a bias that can be added to attention scores:
+            #       (keep = +0,     discard = -10000.0)
+            attention_mask = (1 - attention_mask.to(sample.dtype)) * -10000.0
+            attention_mask = attention_mask.unsqueeze(1)
+
+        # convert encoder_attention_mask to a bias the same way we do for attention_mask
+        if encoder_attention_mask is not None:
+            encoder_attention_mask = (1 - encoder_attention_mask.to(sample.dtype)) * -10000.0
+            encoder_attention_mask = encoder_attention_mask.unsqueeze(1)
+
+        # 0. center input if necessary
+        if self.config.center_input_sample:
+            sample = 2 * sample - 1.0
+
+        # 1. time
+        timesteps = timestep
+        if not torch.is_tensor(timesteps):
+            # TODO: this requires sync between CPU and GPU. So try to pass timesteps as tensors if you can
+            # This would be a good case for the `match` statement (Python 3.10+)
+            is_mps = sample.device.type == "mps"
+            if isinstance(timestep, float):
+                dtype = torch.float32 if is_mps else torch.float64
+            else:
+                dtype = torch.int32 if is_mps else torch.int64
+            timesteps = torch.tensor([timesteps], dtype=dtype, device=sample.device)
+        elif len(timesteps.shape) == 0:
+            timesteps = timesteps[None].to(sample.device)
+
+        # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
+        timesteps = timesteps.expand(sample.shape[0])
+
+        t_emb = self.time_proj(timesteps)
+
+        # `Timesteps` does not contain any weights and will always return f32 tensors
+        # but time_embedding might actually be running in fp16. so we need to cast here.
+        # there might be better ways to encapsulate this.
+        t_emb = t_emb.to(dtype=sample.dtype)
+
+        emb = self.time_embedding(t_emb, timestep_cond)
+        aug_emb = None
+
+        if self.class_embedding is not None:
+            if class_labels is None:
+                raise ValueError("class_labels should be provided when num_class_embeds > 0")
+
+            if self.config.class_embed_type == "timestep":
+                class_labels = self.time_proj(class_labels)
+
+                # `Timesteps` does not contain any weights and will always return f32 tensors
+                # there might be better ways to encapsulate this.
+                class_labels = class_labels.to(dtype=sample.dtype)
+
+            class_emb = self.class_embedding(class_labels).to(dtype=sample.dtype)
+
+            if self.config.class_embeddings_concat:
+                emb = torch.cat([emb, class_emb], dim=-1)
+            else:
+                emb = emb + class_emb
+
+        if self.config.addition_embed_type == "text":
+            aug_emb = self.add_embedding(encoder_hidden_states)
+        elif self.config.addition_embed_type == "text_image":
+            # Kandinsky 2.1 - style
+            if "image_embeds" not in added_cond_kwargs:
+                raise ValueError(
+                    f"{self.__class__} has the config param `addition_embed_type` set to 'text_image' which requires the keyword argument `image_embeds` to be passed in `added_cond_kwargs`"
+                )
+
+            image_embs = added_cond_kwargs.get("image_embeds")
+            text_embs = added_cond_kwargs.get("text_embeds", encoder_hidden_states)
+            aug_emb = self.add_embedding(text_embs, image_embs)
+        elif self.config.addition_embed_type == "text_time":
+            # SDXL - style
+            if "text_embeds" not in added_cond_kwargs:
+                raise ValueError(
+                    f"{self.__class__} has the config param `addition_embed_type` set to 'text_time' which requires the keyword argument `text_embeds` to be passed in `added_cond_kwargs`"
+                )
+            text_embeds = added_cond_kwargs.get("text_embeds")
+            if "time_ids" not in added_cond_kwargs:
+                raise ValueError(
+                    f"{self.__class__} has the config param `addition_embed_type` set to 'text_time' which requires the keyword argument `time_ids` to be passed in `added_cond_kwargs`"
+                )
+            time_ids = added_cond_kwargs.get("time_ids")
+            time_embeds = self.add_time_proj(time_ids.flatten())
+            time_embeds = time_embeds.reshape((text_embeds.shape[0], -1))
+            add_embeds = torch.concat([text_embeds, time_embeds], dim=-1)
+            add_embeds = add_embeds.to(emb.dtype)
+            aug_emb = self.add_embedding(add_embeds)
+        elif self.config.addition_embed_type == "image":
+            # Kandinsky 2.2 - style
+            if "image_embeds" not in added_cond_kwargs:
+                raise ValueError(
+                    f"{self.__class__} has the config param `addition_embed_type` set to 'image' which requires the keyword argument `image_embeds` to be passed in `added_cond_kwargs`"
+                )
+            image_embs = added_cond_kwargs.get("image_embeds")
+            aug_emb = self.add_embedding(image_embs)
+        elif self.config.addition_embed_type == "image_hint":
+            # Kandinsky 2.2 - style
+            if "image_embeds" not in added_cond_kwargs or "hint" not in added_cond_kwargs:
+                raise ValueError(
+                    f"{self.__class__} has the config param `addition_embed_type` set to 'image_hint' which requires the keyword arguments `image_embeds` and `hint` to be passed in `added_cond_kwargs`"
+                )
+            image_embs = added_cond_kwargs.get("image_embeds")
+            hint = added_cond_kwargs.get("hint")
+            aug_emb, hint = self.add_embedding(image_embs, hint)
+            sample = torch.cat([sample, hint], dim=1)
+
+        emb = emb + aug_emb if aug_emb is not None else emb
+
+        if self.time_embed_act is not None:
+            emb = self.time_embed_act(emb)
+
+        if self.encoder_hid_proj is not None and self.config.encoder_hid_dim_type == "text_proj":
+            encoder_hidden_states = self.encoder_hid_proj(encoder_hidden_states)
+        elif self.encoder_hid_proj is not None and self.config.encoder_hid_dim_type == "text_image_proj":
+            # Kadinsky 2.1 - style
+            if "image_embeds" not in added_cond_kwargs:
+                raise ValueError(
+                    f"{self.__class__} has the config param `encoder_hid_dim_type` set to 'text_image_proj' which requires the keyword argument `image_embeds` to be passed in  `added_conditions`"
+                )
+
+            image_embeds = added_cond_kwargs.get("image_embeds")
+            encoder_hidden_states = self.encoder_hid_proj(encoder_hidden_states, image_embeds)
+        elif self.encoder_hid_proj is not None and self.config.encoder_hid_dim_type == "image_proj":
+            # Kandinsky 2.2 - style
+            if "image_embeds" not in added_cond_kwargs:
+                raise ValueError(
+                    f"{self.__class__} has the config param `encoder_hid_dim_type` set to 'image_proj' which requires the keyword argument `image_embeds` to be passed in  `added_conditions`"
+                )
+            image_embeds = added_cond_kwargs.get("image_embeds")
+            encoder_hidden_states = self.encoder_hid_proj(image_embeds)
+        elif self.encoder_hid_proj is not None and self.config.encoder_hid_dim_type == "ip_image_proj":
+            if "image_embeds" not in added_cond_kwargs:
+                raise ValueError(
+                    f"{self.__class__} has the config param `encoder_hid_dim_type` set to 'ip_image_proj' which requires the keyword argument `image_embeds` to be passed in  `added_conditions`"
+                )
+            image_embeds = added_cond_kwargs.get("image_embeds")
+            # print(image_embeds.shape)
+            image_embeds = self.encoder_hid_proj(image_embeds).to(encoder_hidden_states.dtype)
+            encoder_hidden_states = torch.cat([encoder_hidden_states, image_embeds], dim=1)
+
+        # 2. pre-process
+        sample = self.conv_in(sample)
+
+        # 2.5 GLIGEN position net
+        if cross_attention_kwargs is not None and cross_attention_kwargs.get("gligen", None) is not None:
+            cross_attention_kwargs = cross_attention_kwargs.copy()
+            gligen_args = cross_attention_kwargs.pop("gligen")
+            cross_attention_kwargs["gligen"] = {"objs": self.position_net(**gligen_args)}
+
+
+        curr_garment_feat_idx = 0
+
+
+        # 3. down
+        lora_scale = cross_attention_kwargs.get("scale", 1.0) if cross_attention_kwargs is not None else 1.0
+        if USE_PEFT_BACKEND:
+            # weight the lora layers by setting `lora_scale` for each PEFT layer
+            scale_lora_layers(self, lora_scale)
+
+        is_controlnet = mid_block_additional_residual is not None and down_block_additional_residuals is not None
+        # using new arg down_intrablock_additional_residuals for T2I-Adapters, to distinguish from controlnets
+        is_adapter = down_intrablock_additional_residuals is not None
+        # maintain backward compatibility for legacy usage, where
+        #       T2I-Adapter and ControlNet both use down_block_additional_residuals arg
+        #       but can only use one or the other
+        if not is_adapter and mid_block_additional_residual is None and down_block_additional_residuals is not None:
+            deprecate(
+                "T2I should not use down_block_additional_residuals",
+                "1.3.0",
+                "Passing intrablock residual connections with `down_block_additional_residuals` is deprecated \
+                       and will be removed in diffusers 1.3.0.  `down_block_additional_residuals` should only be used \
+                       for ControlNet. Please make sure use `down_intrablock_additional_residuals` instead. ",
+                standard_warn=False,
+            )
+            down_intrablock_additional_residuals = down_block_additional_residuals
+            is_adapter = True
+
+        down_block_res_samples = (sample,)
+        for downsample_block in self.down_blocks:
+            if hasattr(downsample_block, "has_cross_attention") and downsample_block.has_cross_attention:
+                # For t2i-adapter CrossAttnDownBlock2D
+                additional_residuals = {}
+                if is_adapter and len(down_intrablock_additional_residuals) > 0:
+                    additional_residuals["additional_residuals"] = down_intrablock_additional_residuals.pop(0)
+
+                sample, res_samples,curr_garment_feat_idx = downsample_block(
+                    hidden_states=sample,
+                    temb=emb,
+                    encoder_hidden_states=encoder_hidden_states,
+                    attention_mask=attention_mask,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    encoder_attention_mask=encoder_attention_mask,
+                    garment_features=garment_features,
+                    curr_garment_feat_idx=curr_garment_feat_idx,
+                    **additional_residuals,
+                )
+            else:
+                sample, res_samples = downsample_block(hidden_states=sample, temb=emb, scale=lora_scale)
+                if is_adapter and len(down_intrablock_additional_residuals) > 0:
+                    sample += down_intrablock_additional_residuals.pop(0)
+
+            down_block_res_samples += res_samples
+
+
+        if is_controlnet:
+            new_down_block_res_samples = ()
+
+            for down_block_res_sample, down_block_additional_residual in zip(
+                down_block_res_samples, down_block_additional_residuals
+            ):
+                down_block_res_sample = down_block_res_sample + down_block_additional_residual
+                new_down_block_res_samples = new_down_block_res_samples + (down_block_res_sample,)
+
+            down_block_res_samples = new_down_block_res_samples
+
+        # 4. mid
+        if self.mid_block is not None:
+            if hasattr(self.mid_block, "has_cross_attention") and self.mid_block.has_cross_attention:
+                sample ,curr_garment_feat_idx= self.mid_block(
+                    sample,
+                    emb,
+                    encoder_hidden_states=encoder_hidden_states,
+                    attention_mask=attention_mask,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    encoder_attention_mask=encoder_attention_mask,
+                    garment_features=garment_features,
+                    curr_garment_feat_idx=curr_garment_feat_idx,
+                )
+            else:
+                sample = self.mid_block(sample, emb)
+
+            # To support T2I-Adapter-XL
+            if (
+                is_adapter
+                and len(down_intrablock_additional_residuals) > 0
+                and sample.shape == down_intrablock_additional_residuals[0].shape
+            ):
+                sample += down_intrablock_additional_residuals.pop(0)
+
+        if is_controlnet:
+            sample = sample + mid_block_additional_residual
+
+
+
+        # 5. up
+        for i, upsample_block in enumerate(self.up_blocks):
+            is_final_block = i == len(self.up_blocks) - 1
+
+            res_samples = down_block_res_samples[-len(upsample_block.resnets) :]
+            down_block_res_samples = down_block_res_samples[: -len(upsample_block.resnets)]
+
+            # if we have not reached the final block and need to forward the
+            # upsample size, we do it here
+            if not is_final_block and forward_upsample_size:
+                upsample_size = down_block_res_samples[-1].shape[2:]
+
+            if hasattr(upsample_block, "has_cross_attention") and upsample_block.has_cross_attention:
+                sample ,curr_garment_feat_idx= upsample_block(
+                    hidden_states=sample,
+                    temb=emb,
+                    res_hidden_states_tuple=res_samples,
+                    encoder_hidden_states=encoder_hidden_states,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    upsample_size=upsample_size,
+                    attention_mask=attention_mask,
+                    encoder_attention_mask=encoder_attention_mask,
+                    garment_features=garment_features,
+                    curr_garment_feat_idx=curr_garment_feat_idx,
+                )
+
+            else:
+                sample = upsample_block(
+                    hidden_states=sample,
+                    temb=emb,
+                    res_hidden_states_tuple=res_samples,
+                    upsample_size=upsample_size,
+                    scale=lora_scale,
+                )
+        # 6. post-process
+        if self.conv_norm_out:
+            sample = self.conv_norm_out(sample)
+            sample = self.conv_act(sample)
+        sample = self.conv_out(sample)
+
+        if USE_PEFT_BACKEND:
+            # remove `lora_scale` from each PEFT layer
+            unscale_lora_layers(self, lora_scale)
+
+        if not return_dict:
+            return (sample,)
+
+        return UNet2DConditionOutput(sample=sample)
diff --git a/utils_mask.py b/utils_mask.py
new file mode 100644
index 0000000000000000000000000000000000000000..3f55d52da8026a99c2a83a680f2bdaeda484360d
--- /dev/null
+++ b/utils_mask.py
@@ -0,0 +1,167 @@
+import numpy as np
+import cv2
+from PIL import Image, ImageDraw
+
+label_map = {
+    "background": 0,
+    "hat": 1,
+    "hair": 2,
+    "sunglasses": 3,
+    "upper_clothes": 4,
+    "skirt": 5,
+    "pants": 6,
+    "dress": 7,
+    "belt": 8,
+    "left_shoe": 9,
+    "right_shoe": 10,
+    "head": 11,
+    "left_leg": 12,
+    "right_leg": 13,
+    "left_arm": 14,
+    "right_arm": 15,
+    "bag": 16,
+    "scarf": 17,
+}
+
+def extend_arm_mask(wrist, elbow, scale):
+  wrist = elbow + scale * (wrist - elbow)
+  return wrist
+
+def hole_fill(img):
+    img = np.pad(img[1:-1, 1:-1], pad_width = 1, mode = 'constant', constant_values=0)
+    img_copy = img.copy()
+    mask = np.zeros((img.shape[0] + 2, img.shape[1] + 2), dtype=np.uint8)
+
+    cv2.floodFill(img, mask, (0, 0), 255)
+    img_inverse = cv2.bitwise_not(img)
+    dst = cv2.bitwise_or(img_copy, img_inverse)
+    return dst
+
+def refine_mask(mask):
+    contours, hierarchy = cv2.findContours(mask.astype(np.uint8),
+                                           cv2.RETR_CCOMP, cv2.CHAIN_APPROX_TC89_L1)
+    area = []
+    for j in range(len(contours)):
+        a_d = cv2.contourArea(contours[j], True)
+        area.append(abs(a_d))
+    refine_mask = np.zeros_like(mask).astype(np.uint8)
+    if len(area) != 0:
+        i = area.index(max(area))
+        cv2.drawContours(refine_mask, contours, i, color=255, thickness=-1)
+
+    return refine_mask
+
+def get_mask_location(model_type, category, model_parse: Image.Image, keypoint: dict, width=384,height=512):
+    im_parse = model_parse.resize((width, height), Image.NEAREST)
+    parse_array = np.array(im_parse)
+
+    if model_type == 'hd':
+        arm_width = 60
+    elif model_type == 'dc':
+        arm_width = 45
+    else:
+        raise ValueError("model_type must be \'hd\' or \'dc\'!")
+
+    parse_head = (parse_array == 1).astype(np.float32) + \
+                 (parse_array == 3).astype(np.float32) + \
+                 (parse_array == 11).astype(np.float32)
+
+    parser_mask_fixed = (parse_array == label_map["left_shoe"]).astype(np.float32) + \
+                        (parse_array == label_map["right_shoe"]).astype(np.float32) + \
+                        (parse_array == label_map["hat"]).astype(np.float32) + \
+                        (parse_array == label_map["sunglasses"]).astype(np.float32) + \
+                        (parse_array == label_map["bag"]).astype(np.float32)
+
+    parser_mask_changeable = (parse_array == label_map["background"]).astype(np.float32)
+
+    arms_left = (parse_array == 14).astype(np.float32)
+    arms_right = (parse_array == 15).astype(np.float32)
+
+    if category == 'dresses':
+        parse_mask = (parse_array == 7).astype(np.float32) + \
+                     (parse_array == 4).astype(np.float32) + \
+                     (parse_array == 5).astype(np.float32) + \
+                     (parse_array == 6).astype(np.float32)
+
+        parser_mask_changeable += np.logical_and(parse_array, np.logical_not(parser_mask_fixed))
+
+    elif category == 'upper_body':
+        parse_mask = (parse_array == 4).astype(np.float32) + (parse_array == 7).astype(np.float32)
+        parser_mask_fixed_lower_cloth = (parse_array == label_map["skirt"]).astype(np.float32) + \
+                                        (parse_array == label_map["pants"]).astype(np.float32)
+        parser_mask_fixed += parser_mask_fixed_lower_cloth
+        parser_mask_changeable += np.logical_and(parse_array, np.logical_not(parser_mask_fixed))
+    elif category == 'lower_body':
+        parse_mask = (parse_array == 6).astype(np.float32) + \
+                     (parse_array == 12).astype(np.float32) + \
+                     (parse_array == 13).astype(np.float32) + \
+                     (parse_array == 5).astype(np.float32)
+        parser_mask_fixed += (parse_array == label_map["upper_clothes"]).astype(np.float32) + \
+                             (parse_array == 14).astype(np.float32) + \
+                             (parse_array == 15).astype(np.float32)
+        parser_mask_changeable += np.logical_and(parse_array, np.logical_not(parser_mask_fixed))
+    else:
+        raise NotImplementedError
+
+    # Load pose points
+    pose_data = keypoint["pose_keypoints_2d"]
+    pose_data = np.array(pose_data)
+    pose_data = pose_data.reshape((-1, 2))
+
+    im_arms_left = Image.new('L', (width, height))
+    im_arms_right = Image.new('L', (width, height))
+    arms_draw_left = ImageDraw.Draw(im_arms_left)
+    arms_draw_right = ImageDraw.Draw(im_arms_right)
+    if category == 'dresses' or category == 'upper_body':
+        shoulder_right = np.multiply(tuple(pose_data[2][:2]), height / 512.0)
+        shoulder_left = np.multiply(tuple(pose_data[5][:2]), height / 512.0)
+        elbow_right = np.multiply(tuple(pose_data[3][:2]), height / 512.0)
+        elbow_left = np.multiply(tuple(pose_data[6][:2]), height / 512.0)
+        wrist_right = np.multiply(tuple(pose_data[4][:2]), height / 512.0)
+        wrist_left = np.multiply(tuple(pose_data[7][:2]), height / 512.0)
+        ARM_LINE_WIDTH = int(arm_width / 512 * height)
+        size_left = [shoulder_left[0] - ARM_LINE_WIDTH // 2, shoulder_left[1] - ARM_LINE_WIDTH // 2, shoulder_left[0] + ARM_LINE_WIDTH // 2, shoulder_left[1] + ARM_LINE_WIDTH // 2]
+        size_right = [shoulder_right[0] - ARM_LINE_WIDTH // 2, shoulder_right[1] - ARM_LINE_WIDTH // 2, shoulder_right[0] + ARM_LINE_WIDTH // 2,
+                      shoulder_right[1] + ARM_LINE_WIDTH // 2]
+        
+
+        if wrist_right[0] <= 1. and wrist_right[1] <= 1.:
+            im_arms_right = arms_right
+        else:
+            wrist_right = extend_arm_mask(wrist_right, elbow_right, 1.2)
+            arms_draw_right.line(np.concatenate((shoulder_right, elbow_right, wrist_right)).astype(np.uint16).tolist(), 'white', ARM_LINE_WIDTH, 'curve')
+            arms_draw_right.arc(size_right, 0, 360, 'white', ARM_LINE_WIDTH // 2)
+
+        if wrist_left[0] <= 1. and wrist_left[1] <= 1.:
+            im_arms_left = arms_left
+        else:
+            wrist_left = extend_arm_mask(wrist_left, elbow_left, 1.2)
+            arms_draw_left.line(np.concatenate((wrist_left, elbow_left, shoulder_left)).astype(np.uint16).tolist(), 'white', ARM_LINE_WIDTH, 'curve')
+            arms_draw_left.arc(size_left, 0, 360, 'white', ARM_LINE_WIDTH // 2)
+
+        hands_left = np.logical_and(np.logical_not(im_arms_left), arms_left)
+        hands_right = np.logical_and(np.logical_not(im_arms_right), arms_right)
+        parser_mask_fixed += hands_left + hands_right
+
+    parser_mask_fixed = np.logical_or(parser_mask_fixed, parse_head)
+    parse_mask = cv2.dilate(parse_mask, np.ones((5, 5), np.uint16), iterations=5)
+    if category == 'dresses' or category == 'upper_body':
+        neck_mask = (parse_array == 18).astype(np.float32)
+        neck_mask = cv2.dilate(neck_mask, np.ones((5, 5), np.uint16), iterations=1)
+        neck_mask = np.logical_and(neck_mask, np.logical_not(parse_head))
+        parse_mask = np.logical_or(parse_mask, neck_mask)
+        arm_mask = cv2.dilate(np.logical_or(im_arms_left, im_arms_right).astype('float32'), np.ones((5, 5), np.uint16), iterations=4)
+        parse_mask += np.logical_or(parse_mask, arm_mask)
+
+    parse_mask = np.logical_and(parser_mask_changeable, np.logical_not(parse_mask))
+
+    parse_mask_total = np.logical_or(parse_mask, parser_mask_fixed)
+    inpaint_mask = 1 - parse_mask_total
+    img = np.where(inpaint_mask, 255, 0)
+    dst = hole_fill(img.astype(np.uint8))
+    dst = refine_mask(dst)
+    inpaint_mask = dst / 255 * 1
+    mask = Image.fromarray(inpaint_mask.astype(np.uint8) * 255)
+    mask_gray = Image.fromarray(inpaint_mask.astype(np.uint8) * 127)
+
+    return mask, mask_gray